5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
103 # if defined(BUGGY_MSC6)
104 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
105 # pragma optimize("a",off)
106 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
107 # pragma optimize("w",on )
108 # endif /* BUGGY_MSC6 */
112 #define STATIC static
116 typedef struct RExC_state_t {
117 U32 flags; /* RXf_* are we folding, multilining? */
118 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
119 char *precomp; /* uncompiled string. */
120 REGEXP *rx_sv; /* The SV that is the regexp. */
121 regexp *rx; /* perl core regexp structure */
122 regexp_internal *rxi; /* internal data for regexp object pprivate field */
123 char *start; /* Start of input for compile */
124 char *end; /* End of input for compile */
125 char *parse; /* Input-scan pointer. */
126 I32 whilem_seen; /* number of WHILEM in this expr */
127 regnode *emit_start; /* Start of emitted-code area */
128 regnode *emit_bound; /* First regnode outside of the allocated space */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* Capture buffer count, (OPEN). */
135 I32 cpar; /* Capture buffer count, (CLOSE). */
136 I32 nestroot; /* root parens we are in - used by accept */
139 regnode **open_parens; /* pointers to open parens */
140 regnode **close_parens; /* pointers to close parens */
141 regnode *opend; /* END node in program */
142 I32 utf8; /* whether the pattern is utf8 or not */
143 I32 orig_utf8; /* whether the pattern was originally in utf8 */
144 /* XXX use this for future optimisation of case
145 * where pattern must be upgraded to utf8. */
146 I32 uni_semantics; /* If a d charset modifier should use unicode
147 rules, even if the pattern is not in
149 HV *paren_names; /* Paren names */
151 regnode **recurse; /* Recurse regops */
152 I32 recurse_count; /* Number of recurse regops */
155 I32 override_recoding;
156 I32 in_multi_char_class;
157 struct reg_code_block *code_blocks; /* positions of literal (?{})
159 int num_code_blocks; /* size of code_blocks[] */
160 int code_index; /* next code_blocks[] slot */
162 char *starttry; /* -Dr: where regtry was called. */
163 #define RExC_starttry (pRExC_state->starttry)
165 SV *runtime_code_qr; /* qr with the runtime code blocks */
167 const char *lastparse;
169 AV *paren_name_list; /* idx -> name */
170 #define RExC_lastparse (pRExC_state->lastparse)
171 #define RExC_lastnum (pRExC_state->lastnum)
172 #define RExC_paren_name_list (pRExC_state->paren_name_list)
176 #define RExC_flags (pRExC_state->flags)
177 #define RExC_pm_flags (pRExC_state->pm_flags)
178 #define RExC_precomp (pRExC_state->precomp)
179 #define RExC_rx_sv (pRExC_state->rx_sv)
180 #define RExC_rx (pRExC_state->rx)
181 #define RExC_rxi (pRExC_state->rxi)
182 #define RExC_start (pRExC_state->start)
183 #define RExC_end (pRExC_state->end)
184 #define RExC_parse (pRExC_state->parse)
185 #define RExC_whilem_seen (pRExC_state->whilem_seen)
186 #ifdef RE_TRACK_PATTERN_OFFSETS
187 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
189 #define RExC_emit (pRExC_state->emit)
190 #define RExC_emit_start (pRExC_state->emit_start)
191 #define RExC_emit_bound (pRExC_state->emit_bound)
192 #define RExC_naughty (pRExC_state->naughty)
193 #define RExC_sawback (pRExC_state->sawback)
194 #define RExC_seen (pRExC_state->seen)
195 #define RExC_size (pRExC_state->size)
196 #define RExC_npar (pRExC_state->npar)
197 #define RExC_nestroot (pRExC_state->nestroot)
198 #define RExC_extralen (pRExC_state->extralen)
199 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
200 #define RExC_utf8 (pRExC_state->utf8)
201 #define RExC_uni_semantics (pRExC_state->uni_semantics)
202 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
203 #define RExC_open_parens (pRExC_state->open_parens)
204 #define RExC_close_parens (pRExC_state->close_parens)
205 #define RExC_opend (pRExC_state->opend)
206 #define RExC_paren_names (pRExC_state->paren_names)
207 #define RExC_recurse (pRExC_state->recurse)
208 #define RExC_recurse_count (pRExC_state->recurse_count)
209 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
210 #define RExC_contains_locale (pRExC_state->contains_locale)
211 #define RExC_override_recoding (pRExC_state->override_recoding)
212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
215 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
216 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
217 ((*s) == '{' && regcurly(s, FALSE)))
220 #undef SPSTART /* dratted cpp namespace... */
223 * Flags to be passed up and down.
225 #define WORST 0 /* Worst case. */
226 #define HASWIDTH 0x01 /* Known to match non-null strings. */
228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
229 * character. (There needs to be a case: in the switch statement in regexec.c
230 * for any node marked SIMPLE.) Note that this is not the same thing as
233 #define SPSTART 0x04 /* Starts with * or + */
234 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
235 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
237 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
239 /* whether trie related optimizations are enabled */
240 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
241 #define TRIE_STUDY_OPT
242 #define FULL_TRIE_STUDY
248 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
249 #define PBITVAL(paren) (1 << ((paren) & 7))
250 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
251 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
252 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
254 /* If not already in utf8, do a longjmp back to the beginning */
255 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
256 #define REQUIRE_UTF8 STMT_START { \
257 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
260 /* This converts the named class defined in regcomp.h to its equivalent class
261 * number defined in handy.h. */
262 #define namedclass_to_classnum(class) ((int) ((class) / 2))
263 #define classnum_to_namedclass(classnum) ((classnum) * 2)
265 /* About scan_data_t.
267 During optimisation we recurse through the regexp program performing
268 various inplace (keyhole style) optimisations. In addition study_chunk
269 and scan_commit populate this data structure with information about
270 what strings MUST appear in the pattern. We look for the longest
271 string that must appear at a fixed location, and we look for the
272 longest string that may appear at a floating location. So for instance
277 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
278 strings (because they follow a .* construct). study_chunk will identify
279 both FOO and BAR as being the longest fixed and floating strings respectively.
281 The strings can be composites, for instance
285 will result in a composite fixed substring 'foo'.
287 For each string some basic information is maintained:
289 - offset or min_offset
290 This is the position the string must appear at, or not before.
291 It also implicitly (when combined with minlenp) tells us how many
292 characters must match before the string we are searching for.
293 Likewise when combined with minlenp and the length of the string it
294 tells us how many characters must appear after the string we have
298 Only used for floating strings. This is the rightmost point that
299 the string can appear at. If set to I32 max it indicates that the
300 string can occur infinitely far to the right.
303 A pointer to the minimum number of characters of the pattern that the
304 string was found inside. This is important as in the case of positive
305 lookahead or positive lookbehind we can have multiple patterns
310 The minimum length of the pattern overall is 3, the minimum length
311 of the lookahead part is 3, but the minimum length of the part that
312 will actually match is 1. So 'FOO's minimum length is 3, but the
313 minimum length for the F is 1. This is important as the minimum length
314 is used to determine offsets in front of and behind the string being
315 looked for. Since strings can be composites this is the length of the
316 pattern at the time it was committed with a scan_commit. Note that
317 the length is calculated by study_chunk, so that the minimum lengths
318 are not known until the full pattern has been compiled, thus the
319 pointer to the value.
323 In the case of lookbehind the string being searched for can be
324 offset past the start point of the final matching string.
325 If this value was just blithely removed from the min_offset it would
326 invalidate some of the calculations for how many chars must match
327 before or after (as they are derived from min_offset and minlen and
328 the length of the string being searched for).
329 When the final pattern is compiled and the data is moved from the
330 scan_data_t structure into the regexp structure the information
331 about lookbehind is factored in, with the information that would
332 have been lost precalculated in the end_shift field for the
335 The fields pos_min and pos_delta are used to store the minimum offset
336 and the delta to the maximum offset at the current point in the pattern.
340 typedef struct scan_data_t {
341 /*I32 len_min; unused */
342 /*I32 len_delta; unused */
346 I32 last_end; /* min value, <0 unless valid. */
349 SV **longest; /* Either &l_fixed, or &l_float. */
350 SV *longest_fixed; /* longest fixed string found in pattern */
351 I32 offset_fixed; /* offset where it starts */
352 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
353 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
354 SV *longest_float; /* longest floating string found in pattern */
355 I32 offset_float_min; /* earliest point in string it can appear */
356 I32 offset_float_max; /* latest point in string it can appear */
357 I32 *minlen_float; /* pointer to the minlen relevant to the string */
358 I32 lookbehind_float; /* is the position of the string modified by LB */
362 struct regnode_charclass_class *start_class;
366 * Forward declarations for pregcomp()'s friends.
369 static const scan_data_t zero_scan_data =
370 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
372 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
373 #define SF_BEFORE_SEOL 0x0001
374 #define SF_BEFORE_MEOL 0x0002
375 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
376 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
379 # define SF_FIX_SHIFT_EOL (0+2)
380 # define SF_FL_SHIFT_EOL (0+4)
382 # define SF_FIX_SHIFT_EOL (+2)
383 # define SF_FL_SHIFT_EOL (+4)
386 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
387 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
389 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
390 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
391 #define SF_IS_INF 0x0040
392 #define SF_HAS_PAR 0x0080
393 #define SF_IN_PAR 0x0100
394 #define SF_HAS_EVAL 0x0200
395 #define SCF_DO_SUBSTR 0x0400
396 #define SCF_DO_STCLASS_AND 0x0800
397 #define SCF_DO_STCLASS_OR 0x1000
398 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
399 #define SCF_WHILEM_VISITED_POS 0x2000
401 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
402 #define SCF_SEEN_ACCEPT 0x8000
404 #define UTF cBOOL(RExC_utf8)
406 /* The enums for all these are ordered so things work out correctly */
407 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
408 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
409 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
410 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
411 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
412 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
413 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
415 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
417 #define OOB_NAMEDCLASS -1
419 /* There is no code point that is out-of-bounds, so this is problematic. But
420 * its only current use is to initialize a variable that is always set before
422 #define OOB_UNICODE 0xDEADBEEF
424 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
425 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
428 /* length of regex to show in messages that don't mark a position within */
429 #define RegexLengthToShowInErrorMessages 127
432 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
433 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
434 * op/pragma/warn/regcomp.
436 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
437 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
439 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
442 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
443 * arg. Show regex, up to a maximum length. If it's too long, chop and add
446 #define _FAIL(code) STMT_START { \
447 const char *ellipses = ""; \
448 IV len = RExC_end - RExC_precomp; \
451 SAVEFREESV(RExC_rx_sv); \
452 if (len > RegexLengthToShowInErrorMessages) { \
453 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
454 len = RegexLengthToShowInErrorMessages - 10; \
460 #define FAIL(msg) _FAIL( \
461 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
462 msg, (int)len, RExC_precomp, ellipses))
464 #define FAIL2(msg,arg) _FAIL( \
465 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
466 arg, (int)len, RExC_precomp, ellipses))
469 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
471 #define Simple_vFAIL(m) STMT_START { \
472 const IV offset = RExC_parse - RExC_precomp; \
473 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
474 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
478 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
480 #define vFAIL(m) STMT_START { \
482 SAVEFREESV(RExC_rx_sv); \
487 * Like Simple_vFAIL(), but accepts two arguments.
489 #define Simple_vFAIL2(m,a1) STMT_START { \
490 const IV offset = RExC_parse - RExC_precomp; \
491 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
492 (int)offset, RExC_precomp, RExC_precomp + offset); \
496 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
498 #define vFAIL2(m,a1) STMT_START { \
500 SAVEFREESV(RExC_rx_sv); \
501 Simple_vFAIL2(m, a1); \
506 * Like Simple_vFAIL(), but accepts three arguments.
508 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
509 const IV offset = RExC_parse - RExC_precomp; \
510 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
511 (int)offset, RExC_precomp, RExC_precomp + offset); \
515 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
517 #define vFAIL3(m,a1,a2) STMT_START { \
519 SAVEFREESV(RExC_rx_sv); \
520 Simple_vFAIL3(m, a1, a2); \
524 * Like Simple_vFAIL(), but accepts four arguments.
526 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
527 const IV offset = RExC_parse - RExC_precomp; \
528 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
529 (int)offset, RExC_precomp, RExC_precomp + offset); \
532 #define vFAIL4(m,a1,a2,a3) STMT_START { \
534 SAVEFREESV(RExC_rx_sv); \
535 Simple_vFAIL4(m, a1, a2, a3); \
538 /* m is not necessarily a "literal string", in this macro */
539 #define reg_warn_non_literal_string(loc, m) STMT_START { \
540 const IV offset = loc - RExC_precomp; \
541 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
542 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
545 #define ckWARNreg(loc,m) STMT_START { \
546 const IV offset = loc - RExC_precomp; \
547 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
548 (int)offset, RExC_precomp, RExC_precomp + offset); \
551 #define ckWARNregdep(loc,m) STMT_START { \
552 const IV offset = loc - RExC_precomp; \
553 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
555 (int)offset, RExC_precomp, RExC_precomp + offset); \
558 #define ckWARN2regdep(loc,m, a1) STMT_START { \
559 const IV offset = loc - RExC_precomp; \
560 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
562 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
565 #define ckWARN2reg(loc, m, a1) STMT_START { \
566 const IV offset = loc - RExC_precomp; \
567 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
568 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
571 #define vWARN3(loc, m, a1, a2) STMT_START { \
572 const IV offset = loc - RExC_precomp; \
573 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
574 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
577 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
578 const IV offset = loc - RExC_precomp; \
579 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
580 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
583 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
584 const IV offset = loc - RExC_precomp; \
585 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
586 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
589 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
590 const IV offset = loc - RExC_precomp; \
591 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
592 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
595 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
596 const IV offset = loc - RExC_precomp; \
597 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
598 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
602 /* Allow for side effects in s */
603 #define REGC(c,s) STMT_START { \
604 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
607 /* Macros for recording node offsets. 20001227 mjd@plover.com
608 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
609 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
610 * Element 0 holds the number n.
611 * Position is 1 indexed.
613 #ifndef RE_TRACK_PATTERN_OFFSETS
614 #define Set_Node_Offset_To_R(node,byte)
615 #define Set_Node_Offset(node,byte)
616 #define Set_Cur_Node_Offset
617 #define Set_Node_Length_To_R(node,len)
618 #define Set_Node_Length(node,len)
619 #define Set_Node_Cur_Length(node)
620 #define Node_Offset(n)
621 #define Node_Length(n)
622 #define Set_Node_Offset_Length(node,offset,len)
623 #define ProgLen(ri) ri->u.proglen
624 #define SetProgLen(ri,x) ri->u.proglen = x
626 #define ProgLen(ri) ri->u.offsets[0]
627 #define SetProgLen(ri,x) ri->u.offsets[0] = x
628 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
630 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
631 __LINE__, (int)(node), (int)(byte))); \
633 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
635 RExC_offsets[2*(node)-1] = (byte); \
640 #define Set_Node_Offset(node,byte) \
641 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
642 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
644 #define Set_Node_Length_To_R(node,len) STMT_START { \
646 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
647 __LINE__, (int)(node), (int)(len))); \
649 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
651 RExC_offsets[2*(node)] = (len); \
656 #define Set_Node_Length(node,len) \
657 Set_Node_Length_To_R((node)-RExC_emit_start, len)
658 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
659 #define Set_Node_Cur_Length(node) \
660 Set_Node_Length(node, RExC_parse - parse_start)
662 /* Get offsets and lengths */
663 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
664 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
666 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
667 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
668 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
672 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
673 #define EXPERIMENTAL_INPLACESCAN
674 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
676 #define DEBUG_STUDYDATA(str,data,depth) \
677 DEBUG_OPTIMISE_MORE_r(if(data){ \
678 PerlIO_printf(Perl_debug_log, \
679 "%*s" str "Pos:%"IVdf"/%"IVdf \
680 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
681 (int)(depth)*2, "", \
682 (IV)((data)->pos_min), \
683 (IV)((data)->pos_delta), \
684 (UV)((data)->flags), \
685 (IV)((data)->whilem_c), \
686 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
687 is_inf ? "INF " : "" \
689 if ((data)->last_found) \
690 PerlIO_printf(Perl_debug_log, \
691 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
692 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
693 SvPVX_const((data)->last_found), \
694 (IV)((data)->last_end), \
695 (IV)((data)->last_start_min), \
696 (IV)((data)->last_start_max), \
697 ((data)->longest && \
698 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
699 SvPVX_const((data)->longest_fixed), \
700 (IV)((data)->offset_fixed), \
701 ((data)->longest && \
702 (data)->longest==&((data)->longest_float)) ? "*" : "", \
703 SvPVX_const((data)->longest_float), \
704 (IV)((data)->offset_float_min), \
705 (IV)((data)->offset_float_max) \
707 PerlIO_printf(Perl_debug_log,"\n"); \
710 /* Mark that we cannot extend a found fixed substring at this point.
711 Update the longest found anchored substring and the longest found
712 floating substrings if needed. */
715 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
717 const STRLEN l = CHR_SVLEN(data->last_found);
718 const STRLEN old_l = CHR_SVLEN(*data->longest);
719 GET_RE_DEBUG_FLAGS_DECL;
721 PERL_ARGS_ASSERT_SCAN_COMMIT;
723 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
724 SvSetMagicSV(*data->longest, data->last_found);
725 if (*data->longest == data->longest_fixed) {
726 data->offset_fixed = l ? data->last_start_min : data->pos_min;
727 if (data->flags & SF_BEFORE_EOL)
729 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
731 data->flags &= ~SF_FIX_BEFORE_EOL;
732 data->minlen_fixed=minlenp;
733 data->lookbehind_fixed=0;
735 else { /* *data->longest == data->longest_float */
736 data->offset_float_min = l ? data->last_start_min : data->pos_min;
737 data->offset_float_max = (l
738 ? data->last_start_max
739 : data->pos_min + data->pos_delta);
740 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
741 data->offset_float_max = I32_MAX;
742 if (data->flags & SF_BEFORE_EOL)
744 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
746 data->flags &= ~SF_FL_BEFORE_EOL;
747 data->minlen_float=minlenp;
748 data->lookbehind_float=0;
751 SvCUR_set(data->last_found, 0);
753 SV * const sv = data->last_found;
754 if (SvUTF8(sv) && SvMAGICAL(sv)) {
755 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
761 data->flags &= ~SF_BEFORE_EOL;
762 DEBUG_STUDYDATA("commit: ",data,0);
765 /* These macros set, clear and test whether the synthetic start class ('ssc',
766 * given by the parameter) matches an empty string (EOS). This uses the
767 * 'next_off' field in the node, to save a bit in the flags field. The ssc
768 * stands alone, so there is never a next_off, so this field is otherwise
769 * unused. The EOS information is used only for compilation, but theoretically
770 * it could be passed on to the execution code. This could be used to store
771 * more than one bit of information, but only this one is currently used. */
772 #define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
773 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
774 #define TEST_SSC_EOS(node) cBOOL((node)->next_off)
776 /* Can match anything (initialization) */
778 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
780 PERL_ARGS_ASSERT_CL_ANYTHING;
782 ANYOF_BITMAP_SETALL(cl);
783 cl->flags = ANYOF_UNICODE_ALL;
786 /* If any portion of the regex is to operate under locale rules,
787 * initialization includes it. The reason this isn't done for all regexes
788 * is that the optimizer was written under the assumption that locale was
789 * all-or-nothing. Given the complexity and lack of documentation in the
790 * optimizer, and that there are inadequate test cases for locale, so many
791 * parts of it may not work properly, it is safest to avoid locale unless
793 if (RExC_contains_locale) {
794 ANYOF_CLASS_SETALL(cl); /* /l uses class */
795 cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
798 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
802 /* Can match anything (initialization) */
804 S_cl_is_anything(const struct regnode_charclass_class *cl)
808 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
810 for (value = 0; value < ANYOF_MAX; value += 2)
811 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
813 if (!(cl->flags & ANYOF_UNICODE_ALL))
815 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
820 /* Can match anything (initialization) */
822 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
824 PERL_ARGS_ASSERT_CL_INIT;
826 Zero(cl, 1, struct regnode_charclass_class);
828 cl_anything(pRExC_state, cl);
829 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
832 /* These two functions currently do the exact same thing */
833 #define cl_init_zero S_cl_init
835 /* 'AND' a given class with another one. Can create false positives. 'cl'
836 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
837 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
839 S_cl_and(struct regnode_charclass_class *cl,
840 const struct regnode_charclass_class *and_with)
842 PERL_ARGS_ASSERT_CL_AND;
844 assert(PL_regkind[and_with->type] == ANYOF);
846 /* I (khw) am not sure all these restrictions are necessary XXX */
847 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
848 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
849 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
850 && !(and_with->flags & ANYOF_LOC_FOLD)
851 && !(cl->flags & ANYOF_LOC_FOLD)) {
854 if (and_with->flags & ANYOF_INVERT)
855 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
856 cl->bitmap[i] &= ~and_with->bitmap[i];
858 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
859 cl->bitmap[i] &= and_with->bitmap[i];
860 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
862 if (and_with->flags & ANYOF_INVERT) {
864 /* Here, the and'ed node is inverted. Get the AND of the flags that
865 * aren't affected by the inversion. Those that are affected are
866 * handled individually below */
867 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
868 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
869 cl->flags |= affected_flags;
871 /* We currently don't know how to deal with things that aren't in the
872 * bitmap, but we know that the intersection is no greater than what
873 * is already in cl, so let there be false positives that get sorted
874 * out after the synthetic start class succeeds, and the node is
875 * matched for real. */
877 /* The inversion of these two flags indicate that the resulting
878 * intersection doesn't have them */
879 if (and_with->flags & ANYOF_UNICODE_ALL) {
880 cl->flags &= ~ANYOF_UNICODE_ALL;
882 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
883 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
886 else { /* and'd node is not inverted */
887 U8 outside_bitmap_but_not_utf8; /* Temp variable */
889 if (! ANYOF_NONBITMAP(and_with)) {
891 /* Here 'and_with' doesn't match anything outside the bitmap
892 * (except possibly ANYOF_UNICODE_ALL), which means the
893 * intersection can't either, except for ANYOF_UNICODE_ALL, in
894 * which case we don't know what the intersection is, but it's no
895 * greater than what cl already has, so can just leave it alone,
896 * with possible false positives */
897 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
898 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
899 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
902 else if (! ANYOF_NONBITMAP(cl)) {
904 /* Here, 'and_with' does match something outside the bitmap, and cl
905 * doesn't have a list of things to match outside the bitmap. If
906 * cl can match all code points above 255, the intersection will
907 * be those above-255 code points that 'and_with' matches. If cl
908 * can't match all Unicode code points, it means that it can't
909 * match anything outside the bitmap (since the 'if' that got us
910 * into this block tested for that), so we leave the bitmap empty.
912 if (cl->flags & ANYOF_UNICODE_ALL) {
913 ARG_SET(cl, ARG(and_with));
915 /* and_with's ARG may match things that don't require UTF8.
916 * And now cl's will too, in spite of this being an 'and'. See
917 * the comments below about the kludge */
918 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
922 /* Here, both 'and_with' and cl match something outside the
923 * bitmap. Currently we do not do the intersection, so just match
924 * whatever cl had at the beginning. */
928 /* Take the intersection of the two sets of flags. However, the
929 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
930 * kludge around the fact that this flag is not treated like the others
931 * which are initialized in cl_anything(). The way the optimizer works
932 * is that the synthetic start class (SSC) is initialized to match
933 * anything, and then the first time a real node is encountered, its
934 * values are AND'd with the SSC's with the result being the values of
935 * the real node. However, there are paths through the optimizer where
936 * the AND never gets called, so those initialized bits are set
937 * inappropriately, which is not usually a big deal, as they just cause
938 * false positives in the SSC, which will just mean a probably
939 * imperceptible slow down in execution. However this bit has a
940 * higher false positive consequence in that it can cause utf8.pm,
941 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
942 * bigger slowdown and also causes significant extra memory to be used.
943 * In order to prevent this, the code now takes a different tack. The
944 * bit isn't set unless some part of the regular expression needs it,
945 * but once set it won't get cleared. This means that these extra
946 * modules won't get loaded unless there was some path through the
947 * pattern that would have required them anyway, and so any false
948 * positives that occur by not ANDing them out when they could be
949 * aren't as severe as they would be if we treated this bit like all
951 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
952 & ANYOF_NONBITMAP_NON_UTF8;
953 cl->flags &= and_with->flags;
954 cl->flags |= outside_bitmap_but_not_utf8;
958 /* 'OR' a given class with another one. Can create false positives. 'cl'
959 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
960 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
962 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
964 PERL_ARGS_ASSERT_CL_OR;
966 if (or_with->flags & ANYOF_INVERT) {
968 /* Here, the or'd node is to be inverted. This means we take the
969 * complement of everything not in the bitmap, but currently we don't
970 * know what that is, so give up and match anything */
971 if (ANYOF_NONBITMAP(or_with)) {
972 cl_anything(pRExC_state, cl);
975 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
976 * <= (B1 | !B2) | (CL1 | !CL2)
977 * which is wasteful if CL2 is small, but we ignore CL2:
978 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
979 * XXXX Can we handle case-fold? Unclear:
980 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
981 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
983 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
984 && !(or_with->flags & ANYOF_LOC_FOLD)
985 && !(cl->flags & ANYOF_LOC_FOLD) ) {
988 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
989 cl->bitmap[i] |= ~or_with->bitmap[i];
990 } /* XXXX: logic is complicated otherwise */
992 cl_anything(pRExC_state, cl);
995 /* And, we can just take the union of the flags that aren't affected
996 * by the inversion */
997 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
999 /* For the remaining flags:
1000 ANYOF_UNICODE_ALL and inverted means to not match anything above
1001 255, which means that the union with cl should just be
1002 what cl has in it, so can ignore this flag
1003 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1004 is 127-255 to match them, but then invert that, so the
1005 union with cl should just be what cl has in it, so can
1008 } else { /* 'or_with' is not inverted */
1009 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1010 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1011 && (!(or_with->flags & ANYOF_LOC_FOLD)
1012 || (cl->flags & ANYOF_LOC_FOLD)) ) {
1015 /* OR char bitmap and class bitmap separately */
1016 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1017 cl->bitmap[i] |= or_with->bitmap[i];
1018 ANYOF_CLASS_OR(or_with, cl);
1020 else { /* XXXX: logic is complicated, leave it along for a moment. */
1021 cl_anything(pRExC_state, cl);
1024 if (ANYOF_NONBITMAP(or_with)) {
1026 /* Use the added node's outside-the-bit-map match if there isn't a
1027 * conflict. If there is a conflict (both nodes match something
1028 * outside the bitmap, but what they match outside is not the same
1029 * pointer, and hence not easily compared until XXX we extend
1030 * inversion lists this far), give up and allow the start class to
1031 * match everything outside the bitmap. If that stuff is all above
1032 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1033 if (! ANYOF_NONBITMAP(cl)) {
1034 ARG_SET(cl, ARG(or_with));
1036 else if (ARG(cl) != ARG(or_with)) {
1038 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1039 cl_anything(pRExC_state, cl);
1042 cl->flags |= ANYOF_UNICODE_ALL;
1047 /* Take the union */
1048 cl->flags |= or_with->flags;
1052 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1053 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1054 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1055 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1060 dump_trie(trie,widecharmap,revcharmap)
1061 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1062 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1064 These routines dump out a trie in a somewhat readable format.
1065 The _interim_ variants are used for debugging the interim
1066 tables that are used to generate the final compressed
1067 representation which is what dump_trie expects.
1069 Part of the reason for their existence is to provide a form
1070 of documentation as to how the different representations function.
1075 Dumps the final compressed table form of the trie to Perl_debug_log.
1076 Used for debugging make_trie().
1080 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1081 AV *revcharmap, U32 depth)
1084 SV *sv=sv_newmortal();
1085 int colwidth= widecharmap ? 6 : 4;
1087 GET_RE_DEBUG_FLAGS_DECL;
1089 PERL_ARGS_ASSERT_DUMP_TRIE;
1091 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1092 (int)depth * 2 + 2,"",
1093 "Match","Base","Ofs" );
1095 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1096 SV ** const tmp = av_fetch( revcharmap, state, 0);
1098 PerlIO_printf( Perl_debug_log, "%*s",
1100 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1101 PL_colors[0], PL_colors[1],
1102 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1103 PERL_PV_ESCAPE_FIRSTCHAR
1108 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1109 (int)depth * 2 + 2,"");
1111 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1112 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1113 PerlIO_printf( Perl_debug_log, "\n");
1115 for( state = 1 ; state < trie->statecount ; state++ ) {
1116 const U32 base = trie->states[ state ].trans.base;
1118 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1120 if ( trie->states[ state ].wordnum ) {
1121 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1123 PerlIO_printf( Perl_debug_log, "%6s", "" );
1126 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1131 while( ( base + ofs < trie->uniquecharcount ) ||
1132 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1133 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1136 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1138 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1139 if ( ( base + ofs >= trie->uniquecharcount ) &&
1140 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1141 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1143 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1145 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1147 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1151 PerlIO_printf( Perl_debug_log, "]");
1154 PerlIO_printf( Perl_debug_log, "\n" );
1156 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1157 for (word=1; word <= trie->wordcount; word++) {
1158 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1159 (int)word, (int)(trie->wordinfo[word].prev),
1160 (int)(trie->wordinfo[word].len));
1162 PerlIO_printf(Perl_debug_log, "\n" );
1165 Dumps a fully constructed but uncompressed trie in list form.
1166 List tries normally only are used for construction when the number of
1167 possible chars (trie->uniquecharcount) is very high.
1168 Used for debugging make_trie().
1171 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1172 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1176 SV *sv=sv_newmortal();
1177 int colwidth= widecharmap ? 6 : 4;
1178 GET_RE_DEBUG_FLAGS_DECL;
1180 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1182 /* print out the table precompression. */
1183 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1184 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1185 "------:-----+-----------------\n" );
1187 for( state=1 ; state < next_alloc ; state ++ ) {
1190 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1191 (int)depth * 2 + 2,"", (UV)state );
1192 if ( ! trie->states[ state ].wordnum ) {
1193 PerlIO_printf( Perl_debug_log, "%5s| ","");
1195 PerlIO_printf( Perl_debug_log, "W%4x| ",
1196 trie->states[ state ].wordnum
1199 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1200 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1202 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1204 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1205 PL_colors[0], PL_colors[1],
1206 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1207 PERL_PV_ESCAPE_FIRSTCHAR
1209 TRIE_LIST_ITEM(state,charid).forid,
1210 (UV)TRIE_LIST_ITEM(state,charid).newstate
1213 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1214 (int)((depth * 2) + 14), "");
1217 PerlIO_printf( Perl_debug_log, "\n");
1222 Dumps a fully constructed but uncompressed trie in table form.
1223 This is the normal DFA style state transition table, with a few
1224 twists to facilitate compression later.
1225 Used for debugging make_trie().
1228 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1229 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1234 SV *sv=sv_newmortal();
1235 int colwidth= widecharmap ? 6 : 4;
1236 GET_RE_DEBUG_FLAGS_DECL;
1238 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1241 print out the table precompression so that we can do a visual check
1242 that they are identical.
1245 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1247 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1248 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1250 PerlIO_printf( Perl_debug_log, "%*s",
1252 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1253 PL_colors[0], PL_colors[1],
1254 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1255 PERL_PV_ESCAPE_FIRSTCHAR
1261 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1263 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1264 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1267 PerlIO_printf( Perl_debug_log, "\n" );
1269 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1271 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1272 (int)depth * 2 + 2,"",
1273 (UV)TRIE_NODENUM( state ) );
1275 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1276 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1278 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1280 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1282 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1283 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1285 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1286 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1294 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1295 startbranch: the first branch in the whole branch sequence
1296 first : start branch of sequence of branch-exact nodes.
1297 May be the same as startbranch
1298 last : Thing following the last branch.
1299 May be the same as tail.
1300 tail : item following the branch sequence
1301 count : words in the sequence
1302 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1303 depth : indent depth
1305 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1307 A trie is an N'ary tree where the branches are determined by digital
1308 decomposition of the key. IE, at the root node you look up the 1st character and
1309 follow that branch repeat until you find the end of the branches. Nodes can be
1310 marked as "accepting" meaning they represent a complete word. Eg:
1314 would convert into the following structure. Numbers represent states, letters
1315 following numbers represent valid transitions on the letter from that state, if
1316 the number is in square brackets it represents an accepting state, otherwise it
1317 will be in parenthesis.
1319 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1323 (1) +-i->(6)-+-s->[7]
1325 +-s->(3)-+-h->(4)-+-e->[5]
1327 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1329 This shows that when matching against the string 'hers' we will begin at state 1
1330 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1331 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1332 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1333 single traverse. We store a mapping from accepting to state to which word was
1334 matched, and then when we have multiple possibilities we try to complete the
1335 rest of the regex in the order in which they occured in the alternation.
1337 The only prior NFA like behaviour that would be changed by the TRIE support is
1338 the silent ignoring of duplicate alternations which are of the form:
1340 / (DUPE|DUPE) X? (?{ ... }) Y /x
1342 Thus EVAL blocks following a trie may be called a different number of times with
1343 and without the optimisation. With the optimisations dupes will be silently
1344 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1345 the following demonstrates:
1347 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1349 which prints out 'word' three times, but
1351 'words'=~/(word|word|word)(?{ print $1 })S/
1353 which doesnt print it out at all. This is due to other optimisations kicking in.
1355 Example of what happens on a structural level:
1357 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1359 1: CURLYM[1] {1,32767}(18)
1370 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1371 and should turn into:
1373 1: CURLYM[1] {1,32767}(18)
1375 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1383 Cases where tail != last would be like /(?foo|bar)baz/:
1393 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1394 and would end up looking like:
1397 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1404 d = uvuni_to_utf8_flags(d, uv, 0);
1406 is the recommended Unicode-aware way of saying
1411 #define TRIE_STORE_REVCHAR(val) \
1414 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1415 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1416 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1417 SvCUR_set(zlopp, kapow - flrbbbbb); \
1420 av_push(revcharmap, zlopp); \
1422 char ooooff = (char)val; \
1423 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1427 #define TRIE_READ_CHAR STMT_START { \
1430 /* if it is UTF then it is either already folded, or does not need folding */ \
1431 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1433 else if (folder == PL_fold_latin1) { \
1434 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1435 if ( foldlen > 0 ) { \
1436 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1442 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1443 skiplen = UNISKIP(uvc); \
1444 foldlen -= skiplen; \
1445 scan = foldbuf + skiplen; \
1448 /* raw data, will be folded later if needed */ \
1456 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1457 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1458 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1459 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1461 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1462 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1463 TRIE_LIST_CUR( state )++; \
1466 #define TRIE_LIST_NEW(state) STMT_START { \
1467 Newxz( trie->states[ state ].trans.list, \
1468 4, reg_trie_trans_le ); \
1469 TRIE_LIST_CUR( state ) = 1; \
1470 TRIE_LIST_LEN( state ) = 4; \
1473 #define TRIE_HANDLE_WORD(state) STMT_START { \
1474 U16 dupe= trie->states[ state ].wordnum; \
1475 regnode * const noper_next = regnext( noper ); \
1478 /* store the word for dumping */ \
1480 if (OP(noper) != NOTHING) \
1481 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1483 tmp = newSVpvn_utf8( "", 0, UTF ); \
1484 av_push( trie_words, tmp ); \
1488 trie->wordinfo[curword].prev = 0; \
1489 trie->wordinfo[curword].len = wordlen; \
1490 trie->wordinfo[curword].accept = state; \
1492 if ( noper_next < tail ) { \
1494 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1495 trie->jump[curword] = (U16)(noper_next - convert); \
1497 jumper = noper_next; \
1499 nextbranch= regnext(cur); \
1503 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1504 /* chain, so that when the bits of chain are later */\
1505 /* linked together, the dups appear in the chain */\
1506 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1507 trie->wordinfo[dupe].prev = curword; \
1509 /* we haven't inserted this word yet. */ \
1510 trie->states[ state ].wordnum = curword; \
1515 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1516 ( ( base + charid >= ucharcount \
1517 && base + charid < ubound \
1518 && state == trie->trans[ base - ucharcount + charid ].check \
1519 && trie->trans[ base - ucharcount + charid ].next ) \
1520 ? trie->trans[ base - ucharcount + charid ].next \
1521 : ( state==1 ? special : 0 ) \
1525 #define MADE_JUMP_TRIE 2
1526 #define MADE_EXACT_TRIE 4
1529 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1532 /* first pass, loop through and scan words */
1533 reg_trie_data *trie;
1534 HV *widecharmap = NULL;
1535 AV *revcharmap = newAV();
1537 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1542 regnode *jumper = NULL;
1543 regnode *nextbranch = NULL;
1544 regnode *convert = NULL;
1545 U32 *prev_states; /* temp array mapping each state to previous one */
1546 /* we just use folder as a flag in utf8 */
1547 const U8 * folder = NULL;
1550 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1551 AV *trie_words = NULL;
1552 /* along with revcharmap, this only used during construction but both are
1553 * useful during debugging so we store them in the struct when debugging.
1556 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1557 STRLEN trie_charcount=0;
1559 SV *re_trie_maxbuff;
1560 GET_RE_DEBUG_FLAGS_DECL;
1562 PERL_ARGS_ASSERT_MAKE_TRIE;
1564 PERL_UNUSED_ARG(depth);
1571 case EXACTFU_TRICKYFOLD:
1572 case EXACTFU: folder = PL_fold_latin1; break;
1573 case EXACTF: folder = PL_fold; break;
1574 case EXACTFL: folder = PL_fold_locale; break;
1575 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1578 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1580 trie->startstate = 1;
1581 trie->wordcount = word_count;
1582 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1583 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1585 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1586 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1587 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1590 trie_words = newAV();
1593 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1594 if (!SvIOK(re_trie_maxbuff)) {
1595 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1597 DEBUG_TRIE_COMPILE_r({
1598 PerlIO_printf( Perl_debug_log,
1599 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1600 (int)depth * 2 + 2, "",
1601 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1602 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1606 /* Find the node we are going to overwrite */
1607 if ( first == startbranch && OP( last ) != BRANCH ) {
1608 /* whole branch chain */
1611 /* branch sub-chain */
1612 convert = NEXTOPER( first );
1615 /* -- First loop and Setup --
1617 We first traverse the branches and scan each word to determine if it
1618 contains widechars, and how many unique chars there are, this is
1619 important as we have to build a table with at least as many columns as we
1622 We use an array of integers to represent the character codes 0..255
1623 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1624 native representation of the character value as the key and IV's for the
1627 *TODO* If we keep track of how many times each character is used we can
1628 remap the columns so that the table compression later on is more
1629 efficient in terms of memory by ensuring the most common value is in the
1630 middle and the least common are on the outside. IMO this would be better
1631 than a most to least common mapping as theres a decent chance the most
1632 common letter will share a node with the least common, meaning the node
1633 will not be compressible. With a middle is most common approach the worst
1634 case is when we have the least common nodes twice.
1638 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1639 regnode *noper = NEXTOPER( cur );
1640 const U8 *uc = (U8*)STRING( noper );
1641 const U8 *e = uc + STR_LEN( noper );
1643 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1645 const U8 *scan = (U8*)NULL;
1646 U32 wordlen = 0; /* required init */
1648 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1650 if (OP(noper) == NOTHING) {
1651 regnode *noper_next= regnext(noper);
1652 if (noper_next != tail && OP(noper_next) == flags) {
1654 uc= (U8*)STRING(noper);
1655 e= uc + STR_LEN(noper);
1656 trie->minlen= STR_LEN(noper);
1663 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1664 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1665 regardless of encoding */
1666 if (OP( noper ) == EXACTFU_SS) {
1667 /* false positives are ok, so just set this */
1668 TRIE_BITMAP_SET(trie,0xDF);
1671 for ( ; uc < e ; uc += len ) {
1672 TRIE_CHARCOUNT(trie)++;
1677 U8 folded= folder[ (U8) uvc ];
1678 if ( !trie->charmap[ folded ] ) {
1679 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1680 TRIE_STORE_REVCHAR( folded );
1683 if ( !trie->charmap[ uvc ] ) {
1684 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1685 TRIE_STORE_REVCHAR( uvc );
1688 /* store the codepoint in the bitmap, and its folded
1690 TRIE_BITMAP_SET(trie, uvc);
1692 /* store the folded codepoint */
1693 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1696 /* store first byte of utf8 representation of
1697 variant codepoints */
1698 if (! UNI_IS_INVARIANT(uvc)) {
1699 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1702 set_bit = 0; /* We've done our bit :-) */
1707 widecharmap = newHV();
1709 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1712 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1714 if ( !SvTRUE( *svpp ) ) {
1715 sv_setiv( *svpp, ++trie->uniquecharcount );
1716 TRIE_STORE_REVCHAR(uvc);
1720 if( cur == first ) {
1721 trie->minlen = chars;
1722 trie->maxlen = chars;
1723 } else if (chars < trie->minlen) {
1724 trie->minlen = chars;
1725 } else if (chars > trie->maxlen) {
1726 trie->maxlen = chars;
1728 if (OP( noper ) == EXACTFU_SS) {
1729 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1730 if (trie->minlen > 1)
1733 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1734 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1735 * - We assume that any such sequence might match a 2 byte string */
1736 if (trie->minlen > 2 )
1740 } /* end first pass */
1741 DEBUG_TRIE_COMPILE_r(
1742 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1743 (int)depth * 2 + 2,"",
1744 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1745 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1746 (int)trie->minlen, (int)trie->maxlen )
1750 We now know what we are dealing with in terms of unique chars and
1751 string sizes so we can calculate how much memory a naive
1752 representation using a flat table will take. If it's over a reasonable
1753 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1754 conservative but potentially much slower representation using an array
1757 At the end we convert both representations into the same compressed
1758 form that will be used in regexec.c for matching with. The latter
1759 is a form that cannot be used to construct with but has memory
1760 properties similar to the list form and access properties similar
1761 to the table form making it both suitable for fast searches and
1762 small enough that its feasable to store for the duration of a program.
1764 See the comment in the code where the compressed table is produced
1765 inplace from the flat tabe representation for an explanation of how
1766 the compression works.
1771 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1774 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1776 Second Pass -- Array Of Lists Representation
1778 Each state will be represented by a list of charid:state records
1779 (reg_trie_trans_le) the first such element holds the CUR and LEN
1780 points of the allocated array. (See defines above).
1782 We build the initial structure using the lists, and then convert
1783 it into the compressed table form which allows faster lookups
1784 (but cant be modified once converted).
1787 STRLEN transcount = 1;
1789 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1790 "%*sCompiling trie using list compiler\n",
1791 (int)depth * 2 + 2, ""));
1793 trie->states = (reg_trie_state *)
1794 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1795 sizeof(reg_trie_state) );
1799 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1801 regnode *noper = NEXTOPER( cur );
1802 U8 *uc = (U8*)STRING( noper );
1803 const U8 *e = uc + STR_LEN( noper );
1804 U32 state = 1; /* required init */
1805 U16 charid = 0; /* sanity init */
1806 U8 *scan = (U8*)NULL; /* sanity init */
1807 STRLEN foldlen = 0; /* required init */
1808 U32 wordlen = 0; /* required init */
1809 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1812 if (OP(noper) == NOTHING) {
1813 regnode *noper_next= regnext(noper);
1814 if (noper_next != tail && OP(noper_next) == flags) {
1816 uc= (U8*)STRING(noper);
1817 e= uc + STR_LEN(noper);
1821 if (OP(noper) != NOTHING) {
1822 for ( ; uc < e ; uc += len ) {
1827 charid = trie->charmap[ uvc ];
1829 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1833 charid=(U16)SvIV( *svpp );
1836 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1843 if ( !trie->states[ state ].trans.list ) {
1844 TRIE_LIST_NEW( state );
1846 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1847 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1848 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1853 newstate = next_alloc++;
1854 prev_states[newstate] = state;
1855 TRIE_LIST_PUSH( state, charid, newstate );
1860 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1864 TRIE_HANDLE_WORD(state);
1866 } /* end second pass */
1868 /* next alloc is the NEXT state to be allocated */
1869 trie->statecount = next_alloc;
1870 trie->states = (reg_trie_state *)
1871 PerlMemShared_realloc( trie->states,
1873 * sizeof(reg_trie_state) );
1875 /* and now dump it out before we compress it */
1876 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1877 revcharmap, next_alloc,
1881 trie->trans = (reg_trie_trans *)
1882 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1889 for( state=1 ; state < next_alloc ; state ++ ) {
1893 DEBUG_TRIE_COMPILE_MORE_r(
1894 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1898 if (trie->states[state].trans.list) {
1899 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1903 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1904 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1905 if ( forid < minid ) {
1907 } else if ( forid > maxid ) {
1911 if ( transcount < tp + maxid - minid + 1) {
1913 trie->trans = (reg_trie_trans *)
1914 PerlMemShared_realloc( trie->trans,
1916 * sizeof(reg_trie_trans) );
1917 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1919 base = trie->uniquecharcount + tp - minid;
1920 if ( maxid == minid ) {
1922 for ( ; zp < tp ; zp++ ) {
1923 if ( ! trie->trans[ zp ].next ) {
1924 base = trie->uniquecharcount + zp - minid;
1925 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1926 trie->trans[ zp ].check = state;
1932 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1933 trie->trans[ tp ].check = state;
1938 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1939 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1940 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1941 trie->trans[ tid ].check = state;
1943 tp += ( maxid - minid + 1 );
1945 Safefree(trie->states[ state ].trans.list);
1948 DEBUG_TRIE_COMPILE_MORE_r(
1949 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1952 trie->states[ state ].trans.base=base;
1954 trie->lasttrans = tp + 1;
1958 Second Pass -- Flat Table Representation.
1960 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1961 We know that we will need Charcount+1 trans at most to store the data
1962 (one row per char at worst case) So we preallocate both structures
1963 assuming worst case.
1965 We then construct the trie using only the .next slots of the entry
1968 We use the .check field of the first entry of the node temporarily to
1969 make compression both faster and easier by keeping track of how many non
1970 zero fields are in the node.
1972 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1975 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1976 number representing the first entry of the node, and state as a
1977 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1978 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1979 are 2 entrys per node. eg:
1987 The table is internally in the right hand, idx form. However as we also
1988 have to deal with the states array which is indexed by nodenum we have to
1989 use TRIE_NODENUM() to convert.
1992 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1993 "%*sCompiling trie using table compiler\n",
1994 (int)depth * 2 + 2, ""));
1996 trie->trans = (reg_trie_trans *)
1997 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1998 * trie->uniquecharcount + 1,
1999 sizeof(reg_trie_trans) );
2000 trie->states = (reg_trie_state *)
2001 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2002 sizeof(reg_trie_state) );
2003 next_alloc = trie->uniquecharcount + 1;
2006 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2008 regnode *noper = NEXTOPER( cur );
2009 const U8 *uc = (U8*)STRING( noper );
2010 const U8 *e = uc + STR_LEN( noper );
2012 U32 state = 1; /* required init */
2014 U16 charid = 0; /* sanity init */
2015 U32 accept_state = 0; /* sanity init */
2016 U8 *scan = (U8*)NULL; /* sanity init */
2018 STRLEN foldlen = 0; /* required init */
2019 U32 wordlen = 0; /* required init */
2021 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2023 if (OP(noper) == NOTHING) {
2024 regnode *noper_next= regnext(noper);
2025 if (noper_next != tail && OP(noper_next) == flags) {
2027 uc= (U8*)STRING(noper);
2028 e= uc + STR_LEN(noper);
2032 if ( OP(noper) != NOTHING ) {
2033 for ( ; uc < e ; uc += len ) {
2038 charid = trie->charmap[ uvc ];
2040 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2041 charid = svpp ? (U16)SvIV(*svpp) : 0;
2045 if ( !trie->trans[ state + charid ].next ) {
2046 trie->trans[ state + charid ].next = next_alloc;
2047 trie->trans[ state ].check++;
2048 prev_states[TRIE_NODENUM(next_alloc)]
2049 = TRIE_NODENUM(state);
2050 next_alloc += trie->uniquecharcount;
2052 state = trie->trans[ state + charid ].next;
2054 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2056 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2059 accept_state = TRIE_NODENUM( state );
2060 TRIE_HANDLE_WORD(accept_state);
2062 } /* end second pass */
2064 /* and now dump it out before we compress it */
2065 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2067 next_alloc, depth+1));
2071 * Inplace compress the table.*
2073 For sparse data sets the table constructed by the trie algorithm will
2074 be mostly 0/FAIL transitions or to put it another way mostly empty.
2075 (Note that leaf nodes will not contain any transitions.)
2077 This algorithm compresses the tables by eliminating most such
2078 transitions, at the cost of a modest bit of extra work during lookup:
2080 - Each states[] entry contains a .base field which indicates the
2081 index in the state[] array wheres its transition data is stored.
2083 - If .base is 0 there are no valid transitions from that node.
2085 - If .base is nonzero then charid is added to it to find an entry in
2088 -If trans[states[state].base+charid].check!=state then the
2089 transition is taken to be a 0/Fail transition. Thus if there are fail
2090 transitions at the front of the node then the .base offset will point
2091 somewhere inside the previous nodes data (or maybe even into a node
2092 even earlier), but the .check field determines if the transition is
2096 The following process inplace converts the table to the compressed
2097 table: We first do not compress the root node 1,and mark all its
2098 .check pointers as 1 and set its .base pointer as 1 as well. This
2099 allows us to do a DFA construction from the compressed table later,
2100 and ensures that any .base pointers we calculate later are greater
2103 - We set 'pos' to indicate the first entry of the second node.
2105 - We then iterate over the columns of the node, finding the first and
2106 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2107 and set the .check pointers accordingly, and advance pos
2108 appropriately and repreat for the next node. Note that when we copy
2109 the next pointers we have to convert them from the original
2110 NODEIDX form to NODENUM form as the former is not valid post
2113 - If a node has no transitions used we mark its base as 0 and do not
2114 advance the pos pointer.
2116 - If a node only has one transition we use a second pointer into the
2117 structure to fill in allocated fail transitions from other states.
2118 This pointer is independent of the main pointer and scans forward
2119 looking for null transitions that are allocated to a state. When it
2120 finds one it writes the single transition into the "hole". If the
2121 pointer doesnt find one the single transition is appended as normal.
2123 - Once compressed we can Renew/realloc the structures to release the
2126 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2127 specifically Fig 3.47 and the associated pseudocode.
2131 const U32 laststate = TRIE_NODENUM( next_alloc );
2134 trie->statecount = laststate;
2136 for ( state = 1 ; state < laststate ; state++ ) {
2138 const U32 stateidx = TRIE_NODEIDX( state );
2139 const U32 o_used = trie->trans[ stateidx ].check;
2140 U32 used = trie->trans[ stateidx ].check;
2141 trie->trans[ stateidx ].check = 0;
2143 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2144 if ( flag || trie->trans[ stateidx + charid ].next ) {
2145 if ( trie->trans[ stateidx + charid ].next ) {
2147 for ( ; zp < pos ; zp++ ) {
2148 if ( ! trie->trans[ zp ].next ) {
2152 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2153 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2154 trie->trans[ zp ].check = state;
2155 if ( ++zp > pos ) pos = zp;
2162 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2164 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2165 trie->trans[ pos ].check = state;
2170 trie->lasttrans = pos + 1;
2171 trie->states = (reg_trie_state *)
2172 PerlMemShared_realloc( trie->states, laststate
2173 * sizeof(reg_trie_state) );
2174 DEBUG_TRIE_COMPILE_MORE_r(
2175 PerlIO_printf( Perl_debug_log,
2176 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2177 (int)depth * 2 + 2,"",
2178 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2181 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2184 } /* end table compress */
2186 DEBUG_TRIE_COMPILE_MORE_r(
2187 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2188 (int)depth * 2 + 2, "",
2189 (UV)trie->statecount,
2190 (UV)trie->lasttrans)
2192 /* resize the trans array to remove unused space */
2193 trie->trans = (reg_trie_trans *)
2194 PerlMemShared_realloc( trie->trans, trie->lasttrans
2195 * sizeof(reg_trie_trans) );
2197 { /* Modify the program and insert the new TRIE node */
2198 U8 nodetype =(U8)(flags & 0xFF);
2202 regnode *optimize = NULL;
2203 #ifdef RE_TRACK_PATTERN_OFFSETS
2206 U32 mjd_nodelen = 0;
2207 #endif /* RE_TRACK_PATTERN_OFFSETS */
2208 #endif /* DEBUGGING */
2210 This means we convert either the first branch or the first Exact,
2211 depending on whether the thing following (in 'last') is a branch
2212 or not and whther first is the startbranch (ie is it a sub part of
2213 the alternation or is it the whole thing.)
2214 Assuming its a sub part we convert the EXACT otherwise we convert
2215 the whole branch sequence, including the first.
2217 /* Find the node we are going to overwrite */
2218 if ( first != startbranch || OP( last ) == BRANCH ) {
2219 /* branch sub-chain */
2220 NEXT_OFF( first ) = (U16)(last - first);
2221 #ifdef RE_TRACK_PATTERN_OFFSETS
2223 mjd_offset= Node_Offset((convert));
2224 mjd_nodelen= Node_Length((convert));
2227 /* whole branch chain */
2229 #ifdef RE_TRACK_PATTERN_OFFSETS
2232 const regnode *nop = NEXTOPER( convert );
2233 mjd_offset= Node_Offset((nop));
2234 mjd_nodelen= Node_Length((nop));
2238 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2239 (int)depth * 2 + 2, "",
2240 (UV)mjd_offset, (UV)mjd_nodelen)
2243 /* But first we check to see if there is a common prefix we can
2244 split out as an EXACT and put in front of the TRIE node. */
2245 trie->startstate= 1;
2246 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2248 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2252 const U32 base = trie->states[ state ].trans.base;
2254 if ( trie->states[state].wordnum )
2257 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2258 if ( ( base + ofs >= trie->uniquecharcount ) &&
2259 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2260 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2262 if ( ++count > 1 ) {
2263 SV **tmp = av_fetch( revcharmap, ofs, 0);
2264 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2265 if ( state == 1 ) break;
2267 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2269 PerlIO_printf(Perl_debug_log,
2270 "%*sNew Start State=%"UVuf" Class: [",
2271 (int)depth * 2 + 2, "",
2274 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2275 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2277 TRIE_BITMAP_SET(trie,*ch);
2279 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2281 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2285 TRIE_BITMAP_SET(trie,*ch);
2287 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2288 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2294 SV **tmp = av_fetch( revcharmap, idx, 0);
2296 char *ch = SvPV( *tmp, len );
2298 SV *sv=sv_newmortal();
2299 PerlIO_printf( Perl_debug_log,
2300 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2301 (int)depth * 2 + 2, "",
2303 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2304 PL_colors[0], PL_colors[1],
2305 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2306 PERL_PV_ESCAPE_FIRSTCHAR
2311 OP( convert ) = nodetype;
2312 str=STRING(convert);
2315 STR_LEN(convert) += len;
2321 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2326 trie->prefixlen = (state-1);
2328 regnode *n = convert+NODE_SZ_STR(convert);
2329 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2330 trie->startstate = state;
2331 trie->minlen -= (state - 1);
2332 trie->maxlen -= (state - 1);
2334 /* At least the UNICOS C compiler choked on this
2335 * being argument to DEBUG_r(), so let's just have
2338 #ifdef PERL_EXT_RE_BUILD
2344 regnode *fix = convert;
2345 U32 word = trie->wordcount;
2347 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2348 while( ++fix < n ) {
2349 Set_Node_Offset_Length(fix, 0, 0);
2352 SV ** const tmp = av_fetch( trie_words, word, 0 );
2354 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2355 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2357 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2365 NEXT_OFF(convert) = (U16)(tail - convert);
2366 DEBUG_r(optimize= n);
2372 if ( trie->maxlen ) {
2373 NEXT_OFF( convert ) = (U16)(tail - convert);
2374 ARG_SET( convert, data_slot );
2375 /* Store the offset to the first unabsorbed branch in
2376 jump[0], which is otherwise unused by the jump logic.
2377 We use this when dumping a trie and during optimisation. */
2379 trie->jump[0] = (U16)(nextbranch - convert);
2381 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2382 * and there is a bitmap
2383 * and the first "jump target" node we found leaves enough room
2384 * then convert the TRIE node into a TRIEC node, with the bitmap
2385 * embedded inline in the opcode - this is hypothetically faster.
2387 if ( !trie->states[trie->startstate].wordnum
2389 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2391 OP( convert ) = TRIEC;
2392 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2393 PerlMemShared_free(trie->bitmap);
2396 OP( convert ) = TRIE;
2398 /* store the type in the flags */
2399 convert->flags = nodetype;
2403 + regarglen[ OP( convert ) ];
2405 /* XXX We really should free up the resource in trie now,
2406 as we won't use them - (which resources?) dmq */
2408 /* needed for dumping*/
2409 DEBUG_r(if (optimize) {
2410 regnode *opt = convert;
2412 while ( ++opt < optimize) {
2413 Set_Node_Offset_Length(opt,0,0);
2416 Try to clean up some of the debris left after the
2419 while( optimize < jumper ) {
2420 mjd_nodelen += Node_Length((optimize));
2421 OP( optimize ) = OPTIMIZED;
2422 Set_Node_Offset_Length(optimize,0,0);
2425 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2427 } /* end node insert */
2428 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2430 /* Finish populating the prev field of the wordinfo array. Walk back
2431 * from each accept state until we find another accept state, and if
2432 * so, point the first word's .prev field at the second word. If the
2433 * second already has a .prev field set, stop now. This will be the
2434 * case either if we've already processed that word's accept state,
2435 * or that state had multiple words, and the overspill words were
2436 * already linked up earlier.
2443 for (word=1; word <= trie->wordcount; word++) {
2445 if (trie->wordinfo[word].prev)
2447 state = trie->wordinfo[word].accept;
2449 state = prev_states[state];
2452 prev = trie->states[state].wordnum;
2456 trie->wordinfo[word].prev = prev;
2458 Safefree(prev_states);
2462 /* and now dump out the compressed format */
2463 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2465 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2467 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2468 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2470 SvREFCNT_dec_NN(revcharmap);
2474 : trie->startstate>1
2480 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2482 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2484 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2485 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2488 We find the fail state for each state in the trie, this state is the longest proper
2489 suffix of the current state's 'word' that is also a proper prefix of another word in our
2490 trie. State 1 represents the word '' and is thus the default fail state. This allows
2491 the DFA not to have to restart after its tried and failed a word at a given point, it
2492 simply continues as though it had been matching the other word in the first place.
2494 'abcdgu'=~/abcdefg|cdgu/
2495 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2496 fail, which would bring us to the state representing 'd' in the second word where we would
2497 try 'g' and succeed, proceeding to match 'cdgu'.
2499 /* add a fail transition */
2500 const U32 trie_offset = ARG(source);
2501 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2503 const U32 ucharcount = trie->uniquecharcount;
2504 const U32 numstates = trie->statecount;
2505 const U32 ubound = trie->lasttrans + ucharcount;
2509 U32 base = trie->states[ 1 ].trans.base;
2512 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2513 GET_RE_DEBUG_FLAGS_DECL;
2515 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2517 PERL_UNUSED_ARG(depth);
2521 ARG_SET( stclass, data_slot );
2522 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2523 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2524 aho->trie=trie_offset;
2525 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2526 Copy( trie->states, aho->states, numstates, reg_trie_state );
2527 Newxz( q, numstates, U32);
2528 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2531 /* initialize fail[0..1] to be 1 so that we always have
2532 a valid final fail state */
2533 fail[ 0 ] = fail[ 1 ] = 1;
2535 for ( charid = 0; charid < ucharcount ; charid++ ) {
2536 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2538 q[ q_write ] = newstate;
2539 /* set to point at the root */
2540 fail[ q[ q_write++ ] ]=1;
2543 while ( q_read < q_write) {
2544 const U32 cur = q[ q_read++ % numstates ];
2545 base = trie->states[ cur ].trans.base;
2547 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2548 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2550 U32 fail_state = cur;
2553 fail_state = fail[ fail_state ];
2554 fail_base = aho->states[ fail_state ].trans.base;
2555 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2557 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2558 fail[ ch_state ] = fail_state;
2559 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2561 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2563 q[ q_write++ % numstates] = ch_state;
2567 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2568 when we fail in state 1, this allows us to use the
2569 charclass scan to find a valid start char. This is based on the principle
2570 that theres a good chance the string being searched contains lots of stuff
2571 that cant be a start char.
2573 fail[ 0 ] = fail[ 1 ] = 0;
2574 DEBUG_TRIE_COMPILE_r({
2575 PerlIO_printf(Perl_debug_log,
2576 "%*sStclass Failtable (%"UVuf" states): 0",
2577 (int)(depth * 2), "", (UV)numstates
2579 for( q_read=1; q_read<numstates; q_read++ ) {
2580 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2582 PerlIO_printf(Perl_debug_log, "\n");
2585 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2590 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2591 * These need to be revisited when a newer toolchain becomes available.
2593 #if defined(__sparc64__) && defined(__GNUC__)
2594 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2595 # undef SPARC64_GCC_WORKAROUND
2596 # define SPARC64_GCC_WORKAROUND 1
2600 #define DEBUG_PEEP(str,scan,depth) \
2601 DEBUG_OPTIMISE_r({if (scan){ \
2602 SV * const mysv=sv_newmortal(); \
2603 regnode *Next = regnext(scan); \
2604 regprop(RExC_rx, mysv, scan); \
2605 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2606 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2607 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2611 /* The below joins as many adjacent EXACTish nodes as possible into a single
2612 * one. The regop may be changed if the node(s) contain certain sequences that
2613 * require special handling. The joining is only done if:
2614 * 1) there is room in the current conglomerated node to entirely contain the
2616 * 2) they are the exact same node type
2618 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2619 * these get optimized out
2621 * If a node is to match under /i (folded), the number of characters it matches
2622 * can be different than its character length if it contains a multi-character
2623 * fold. *min_subtract is set to the total delta of the input nodes.
2625 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2626 * and contains LATIN SMALL LETTER SHARP S
2628 * This is as good a place as any to discuss the design of handling these
2629 * multi-character fold sequences. It's been wrong in Perl for a very long
2630 * time. There are three code points in Unicode whose multi-character folds
2631 * were long ago discovered to mess things up. The previous designs for
2632 * dealing with these involved assigning a special node for them. This
2633 * approach doesn't work, as evidenced by this example:
2634 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2635 * Both these fold to "sss", but if the pattern is parsed to create a node that
2636 * would match just the \xDF, it won't be able to handle the case where a
2637 * successful match would have to cross the node's boundary. The new approach
2638 * that hopefully generally solves the problem generates an EXACTFU_SS node
2641 * It turns out that there are problems with all multi-character folds, and not
2642 * just these three. Now the code is general, for all such cases, but the
2643 * three still have some special handling. The approach taken is:
2644 * 1) This routine examines each EXACTFish node that could contain multi-
2645 * character fold sequences. It returns in *min_subtract how much to
2646 * subtract from the the actual length of the string to get a real minimum
2647 * match length; it is 0 if there are no multi-char folds. This delta is
2648 * used by the caller to adjust the min length of the match, and the delta
2649 * between min and max, so that the optimizer doesn't reject these
2650 * possibilities based on size constraints.
2651 * 2) Certain of these sequences require special handling by the trie code,
2652 * so, if found, this code changes the joined node type to special ops:
2653 * EXACTFU_TRICKYFOLD and EXACTFU_SS.
2654 * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2655 * is used for an EXACTFU node that contains at least one "ss" sequence in
2656 * it. For non-UTF-8 patterns and strings, this is the only case where
2657 * there is a possible fold length change. That means that a regular
2658 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2659 * with length changes, and so can be processed faster. regexec.c takes
2660 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2661 * pre-folded by regcomp.c. This saves effort in regex matching.
2662 * However, the pre-folding isn't done for non-UTF8 patterns because the
2663 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2664 * down by forcing the pattern into UTF8 unless necessary. Also what
2665 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2666 * possibilities for the non-UTF8 patterns are quite simple, except for
2667 * the sharp s. All the ones that don't involve a UTF-8 target string are
2668 * members of a fold-pair, and arrays are set up for all of them so that
2669 * the other member of the pair can be found quickly. Code elsewhere in
2670 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2671 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2672 * described in the next item.
2673 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2674 * 'ss' or not is not knowable at compile time. It will match iff the
2675 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2676 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2677 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2678 * described in item 3). An assumption that the optimizer part of
2679 * regexec.c (probably unwittingly) makes is that a character in the
2680 * pattern corresponds to at most a single character in the target string.
2681 * (And I do mean character, and not byte here, unlike other parts of the
2682 * documentation that have never been updated to account for multibyte
2683 * Unicode.) This assumption is wrong only in this case, as all other
2684 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2685 * virtue of having this file pre-fold UTF-8 patterns. I'm
2686 * reluctant to try to change this assumption, so instead the code punts.
2687 * This routine examines EXACTF nodes for the sharp s, and returns a
2688 * boolean indicating whether or not the node is an EXACTF node that
2689 * contains a sharp s. When it is true, the caller sets a flag that later
2690 * causes the optimizer in this file to not set values for the floating
2691 * and fixed string lengths, and thus avoids the optimizer code in
2692 * regexec.c that makes the invalid assumption. Thus, there is no
2693 * optimization based on string lengths for EXACTF nodes that contain the
2694 * sharp s. This only happens for /id rules (which means the pattern
2698 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2699 if (PL_regkind[OP(scan)] == EXACT) \
2700 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2703 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2704 /* Merge several consecutive EXACTish nodes into one. */
2705 regnode *n = regnext(scan);
2707 regnode *next = scan + NODE_SZ_STR(scan);
2711 regnode *stop = scan;
2712 GET_RE_DEBUG_FLAGS_DECL;
2714 PERL_UNUSED_ARG(depth);
2717 PERL_ARGS_ASSERT_JOIN_EXACT;
2718 #ifndef EXPERIMENTAL_INPLACESCAN
2719 PERL_UNUSED_ARG(flags);
2720 PERL_UNUSED_ARG(val);
2722 DEBUG_PEEP("join",scan,depth);
2724 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2725 * EXACT ones that are mergeable to the current one. */
2727 && (PL_regkind[OP(n)] == NOTHING
2728 || (stringok && OP(n) == OP(scan)))
2730 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2733 if (OP(n) == TAIL || n > next)
2735 if (PL_regkind[OP(n)] == NOTHING) {
2736 DEBUG_PEEP("skip:",n,depth);
2737 NEXT_OFF(scan) += NEXT_OFF(n);
2738 next = n + NODE_STEP_REGNODE;
2745 else if (stringok) {
2746 const unsigned int oldl = STR_LEN(scan);
2747 regnode * const nnext = regnext(n);
2749 /* XXX I (khw) kind of doubt that this works on platforms where
2750 * U8_MAX is above 255 because of lots of other assumptions */
2751 /* Don't join if the sum can't fit into a single node */
2752 if (oldl + STR_LEN(n) > U8_MAX)
2755 DEBUG_PEEP("merg",n,depth);
2758 NEXT_OFF(scan) += NEXT_OFF(n);
2759 STR_LEN(scan) += STR_LEN(n);
2760 next = n + NODE_SZ_STR(n);
2761 /* Now we can overwrite *n : */
2762 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2770 #ifdef EXPERIMENTAL_INPLACESCAN
2771 if (flags && !NEXT_OFF(n)) {
2772 DEBUG_PEEP("atch", val, depth);
2773 if (reg_off_by_arg[OP(n)]) {
2774 ARG_SET(n, val - n);
2777 NEXT_OFF(n) = val - n;
2785 *has_exactf_sharp_s = FALSE;
2787 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2788 * can now analyze for sequences of problematic code points. (Prior to
2789 * this final joining, sequences could have been split over boundaries, and
2790 * hence missed). The sequences only happen in folding, hence for any
2791 * non-EXACT EXACTish node */
2792 if (OP(scan) != EXACT) {
2793 const U8 * const s0 = (U8*) STRING(scan);
2795 const U8 * const s_end = s0 + STR_LEN(scan);
2797 /* One pass is made over the node's string looking for all the
2798 * possibilities. to avoid some tests in the loop, there are two main
2799 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2803 /* Examine the string for a multi-character fold sequence. UTF-8
2804 * patterns have all characters pre-folded by the time this code is
2806 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2807 length sequence we are looking for is 2 */
2810 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2811 if (! len) { /* Not a multi-char fold: get next char */
2816 /* Nodes with 'ss' require special handling, except for EXACTFL
2817 * and EXACTFA for which there is no multi-char fold to this */
2818 if (len == 2 && *s == 's' && *(s+1) == 's'
2819 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2822 OP(scan) = EXACTFU_SS;
2825 else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
2826 && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2827 COMBINING_DIAERESIS_UTF8
2828 COMBINING_ACUTE_ACCENT_UTF8,
2830 || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2831 COMBINING_DIAERESIS_UTF8
2832 COMBINING_ACUTE_ACCENT_UTF8,
2837 /* These two folds require special handling by trie's, so
2838 * change the node type to indicate this. If EXACTFA and
2839 * EXACTFL were ever to be handled by trie's, this would
2840 * have to be changed. If this node has already been
2841 * changed to EXACTFU_SS in this loop, leave it as is. (I
2842 * (khw) think it doesn't matter in regexec.c for UTF
2843 * patterns, but no need to change it */
2844 if (OP(scan) == EXACTFU) {
2845 OP(scan) = EXACTFU_TRICKYFOLD;
2849 else { /* Here is a generic multi-char fold. */
2850 const U8* multi_end = s + len;
2852 /* Count how many characters in it. In the case of /l and
2853 * /aa, no folds which contain ASCII code points are
2854 * allowed, so check for those, and skip if found. (In
2855 * EXACTFL, no folds are allowed to any Latin1 code point,
2856 * not just ASCII. But there aren't any of these
2857 * currently, nor ever likely, so don't take the time to
2858 * test for them. The code that generates the
2859 * is_MULTI_foo() macros croaks should one actually get put
2860 * into Unicode .) */
2861 if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2862 count = utf8_length(s, multi_end);
2866 while (s < multi_end) {
2869 goto next_iteration;
2879 /* The delta is how long the sequence is minus 1 (1 is how long
2880 * the character that folds to the sequence is) */
2881 *min_subtract += count - 1;
2885 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2887 /* Here, the pattern is not UTF-8. Look for the multi-char folds
2888 * that are all ASCII. As in the above case, EXACTFL and EXACTFA
2889 * nodes can't have multi-char folds to this range (and there are
2890 * no existing ones in the upper latin1 range). In the EXACTF
2891 * case we look also for the sharp s, which can be in the final
2892 * position. Otherwise we can stop looking 1 byte earlier because
2893 * have to find at least two characters for a multi-fold */
2894 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2896 /* The below is perhaps overboard, but this allows us to save a
2897 * test each time through the loop at the expense of a mask. This
2898 * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2899 * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
2900 * are 64. This uses an exclusive 'or' to find that bit and then
2901 * inverts it to form a mask, with just a single 0, in the bit
2902 * position where 'S' and 's' differ. */
2903 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2904 const U8 s_masked = 's' & S_or_s_mask;
2907 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2908 if (! len) { /* Not a multi-char fold. */
2909 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2911 *has_exactf_sharp_s = TRUE;
2918 && ((*s & S_or_s_mask) == s_masked)
2919 && ((*(s+1) & S_or_s_mask) == s_masked))
2922 /* EXACTF nodes need to know that the minimum length
2923 * changed so that a sharp s in the string can match this
2924 * ss in the pattern, but they remain EXACTF nodes, as they
2925 * won't match this unless the target string is is UTF-8,
2926 * which we don't know until runtime */
2927 if (OP(scan) != EXACTF) {
2928 OP(scan) = EXACTFU_SS;
2932 *min_subtract += len - 1;
2939 /* Allow dumping but overwriting the collection of skipped
2940 * ops and/or strings with fake optimized ops */
2941 n = scan + NODE_SZ_STR(scan);
2949 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2953 /* REx optimizer. Converts nodes into quicker variants "in place".
2954 Finds fixed substrings. */
2956 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2957 to the position after last scanned or to NULL. */
2959 #define INIT_AND_WITHP \
2960 assert(!and_withp); \
2961 Newx(and_withp,1,struct regnode_charclass_class); \
2962 SAVEFREEPV(and_withp)
2964 /* this is a chain of data about sub patterns we are processing that
2965 need to be handled separately/specially in study_chunk. Its so
2966 we can simulate recursion without losing state. */
2968 typedef struct scan_frame {
2969 regnode *last; /* last node to process in this frame */
2970 regnode *next; /* next node to process when last is reached */
2971 struct scan_frame *prev; /*previous frame*/
2972 I32 stop; /* what stopparen do we use */
2976 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2979 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2980 I32 *minlenp, I32 *deltap,
2985 struct regnode_charclass_class *and_withp,
2986 U32 flags, U32 depth)
2987 /* scanp: Start here (read-write). */
2988 /* deltap: Write maxlen-minlen here. */
2989 /* last: Stop before this one. */
2990 /* data: string data about the pattern */
2991 /* stopparen: treat close N as END */
2992 /* recursed: which subroutines have we recursed into */
2993 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2996 I32 min = 0; /* There must be at least this number of characters to match */
2998 regnode *scan = *scanp, *next;
3000 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3001 int is_inf_internal = 0; /* The studied chunk is infinite */
3002 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3003 scan_data_t data_fake;
3004 SV *re_trie_maxbuff = NULL;
3005 regnode *first_non_open = scan;
3006 I32 stopmin = I32_MAX;
3007 scan_frame *frame = NULL;
3008 GET_RE_DEBUG_FLAGS_DECL;
3010 PERL_ARGS_ASSERT_STUDY_CHUNK;
3013 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3017 while (first_non_open && OP(first_non_open) == OPEN)
3018 first_non_open=regnext(first_non_open);
3023 while ( scan && OP(scan) != END && scan < last ){
3024 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3025 node length to get a real minimum (because
3026 the folded version may be shorter) */
3027 bool has_exactf_sharp_s = FALSE;
3028 /* Peephole optimizer: */
3029 DEBUG_STUDYDATA("Peep:", data,depth);
3030 DEBUG_PEEP("Peep",scan,depth);
3032 /* Its not clear to khw or hv why this is done here, and not in the
3033 * clauses that deal with EXACT nodes. khw's guess is that it's
3034 * because of a previous design */
3035 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3037 /* Follow the next-chain of the current node and optimize
3038 away all the NOTHINGs from it. */
3039 if (OP(scan) != CURLYX) {
3040 const int max = (reg_off_by_arg[OP(scan)]
3042 /* I32 may be smaller than U16 on CRAYs! */
3043 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3044 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3048 /* Skip NOTHING and LONGJMP. */
3049 while ((n = regnext(n))
3050 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3051 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3052 && off + noff < max)
3054 if (reg_off_by_arg[OP(scan)])
3057 NEXT_OFF(scan) = off;
3062 /* The principal pseudo-switch. Cannot be a switch, since we
3063 look into several different things. */
3064 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3065 || OP(scan) == IFTHEN) {
3066 next = regnext(scan);
3068 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3070 if (OP(next) == code || code == IFTHEN) {
3071 /* NOTE - There is similar code to this block below for handling
3072 TRIE nodes on a re-study. If you change stuff here check there
3074 I32 max1 = 0, min1 = I32_MAX, num = 0;
3075 struct regnode_charclass_class accum;
3076 regnode * const startbranch=scan;
3078 if (flags & SCF_DO_SUBSTR)
3079 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3080 if (flags & SCF_DO_STCLASS)
3081 cl_init_zero(pRExC_state, &accum);
3083 while (OP(scan) == code) {
3084 I32 deltanext, minnext, f = 0, fake;
3085 struct regnode_charclass_class this_class;
3088 data_fake.flags = 0;
3090 data_fake.whilem_c = data->whilem_c;
3091 data_fake.last_closep = data->last_closep;
3094 data_fake.last_closep = &fake;
3096 data_fake.pos_delta = delta;
3097 next = regnext(scan);
3098 scan = NEXTOPER(scan);
3100 scan = NEXTOPER(scan);
3101 if (flags & SCF_DO_STCLASS) {
3102 cl_init(pRExC_state, &this_class);
3103 data_fake.start_class = &this_class;
3104 f = SCF_DO_STCLASS_AND;
3106 if (flags & SCF_WHILEM_VISITED_POS)
3107 f |= SCF_WHILEM_VISITED_POS;
3109 /* we suppose the run is continuous, last=next...*/
3110 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3112 stopparen, recursed, NULL, f,depth+1);
3115 if (max1 < minnext + deltanext)
3116 max1 = minnext + deltanext;
3117 if (deltanext == I32_MAX)
3118 is_inf = is_inf_internal = 1;
3120 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3122 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3123 if ( stopmin > minnext)
3124 stopmin = min + min1;
3125 flags &= ~SCF_DO_SUBSTR;
3127 data->flags |= SCF_SEEN_ACCEPT;
3130 if (data_fake.flags & SF_HAS_EVAL)
3131 data->flags |= SF_HAS_EVAL;
3132 data->whilem_c = data_fake.whilem_c;
3134 if (flags & SCF_DO_STCLASS)
3135 cl_or(pRExC_state, &accum, &this_class);
3137 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3139 if (flags & SCF_DO_SUBSTR) {
3140 data->pos_min += min1;
3141 data->pos_delta += max1 - min1;
3142 if (max1 != min1 || is_inf)
3143 data->longest = &(data->longest_float);
3146 delta += max1 - min1;
3147 if (flags & SCF_DO_STCLASS_OR) {
3148 cl_or(pRExC_state, data->start_class, &accum);
3150 cl_and(data->start_class, and_withp);
3151 flags &= ~SCF_DO_STCLASS;
3154 else if (flags & SCF_DO_STCLASS_AND) {
3156 cl_and(data->start_class, &accum);
3157 flags &= ~SCF_DO_STCLASS;
3160 /* Switch to OR mode: cache the old value of
3161 * data->start_class */
3163 StructCopy(data->start_class, and_withp,
3164 struct regnode_charclass_class);
3165 flags &= ~SCF_DO_STCLASS_AND;
3166 StructCopy(&accum, data->start_class,
3167 struct regnode_charclass_class);
3168 flags |= SCF_DO_STCLASS_OR;
3169 SET_SSC_EOS(data->start_class);
3173 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3176 Assuming this was/is a branch we are dealing with: 'scan' now
3177 points at the item that follows the branch sequence, whatever
3178 it is. We now start at the beginning of the sequence and look
3185 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3187 If we can find such a subsequence we need to turn the first
3188 element into a trie and then add the subsequent branch exact
3189 strings to the trie.
3193 1. patterns where the whole set of branches can be converted.
3195 2. patterns where only a subset can be converted.
3197 In case 1 we can replace the whole set with a single regop
3198 for the trie. In case 2 we need to keep the start and end
3201 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3202 becomes BRANCH TRIE; BRANCH X;
3204 There is an additional case, that being where there is a
3205 common prefix, which gets split out into an EXACT like node
3206 preceding the TRIE node.
3208 If x(1..n)==tail then we can do a simple trie, if not we make
3209 a "jump" trie, such that when we match the appropriate word
3210 we "jump" to the appropriate tail node. Essentially we turn
3211 a nested if into a case structure of sorts.
3216 if (!re_trie_maxbuff) {
3217 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3218 if (!SvIOK(re_trie_maxbuff))
3219 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3221 if ( SvIV(re_trie_maxbuff)>=0 ) {
3223 regnode *first = (regnode *)NULL;
3224 regnode *last = (regnode *)NULL;
3225 regnode *tail = scan;
3230 SV * const mysv = sv_newmortal(); /* for dumping */
3232 /* var tail is used because there may be a TAIL
3233 regop in the way. Ie, the exacts will point to the
3234 thing following the TAIL, but the last branch will
3235 point at the TAIL. So we advance tail. If we
3236 have nested (?:) we may have to move through several
3240 while ( OP( tail ) == TAIL ) {
3241 /* this is the TAIL generated by (?:) */
3242 tail = regnext( tail );
3246 DEBUG_TRIE_COMPILE_r({
3247 regprop(RExC_rx, mysv, tail );
3248 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3249 (int)depth * 2 + 2, "",
3250 "Looking for TRIE'able sequences. Tail node is: ",
3251 SvPV_nolen_const( mysv )
3257 Step through the branches
3258 cur represents each branch,
3259 noper is the first thing to be matched as part of that branch
3260 noper_next is the regnext() of that node.
3262 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3263 via a "jump trie" but we also support building with NOJUMPTRIE,
3264 which restricts the trie logic to structures like /FOO|BAR/.
3266 If noper is a trieable nodetype then the branch is a possible optimization
3267 target. If we are building under NOJUMPTRIE then we require that noper_next
3268 is the same as scan (our current position in the regex program).
3270 Once we have two or more consecutive such branches we can create a
3271 trie of the EXACT's contents and stitch it in place into the program.
3273 If the sequence represents all of the branches in the alternation we
3274 replace the entire thing with a single TRIE node.
3276 Otherwise when it is a subsequence we need to stitch it in place and
3277 replace only the relevant branches. This means the first branch has
3278 to remain as it is used by the alternation logic, and its next pointer,
3279 and needs to be repointed at the item on the branch chain following
3280 the last branch we have optimized away.
3282 This could be either a BRANCH, in which case the subsequence is internal,
3283 or it could be the item following the branch sequence in which case the
3284 subsequence is at the end (which does not necessarily mean the first node
3285 is the start of the alternation).
3287 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3290 ----------------+-----------
3294 EXACTFU_SS | EXACTFU
3295 EXACTFU_TRICKYFOLD | EXACTFU
3300 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3301 ( EXACT == (X) ) ? EXACT : \
3302 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3305 /* dont use tail as the end marker for this traverse */
3306 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3307 regnode * const noper = NEXTOPER( cur );
3308 U8 noper_type = OP( noper );
3309 U8 noper_trietype = TRIE_TYPE( noper_type );
3310 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3311 regnode * const noper_next = regnext( noper );
3312 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3313 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3316 DEBUG_TRIE_COMPILE_r({
3317 regprop(RExC_rx, mysv, cur);
3318 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3319 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3321 regprop(RExC_rx, mysv, noper);
3322 PerlIO_printf( Perl_debug_log, " -> %s",
3323 SvPV_nolen_const(mysv));
3326 regprop(RExC_rx, mysv, noper_next );
3327 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3328 SvPV_nolen_const(mysv));
3330 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3331 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3332 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3336 /* Is noper a trieable nodetype that can be merged with the
3337 * current trie (if there is one)? */
3341 ( noper_trietype == NOTHING)
3342 || ( trietype == NOTHING )
3343 || ( trietype == noper_trietype )
3346 && noper_next == tail
3350 /* Handle mergable triable node
3351 * Either we are the first node in a new trieable sequence,
3352 * in which case we do some bookkeeping, otherwise we update
3353 * the end pointer. */
3356 if ( noper_trietype == NOTHING ) {
3357 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3358 regnode * const noper_next = regnext( noper );
3359 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3360 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3363 if ( noper_next_trietype ) {
3364 trietype = noper_next_trietype;
3365 } else if (noper_next_type) {
3366 /* a NOTHING regop is 1 regop wide. We need at least two
3367 * for a trie so we can't merge this in */
3371 trietype = noper_trietype;
3374 if ( trietype == NOTHING )
3375 trietype = noper_trietype;
3380 } /* end handle mergable triable node */
3382 /* handle unmergable node -
3383 * noper may either be a triable node which can not be tried
3384 * together with the current trie, or a non triable node */
3386 /* If last is set and trietype is not NOTHING then we have found
3387 * at least two triable branch sequences in a row of a similar
3388 * trietype so we can turn them into a trie. If/when we
3389 * allow NOTHING to start a trie sequence this condition will be
3390 * required, and it isn't expensive so we leave it in for now. */
3391 if ( trietype && trietype != NOTHING )
3392 make_trie( pRExC_state,
3393 startbranch, first, cur, tail, count,
3394 trietype, depth+1 );
3395 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3399 && noper_next == tail
3402 /* noper is triable, so we can start a new trie sequence */
3405 trietype = noper_trietype;
3407 /* if we already saw a first but the current node is not triable then we have
3408 * to reset the first information. */
3413 } /* end handle unmergable node */
3414 } /* loop over branches */
3415 DEBUG_TRIE_COMPILE_r({
3416 regprop(RExC_rx, mysv, cur);
3417 PerlIO_printf( Perl_debug_log,
3418 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3419 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3422 if ( last && trietype ) {
3423 if ( trietype != NOTHING ) {
3424 /* the last branch of the sequence was part of a trie,
3425 * so we have to construct it here outside of the loop
3427 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3428 #ifdef TRIE_STUDY_OPT
3429 if ( ((made == MADE_EXACT_TRIE &&
3430 startbranch == first)
3431 || ( first_non_open == first )) &&
3433 flags |= SCF_TRIE_RESTUDY;
3434 if ( startbranch == first
3437 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3442 /* at this point we know whatever we have is a NOTHING sequence/branch
3443 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3445 if ( startbranch == first ) {
3447 /* the entire thing is a NOTHING sequence, something like this:
3448 * (?:|) So we can turn it into a plain NOTHING op. */
3449 DEBUG_TRIE_COMPILE_r({
3450 regprop(RExC_rx, mysv, cur);
3451 PerlIO_printf( Perl_debug_log,
3452 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3453 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3456 OP(startbranch)= NOTHING;
3457 NEXT_OFF(startbranch)= tail - startbranch;
3458 for ( opt= startbranch + 1; opt < tail ; opt++ )
3462 } /* end if ( last) */
3463 } /* TRIE_MAXBUF is non zero */
3468 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3469 scan = NEXTOPER(NEXTOPER(scan));
3470 } else /* single branch is optimized. */
3471 scan = NEXTOPER(scan);
3473 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3474 scan_frame *newframe = NULL;
3479 if (OP(scan) != SUSPEND) {
3480 /* set the pointer */
3481 if (OP(scan) == GOSUB) {
3483 RExC_recurse[ARG2L(scan)] = scan;
3484 start = RExC_open_parens[paren-1];
3485 end = RExC_close_parens[paren-1];
3488 start = RExC_rxi->program + 1;
3492 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3493 SAVEFREEPV(recursed);
3495 if (!PAREN_TEST(recursed,paren+1)) {
3496 PAREN_SET(recursed,paren+1);
3497 Newx(newframe,1,scan_frame);
3499 if (flags & SCF_DO_SUBSTR) {
3500 SCAN_COMMIT(pRExC_state,data,minlenp);
3501 data->longest = &(data->longest_float);
3503 is_inf = is_inf_internal = 1;
3504 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3505 cl_anything(pRExC_state, data->start_class);
3506 flags &= ~SCF_DO_STCLASS;
3509 Newx(newframe,1,scan_frame);
3512 end = regnext(scan);
3517 SAVEFREEPV(newframe);
3518 newframe->next = regnext(scan);
3519 newframe->last = last;
3520 newframe->stop = stopparen;
3521 newframe->prev = frame;
3531 else if (OP(scan) == EXACT) {
3532 I32 l = STR_LEN(scan);
3535 const U8 * const s = (U8*)STRING(scan);
3536 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3537 l = utf8_length(s, s + l);
3539 uc = *((U8*)STRING(scan));
3542 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3543 /* The code below prefers earlier match for fixed
3544 offset, later match for variable offset. */
3545 if (data->last_end == -1) { /* Update the start info. */
3546 data->last_start_min = data->pos_min;
3547 data->last_start_max = is_inf
3548 ? I32_MAX : data->pos_min + data->pos_delta;
3550 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3552 SvUTF8_on(data->last_found);
3554 SV * const sv = data->last_found;
3555 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3556 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3557 if (mg && mg->mg_len >= 0)
3558 mg->mg_len += utf8_length((U8*)STRING(scan),
3559 (U8*)STRING(scan)+STR_LEN(scan));
3561 data->last_end = data->pos_min + l;
3562 data->pos_min += l; /* As in the first entry. */
3563 data->flags &= ~SF_BEFORE_EOL;
3565 if (flags & SCF_DO_STCLASS_AND) {
3566 /* Check whether it is compatible with what we know already! */
3570 /* If compatible, we or it in below. It is compatible if is
3571 * in the bitmp and either 1) its bit or its fold is set, or 2)
3572 * it's for a locale. Even if there isn't unicode semantics
3573 * here, at runtime there may be because of matching against a
3574 * utf8 string, so accept a possible false positive for
3575 * latin1-range folds */
3577 (!(data->start_class->flags & ANYOF_LOCALE)
3578 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3579 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3580 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3585 ANYOF_CLASS_ZERO(data->start_class);
3586 ANYOF_BITMAP_ZERO(data->start_class);
3588 ANYOF_BITMAP_SET(data->start_class, uc);
3589 else if (uc >= 0x100) {
3592 /* Some Unicode code points fold to the Latin1 range; as
3593 * XXX temporary code, instead of figuring out if this is
3594 * one, just assume it is and set all the start class bits
3595 * that could be some such above 255 code point's fold
3596 * which will generate fals positives. As the code
3597 * elsewhere that does compute the fold settles down, it
3598 * can be extracted out and re-used here */
3599 for (i = 0; i < 256; i++){
3600 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3601 ANYOF_BITMAP_SET(data->start_class, i);
3605 CLEAR_SSC_EOS(data->start_class);
3607 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3609 else if (flags & SCF_DO_STCLASS_OR) {
3610 /* false positive possible if the class is case-folded */
3612 ANYOF_BITMAP_SET(data->start_class, uc);
3614 data->start_class->flags |= ANYOF_UNICODE_ALL;
3615 CLEAR_SSC_EOS(data->start_class);
3616 cl_and(data->start_class, and_withp);
3618 flags &= ~SCF_DO_STCLASS;
3620 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3621 I32 l = STR_LEN(scan);
3622 UV uc = *((U8*)STRING(scan));
3624 /* Search for fixed substrings supports EXACT only. */
3625 if (flags & SCF_DO_SUBSTR) {
3627 SCAN_COMMIT(pRExC_state, data, minlenp);
3630 const U8 * const s = (U8 *)STRING(scan);
3631 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3632 l = utf8_length(s, s + l);
3634 if (has_exactf_sharp_s) {
3635 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3637 min += l - min_subtract;
3639 delta += min_subtract;
3640 if (flags & SCF_DO_SUBSTR) {
3641 data->pos_min += l - min_subtract;
3642 if (data->pos_min < 0) {
3645 data->pos_delta += min_subtract;
3647 data->longest = &(data->longest_float);
3650 if (flags & SCF_DO_STCLASS_AND) {
3651 /* Check whether it is compatible with what we know already! */
3654 (!(data->start_class->flags & ANYOF_LOCALE)
3655 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3656 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3660 ANYOF_CLASS_ZERO(data->start_class);
3661 ANYOF_BITMAP_ZERO(data->start_class);
3663 ANYOF_BITMAP_SET(data->start_class, uc);
3664 CLEAR_SSC_EOS(data->start_class);
3665 if (OP(scan) == EXACTFL) {
3666 /* XXX This set is probably no longer necessary, and
3667 * probably wrong as LOCALE now is on in the initial
3669 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3673 /* Also set the other member of the fold pair. In case
3674 * that unicode semantics is called for at runtime, use
3675 * the full latin1 fold. (Can't do this for locale,
3676 * because not known until runtime) */
3677 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3679 /* All other (EXACTFL handled above) folds except under
3680 * /iaa that include s, S, and sharp_s also may include
3682 if (OP(scan) != EXACTFA) {
3683 if (uc == 's' || uc == 'S') {
3684 ANYOF_BITMAP_SET(data->start_class,
3685 LATIN_SMALL_LETTER_SHARP_S);
3687 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3688 ANYOF_BITMAP_SET(data->start_class, 's');
3689 ANYOF_BITMAP_SET(data->start_class, 'S');
3694 else if (uc >= 0x100) {
3696 for (i = 0; i < 256; i++){
3697 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3698 ANYOF_BITMAP_SET(data->start_class, i);
3703 else if (flags & SCF_DO_STCLASS_OR) {
3704 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3705 /* false positive possible if the class is case-folded.
3706 Assume that the locale settings are the same... */
3708 ANYOF_BITMAP_SET(data->start_class, uc);
3709 if (OP(scan) != EXACTFL) {
3711 /* And set the other member of the fold pair, but
3712 * can't do that in locale because not known until
3714 ANYOF_BITMAP_SET(data->start_class,
3715 PL_fold_latin1[uc]);
3717 /* All folds except under /iaa that include s, S,
3718 * and sharp_s also may include the others */
3719 if (OP(scan) != EXACTFA) {
3720 if (uc == 's' || uc == 'S') {
3721 ANYOF_BITMAP_SET(data->start_class,
3722 LATIN_SMALL_LETTER_SHARP_S);
3724 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3725 ANYOF_BITMAP_SET(data->start_class, 's');
3726 ANYOF_BITMAP_SET(data->start_class, 'S');
3731 CLEAR_SSC_EOS(data->start_class);
3733 cl_and(data->start_class, and_withp);
3735 flags &= ~SCF_DO_STCLASS;
3737 else if (REGNODE_VARIES(OP(scan))) {
3738 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3739 I32 f = flags, pos_before = 0;
3740 regnode * const oscan = scan;
3741 struct regnode_charclass_class this_class;
3742 struct regnode_charclass_class *oclass = NULL;
3743 I32 next_is_eval = 0;
3745 switch (PL_regkind[OP(scan)]) {
3746 case WHILEM: /* End of (?:...)* . */
3747 scan = NEXTOPER(scan);
3750 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3751 next = NEXTOPER(scan);
3752 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3754 maxcount = REG_INFTY;
3755 next = regnext(scan);
3756 scan = NEXTOPER(scan);
3760 if (flags & SCF_DO_SUBSTR)
3765 if (flags & SCF_DO_STCLASS) {
3767 maxcount = REG_INFTY;
3768 next = regnext(scan);
3769 scan = NEXTOPER(scan);
3772 is_inf = is_inf_internal = 1;
3773 scan = regnext(scan);
3774 if (flags & SCF_DO_SUBSTR) {
3775 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3776 data->longest = &(data->longest_float);
3778 goto optimize_curly_tail;
3780 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3781 && (scan->flags == stopparen))
3786 mincount = ARG1(scan);
3787 maxcount = ARG2(scan);
3789 next = regnext(scan);
3790 if (OP(scan) == CURLYX) {
3791 I32 lp = (data ? *(data->last_closep) : 0);
3792 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3794 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3795 next_is_eval = (OP(scan) == EVAL);
3797 if (flags & SCF_DO_SUBSTR) {
3798 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3799 pos_before = data->pos_min;
3803 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3805 data->flags |= SF_IS_INF;
3807 if (flags & SCF_DO_STCLASS) {
3808 cl_init(pRExC_state, &this_class);
3809 oclass = data->start_class;
3810 data->start_class = &this_class;
3811 f |= SCF_DO_STCLASS_AND;
3812 f &= ~SCF_DO_STCLASS_OR;
3814 /* Exclude from super-linear cache processing any {n,m}
3815 regops for which the combination of input pos and regex
3816 pos is not enough information to determine if a match
3819 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3820 regex pos at the \s*, the prospects for a match depend not
3821 only on the input position but also on how many (bar\s*)
3822 repeats into the {4,8} we are. */
3823 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3824 f &= ~SCF_WHILEM_VISITED_POS;
3826 /* This will finish on WHILEM, setting scan, or on NULL: */
3827 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3828 last, data, stopparen, recursed, NULL,
3830 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3832 if (flags & SCF_DO_STCLASS)
3833 data->start_class = oclass;
3834 if (mincount == 0 || minnext == 0) {
3835 if (flags & SCF_DO_STCLASS_OR) {
3836 cl_or(pRExC_state, data->start_class, &this_class);
3838 else if (flags & SCF_DO_STCLASS_AND) {
3839 /* Switch to OR mode: cache the old value of
3840 * data->start_class */
3842 StructCopy(data->start_class, and_withp,
3843 struct regnode_charclass_class);
3844 flags &= ~SCF_DO_STCLASS_AND;
3845 StructCopy(&this_class, data->start_class,
3846 struct regnode_charclass_class);
3847 flags |= SCF_DO_STCLASS_OR;
3848 SET_SSC_EOS(data->start_class);
3850 } else { /* Non-zero len */
3851 if (flags & SCF_DO_STCLASS_OR) {
3852 cl_or(pRExC_state, data->start_class, &this_class);
3853 cl_and(data->start_class, and_withp);
3855 else if (flags & SCF_DO_STCLASS_AND)
3856 cl_and(data->start_class, &this_class);
3857 flags &= ~SCF_DO_STCLASS;
3859 if (!scan) /* It was not CURLYX, but CURLY. */
3861 if ( /* ? quantifier ok, except for (?{ ... }) */
3862 (next_is_eval || !(mincount == 0 && maxcount == 1))
3863 && (minnext == 0) && (deltanext == 0)
3864 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3865 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3867 /* Fatal warnings may leak the regexp without this: */
3868 SAVEFREESV(RExC_rx_sv);
3869 ckWARNreg(RExC_parse,
3870 "Quantifier unexpected on zero-length expression");
3871 (void)ReREFCNT_inc(RExC_rx_sv);
3874 min += minnext * mincount;
3875 is_inf_internal |= ((maxcount == REG_INFTY
3876 && (minnext + deltanext) > 0)
3877 || deltanext == I32_MAX);
3878 is_inf |= is_inf_internal;
3879 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3881 /* Try powerful optimization CURLYX => CURLYN. */
3882 if ( OP(oscan) == CURLYX && data
3883 && data->flags & SF_IN_PAR
3884 && !(data->flags & SF_HAS_EVAL)
3885 && !deltanext && minnext == 1 ) {
3886 /* Try to optimize to CURLYN. */
3887 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3888 regnode * const nxt1 = nxt;
3895 if (!REGNODE_SIMPLE(OP(nxt))
3896 && !(PL_regkind[OP(nxt)] == EXACT
3897 && STR_LEN(nxt) == 1))
3903 if (OP(nxt) != CLOSE)
3905 if (RExC_open_parens) {
3906 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3907 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3909 /* Now we know that nxt2 is the only contents: */
3910 oscan->flags = (U8)ARG(nxt);
3912 OP(nxt1) = NOTHING; /* was OPEN. */
3915 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3916 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3917 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3918 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3919 OP(nxt + 1) = OPTIMIZED; /* was count. */
3920 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3925 /* Try optimization CURLYX => CURLYM. */
3926 if ( OP(oscan) == CURLYX && data
3927 && !(data->flags & SF_HAS_PAR)
3928 && !(data->flags & SF_HAS_EVAL)
3929 && !deltanext /* atom is fixed width */
3930 && minnext != 0 /* CURLYM can't handle zero width */
3931 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3933 /* XXXX How to optimize if data == 0? */
3934 /* Optimize to a simpler form. */
3935 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3939 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3940 && (OP(nxt2) != WHILEM))
3942 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3943 /* Need to optimize away parenths. */
3944 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3945 /* Set the parenth number. */
3946 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3948 oscan->flags = (U8)ARG(nxt);
3949 if (RExC_open_parens) {
3950 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3951 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3953 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3954 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3957 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3958 OP(nxt + 1) = OPTIMIZED; /* was count. */
3959 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3960 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3963 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3964 regnode *nnxt = regnext(nxt1);
3966 if (reg_off_by_arg[OP(nxt1)])
3967 ARG_SET(nxt1, nxt2 - nxt1);
3968 else if (nxt2 - nxt1 < U16_MAX)
3969 NEXT_OFF(nxt1) = nxt2 - nxt1;
3971 OP(nxt) = NOTHING; /* Cannot beautify */
3976 /* Optimize again: */
3977 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3978 NULL, stopparen, recursed, NULL, 0,depth+1);
3983 else if ((OP(oscan) == CURLYX)
3984 && (flags & SCF_WHILEM_VISITED_POS)
3985 /* See the comment on a similar expression above.
3986 However, this time it's not a subexpression
3987 we care about, but the expression itself. */
3988 && (maxcount == REG_INFTY)
3989 && data && ++data->whilem_c < 16) {
3990 /* This stays as CURLYX, we can put the count/of pair. */
3991 /* Find WHILEM (as in regexec.c) */
3992 regnode *nxt = oscan + NEXT_OFF(oscan);
3994 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3996 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3997 | (RExC_whilem_seen << 4)); /* On WHILEM */
3999 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4001 if (flags & SCF_DO_SUBSTR) {
4002 SV *last_str = NULL;
4003 int counted = mincount != 0;
4005 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4006 #if defined(SPARC64_GCC_WORKAROUND)
4009 const char *s = NULL;
4012 if (pos_before >= data->last_start_min)
4015 b = data->last_start_min;
4018 s = SvPV_const(data->last_found, l);
4019 old = b - data->last_start_min;
4022 I32 b = pos_before >= data->last_start_min
4023 ? pos_before : data->last_start_min;
4025 const char * const s = SvPV_const(data->last_found, l);
4026 I32 old = b - data->last_start_min;
4030 old = utf8_hop((U8*)s, old) - (U8*)s;
4032 /* Get the added string: */
4033 last_str = newSVpvn_utf8(s + old, l, UTF);
4034 if (deltanext == 0 && pos_before == b) {
4035 /* What was added is a constant string */
4037 SvGROW(last_str, (mincount * l) + 1);
4038 repeatcpy(SvPVX(last_str) + l,
4039 SvPVX_const(last_str), l, mincount - 1);
4040 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4041 /* Add additional parts. */
4042 SvCUR_set(data->last_found,
4043 SvCUR(data->last_found) - l);
4044 sv_catsv(data->last_found, last_str);
4046 SV * sv = data->last_found;
4048 SvUTF8(sv) && SvMAGICAL(sv) ?
4049 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4050 if (mg && mg->mg_len >= 0)
4051 mg->mg_len += CHR_SVLEN(last_str) - l;
4053 data->last_end += l * (mincount - 1);
4056 /* start offset must point into the last copy */
4057 data->last_start_min += minnext * (mincount - 1);
4058 data->last_start_max += is_inf ? I32_MAX
4059 : (maxcount - 1) * (minnext + data->pos_delta);
4062 /* It is counted once already... */
4063 data->pos_min += minnext * (mincount - counted);
4064 data->pos_delta += - counted * deltanext +
4065 (minnext + deltanext) * maxcount - minnext * mincount;
4066 if (mincount != maxcount) {
4067 /* Cannot extend fixed substrings found inside
4069 SCAN_COMMIT(pRExC_state,data,minlenp);
4070 if (mincount && last_str) {
4071 SV * const sv = data->last_found;
4072 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4073 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4077 sv_setsv(sv, last_str);
4078 data->last_end = data->pos_min;
4079 data->last_start_min =
4080 data->pos_min - CHR_SVLEN(last_str);
4081 data->last_start_max = is_inf
4083 : data->pos_min + data->pos_delta
4084 - CHR_SVLEN(last_str);
4086 data->longest = &(data->longest_float);
4088 SvREFCNT_dec(last_str);
4090 if (data && (fl & SF_HAS_EVAL))
4091 data->flags |= SF_HAS_EVAL;
4092 optimize_curly_tail:
4093 if (OP(oscan) != CURLYX) {
4094 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4096 NEXT_OFF(oscan) += NEXT_OFF(next);
4099 default: /* REF, ANYOFV, and CLUMP only? */
4100 if (flags & SCF_DO_SUBSTR) {
4101 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4102 data->longest = &(data->longest_float);
4104 is_inf = is_inf_internal = 1;
4105 if (flags & SCF_DO_STCLASS_OR)
4106 cl_anything(pRExC_state, data->start_class);
4107 flags &= ~SCF_DO_STCLASS;
4111 else if (OP(scan) == LNBREAK) {
4112 if (flags & SCF_DO_STCLASS) {
4114 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4115 if (flags & SCF_DO_STCLASS_AND) {
4116 for (value = 0; value < 256; value++)
4117 if (!is_VERTWS_cp(value))
4118 ANYOF_BITMAP_CLEAR(data->start_class, value);
4121 for (value = 0; value < 256; value++)
4122 if (is_VERTWS_cp(value))
4123 ANYOF_BITMAP_SET(data->start_class, value);
4125 if (flags & SCF_DO_STCLASS_OR)
4126 cl_and(data->start_class, and_withp);
4127 flags &= ~SCF_DO_STCLASS;
4130 delta++; /* Because of the 2 char string cr-lf */
4131 if (flags & SCF_DO_SUBSTR) {
4132 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4134 data->pos_delta += 1;
4135 data->longest = &(data->longest_float);
4138 else if (REGNODE_SIMPLE(OP(scan))) {
4141 if (flags & SCF_DO_SUBSTR) {
4142 SCAN_COMMIT(pRExC_state,data,minlenp);
4146 if (flags & SCF_DO_STCLASS) {
4148 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4150 /* Some of the logic below assumes that switching
4151 locale on will only add false positives. */
4152 switch (PL_regkind[OP(scan)]) {
4158 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4161 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4162 cl_anything(pRExC_state, data->start_class);
4165 if (OP(scan) == SANY)
4167 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4168 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4169 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4170 cl_anything(pRExC_state, data->start_class);
4172 if (flags & SCF_DO_STCLASS_AND || !value)
4173 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4176 if (flags & SCF_DO_STCLASS_AND)
4177 cl_and(data->start_class,
4178 (struct regnode_charclass_class*)scan);
4180 cl_or(pRExC_state, data->start_class,
4181 (struct regnode_charclass_class*)scan);
4189 classnum = FLAGS(scan);
4190 if (flags & SCF_DO_STCLASS_AND) {
4191 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4192 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4193 for (value = 0; value < loop_max; value++) {
4194 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4195 ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4201 if (data->start_class->flags & ANYOF_LOCALE) {
4202 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4206 /* Even if under locale, set the bits for non-locale
4207 * in case it isn't a true locale-node. This will
4208 * create false positives if it truly is locale */
4209 for (value = 0; value < loop_max; value++) {
4210 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4211 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4223 classnum = FLAGS(scan);
4224 if (flags & SCF_DO_STCLASS_AND) {
4225 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4226 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4227 for (value = 0; value < loop_max; value++) {
4228 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4229 ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4235 if (data->start_class->flags & ANYOF_LOCALE) {
4236 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4240 /* Even if under locale, set the bits for non-locale in
4241 * case it isn't a true locale-node. This will create
4242 * false positives if it truly is locale */
4243 for (value = 0; value < loop_max; value++) {
4244 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4245 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4248 if (PL_regkind[OP(scan)] == NPOSIXD) {
4249 data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4255 if (flags & SCF_DO_STCLASS_OR)
4256 cl_and(data->start_class, and_withp);
4257 flags &= ~SCF_DO_STCLASS;
4260 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4261 data->flags |= (OP(scan) == MEOL
4264 SCAN_COMMIT(pRExC_state, data, minlenp);
4267 else if ( PL_regkind[OP(scan)] == BRANCHJ
4268 /* Lookbehind, or need to calculate parens/evals/stclass: */
4269 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4270 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4271 if ( OP(scan) == UNLESSM &&
4273 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4274 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4277 regnode *upto= regnext(scan);
4279 SV * const mysv_val=sv_newmortal();
4280 DEBUG_STUDYDATA("OPFAIL",data,depth);
4282 /*DEBUG_PARSE_MSG("opfail");*/
4283 regprop(RExC_rx, mysv_val, upto);
4284 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4285 SvPV_nolen_const(mysv_val),
4286 (IV)REG_NODE_NUM(upto),
4291 NEXT_OFF(scan) = upto - scan;
4292 for (opt= scan + 1; opt < upto ; opt++)
4293 OP(opt) = OPTIMIZED;
4297 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4298 || OP(scan) == UNLESSM )
4300 /* Negative Lookahead/lookbehind
4301 In this case we can't do fixed string optimisation.
4304 I32 deltanext, minnext, fake = 0;
4306 struct regnode_charclass_class intrnl;
4309 data_fake.flags = 0;
4311 data_fake.whilem_c = data->whilem_c;
4312 data_fake.last_closep = data->last_closep;
4315 data_fake.last_closep = &fake;
4316 data_fake.pos_delta = delta;
4317 if ( flags & SCF_DO_STCLASS && !scan->flags
4318 && OP(scan) == IFMATCH ) { /* Lookahead */
4319 cl_init(pRExC_state, &intrnl);
4320 data_fake.start_class = &intrnl;
4321 f |= SCF_DO_STCLASS_AND;
4323 if (flags & SCF_WHILEM_VISITED_POS)
4324 f |= SCF_WHILEM_VISITED_POS;
4325 next = regnext(scan);
4326 nscan = NEXTOPER(NEXTOPER(scan));
4327 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4328 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4331 FAIL("Variable length lookbehind not implemented");
4333 else if (minnext > (I32)U8_MAX) {
4334 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4336 scan->flags = (U8)minnext;
4339 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4341 if (data_fake.flags & SF_HAS_EVAL)
4342 data->flags |= SF_HAS_EVAL;
4343 data->whilem_c = data_fake.whilem_c;
4345 if (f & SCF_DO_STCLASS_AND) {
4346 if (flags & SCF_DO_STCLASS_OR) {
4347 /* OR before, AND after: ideally we would recurse with
4348 * data_fake to get the AND applied by study of the
4349 * remainder of the pattern, and then derecurse;
4350 * *** HACK *** for now just treat as "no information".
4351 * See [perl #56690].
4353 cl_init(pRExC_state, data->start_class);
4355 /* AND before and after: combine and continue */
4356 const int was = TEST_SSC_EOS(data->start_class);
4358 cl_and(data->start_class, &intrnl);
4360 SET_SSC_EOS(data->start_class);
4364 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4366 /* Positive Lookahead/lookbehind
4367 In this case we can do fixed string optimisation,
4368 but we must be careful about it. Note in the case of
4369 lookbehind the positions will be offset by the minimum
4370 length of the pattern, something we won't know about
4371 until after the recurse.
4373 I32 deltanext, fake = 0;
4375 struct regnode_charclass_class intrnl;
4377 /* We use SAVEFREEPV so that when the full compile
4378 is finished perl will clean up the allocated
4379 minlens when it's all done. This way we don't
4380 have to worry about freeing them when we know
4381 they wont be used, which would be a pain.
4384 Newx( minnextp, 1, I32 );
4385 SAVEFREEPV(minnextp);
4388 StructCopy(data, &data_fake, scan_data_t);
4389 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4392 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4393 data_fake.last_found=newSVsv(data->last_found);
4397 data_fake.last_closep = &fake;
4398 data_fake.flags = 0;
4399 data_fake.pos_delta = delta;
4401 data_fake.flags |= SF_IS_INF;
4402 if ( flags & SCF_DO_STCLASS && !scan->flags
4403 && OP(scan) == IFMATCH ) { /* Lookahead */
4404 cl_init(pRExC_state, &intrnl);
4405 data_fake.start_class = &intrnl;
4406 f |= SCF_DO_STCLASS_AND;
4408 if (flags & SCF_WHILEM_VISITED_POS)
4409 f |= SCF_WHILEM_VISITED_POS;
4410 next = regnext(scan);
4411 nscan = NEXTOPER(NEXTOPER(scan));
4413 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4414 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4417 FAIL("Variable length lookbehind not implemented");
4419 else if (*minnextp > (I32)U8_MAX) {
4420 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4422 scan->flags = (U8)*minnextp;
4427 if (f & SCF_DO_STCLASS_AND) {
4428 const int was = TEST_SSC_EOS(data.start_class);
4430 cl_and(data->start_class, &intrnl);
4432 SET_SSC_EOS(data->start_class);
4435 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4437 if (data_fake.flags & SF_HAS_EVAL)
4438 data->flags |= SF_HAS_EVAL;
4439 data->whilem_c = data_fake.whilem_c;
4440 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4441 if (RExC_rx->minlen<*minnextp)
4442 RExC_rx->minlen=*minnextp;
4443 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4444 SvREFCNT_dec_NN(data_fake.last_found);
4446 if ( data_fake.minlen_fixed != minlenp )
4448 data->offset_fixed= data_fake.offset_fixed;
4449 data->minlen_fixed= data_fake.minlen_fixed;
4450 data->lookbehind_fixed+= scan->flags;
4452 if ( data_fake.minlen_float != minlenp )
4454 data->minlen_float= data_fake.minlen_float;
4455 data->offset_float_min=data_fake.offset_float_min;
4456 data->offset_float_max=data_fake.offset_float_max;
4457 data->lookbehind_float+= scan->flags;
4464 else if (OP(scan) == OPEN) {
4465 if (stopparen != (I32)ARG(scan))
4468 else if (OP(scan) == CLOSE) {
4469 if (stopparen == (I32)ARG(scan)) {
4472 if ((I32)ARG(scan) == is_par) {
4473 next = regnext(scan);
4475 if ( next && (OP(next) != WHILEM) && next < last)
4476 is_par = 0; /* Disable optimization */
4479 *(data->last_closep) = ARG(scan);
4481 else if (OP(scan) == EVAL) {
4483 data->flags |= SF_HAS_EVAL;
4485 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4486 if (flags & SCF_DO_SUBSTR) {
4487 SCAN_COMMIT(pRExC_state,data,minlenp);
4488 flags &= ~SCF_DO_SUBSTR;
4490 if (data && OP(scan)==ACCEPT) {
4491 data->flags |= SCF_SEEN_ACCEPT;
4496 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4498 if (flags & SCF_DO_SUBSTR) {
4499 SCAN_COMMIT(pRExC_state,data,minlenp);
4500 data->longest = &(data->longest_float);
4502 is_inf = is_inf_internal = 1;
4503 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4504 cl_anything(pRExC_state, data->start_class);
4505 flags &= ~SCF_DO_STCLASS;
4507 else if (OP(scan) == GPOS) {
4508 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4509 !(delta || is_inf || (data && data->pos_delta)))
4511 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4512 RExC_rx->extflags |= RXf_ANCH_GPOS;
4513 if (RExC_rx->gofs < (U32)min)
4514 RExC_rx->gofs = min;
4516 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4520 #ifdef TRIE_STUDY_OPT
4521 #ifdef FULL_TRIE_STUDY
4522 else if (PL_regkind[OP(scan)] == TRIE) {
4523 /* NOTE - There is similar code to this block above for handling
4524 BRANCH nodes on the initial study. If you change stuff here
4526 regnode *trie_node= scan;
4527 regnode *tail= regnext(scan);
4528 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4529 I32 max1 = 0, min1 = I32_MAX;
4530 struct regnode_charclass_class accum;
4532 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4533 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4534 if (flags & SCF_DO_STCLASS)
4535 cl_init_zero(pRExC_state, &accum);
4541 const regnode *nextbranch= NULL;
4544 for ( word=1 ; word <= trie->wordcount ; word++)
4546 I32 deltanext=0, minnext=0, f = 0, fake;
4547 struct regnode_charclass_class this_class;
4549 data_fake.flags = 0;
4551 data_fake.whilem_c = data->whilem_c;
4552 data_fake.last_closep = data->last_closep;
4555 data_fake.last_closep = &fake;
4556 data_fake.pos_delta = delta;
4557 if (flags & SCF_DO_STCLASS) {
4558 cl_init(pRExC_state, &this_class);
4559 data_fake.start_class = &this_class;
4560 f = SCF_DO_STCLASS_AND;
4562 if (flags & SCF_WHILEM_VISITED_POS)
4563 f |= SCF_WHILEM_VISITED_POS;
4565 if (trie->jump[word]) {
4567 nextbranch = trie_node + trie->jump[0];
4568 scan= trie_node + trie->jump[word];
4569 /* We go from the jump point to the branch that follows
4570 it. Note this means we need the vestigal unused branches
4571 even though they arent otherwise used.
4573 minnext = study_chunk(pRExC_state, &scan, minlenp,
4574 &deltanext, (regnode *)nextbranch, &data_fake,
4575 stopparen, recursed, NULL, f,depth+1);
4577 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4578 nextbranch= regnext((regnode*)nextbranch);
4580 if (min1 > (I32)(minnext + trie->minlen))
4581 min1 = minnext + trie->minlen;
4582 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4583 max1 = minnext + deltanext + trie->maxlen;
4584 if (deltanext == I32_MAX)
4585 is_inf = is_inf_internal = 1;
4587 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4589 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4590 if ( stopmin > min + min1)
4591 stopmin = min + min1;
4592 flags &= ~SCF_DO_SUBSTR;
4594 data->flags |= SCF_SEEN_ACCEPT;
4597 if (data_fake.flags & SF_HAS_EVAL)
4598 data->flags |= SF_HAS_EVAL;
4599 data->whilem_c = data_fake.whilem_c;
4601 if (flags & SCF_DO_STCLASS)
4602 cl_or(pRExC_state, &accum, &this_class);
4605 if (flags & SCF_DO_SUBSTR) {
4606 data->pos_min += min1;
4607 data->pos_delta += max1 - min1;
4608 if (max1 != min1 || is_inf)
4609 data->longest = &(data->longest_float);
4612 delta += max1 - min1;
4613 if (flags & SCF_DO_STCLASS_OR) {
4614 cl_or(pRExC_state, data->start_class, &accum);
4616 cl_and(data->start_class, and_withp);
4617 flags &= ~SCF_DO_STCLASS;
4620 else if (flags & SCF_DO_STCLASS_AND) {
4622 cl_and(data->start_class, &accum);
4623 flags &= ~SCF_DO_STCLASS;
4626 /* Switch to OR mode: cache the old value of
4627 * data->start_class */
4629 StructCopy(data->start_class, and_withp,
4630 struct regnode_charclass_class);
4631 flags &= ~SCF_DO_STCLASS_AND;
4632 StructCopy(&accum, data->start_class,
4633 struct regnode_charclass_class);
4634 flags |= SCF_DO_STCLASS_OR;
4635 SET_SSC_EOS(data->start_class);
4642 else if (PL_regkind[OP(scan)] == TRIE) {
4643 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4646 min += trie->minlen;
4647 delta += (trie->maxlen - trie->minlen);
4648 flags &= ~SCF_DO_STCLASS; /* xxx */
4649 if (flags & SCF_DO_SUBSTR) {
4650 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4651 data->pos_min += trie->minlen;
4652 data->pos_delta += (trie->maxlen - trie->minlen);
4653 if (trie->maxlen != trie->minlen)
4654 data->longest = &(data->longest_float);
4656 if (trie->jump) /* no more substrings -- for now /grr*/
4657 flags &= ~SCF_DO_SUBSTR;
4659 #endif /* old or new */
4660 #endif /* TRIE_STUDY_OPT */
4662 /* Else: zero-length, ignore. */
4663 scan = regnext(scan);
4668 stopparen = frame->stop;
4669 frame = frame->prev;
4670 goto fake_study_recurse;
4675 DEBUG_STUDYDATA("pre-fin:",data,depth);
4678 *deltap = is_inf_internal ? I32_MAX : delta;
4679 if (flags & SCF_DO_SUBSTR && is_inf)
4680 data->pos_delta = I32_MAX - data->pos_min;
4681 if (is_par > (I32)U8_MAX)
4683 if (is_par && pars==1 && data) {
4684 data->flags |= SF_IN_PAR;
4685 data->flags &= ~SF_HAS_PAR;
4687 else if (pars && data) {
4688 data->flags |= SF_HAS_PAR;
4689 data->flags &= ~SF_IN_PAR;
4691 if (flags & SCF_DO_STCLASS_OR)
4692 cl_and(data->start_class, and_withp);
4693 if (flags & SCF_TRIE_RESTUDY)
4694 data->flags |= SCF_TRIE_RESTUDY;
4696 DEBUG_STUDYDATA("post-fin:",data,depth);
4698 return min < stopmin ? min : stopmin;
4702 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4704 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4706 PERL_ARGS_ASSERT_ADD_DATA;
4708 Renewc(RExC_rxi->data,
4709 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4710 char, struct reg_data);
4712 Renew(RExC_rxi->data->what, count + n, U8);
4714 Newx(RExC_rxi->data->what, n, U8);
4715 RExC_rxi->data->count = count + n;
4716 Copy(s, RExC_rxi->data->what + count, n, U8);
4720 /*XXX: todo make this not included in a non debugging perl */
4721 #ifndef PERL_IN_XSUB_RE
4723 Perl_reginitcolors(pTHX)
4726 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4728 char *t = savepv(s);
4732 t = strchr(t, '\t');
4738 PL_colors[i] = t = (char *)"";
4743 PL_colors[i++] = (char *)"";
4750 #ifdef TRIE_STUDY_OPT
4751 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
4754 (data.flags & SCF_TRIE_RESTUDY) \
4762 #define CHECK_RESTUDY_GOTO_butfirst
4766 * pregcomp - compile a regular expression into internal code
4768 * Decides which engine's compiler to call based on the hint currently in
4772 #ifndef PERL_IN_XSUB_RE
4774 /* return the currently in-scope regex engine (or the default if none) */
4776 regexp_engine const *
4777 Perl_current_re_engine(pTHX)
4781 if (IN_PERL_COMPILETIME) {
4782 HV * const table = GvHV(PL_hintgv);
4786 return &reh_regexp_engine;
4787 ptr = hv_fetchs(table, "regcomp", FALSE);
4788 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4789 return &reh_regexp_engine;
4790 return INT2PTR(regexp_engine*,SvIV(*ptr));
4794 if (!PL_curcop->cop_hints_hash)
4795 return &reh_regexp_engine;
4796 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4797 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4798 return &reh_regexp_engine;
4799 return INT2PTR(regexp_engine*,SvIV(ptr));
4805 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4808 regexp_engine const *eng = current_re_engine();
4809 GET_RE_DEBUG_FLAGS_DECL;
4811 PERL_ARGS_ASSERT_PREGCOMP;
4813 /* Dispatch a request to compile a regexp to correct regexp engine. */
4815 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4818 return CALLREGCOMP_ENG(eng, pattern, flags);
4822 /* public(ish) entry point for the perl core's own regex compiling code.
4823 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4824 * pattern rather than a list of OPs, and uses the internal engine rather
4825 * than the current one */
4828 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4830 SV *pat = pattern; /* defeat constness! */
4831 PERL_ARGS_ASSERT_RE_COMPILE;
4832 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4833 #ifdef PERL_IN_XSUB_RE
4838 NULL, NULL, rx_flags, 0);
4841 /* see if there are any run-time code blocks in the pattern.
4842 * False positives are allowed */
4845 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4846 U32 pm_flags, char *pat, STRLEN plen)
4851 /* avoid infinitely recursing when we recompile the pattern parcelled up
4852 * as qr'...'. A single constant qr// string can't have have any
4853 * run-time component in it, and thus, no runtime code. (A non-qr
4854 * string, however, can, e.g. $x =~ '(?{})') */
4855 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4858 for (s = 0; s < plen; s++) {
4859 if (n < pRExC_state->num_code_blocks
4860 && s == pRExC_state->code_blocks[n].start)
4862 s = pRExC_state->code_blocks[n].end;
4866 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4868 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
4870 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
4877 /* Handle run-time code blocks. We will already have compiled any direct
4878 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4879 * copy of it, but with any literal code blocks blanked out and
4880 * appropriate chars escaped; then feed it into
4882 * eval "qr'modified_pattern'"
4886 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4890 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4892 * After eval_sv()-ing that, grab any new code blocks from the returned qr
4893 * and merge them with any code blocks of the original regexp.
4895 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4896 * instead, just save the qr and return FALSE; this tells our caller that
4897 * the original pattern needs upgrading to utf8.
4901 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4902 char *pat, STRLEN plen)
4906 GET_RE_DEBUG_FLAGS_DECL;
4908 if (pRExC_state->runtime_code_qr) {
4909 /* this is the second time we've been called; this should
4910 * only happen if the main pattern got upgraded to utf8
4911 * during compilation; re-use the qr we compiled first time
4912 * round (which should be utf8 too)
4914 qr = pRExC_state->runtime_code_qr;
4915 pRExC_state->runtime_code_qr = NULL;
4916 assert(RExC_utf8 && SvUTF8(qr));
4922 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4926 /* determine how many extra chars we need for ' and \ escaping */
4927 for (s = 0; s < plen; s++) {
4928 if (pat[s] == '\'' || pat[s] == '\\')
4932 Newx(newpat, newlen, char);
4934 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4936 for (s = 0; s < plen; s++) {
4937 if (n < pRExC_state->num_code_blocks
4938 && s == pRExC_state->code_blocks[n].start)
4940 /* blank out literal code block */
4941 assert(pat[s] == '(');
4942 while (s <= pRExC_state->code_blocks[n].end) {
4950 if (pat[s] == '\'' || pat[s] == '\\')
4955 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4959 PerlIO_printf(Perl_debug_log,
4960 "%sre-parsing pattern for runtime code:%s %s\n",
4961 PL_colors[4],PL_colors[5],newpat);
4964 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
4970 PUSHSTACKi(PERLSI_REQUIRE);
4971 /* this causes the toker to collapse \\ into \ when parsing
4972 * qr''; normally only q'' does this. It also alters hints
4974 PL_reg_state.re_reparsing = TRUE;
4975 eval_sv(sv, G_SCALAR);
4976 SvREFCNT_dec_NN(sv);
4981 SV * const errsv = ERRSV;
4982 if (SvTRUE_NN(errsv))
4984 Safefree(pRExC_state->code_blocks);
4985 /* use croak_sv ? */
4986 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
4989 assert(SvROK(qr_ref));
4991 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
4992 /* the leaving below frees the tmp qr_ref.
4993 * Give qr a life of its own */
5001 if (!RExC_utf8 && SvUTF8(qr)) {
5002 /* first time through; the pattern got upgraded; save the
5003 * qr for the next time through */
5004 assert(!pRExC_state->runtime_code_qr);
5005 pRExC_state->runtime_code_qr = qr;
5010 /* extract any code blocks within the returned qr// */
5013 /* merge the main (r1) and run-time (r2) code blocks into one */
5015 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5016 struct reg_code_block *new_block, *dst;
5017 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5020 if (!r2->num_code_blocks) /* we guessed wrong */
5022 SvREFCNT_dec_NN(qr);
5027 r1->num_code_blocks + r2->num_code_blocks,
5028 struct reg_code_block);
5031 while ( i1 < r1->num_code_blocks
5032 || i2 < r2->num_code_blocks)
5034 struct reg_code_block *src;
5037 if (i1 == r1->num_code_blocks) {
5038 src = &r2->code_blocks[i2++];
5041 else if (i2 == r2->num_code_blocks)
5042 src = &r1->code_blocks[i1++];
5043 else if ( r1->code_blocks[i1].start
5044 < r2->code_blocks[i2].start)
5046 src = &r1->code_blocks[i1++];
5047 assert(src->end < r2->code_blocks[i2].start);
5050 assert( r1->code_blocks[i1].start
5051 > r2->code_blocks[i2].start);
5052 src = &r2->code_blocks[i2++];
5054 assert(src->end < r1->code_blocks[i1].start);
5057 assert(pat[src->start] == '(');
5058 assert(pat[src->end] == ')');
5059 dst->start = src->start;
5060 dst->end = src->end;
5061 dst->block = src->block;
5062 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5066 r1->num_code_blocks += r2->num_code_blocks;
5067 Safefree(r1->code_blocks);
5068 r1->code_blocks = new_block;
5071 SvREFCNT_dec_NN(qr);
5077 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5079 /* This is the common code for setting up the floating and fixed length
5080 * string data extracted from Perlre_op_compile() below. Returns a boolean
5081 * as to whether succeeded or not */
5085 if (! (longest_length
5086 || (eol /* Can't have SEOL and MULTI */
5087 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5089 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5090 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5095 /* copy the information about the longest from the reg_scan_data
5096 over to the program. */
5097 if (SvUTF8(sv_longest)) {
5098 *rx_utf8 = sv_longest;
5101 *rx_substr = sv_longest;
5104 /* end_shift is how many chars that must be matched that
5105 follow this item. We calculate it ahead of time as once the
5106 lookbehind offset is added in we lose the ability to correctly
5108 ml = minlen ? *(minlen) : (I32)longest_length;
5109 *rx_end_shift = ml - offset
5110 - longest_length + (SvTAIL(sv_longest) != 0)
5113 t = (eol/* Can't have SEOL and MULTI */
5114 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5115 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5121 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5122 * regular expression into internal code.
5123 * The pattern may be passed either as:
5124 * a list of SVs (patternp plus pat_count)
5125 * a list of OPs (expr)
5126 * If both are passed, the SV list is used, but the OP list indicates
5127 * which SVs are actually pre-compiled code blocks
5129 * The SVs in the list have magic and qr overloading applied to them (and
5130 * the list may be modified in-place with replacement SVs in the latter
5133 * If the pattern hasn't changed from old_re, then old_re will be
5136 * eng is the current engine. If that engine has an op_comp method, then
5137 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5138 * do the initial concatenation of arguments and pass on to the external
5141 * If is_bare_re is not null, set it to a boolean indicating whether the
5142 * arg list reduced (after overloading) to a single bare regex which has
5143 * been returned (i.e. /$qr/).
5145 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5147 * pm_flags contains the PMf_* flags, typically based on those from the
5148 * pm_flags field of the related PMOP. Currently we're only interested in
5149 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5151 * We can't allocate space until we know how big the compiled form will be,
5152 * but we can't compile it (and thus know how big it is) until we've got a
5153 * place to put the code. So we cheat: we compile it twice, once with code
5154 * generation turned off and size counting turned on, and once "for real".
5155 * This also means that we don't allocate space until we are sure that the
5156 * thing really will compile successfully, and we never have to move the
5157 * code and thus invalidate pointers into it. (Note that it has to be in
5158 * one piece because free() must be able to free it all.) [NB: not true in perl]
5160 * Beware that the optimization-preparation code in here knows about some
5161 * of the structure of the compiled regexp. [I'll say.]
5165 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5166 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5167 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5172 regexp_internal *ri;
5181 SV * VOL code_blocksv = NULL;
5183 /* these are all flags - maybe they should be turned
5184 * into a single int with different bit masks */
5185 I32 sawlookahead = 0;
5188 bool used_setjump = FALSE;
5189 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5190 bool code_is_utf8 = 0;
5191 bool VOL recompile = 0;
5192 bool runtime_code = 0;
5196 RExC_state_t RExC_state;
5197 RExC_state_t * const pRExC_state = &RExC_state;
5198 #ifdef TRIE_STUDY_OPT
5200 RExC_state_t copyRExC_state;
5202 GET_RE_DEBUG_FLAGS_DECL;
5204 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5206 DEBUG_r(if (!PL_colorset) reginitcolors());
5208 #ifndef PERL_IN_XSUB_RE
5209 /* Initialize these here instead of as-needed, as is quick and avoids
5210 * having to test them each time otherwise */
5211 if (! PL_AboveLatin1) {
5212 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5213 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5214 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5216 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5217 = _new_invlist_C_array(L1PosixAlnum_invlist);
5218 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5219 = _new_invlist_C_array(PosixAlnum_invlist);
5221 PL_L1Posix_ptrs[_CC_ALPHA]
5222 = _new_invlist_C_array(L1PosixAlpha_invlist);
5223 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5225 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5226 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5228 /* Cased is the same as Alpha in the ASCII range */
5229 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5230 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5232 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5233 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5235 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5236 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5238 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5239 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5241 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5242 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5244 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5245 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5247 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5248 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5250 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5251 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5252 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5253 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5255 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5256 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5258 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5260 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5261 PL_L1Posix_ptrs[_CC_WORDCHAR]
5262 = _new_invlist_C_array(L1PosixWord_invlist);
5264 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5265 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5267 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5271 pRExC_state->code_blocks = NULL;
5272 pRExC_state->num_code_blocks = 0;
5275 *is_bare_re = FALSE;
5277 if (expr && (expr->op_type == OP_LIST ||
5278 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5280 /* is the source UTF8, and how many code blocks are there? */
5284 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5285 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5287 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5288 /* count of DO blocks */
5292 pRExC_state->num_code_blocks = ncode;
5293 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5298 /* handle a list of SVs */
5302 /* apply magic and RE overloading to each arg */
5303 for (svp = patternp; svp < patternp + pat_count; svp++) {
5306 if (SvROK(rx) && SvAMAGIC(rx)) {
5307 SV *sv = AMG_CALLunary(rx, regexp_amg);
5311 if (SvTYPE(sv) != SVt_REGEXP)
5312 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5318 if (pat_count > 1) {
5319 /* concat multiple args and find any code block indexes */
5324 STRLEN orig_patlen = 0;
5326 if (pRExC_state->num_code_blocks) {
5327 o = cLISTOPx(expr)->op_first;
5328 assert( o->op_type == OP_PUSHMARK
5329 || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5330 || o->op_type == OP_PADRANGE);
5334 pat = newSVpvn("", 0);
5337 /* determine if the pattern is going to be utf8 (needed
5338 * in advance to align code block indices correctly).
5339 * XXX This could fail to be detected for an arg with
5340 * overloading but not concat overloading; but the main effect
5341 * in this obscure case is to need a 'use re eval' for a
5342 * literal code block */
5343 for (svp = patternp; svp < patternp + pat_count; svp++) {
5350 for (svp = patternp; svp < patternp + pat_count; svp++) {
5351 SV *sv, *msv = *svp;
5354 /* we make the assumption here that each op in the list of
5355 * op_siblings maps to one SV pushed onto the stack,
5356 * except for code blocks, with have both an OP_NULL and
5358 * This allows us to match up the list of SVs against the
5359 * list of OPs to find the next code block.
5361 * Note that PUSHMARK PADSV PADSV ..
5363 * PADRANGE NULL NULL ..
5364 * so the alignment still works. */
5366 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5367 assert(n < pRExC_state->num_code_blocks);
5368 pRExC_state->code_blocks[n].start = SvCUR(pat);
5369 pRExC_state->code_blocks[n].block = o;
5370 pRExC_state->code_blocks[n].src_regex = NULL;
5373 o = o->op_sibling; /* skip CONST */
5379 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5380 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5383 /* overloading involved: all bets are off over literal
5384 * code. Pretend we haven't seen it */
5385 pRExC_state->num_code_blocks -= n;
5391 while (SvAMAGIC(msv)
5392 && (sv = AMG_CALLunary(msv, string_amg))
5396 && SvRV(msv) == SvRV(sv))
5401 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5403 orig_patlen = SvCUR(pat);
5404 sv_catsv_nomg(pat, msv);
5407 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5410 /* extract any code blocks within any embedded qr//'s */
5411 if (rx && SvTYPE(rx) == SVt_REGEXP
5412 && RX_ENGINE((REGEXP*)rx)->op_comp)
5415 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5416 if (ri->num_code_blocks) {
5418 /* the presence of an embedded qr// with code means
5419 * we should always recompile: the text of the
5420 * qr// may not have changed, but it may be a
5421 * different closure than last time */
5423 Renew(pRExC_state->code_blocks,
5424 pRExC_state->num_code_blocks + ri->num_code_blocks,
5425 struct reg_code_block);
5426 pRExC_state->num_code_blocks += ri->num_code_blocks;
5427 for (i=0; i < ri->num_code_blocks; i++) {
5428 struct reg_code_block *src, *dst;
5429 STRLEN offset = orig_patlen
5430 + ReANY((REGEXP *)rx)->pre_prefix;
5431 assert(n < pRExC_state->num_code_blocks);
5432 src = &ri->code_blocks[i];
5433 dst = &pRExC_state->code_blocks[n];
5434 dst->start = src->start + offset;
5435 dst->end = src->end + offset;
5436 dst->block = src->block;
5437 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5451 while (SvAMAGIC(pat)
5452 && (sv = AMG_CALLunary(pat, string_amg))
5460 /* handle bare regex: foo =~ $re */
5465 if (SvTYPE(re) == SVt_REGEXP) {
5469 Safefree(pRExC_state->code_blocks);
5475 /* not a list of SVs, so must be a list of OPs */
5477 if (expr->op_type == OP_LIST) {
5482 pat = newSVpvn("", 0);
5487 /* given a list of CONSTs and DO blocks in expr, append all
5488 * the CONSTs to pat, and record the start and end of each
5489 * code block in code_blocks[] (each DO{} op is followed by an
5490 * OP_CONST containing the corresponding literal '(?{...})
5493 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5494 if (o->op_type == OP_CONST) {
5495 sv_catsv(pat, cSVOPo_sv);
5497 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5501 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5502 assert(i+1 < pRExC_state->num_code_blocks);
5503 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5504 pRExC_state->code_blocks[i].block = o;
5505 pRExC_state->code_blocks[i].src_regex = NULL;
5511 assert(expr->op_type == OP_CONST);
5512 pat = cSVOPx_sv(expr);
5516 exp = SvPV_nomg(pat, plen);
5518 if (!eng->op_comp) {
5519 if ((SvUTF8(pat) && IN_BYTES)
5520 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5522 /* make a temporary copy; either to convert to bytes,
5523 * or to avoid repeating get-magic / overloaded stringify */
5524 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5525 (IN_BYTES ? 0 : SvUTF8(pat)));
5527 Safefree(pRExC_state->code_blocks);
5528 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5531 /* ignore the utf8ness if the pattern is 0 length */
5532 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5533 RExC_uni_semantics = 0;
5534 RExC_contains_locale = 0;
5535 pRExC_state->runtime_code_qr = NULL;
5537 /****************** LONG JUMP TARGET HERE***********************/
5538 /* Longjmp back to here if have to switch in midstream to utf8 */
5539 if (! RExC_orig_utf8) {
5540 JMPENV_PUSH(jump_ret);
5541 used_setjump = TRUE;
5544 if (jump_ret == 0) { /* First time through */
5548 SV *dsv= sv_newmortal();
5549 RE_PV_QUOTED_DECL(s, RExC_utf8,
5550 dsv, exp, plen, 60);
5551 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5552 PL_colors[4],PL_colors[5],s);
5555 else { /* longjumped back */
5558 STRLEN s = 0, d = 0;
5561 /* If the cause for the longjmp was other than changing to utf8, pop
5562 * our own setjmp, and longjmp to the correct handler */
5563 if (jump_ret != UTF8_LONGJMP) {
5565 JMPENV_JUMP(jump_ret);
5570 /* It's possible to write a regexp in ascii that represents Unicode
5571 codepoints outside of the byte range, such as via \x{100}. If we
5572 detect such a sequence we have to convert the entire pattern to utf8
5573 and then recompile, as our sizing calculation will have been based
5574 on 1 byte == 1 character, but we will need to use utf8 to encode
5575 at least some part of the pattern, and therefore must convert the whole
5578 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5579 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5581 /* upgrade pattern to UTF8, and if there are code blocks,
5582 * recalculate the indices.
5583 * This is essentially an unrolled Perl_bytes_to_utf8() */
5585 src = (U8*)SvPV_nomg(pat, plen);
5586 Newx(dst, plen * 2 + 1, U8);
5589 const UV uv = NATIVE_TO_ASCII(src[s]);
5590 if (UNI_IS_INVARIANT(uv))
5591 dst[d] = (U8)UTF_TO_NATIVE(uv);
5593 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5594 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5596 if (n < pRExC_state->num_code_blocks) {
5597 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5598 pRExC_state->code_blocks[n].start = d;
5599 assert(dst[d] == '(');
5602 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5603 pRExC_state->code_blocks[n].end = d;
5604 assert(dst[d] == ')');
5617 RExC_orig_utf8 = RExC_utf8 = 1;
5620 /* return old regex if pattern hasn't changed */
5624 && !!RX_UTF8(old_re) == !!RExC_utf8
5625 && RX_PRECOMP(old_re)
5626 && RX_PRELEN(old_re) == plen
5627 && memEQ(RX_PRECOMP(old_re), exp, plen))
5629 /* with runtime code, always recompile */
5630 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5632 if (!runtime_code) {
5636 Safefree(pRExC_state->code_blocks);
5640 else if ((pm_flags & PMf_USE_RE_EVAL)
5641 /* this second condition covers the non-regex literal case,
5642 * i.e. $foo =~ '(?{})'. */
5643 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5644 && (PL_hints & HINT_RE_EVAL))
5646 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5649 #ifdef TRIE_STUDY_OPT
5653 rx_flags = orig_rx_flags;
5655 if (initial_charset == REGEX_LOCALE_CHARSET) {
5656 RExC_contains_locale = 1;
5658 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5660 /* Set to use unicode semantics if the pattern is in utf8 and has the
5661 * 'depends' charset specified, as it means unicode when utf8 */
5662 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5666 RExC_flags = rx_flags;
5667 RExC_pm_flags = pm_flags;
5670 if (TAINTING_get && TAINT_get)
5671 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5673 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5674 /* whoops, we have a non-utf8 pattern, whilst run-time code
5675 * got compiled as utf8. Try again with a utf8 pattern */
5676 JMPENV_JUMP(UTF8_LONGJMP);
5679 assert(!pRExC_state->runtime_code_qr);
5684 RExC_in_lookbehind = 0;
5685 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5687 RExC_override_recoding = 0;
5688 RExC_in_multi_char_class = 0;
5690 /* First pass: determine size, legality. */
5698 RExC_emit = &PL_regdummy;
5699 RExC_whilem_seen = 0;
5700 RExC_open_parens = NULL;
5701 RExC_close_parens = NULL;
5703 RExC_paren_names = NULL;
5705 RExC_paren_name_list = NULL;
5707 RExC_recurse = NULL;
5708 RExC_recurse_count = 0;
5709 pRExC_state->code_index = 0;
5711 #if 0 /* REGC() is (currently) a NOP at the first pass.
5712 * Clever compilers notice this and complain. --jhi */
5713 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5716 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5718 RExC_lastparse=NULL;
5720 /* reg may croak on us, not giving us a chance to free
5721 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5722 need it to survive as long as the regexp (qr/(?{})/).
5723 We must check that code_blocksv is not already set, because we may
5724 have longjmped back. */
5725 if (pRExC_state->code_blocks && !code_blocksv) {
5726 code_blocksv = newSV_type(SVt_PV);
5727 SAVEFREESV(code_blocksv);
5728 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5729 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5731 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5732 RExC_precomp = NULL;
5736 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5738 /* Here, finished first pass. Get rid of any added setjmp */
5744 PerlIO_printf(Perl_debug_log,
5745 "Required size %"IVdf" nodes\n"
5746 "Starting second pass (creation)\n",
5749 RExC_lastparse=NULL;
5752 /* The first pass could have found things that force Unicode semantics */
5753 if ((RExC_utf8 || RExC_uni_semantics)
5754 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5756 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5759 /* Small enough for pointer-storage convention?
5760 If extralen==0, this means that we will not need long jumps. */
5761 if (RExC_size >= 0x10000L && RExC_extralen)
5762 RExC_size += RExC_extralen;
5765 if (RExC_whilem_seen > 15)
5766 RExC_whilem_seen = 15;
5768 /* Allocate space and zero-initialize. Note, the two step process
5769 of zeroing when in debug mode, thus anything assigned has to
5770 happen after that */
5771 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5773 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5774 char, regexp_internal);
5775 if ( r == NULL || ri == NULL )
5776 FAIL("Regexp out of space");
5778 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5779 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5781 /* bulk initialize base fields with 0. */
5782 Zero(ri, sizeof(regexp_internal), char);
5785 /* non-zero initialization begins here */
5788 r->extflags = rx_flags;
5789 if (pm_flags & PMf_IS_QR) {
5790 ri->code_blocks = pRExC_state->code_blocks;
5791 ri->num_code_blocks = pRExC_state->num_code_blocks;
5796 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5797 if (pRExC_state->code_blocks[n].src_regex)
5798 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5799 SAVEFREEPV(pRExC_state->code_blocks);
5803 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5804 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5806 /* The caret is output if there are any defaults: if not all the STD
5807 * flags are set, or if no character set specifier is needed */
5809 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5811 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5812 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5813 >> RXf_PMf_STD_PMMOD_SHIFT);
5814 const char *fptr = STD_PAT_MODS; /*"msix"*/
5816 /* Allocate for the worst case, which is all the std flags are turned
5817 * on. If more precision is desired, we could do a population count of
5818 * the flags set. This could be done with a small lookup table, or by
5819 * shifting, masking and adding, or even, when available, assembly
5820 * language for a machine-language population count.
5821 * We never output a minus, as all those are defaults, so are
5822 * covered by the caret */
5823 const STRLEN wraplen = plen + has_p + has_runon
5824 + has_default /* If needs a caret */
5826 /* If needs a character set specifier */
5827 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5828 + (sizeof(STD_PAT_MODS) - 1)
5829 + (sizeof("(?:)") - 1);
5831 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5832 r->xpv_len_u.xpvlenu_pv = p;
5834 SvFLAGS(rx) |= SVf_UTF8;
5837 /* If a default, cover it using the caret */
5839 *p++= DEFAULT_PAT_MOD;
5843 const char* const name = get_regex_charset_name(r->extflags, &len);
5844 Copy(name, p, len, char);
5848 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5851 while((ch = *fptr++)) {
5859 Copy(RExC_precomp, p, plen, char);
5860 assert ((RX_WRAPPED(rx) - p) < 16);
5861 r->pre_prefix = p - RX_WRAPPED(rx);
5867 SvCUR_set(rx, p - RX_WRAPPED(rx));
5871 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5873 if (RExC_seen & REG_SEEN_RECURSE) {
5874 Newxz(RExC_open_parens, RExC_npar,regnode *);
5875 SAVEFREEPV(RExC_open_parens);
5876 Newxz(RExC_close_parens,RExC_npar,regnode *);
5877 SAVEFREEPV(RExC_close_parens);
5880 /* Useful during FAIL. */
5881 #ifdef RE_TRACK_PATTERN_OFFSETS
5882 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5883 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5884 "%s %"UVuf" bytes for offset annotations.\n",
5885 ri->u.offsets ? "Got" : "Couldn't get",
5886 (UV)((2*RExC_size+1) * sizeof(U32))));
5888 SetProgLen(ri,RExC_size);
5892 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5894 /* Second pass: emit code. */
5895 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5896 RExC_pm_flags = pm_flags;
5901 RExC_emit_start = ri->program;
5902 RExC_emit = ri->program;
5903 RExC_emit_bound = ri->program + RExC_size + 1;
5904 pRExC_state->code_index = 0;
5906 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5907 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5911 /* XXXX To minimize changes to RE engine we always allocate
5912 3-units-long substrs field. */
5913 Newx(r->substrs, 1, struct reg_substr_data);
5914 if (RExC_recurse_count) {
5915 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5916 SAVEFREEPV(RExC_recurse);
5920 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5921 Zero(r->substrs, 1, struct reg_substr_data);
5923 #ifdef TRIE_STUDY_OPT
5925 StructCopy(&zero_scan_data, &data, scan_data_t);
5926 copyRExC_state = RExC_state;
5929 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5931 RExC_state = copyRExC_state;
5932 if (seen & REG_TOP_LEVEL_BRANCHES)
5933 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5935 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5936 StructCopy(&zero_scan_data, &data, scan_data_t);
5939 StructCopy(&zero_scan_data, &data, scan_data_t);
5942 /* Dig out information for optimizations. */
5943 r->extflags = RExC_flags; /* was pm_op */
5944 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5947 SvUTF8_on(rx); /* Unicode in it? */
5948 ri->regstclass = NULL;
5949 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5950 r->intflags |= PREGf_NAUGHTY;
5951 scan = ri->program + 1; /* First BRANCH. */
5953 /* testing for BRANCH here tells us whether there is "must appear"
5954 data in the pattern. If there is then we can use it for optimisations */
5955 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5957 STRLEN longest_float_length, longest_fixed_length;
5958 struct regnode_charclass_class ch_class; /* pointed to by data */
5960 I32 last_close = 0; /* pointed to by data */
5961 regnode *first= scan;
5962 regnode *first_next= regnext(first);
5964 * Skip introductions and multiplicators >= 1
5965 * so that we can extract the 'meat' of the pattern that must
5966 * match in the large if() sequence following.
5967 * NOTE that EXACT is NOT covered here, as it is normally
5968 * picked up by the optimiser separately.
5970 * This is unfortunate as the optimiser isnt handling lookahead
5971 * properly currently.
5974 while ((OP(first) == OPEN && (sawopen = 1)) ||
5975 /* An OR of *one* alternative - should not happen now. */
5976 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5977 /* for now we can't handle lookbehind IFMATCH*/
5978 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5979 (OP(first) == PLUS) ||
5980 (OP(first) == MINMOD) ||
5981 /* An {n,m} with n>0 */
5982 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5983 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5986 * the only op that could be a regnode is PLUS, all the rest
5987 * will be regnode_1 or regnode_2.
5990 if (OP(first) == PLUS)
5993 first += regarglen[OP(first)];
5995 first = NEXTOPER(first);
5996 first_next= regnext(first);
5999 /* Starting-point info. */
6001 DEBUG_PEEP("first:",first,0);
6002 /* Ignore EXACT as we deal with it later. */
6003 if (PL_regkind[OP(first)] == EXACT) {
6004 if (OP(first) == EXACT)
6005 NOOP; /* Empty, get anchored substr later. */
6007 ri->regstclass = first;
6010 else if (PL_regkind[OP(first)] == TRIE &&
6011 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6014 /* this can happen only on restudy */
6015 if ( OP(first) == TRIE ) {
6016 struct regnode_1 *trieop = (struct regnode_1 *)
6017 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6018 StructCopy(first,trieop,struct regnode_1);
6019 trie_op=(regnode *)trieop;
6021 struct regnode_charclass *trieop = (struct regnode_charclass *)
6022 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6023 StructCopy(first,trieop,struct regnode_charclass);
6024 trie_op=(regnode *)trieop;
6027 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6028 ri->regstclass = trie_op;
6031 else if (REGNODE_SIMPLE(OP(first)))
6032 ri->regstclass = first;
6033 else if (PL_regkind[OP(first)] == BOUND ||
6034 PL_regkind[OP(first)] == NBOUND)
6035 ri->regstclass = first;
6036 else if (PL_regkind[OP(first)] == BOL) {
6037 r->extflags |= (OP(first) == MBOL
6039 : (OP(first) == SBOL
6042 first = NEXTOPER(first);
6045 else if (OP(first) == GPOS) {
6046 r->extflags |= RXf_ANCH_GPOS;
6047 first = NEXTOPER(first);
6050 else if ((!sawopen || !RExC_sawback) &&
6051 (OP(first) == STAR &&
6052 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6053 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6055 /* turn .* into ^.* with an implied $*=1 */
6057 (OP(NEXTOPER(first)) == REG_ANY)
6060 r->extflags |= type;
6061 r->intflags |= PREGf_IMPLICIT;
6062 first = NEXTOPER(first);
6065 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6066 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6067 /* x+ must match at the 1st pos of run of x's */
6068 r->intflags |= PREGf_SKIP;
6070 /* Scan is after the zeroth branch, first is atomic matcher. */
6071 #ifdef TRIE_STUDY_OPT
6074 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6075 (IV)(first - scan + 1))
6079 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6080 (IV)(first - scan + 1))
6086 * If there's something expensive in the r.e., find the
6087 * longest literal string that must appear and make it the
6088 * regmust. Resolve ties in favor of later strings, since
6089 * the regstart check works with the beginning of the r.e.
6090 * and avoiding duplication strengthens checking. Not a
6091 * strong reason, but sufficient in the absence of others.
6092 * [Now we resolve ties in favor of the earlier string if
6093 * it happens that c_offset_min has been invalidated, since the
6094 * earlier string may buy us something the later one won't.]
6097 data.longest_fixed = newSVpvs("");
6098 data.longest_float = newSVpvs("");
6099 data.last_found = newSVpvs("");
6100 data.longest = &(data.longest_fixed);
6101 ENTER_with_name("study_chunk");
6102 SAVEFREESV(data.longest_fixed);
6103 SAVEFREESV(data.longest_float);
6104 SAVEFREESV(data.last_found);
6106 if (!ri->regstclass) {
6107 cl_init(pRExC_state, &ch_class);
6108 data.start_class = &ch_class;
6109 stclass_flag = SCF_DO_STCLASS_AND;
6110 } else /* XXXX Check for BOUND? */
6112 data.last_closep = &last_close;
6114 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6115 &data, -1, NULL, NULL,
6116 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6119 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6122 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6123 && data.last_start_min == 0 && data.last_end > 0
6124 && !RExC_seen_zerolen
6125 && !(RExC_seen & REG_SEEN_VERBARG)
6126 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6127 r->extflags |= RXf_CHECK_ALL;
6128 scan_commit(pRExC_state, &data,&minlen,0);
6130 longest_float_length = CHR_SVLEN(data.longest_float);
6132 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6133 && data.offset_fixed == data.offset_float_min
6134 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6135 && S_setup_longest (aTHX_ pRExC_state,
6139 &(r->float_end_shift),
6140 data.lookbehind_float,
6141 data.offset_float_min,
6143 longest_float_length,
6144 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6145 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6147 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6148 r->float_max_offset = data.offset_float_max;
6149 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6150 r->float_max_offset -= data.lookbehind_float;
6151 SvREFCNT_inc_simple_void_NN(data.longest_float);
6154 r->float_substr = r->float_utf8 = NULL;
6155 longest_float_length = 0;
6158 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6160 if (S_setup_longest (aTHX_ pRExC_state,
6162 &(r->anchored_utf8),
6163 &(r->anchored_substr),
6164 &(r->anchored_end_shift),
6165 data.lookbehind_fixed,
6168 longest_fixed_length,
6169 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6170 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6172 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6173 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6176 r->anchored_substr = r->anchored_utf8 = NULL;
6177 longest_fixed_length = 0;
6179 LEAVE_with_name("study_chunk");
6182 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6183 ri->regstclass = NULL;
6185 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6187 && ! TEST_SSC_EOS(data.start_class)
6188 && !cl_is_anything(data.start_class))
6190 const U32 n = add_data(pRExC_state, 1, "f");
6191 OP(data.start_class) = ANYOF_SYNTHETIC;
6193 Newx(RExC_rxi->data->data[n], 1,
6194 struct regnode_charclass_class);
6195 StructCopy(data.start_class,
6196 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6197 struct regnode_charclass_class);
6198 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6199 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6200 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6201 regprop(r, sv, (regnode*)data.start_class);
6202 PerlIO_printf(Perl_debug_log,
6203 "synthetic stclass \"%s\".\n",
6204 SvPVX_const(sv));});
6207 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6208 if (longest_fixed_length > longest_float_length) {
6209 r->check_end_shift = r->anchored_end_shift;
6210 r->check_substr = r->anchored_substr;
6211 r->check_utf8 = r->anchored_utf8;
6212 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6213 if (r->extflags & RXf_ANCH_SINGLE)
6214 r->extflags |= RXf_NOSCAN;
6217 r->check_end_shift = r->float_end_shift;
6218 r->check_substr = r->float_substr;
6219 r->check_utf8 = r->float_utf8;
6220 r->check_offset_min = r->float_min_offset;
6221 r->check_offset_max = r->float_max_offset;
6223 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6224 This should be changed ASAP! */
6225 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6226 r->extflags |= RXf_USE_INTUIT;
6227 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6228 r->extflags |= RXf_INTUIT_TAIL;
6230 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6231 if ( (STRLEN)minlen < longest_float_length )
6232 minlen= longest_float_length;
6233 if ( (STRLEN)minlen < longest_fixed_length )
6234 minlen= longest_fixed_length;
6238 /* Several toplevels. Best we can is to set minlen. */
6240 struct regnode_charclass_class ch_class;
6243 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6245 scan = ri->program + 1;
6246 cl_init(pRExC_state, &ch_class);
6247 data.start_class = &ch_class;
6248 data.last_closep = &last_close;
6251 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6252 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6254 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6256 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6257 = r->float_substr = r->float_utf8 = NULL;
6259 if (! TEST_SSC_EOS(data.start_class)
6260 && !cl_is_anything(data.start_class))
6262 const U32 n = add_data(pRExC_state, 1, "f");
6263 OP(data.start_class) = ANYOF_SYNTHETIC;
6265 Newx(RExC_rxi->data->data[n], 1,
6266 struct regnode_charclass_class);
6267 StructCopy(data.start_class,
6268 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6269 struct regnode_charclass_class);
6270 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6271 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6272 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6273 regprop(r, sv, (regnode*)data.start_class);
6274 PerlIO_printf(Perl_debug_log,
6275 "synthetic stclass \"%s\".\n",
6276 SvPVX_const(sv));});
6280 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6281 the "real" pattern. */
6283 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6284 (IV)minlen, (IV)r->minlen);
6286 r->minlenret = minlen;
6287 if (r->minlen < minlen)
6290 if (RExC_seen & REG_SEEN_GPOS)
6291 r->extflags |= RXf_GPOS_SEEN;
6292 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6293 r->extflags |= RXf_LOOKBEHIND_SEEN;
6294 if (pRExC_state->num_code_blocks)
6295 r->extflags |= RXf_EVAL_SEEN;
6296 if (RExC_seen & REG_SEEN_CANY)
6297 r->extflags |= RXf_CANY_SEEN;
6298 if (RExC_seen & REG_SEEN_VERBARG)
6300 r->intflags |= PREGf_VERBARG_SEEN;
6301 r->extflags |= RXf_MODIFIES_VARS;
6303 if (RExC_seen & REG_SEEN_CUTGROUP)
6304 r->intflags |= PREGf_CUTGROUP_SEEN;
6305 if (pm_flags & PMf_USE_RE_EVAL)
6306 r->intflags |= PREGf_USE_RE_EVAL;
6307 if (RExC_paren_names)
6308 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6310 RXp_PAREN_NAMES(r) = NULL;
6312 #ifdef STUPID_PATTERN_CHECKS
6313 if (RX_PRELEN(rx) == 0)
6314 r->extflags |= RXf_NULL;
6315 if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6316 r->extflags |= RXf_WHITE;
6317 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6318 r->extflags |= RXf_START_ONLY;
6321 regnode *first = ri->program + 1;
6324 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6325 r->extflags |= RXf_NULL;
6326 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6327 r->extflags |= RXf_START_ONLY;
6328 else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
6329 && OP(regnext(first)) == END)
6330 r->extflags |= RXf_WHITE;
6334 if (RExC_paren_names) {
6335 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6336 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6339 ri->name_list_idx = 0;
6341 if (RExC_recurse_count) {
6342 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6343 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6344 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6347 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6348 /* assume we don't need to swap parens around before we match */
6351 PerlIO_printf(Perl_debug_log,"Final program:\n");
6354 #ifdef RE_TRACK_PATTERN_OFFSETS
6355 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6356 const U32 len = ri->u.offsets[0];
6358 GET_RE_DEBUG_FLAGS_DECL;
6359 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6360 for (i = 1; i <= len; i++) {
6361 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6362 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6363 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6365 PerlIO_printf(Perl_debug_log, "\n");
6370 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6371 * by setting the regexp SV to readonly-only instead. If the
6372 * pattern's been recompiled, the USEDness should remain. */
6373 if (old_re && SvREADONLY(old_re))
6381 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6384 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6386 PERL_UNUSED_ARG(value);
6388 if (flags & RXapif_FETCH) {
6389 return reg_named_buff_fetch(rx, key, flags);
6390 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6391 Perl_croak_no_modify();
6393 } else if (flags & RXapif_EXISTS) {
6394 return reg_named_buff_exists(rx, key, flags)
6397 } else if (flags & RXapif_REGNAMES) {
6398 return reg_named_buff_all(rx, flags);
6399 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6400 return reg_named_buff_scalar(rx, flags);
6402 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6408 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6411 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6412 PERL_UNUSED_ARG(lastkey);
6414 if (flags & RXapif_FIRSTKEY)
6415 return reg_named_buff_firstkey(rx, flags);
6416 else if (flags & RXapif_NEXTKEY)
6417 return reg_named_buff_nextkey(rx, flags);
6419 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6425 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6428 AV *retarray = NULL;
6430 struct regexp *const rx = ReANY(r);
6432 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6434 if (flags & RXapif_ALL)
6437 if (rx && RXp_PAREN_NAMES(rx)) {
6438 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6441 SV* sv_dat=HeVAL(he_str);
6442 I32 *nums=(I32*)SvPVX(sv_dat);
6443 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6444 if ((I32)(rx->nparens) >= nums[i]
6445 && rx->offs[nums[i]].start != -1
6446 && rx->offs[nums[i]].end != -1)
6449 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6454 ret = newSVsv(&PL_sv_undef);
6457 av_push(retarray, ret);
6460 return newRV_noinc(MUTABLE_SV(retarray));
6467 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6470 struct regexp *const rx = ReANY(r);
6472 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6474 if (rx && RXp_PAREN_NAMES(rx)) {
6475 if (flags & RXapif_ALL) {
6476 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6478 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6480 SvREFCNT_dec_NN(sv);
6492 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6494 struct regexp *const rx = ReANY(r);
6496 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6498 if ( rx && RXp_PAREN_NAMES(rx) ) {
6499 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6501 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6508 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6510 struct regexp *const rx = ReANY(r);
6511 GET_RE_DEBUG_FLAGS_DECL;
6513 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6515 if (rx && RXp_PAREN_NAMES(rx)) {
6516 HV *hv = RXp_PAREN_NAMES(rx);
6518 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6521 SV* sv_dat = HeVAL(temphe);
6522 I32 *nums = (I32*)SvPVX(sv_dat);
6523 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6524 if ((I32)(rx->lastparen) >= nums[i] &&
6525 rx->offs[nums[i]].start != -1 &&
6526 rx->offs[nums[i]].end != -1)
6532 if (parno || flags & RXapif_ALL) {
6533 return newSVhek(HeKEY_hek(temphe));
6541 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6546 struct regexp *const rx = ReANY(r);
6548 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6550 if (rx && RXp_PAREN_NAMES(rx)) {
6551 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6552 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6553 } else if (flags & RXapif_ONE) {
6554 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6555 av = MUTABLE_AV(SvRV(ret));
6556 length = av_len(av);
6557 SvREFCNT_dec_NN(ret);
6558 return newSViv(length + 1);
6560 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6564 return &PL_sv_undef;
6568 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6570 struct regexp *const rx = ReANY(r);
6573 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6575 if (rx && RXp_PAREN_NAMES(rx)) {
6576 HV *hv= RXp_PAREN_NAMES(rx);
6578 (void)hv_iterinit(hv);
6579 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6582 SV* sv_dat = HeVAL(temphe);
6583 I32 *nums = (I32*)SvPVX(sv_dat);
6584 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6585 if ((I32)(rx->lastparen) >= nums[i] &&
6586 rx->offs[nums[i]].start != -1 &&
6587 rx->offs[nums[i]].end != -1)
6593 if (parno || flags & RXapif_ALL) {
6594 av_push(av, newSVhek(HeKEY_hek(temphe)));
6599 return newRV_noinc(MUTABLE_SV(av));
6603 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6606 struct regexp *const rx = ReANY(r);
6612 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6614 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6615 || n == RX_BUFF_IDX_CARET_FULLMATCH
6616 || n == RX_BUFF_IDX_CARET_POSTMATCH
6618 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6625 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6626 /* no need to distinguish between them any more */
6627 n = RX_BUFF_IDX_FULLMATCH;
6629 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6630 && rx->offs[0].start != -1)
6632 /* $`, ${^PREMATCH} */
6633 i = rx->offs[0].start;
6637 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6638 && rx->offs[0].end != -1)
6640 /* $', ${^POSTMATCH} */
6641 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6642 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6645 if ( 0 <= n && n <= (I32)rx->nparens &&
6646 (s1 = rx->offs[n].start) != -1 &&
6647 (t1 = rx->offs[n].end) != -1)
6649 /* $&, ${^MATCH}, $1 ... */
6651 s = rx->subbeg + s1 - rx->suboffset;
6656 assert(s >= rx->subbeg);
6657 assert(rx->sublen >= (s - rx->subbeg) + i );
6659 #if NO_TAINT_SUPPORT
6660 sv_setpvn(sv, s, i);
6662 const int oldtainted = TAINT_get;
6664 sv_setpvn(sv, s, i);
6665 TAINT_set(oldtainted);
6667 if ( (rx->extflags & RXf_CANY_SEEN)
6668 ? (RXp_MATCH_UTF8(rx)
6669 && (!i || is_utf8_string((U8*)s, i)))
6670 : (RXp_MATCH_UTF8(rx)) )
6677 if (RXp_MATCH_TAINTED(rx)) {
6678 if (SvTYPE(sv) >= SVt_PVMG) {
6679 MAGIC* const mg = SvMAGIC(sv);
6682 SvMAGIC_set(sv, mg->mg_moremagic);
6684 if ((mgt = SvMAGIC(sv))) {
6685 mg->mg_moremagic = mgt;
6686 SvMAGIC_set(sv, mg);
6697 sv_setsv(sv,&PL_sv_undef);
6703 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6704 SV const * const value)
6706 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6708 PERL_UNUSED_ARG(rx);
6709 PERL_UNUSED_ARG(paren);
6710 PERL_UNUSED_ARG(value);
6713 Perl_croak_no_modify();
6717 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6720 struct regexp *const rx = ReANY(r);
6724 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6726 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6728 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6729 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6733 case RX_BUFF_IDX_PREMATCH: /* $` */
6734 if (rx->offs[0].start != -1) {
6735 i = rx->offs[0].start;
6744 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6745 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6747 case RX_BUFF_IDX_POSTMATCH: /* $' */
6748 if (rx->offs[0].end != -1) {
6749 i = rx->sublen - rx->offs[0].end;
6751 s1 = rx->offs[0].end;
6758 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6759 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6763 /* $& / ${^MATCH}, $1, $2, ... */
6765 if (paren <= (I32)rx->nparens &&
6766 (s1 = rx->offs[paren].start) != -1 &&
6767 (t1 = rx->offs[paren].end) != -1)
6773 if (ckWARN(WARN_UNINITIALIZED))
6774 report_uninit((const SV *)sv);
6779 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6780 const char * const s = rx->subbeg - rx->suboffset + s1;
6785 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6792 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6794 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6795 PERL_UNUSED_ARG(rx);
6799 return newSVpvs("Regexp");
6802 /* Scans the name of a named buffer from the pattern.
6803 * If flags is REG_RSN_RETURN_NULL returns null.
6804 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6805 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6806 * to the parsed name as looked up in the RExC_paren_names hash.
6807 * If there is an error throws a vFAIL().. type exception.
6810 #define REG_RSN_RETURN_NULL 0
6811 #define REG_RSN_RETURN_NAME 1
6812 #define REG_RSN_RETURN_DATA 2
6815 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6817 char *name_start = RExC_parse;
6819 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6821 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6822 /* skip IDFIRST by using do...while */
6825 RExC_parse += UTF8SKIP(RExC_parse);
6826 } while (isWORDCHAR_utf8((U8*)RExC_parse));
6830 } while (isWORDCHAR(*RExC_parse));
6832 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6833 vFAIL("Group name must start with a non-digit word character");
6837 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6838 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6839 if ( flags == REG_RSN_RETURN_NAME)
6841 else if (flags==REG_RSN_RETURN_DATA) {
6844 if ( ! sv_name ) /* should not happen*/
6845 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6846 if (RExC_paren_names)
6847 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6849 sv_dat = HeVAL(he_str);
6851 vFAIL("Reference to nonexistent named group");
6855 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6856 (unsigned long) flags);
6858 assert(0); /* NOT REACHED */
6863 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6864 int rem=(int)(RExC_end - RExC_parse); \
6873 if (RExC_lastparse!=RExC_parse) \
6874 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6877 iscut ? "..." : "<" \
6880 PerlIO_printf(Perl_debug_log,"%16s",""); \
6883 num = RExC_size + 1; \
6885 num=REG_NODE_NUM(RExC_emit); \
6886 if (RExC_lastnum!=num) \
6887 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6889 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6890 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6891 (int)((depth*2)), "", \
6895 RExC_lastparse=RExC_parse; \
6900 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6901 DEBUG_PARSE_MSG((funcname)); \
6902 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6904 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6905 DEBUG_PARSE_MSG((funcname)); \
6906 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6909 /* This section of code defines the inversion list object and its methods. The
6910 * interfaces are highly subject to change, so as much as possible is static to
6911 * this file. An inversion list is here implemented as a malloc'd C UV array
6912 * with some added info that is placed as UVs at the beginning in a header
6913 * portion. An inversion list for Unicode is an array of code points, sorted
6914 * by ordinal number. The zeroth element is the first code point in the list.
6915 * The 1th element is the first element beyond that not in the list. In other
6916 * words, the first range is
6917 * invlist[0]..(invlist[1]-1)
6918 * The other ranges follow. Thus every element whose index is divisible by two
6919 * marks the beginning of a range that is in the list, and every element not
6920 * divisible by two marks the beginning of a range not in the list. A single
6921 * element inversion list that contains the single code point N generally
6922 * consists of two elements
6925 * (The exception is when N is the highest representable value on the
6926 * machine, in which case the list containing just it would be a single
6927 * element, itself. By extension, if the last range in the list extends to
6928 * infinity, then the first element of that range will be in the inversion list
6929 * at a position that is divisible by two, and is the final element in the
6931 * Taking the complement (inverting) an inversion list is quite simple, if the
6932 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6933 * This implementation reserves an element at the beginning of each inversion
6934 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
6935 * actual beginning of the list is either that element if 0, or the next one if
6938 * More about inversion lists can be found in "Unicode Demystified"
6939 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6940 * More will be coming when functionality is added later.
6942 * The inversion list data structure is currently implemented as an SV pointing
6943 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6944 * array of UV whose memory management is automatically handled by the existing
6945 * facilities for SV's.
6947 * Some of the methods should always be private to the implementation, and some
6948 * should eventually be made public */
6950 /* The header definitions are in F<inline_invlist.c> */
6951 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6952 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6954 #define INVLIST_INITIAL_LEN 10
6956 PERL_STATIC_INLINE UV*
6957 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6959 /* Returns a pointer to the first element in the inversion list's array.
6960 * This is called upon initialization of an inversion list. Where the
6961 * array begins depends on whether the list has the code point U+0000
6962 * in it or not. The other parameter tells it whether the code that
6963 * follows this call is about to put a 0 in the inversion list or not.
6964 * The first element is either the element with 0, if 0, or the next one,
6967 UV* zero = get_invlist_zero_addr(invlist);
6969 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6972 assert(! *_get_invlist_len_addr(invlist));
6974 /* 1^1 = 0; 1^0 = 1 */
6975 *zero = 1 ^ will_have_0;
6976 return zero + *zero;
6979 PERL_STATIC_INLINE UV*
6980 S_invlist_array(pTHX_ SV* const invlist)
6982 /* Returns the pointer to the inversion list's array. Every time the
6983 * length changes, this needs to be called in case malloc or realloc moved
6986 PERL_ARGS_ASSERT_INVLIST_ARRAY;
6988 /* Must not be empty. If these fail, you probably didn't check for <len>
6989 * being non-zero before trying to get the array */
6990 assert(*_get_invlist_len_addr(invlist));
6991 assert(*get_invlist_zero_addr(invlist) == 0
6992 || *get_invlist_zero_addr(invlist) == 1);
6994 /* The array begins either at the element reserved for zero if the
6995 * list contains 0 (that element will be set to 0), or otherwise the next
6996 * element (in which case the reserved element will be set to 1). */
6997 return (UV *) (get_invlist_zero_addr(invlist)
6998 + *get_invlist_zero_addr(invlist));
7001 PERL_STATIC_INLINE void
7002 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7004 /* Sets the current number of elements stored in the inversion list */
7006 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7008 *_get_invlist_len_addr(invlist) = len;
7010 assert(len <= SvLEN(invlist));
7012 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7013 /* If the list contains U+0000, that element is part of the header,
7014 * and should not be counted as part of the array. It will contain
7015 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7017 * SvCUR_set(invlist,
7018 * TO_INTERNAL_SIZE(len
7019 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7020 * But, this is only valid if len is not 0. The consequences of not doing
7021 * this is that the memory allocation code may think that 1 more UV is
7022 * being used than actually is, and so might do an unnecessary grow. That
7023 * seems worth not bothering to make this the precise amount.
7025 * Note that when inverting, SvCUR shouldn't change */
7028 PERL_STATIC_INLINE IV*
7029 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7031 /* Return the address of the UV that is reserved to hold the cached index
7034 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7036 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7039 PERL_STATIC_INLINE IV
7040 S_invlist_previous_index(pTHX_ SV* const invlist)
7042 /* Returns cached index of previous search */
7044 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7046 return *get_invlist_previous_index_addr(invlist);
7049 PERL_STATIC_INLINE void
7050 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7052 /* Caches <index> for later retrieval */
7054 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7056 assert(index == 0 || index < (int) _invlist_len(invlist));
7058 *get_invlist_previous_index_addr(invlist) = index;
7061 PERL_STATIC_INLINE UV
7062 S_invlist_max(pTHX_ SV* const invlist)
7064 /* Returns the maximum number of elements storable in the inversion list's
7065 * array, without having to realloc() */
7067 PERL_ARGS_ASSERT_INVLIST_MAX;
7069 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7070 ? _invlist_len(invlist)
7071 : FROM_INTERNAL_SIZE(SvLEN(invlist));
7074 PERL_STATIC_INLINE UV*
7075 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7077 /* Return the address of the UV that is reserved to hold 0 if the inversion
7078 * list contains 0. This has to be the last element of the heading, as the
7079 * list proper starts with either it if 0, or the next element if not.
7080 * (But we force it to contain either 0 or 1) */
7082 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7084 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7087 #ifndef PERL_IN_XSUB_RE
7089 Perl__new_invlist(pTHX_ IV initial_size)
7092 /* Return a pointer to a newly constructed inversion list, with enough
7093 * space to store 'initial_size' elements. If that number is negative, a
7094 * system default is used instead */
7098 if (initial_size < 0) {
7099 initial_size = INVLIST_INITIAL_LEN;
7102 /* Allocate the initial space */
7103 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7104 invlist_set_len(new_list, 0);
7106 /* Force iterinit() to be used to get iteration to work */
7107 *get_invlist_iter_addr(new_list) = UV_MAX;
7109 /* This should force a segfault if a method doesn't initialize this
7111 *get_invlist_zero_addr(new_list) = UV_MAX;
7113 *get_invlist_previous_index_addr(new_list) = 0;
7114 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7115 #if HEADER_LENGTH != 5
7116 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7124 S__new_invlist_C_array(pTHX_ UV* list)
7126 /* Return a pointer to a newly constructed inversion list, initialized to
7127 * point to <list>, which has to be in the exact correct inversion list
7128 * form, including internal fields. Thus this is a dangerous routine that
7129 * should not be used in the wrong hands */
7131 SV* invlist = newSV_type(SVt_PV);
7133 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7135 SvPV_set(invlist, (char *) list);
7136 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7137 shouldn't touch it */
7138 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7140 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7141 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7144 /* Initialize the iteration pointer.
7145 * XXX This could be done at compile time in charclass_invlists.h, but I
7146 * (khw) am not confident that the suffixes for specifying the C constant
7147 * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured
7148 * to use 64 bits; might need a Configure probe */
7149 invlist_iterfinish(invlist);
7155 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7157 /* Grow the maximum size of an inversion list */
7159 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7161 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7164 PERL_STATIC_INLINE void
7165 S_invlist_trim(pTHX_ SV* const invlist)
7167 PERL_ARGS_ASSERT_INVLIST_TRIM;
7169 /* Change the length of the inversion list to how many entries it currently
7172 SvPV_shrink_to_cur((SV *) invlist);
7175 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7178 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7180 /* Subject to change or removal. Append the range from 'start' to 'end' at
7181 * the end of the inversion list. The range must be above any existing
7185 UV max = invlist_max(invlist);
7186 UV len = _invlist_len(invlist);
7188 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7190 if (len == 0) { /* Empty lists must be initialized */
7191 array = _invlist_array_init(invlist, start == 0);
7194 /* Here, the existing list is non-empty. The current max entry in the
7195 * list is generally the first value not in the set, except when the
7196 * set extends to the end of permissible values, in which case it is
7197 * the first entry in that final set, and so this call is an attempt to
7198 * append out-of-order */
7200 UV final_element = len - 1;
7201 array = invlist_array(invlist);
7202 if (array[final_element] > start
7203 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7205 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7206 array[final_element], start,
7207 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7210 /* Here, it is a legal append. If the new range begins with the first
7211 * value not in the set, it is extending the set, so the new first
7212 * value not in the set is one greater than the newly extended range.
7214 if (array[final_element] == start) {
7215 if (end != UV_MAX) {
7216 array[final_element] = end + 1;
7219 /* But if the end is the maximum representable on the machine,
7220 * just let the range that this would extend to have no end */
7221 invlist_set_len(invlist, len - 1);
7227 /* Here the new range doesn't extend any existing set. Add it */
7229 len += 2; /* Includes an element each for the start and end of range */
7231 /* If overflows the existing space, extend, which may cause the array to be
7234 invlist_extend(invlist, len);
7235 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7236 failure in invlist_array() */
7237 array = invlist_array(invlist);
7240 invlist_set_len(invlist, len);
7243 /* The next item on the list starts the range, the one after that is
7244 * one past the new range. */
7245 array[len - 2] = start;
7246 if (end != UV_MAX) {
7247 array[len - 1] = end + 1;
7250 /* But if the end is the maximum representable on the machine, just let
7251 * the range have no end */
7252 invlist_set_len(invlist, len - 1);
7256 #ifndef PERL_IN_XSUB_RE
7259 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7261 /* Searches the inversion list for the entry that contains the input code
7262 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7263 * return value is the index into the list's array of the range that
7268 IV high = _invlist_len(invlist);
7269 const IV highest_element = high - 1;
7272 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7274 /* If list is empty, return failure. */
7279 /* (We can't get the array unless we know the list is non-empty) */
7280 array = invlist_array(invlist);
7282 mid = invlist_previous_index(invlist);
7283 assert(mid >=0 && mid <= highest_element);
7285 /* <mid> contains the cache of the result of the previous call to this
7286 * function (0 the first time). See if this call is for the same result,
7287 * or if it is for mid-1. This is under the theory that calls to this
7288 * function will often be for related code points that are near each other.
7289 * And benchmarks show that caching gives better results. We also test
7290 * here if the code point is within the bounds of the list. These tests
7291 * replace others that would have had to be made anyway to make sure that
7292 * the array bounds were not exceeded, and these give us extra information
7293 * at the same time */
7294 if (cp >= array[mid]) {
7295 if (cp >= array[highest_element]) {
7296 return highest_element;
7299 /* Here, array[mid] <= cp < array[highest_element]. This means that
7300 * the final element is not the answer, so can exclude it; it also
7301 * means that <mid> is not the final element, so can refer to 'mid + 1'
7303 if (cp < array[mid + 1]) {
7309 else { /* cp < aray[mid] */
7310 if (cp < array[0]) { /* Fail if outside the array */
7314 if (cp >= array[mid - 1]) {
7319 /* Binary search. What we are looking for is <i> such that
7320 * array[i] <= cp < array[i+1]
7321 * The loop below converges on the i+1. Note that there may not be an
7322 * (i+1)th element in the array, and things work nonetheless */
7323 while (low < high) {
7324 mid = (low + high) / 2;
7325 assert(mid <= highest_element);
7326 if (array[mid] <= cp) { /* cp >= array[mid] */
7329 /* We could do this extra test to exit the loop early.
7330 if (cp < array[low]) {
7335 else { /* cp < array[mid] */
7342 invlist_set_previous_index(invlist, high);
7347 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7349 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7350 * but is used when the swash has an inversion list. This makes this much
7351 * faster, as it uses a binary search instead of a linear one. This is
7352 * intimately tied to that function, and perhaps should be in utf8.c,
7353 * except it is intimately tied to inversion lists as well. It assumes
7354 * that <swatch> is all 0's on input */
7357 const IV len = _invlist_len(invlist);
7361 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7363 if (len == 0) { /* Empty inversion list */
7367 array = invlist_array(invlist);
7369 /* Find which element it is */
7370 i = _invlist_search(invlist, start);
7372 /* We populate from <start> to <end> */
7373 while (current < end) {
7376 /* The inversion list gives the results for every possible code point
7377 * after the first one in the list. Only those ranges whose index is
7378 * even are ones that the inversion list matches. For the odd ones,
7379 * and if the initial code point is not in the list, we have to skip
7380 * forward to the next element */
7381 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7383 if (i >= len) { /* Finished if beyond the end of the array */
7387 if (current >= end) { /* Finished if beyond the end of what we
7389 if (LIKELY(end < UV_MAX)) {
7393 /* We get here when the upper bound is the maximum
7394 * representable on the machine, and we are looking for just
7395 * that code point. Have to special case it */
7397 goto join_end_of_list;
7400 assert(current >= start);
7402 /* The current range ends one below the next one, except don't go past
7405 upper = (i < len && array[i] < end) ? array[i] : end;
7407 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7408 * for each code point in it */
7409 for (; current < upper; current++) {
7410 const STRLEN offset = (STRLEN)(current - start);
7411 swatch[offset >> 3] |= 1 << (offset & 7);
7416 /* Quit if at the end of the list */
7419 /* But first, have to deal with the highest possible code point on
7420 * the platform. The previous code assumes that <end> is one
7421 * beyond where we want to populate, but that is impossible at the
7422 * platform's infinity, so have to handle it specially */
7423 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7425 const STRLEN offset = (STRLEN)(end - start);
7426 swatch[offset >> 3] |= 1 << (offset & 7);
7431 /* Advance to the next range, which will be for code points not in the
7440 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7442 /* Take the union of two inversion lists and point <output> to it. *output
7443 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7444 * the reference count to that list will be decremented. The first list,
7445 * <a>, may be NULL, in which case a copy of the second list is returned.
7446 * If <complement_b> is TRUE, the union is taken of the complement
7447 * (inversion) of <b> instead of b itself.
7449 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7450 * Richard Gillam, published by Addison-Wesley, and explained at some
7451 * length there. The preface says to incorporate its examples into your
7452 * code at your own risk.
7454 * The algorithm is like a merge sort.
7456 * XXX A potential performance improvement is to keep track as we go along
7457 * if only one of the inputs contributes to the result, meaning the other
7458 * is a subset of that one. In that case, we can skip the final copy and
7459 * return the larger of the input lists, but then outside code might need
7460 * to keep track of whether to free the input list or not */
7462 UV* array_a; /* a's array */
7464 UV len_a; /* length of a's array */
7467 SV* u; /* the resulting union */
7471 UV i_a = 0; /* current index into a's array */
7475 /* running count, as explained in the algorithm source book; items are
7476 * stopped accumulating and are output when the count changes to/from 0.
7477 * The count is incremented when we start a range that's in the set, and
7478 * decremented when we start a range that's not in the set. So its range
7479 * is 0 to 2. Only when the count is zero is something not in the set.
7483 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7486 /* If either one is empty, the union is the other one */
7487 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7494 *output = invlist_clone(b);
7496 _invlist_invert(*output);
7498 } /* else *output already = b; */
7501 else if ((len_b = _invlist_len(b)) == 0) {
7506 /* The complement of an empty list is a list that has everything in it,
7507 * so the union with <a> includes everything too */
7512 *output = _new_invlist(1);
7513 _append_range_to_invlist(*output, 0, UV_MAX);
7515 else if (*output != a) {
7516 *output = invlist_clone(a);
7518 /* else *output already = a; */
7522 /* Here both lists exist and are non-empty */
7523 array_a = invlist_array(a);
7524 array_b = invlist_array(b);
7526 /* If are to take the union of 'a' with the complement of b, set it
7527 * up so are looking at b's complement. */
7530 /* To complement, we invert: if the first element is 0, remove it. To
7531 * do this, we just pretend the array starts one later, and clear the
7532 * flag as we don't have to do anything else later */
7533 if (array_b[0] == 0) {
7536 complement_b = FALSE;
7540 /* But if the first element is not zero, we unshift a 0 before the
7541 * array. The data structure reserves a space for that 0 (which
7542 * should be a '1' right now), so physical shifting is unneeded,
7543 * but temporarily change that element to 0. Before exiting the
7544 * routine, we must restore the element to '1' */
7551 /* Size the union for the worst case: that the sets are completely
7553 u = _new_invlist(len_a + len_b);
7555 /* Will contain U+0000 if either component does */
7556 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7557 || (len_b > 0 && array_b[0] == 0));
7559 /* Go through each list item by item, stopping when exhausted one of
7561 while (i_a < len_a && i_b < len_b) {
7562 UV cp; /* The element to potentially add to the union's array */
7563 bool cp_in_set; /* is it in the the input list's set or not */
7565 /* We need to take one or the other of the two inputs for the union.
7566 * Since we are merging two sorted lists, we take the smaller of the
7567 * next items. In case of a tie, we take the one that is in its set
7568 * first. If we took one not in the set first, it would decrement the
7569 * count, possibly to 0 which would cause it to be output as ending the
7570 * range, and the next time through we would take the same number, and
7571 * output it again as beginning the next range. By doing it the
7572 * opposite way, there is no possibility that the count will be
7573 * momentarily decremented to 0, and thus the two adjoining ranges will
7574 * be seamlessly merged. (In a tie and both are in the set or both not
7575 * in the set, it doesn't matter which we take first.) */
7576 if (array_a[i_a] < array_b[i_b]
7577 || (array_a[i_a] == array_b[i_b]
7578 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7580 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7584 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7588 /* Here, have chosen which of the two inputs to look at. Only output
7589 * if the running count changes to/from 0, which marks the
7590 * beginning/end of a range in that's in the set */
7593 array_u[i_u++] = cp;
7600 array_u[i_u++] = cp;
7605 /* Here, we are finished going through at least one of the lists, which
7606 * means there is something remaining in at most one. We check if the list
7607 * that hasn't been exhausted is positioned such that we are in the middle
7608 * of a range in its set or not. (i_a and i_b point to the element beyond
7609 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7610 * is potentially more to output.
7611 * There are four cases:
7612 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7613 * in the union is entirely from the non-exhausted set.
7614 * 2) Both were in their sets, count is 2. Nothing further should
7615 * be output, as everything that remains will be in the exhausted
7616 * list's set, hence in the union; decrementing to 1 but not 0 insures
7618 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7619 * Nothing further should be output because the union includes
7620 * everything from the exhausted set. Not decrementing ensures that.
7621 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7622 * decrementing to 0 insures that we look at the remainder of the
7623 * non-exhausted set */
7624 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7625 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7630 /* The final length is what we've output so far, plus what else is about to
7631 * be output. (If 'count' is non-zero, then the input list we exhausted
7632 * has everything remaining up to the machine's limit in its set, and hence
7633 * in the union, so there will be no further output. */
7636 /* At most one of the subexpressions will be non-zero */
7637 len_u += (len_a - i_a) + (len_b - i_b);
7640 /* Set result to final length, which can change the pointer to array_u, so
7642 if (len_u != _invlist_len(u)) {
7643 invlist_set_len(u, len_u);
7645 array_u = invlist_array(u);
7648 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7649 * the other) ended with everything above it not in its set. That means
7650 * that the remaining part of the union is precisely the same as the
7651 * non-exhausted list, so can just copy it unchanged. (If both list were
7652 * exhausted at the same time, then the operations below will be both 0.)
7655 IV copy_count; /* At most one will have a non-zero copy count */
7656 if ((copy_count = len_a - i_a) > 0) {
7657 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7659 else if ((copy_count = len_b - i_b) > 0) {
7660 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7664 /* If we've changed b, restore it */
7669 /* We may be removing a reference to one of the inputs */
7670 if (a == *output || b == *output) {
7671 assert(! invlist_is_iterating(*output));
7672 SvREFCNT_dec_NN(*output);
7680 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7682 /* Take the intersection of two inversion lists and point <i> to it. *i
7683 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7684 * the reference count to that list will be decremented.
7685 * If <complement_b> is TRUE, the result will be the intersection of <a>
7686 * and the complement (or inversion) of <b> instead of <b> directly.
7688 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7689 * Richard Gillam, published by Addison-Wesley, and explained at some
7690 * length there. The preface says to incorporate its examples into your
7691 * code at your own risk. In fact, it had bugs
7693 * The algorithm is like a merge sort, and is essentially the same as the
7697 UV* array_a; /* a's array */
7699 UV len_a; /* length of a's array */
7702 SV* r; /* the resulting intersection */
7706 UV i_a = 0; /* current index into a's array */
7710 /* running count, as explained in the algorithm source book; items are
7711 * stopped accumulating and are output when the count changes to/from 2.
7712 * The count is incremented when we start a range that's in the set, and
7713 * decremented when we start a range that's not in the set. So its range
7714 * is 0 to 2. Only when the count is 2 is something in the intersection.
7718 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7721 /* Special case if either one is empty */
7722 len_a = _invlist_len(a);
7723 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7725 if (len_a != 0 && complement_b) {
7727 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7728 * be empty. Here, also we are using 'b's complement, which hence
7729 * must be every possible code point. Thus the intersection is
7732 *i = invlist_clone(a);
7738 /* else *i is already 'a' */
7742 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7743 * intersection must be empty */
7750 *i = _new_invlist(0);
7754 /* Here both lists exist and are non-empty */
7755 array_a = invlist_array(a);
7756 array_b = invlist_array(b);
7758 /* If are to take the intersection of 'a' with the complement of b, set it
7759 * up so are looking at b's complement. */
7762 /* To complement, we invert: if the first element is 0, remove it. To
7763 * do this, we just pretend the array starts one later, and clear the
7764 * flag as we don't have to do anything else later */
7765 if (array_b[0] == 0) {
7768 complement_b = FALSE;
7772 /* But if the first element is not zero, we unshift a 0 before the
7773 * array. The data structure reserves a space for that 0 (which
7774 * should be a '1' right now), so physical shifting is unneeded,
7775 * but temporarily change that element to 0. Before exiting the
7776 * routine, we must restore the element to '1' */
7783 /* Size the intersection for the worst case: that the intersection ends up
7784 * fragmenting everything to be completely disjoint */
7785 r= _new_invlist(len_a + len_b);
7787 /* Will contain U+0000 iff both components do */
7788 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7789 && len_b > 0 && array_b[0] == 0);
7791 /* Go through each list item by item, stopping when exhausted one of
7793 while (i_a < len_a && i_b < len_b) {
7794 UV cp; /* The element to potentially add to the intersection's
7796 bool cp_in_set; /* Is it in the input list's set or not */
7798 /* We need to take one or the other of the two inputs for the
7799 * intersection. Since we are merging two sorted lists, we take the
7800 * smaller of the next items. In case of a tie, we take the one that
7801 * is not in its set first (a difference from the union algorithm). If
7802 * we took one in the set first, it would increment the count, possibly
7803 * to 2 which would cause it to be output as starting a range in the
7804 * intersection, and the next time through we would take that same
7805 * number, and output it again as ending the set. By doing it the
7806 * opposite of this, there is no possibility that the count will be
7807 * momentarily incremented to 2. (In a tie and both are in the set or
7808 * both not in the set, it doesn't matter which we take first.) */
7809 if (array_a[i_a] < array_b[i_b]
7810 || (array_a[i_a] == array_b[i_b]
7811 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7813 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7817 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7821 /* Here, have chosen which of the two inputs to look at. Only output
7822 * if the running count changes to/from 2, which marks the
7823 * beginning/end of a range that's in the intersection */
7827 array_r[i_r++] = cp;
7832 array_r[i_r++] = cp;
7838 /* Here, we are finished going through at least one of the lists, which
7839 * means there is something remaining in at most one. We check if the list
7840 * that has been exhausted is positioned such that we are in the middle
7841 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7842 * the ones we care about.) There are four cases:
7843 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7844 * nothing left in the intersection.
7845 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7846 * above 2. What should be output is exactly that which is in the
7847 * non-exhausted set, as everything it has is also in the intersection
7848 * set, and everything it doesn't have can't be in the intersection
7849 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7850 * gets incremented to 2. Like the previous case, the intersection is
7851 * everything that remains in the non-exhausted set.
7852 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7853 * remains 1. And the intersection has nothing more. */
7854 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7855 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7860 /* The final length is what we've output so far plus what else is in the
7861 * intersection. At most one of the subexpressions below will be non-zero */
7864 len_r += (len_a - i_a) + (len_b - i_b);
7867 /* Set result to final length, which can change the pointer to array_r, so
7869 if (len_r != _invlist_len(r)) {
7870 invlist_set_len(r, len_r);
7872 array_r = invlist_array(r);
7875 /* Finish outputting any remaining */
7876 if (count >= 2) { /* At most one will have a non-zero copy count */
7878 if ((copy_count = len_a - i_a) > 0) {
7879 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7881 else if ((copy_count = len_b - i_b) > 0) {
7882 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7886 /* If we've changed b, restore it */
7891 /* We may be removing a reference to one of the inputs */
7892 if (a == *i || b == *i) {
7893 assert(! invlist_is_iterating(*i));
7894 SvREFCNT_dec_NN(*i);
7902 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7904 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7905 * set. A pointer to the inversion list is returned. This may actually be
7906 * a new list, in which case the passed in one has been destroyed. The
7907 * passed in inversion list can be NULL, in which case a new one is created
7908 * with just the one range in it */
7913 if (invlist == NULL) {
7914 invlist = _new_invlist(2);
7918 len = _invlist_len(invlist);
7921 /* If comes after the final entry actually in the list, can just append it
7924 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
7925 && start >= invlist_array(invlist)[len - 1]))
7927 _append_range_to_invlist(invlist, start, end);
7931 /* Here, can't just append things, create and return a new inversion list
7932 * which is the union of this range and the existing inversion list */
7933 range_invlist = _new_invlist(2);
7934 _append_range_to_invlist(range_invlist, start, end);
7936 _invlist_union(invlist, range_invlist, &invlist);
7938 /* The temporary can be freed */
7939 SvREFCNT_dec_NN(range_invlist);
7946 PERL_STATIC_INLINE SV*
7947 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7948 return _add_range_to_invlist(invlist, cp, cp);
7951 #ifndef PERL_IN_XSUB_RE
7953 Perl__invlist_invert(pTHX_ SV* const invlist)
7955 /* Complement the input inversion list. This adds a 0 if the list didn't
7956 * have a zero; removes it otherwise. As described above, the data
7957 * structure is set up so that this is very efficient */
7959 UV* len_pos = _get_invlist_len_addr(invlist);
7961 PERL_ARGS_ASSERT__INVLIST_INVERT;
7963 assert(! invlist_is_iterating(invlist));
7965 /* The inverse of matching nothing is matching everything */
7966 if (*len_pos == 0) {
7967 _append_range_to_invlist(invlist, 0, UV_MAX);
7971 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7972 * zero element was a 0, so it is being removed, so the length decrements
7973 * by 1; and vice-versa. SvCUR is unaffected */
7974 if (*get_invlist_zero_addr(invlist) ^= 1) {
7983 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7985 /* Complement the input inversion list (which must be a Unicode property,
7986 * all of which don't match above the Unicode maximum code point.) And
7987 * Perl has chosen to not have the inversion match above that either. This
7988 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7994 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7996 _invlist_invert(invlist);
7998 len = _invlist_len(invlist);
8000 if (len != 0) { /* If empty do nothing */
8001 array = invlist_array(invlist);
8002 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8003 /* Add 0x110000. First, grow if necessary */
8005 if (invlist_max(invlist) < len) {
8006 invlist_extend(invlist, len);
8007 array = invlist_array(invlist);
8009 invlist_set_len(invlist, len);
8010 array[len - 1] = PERL_UNICODE_MAX + 1;
8012 else { /* Remove the 0x110000 */
8013 invlist_set_len(invlist, len - 1);
8021 PERL_STATIC_INLINE SV*
8022 S_invlist_clone(pTHX_ SV* const invlist)
8025 /* Return a new inversion list that is a copy of the input one, which is
8028 /* Need to allocate extra space to accommodate Perl's addition of a
8029 * trailing NUL to SvPV's, since it thinks they are always strings */
8030 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8031 STRLEN length = SvCUR(invlist);
8033 PERL_ARGS_ASSERT_INVLIST_CLONE;
8035 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8036 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8041 PERL_STATIC_INLINE UV*
8042 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8044 /* Return the address of the UV that contains the current iteration
8047 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8049 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8052 PERL_STATIC_INLINE UV*
8053 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8055 /* Return the address of the UV that contains the version id. */
8057 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8059 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8062 PERL_STATIC_INLINE void
8063 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8065 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8067 *get_invlist_iter_addr(invlist) = 0;
8070 PERL_STATIC_INLINE void
8071 S_invlist_iterfinish(pTHX_ SV* invlist)
8073 /* Terminate iterator for invlist. This is to catch development errors.
8074 * Any iteration that is interrupted before completed should call this
8075 * function. Functions that add code points anywhere else but to the end
8076 * of an inversion list assert that they are not in the middle of an
8077 * iteration. If they were, the addition would make the iteration
8078 * problematical: if the iteration hadn't reached the place where things
8079 * were being added, it would be ok */
8081 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8083 *get_invlist_iter_addr(invlist) = UV_MAX;
8087 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8089 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8090 * This call sets in <*start> and <*end>, the next range in <invlist>.
8091 * Returns <TRUE> if successful and the next call will return the next
8092 * range; <FALSE> if was already at the end of the list. If the latter,
8093 * <*start> and <*end> are unchanged, and the next call to this function
8094 * will start over at the beginning of the list */
8096 UV* pos = get_invlist_iter_addr(invlist);
8097 UV len = _invlist_len(invlist);
8100 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8103 *pos = UV_MAX; /* Force iterinit() to be required next time */
8107 array = invlist_array(invlist);
8109 *start = array[(*pos)++];
8115 *end = array[(*pos)++] - 1;
8121 PERL_STATIC_INLINE bool
8122 S_invlist_is_iterating(pTHX_ SV* const invlist)
8124 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8126 return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8129 PERL_STATIC_INLINE UV
8130 S_invlist_highest(pTHX_ SV* const invlist)
8132 /* Returns the highest code point that matches an inversion list. This API
8133 * has an ambiguity, as it returns 0 under either the highest is actually
8134 * 0, or if the list is empty. If this distinction matters to you, check
8135 * for emptiness before calling this function */
8137 UV len = _invlist_len(invlist);
8140 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8146 array = invlist_array(invlist);
8148 /* The last element in the array in the inversion list always starts a
8149 * range that goes to infinity. That range may be for code points that are
8150 * matched in the inversion list, or it may be for ones that aren't
8151 * matched. In the latter case, the highest code point in the set is one
8152 * less than the beginning of this range; otherwise it is the final element
8153 * of this range: infinity */
8154 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8156 : array[len - 1] - 1;
8159 #ifndef PERL_IN_XSUB_RE
8161 Perl__invlist_contents(pTHX_ SV* const invlist)
8163 /* Get the contents of an inversion list into a string SV so that they can
8164 * be printed out. It uses the format traditionally done for debug tracing
8168 SV* output = newSVpvs("\n");
8170 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8172 assert(! invlist_is_iterating(invlist));
8174 invlist_iterinit(invlist);
8175 while (invlist_iternext(invlist, &start, &end)) {
8176 if (end == UV_MAX) {
8177 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8179 else if (end != start) {
8180 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8184 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8192 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8194 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8196 /* Dumps out the ranges in an inversion list. The string 'header'
8197 * if present is output on a line before the first range */
8201 PERL_ARGS_ASSERT__INVLIST_DUMP;
8203 if (header && strlen(header)) {
8204 PerlIO_printf(Perl_debug_log, "%s\n", header);
8206 if (invlist_is_iterating(invlist)) {
8207 PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8211 invlist_iterinit(invlist);
8212 while (invlist_iternext(invlist, &start, &end)) {
8213 if (end == UV_MAX) {
8214 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8216 else if (end != start) {
8217 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8221 PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8229 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8231 /* Return a boolean as to if the two passed in inversion lists are
8232 * identical. The final argument, if TRUE, says to take the complement of
8233 * the second inversion list before doing the comparison */
8235 UV* array_a = invlist_array(a);
8236 UV* array_b = invlist_array(b);
8237 UV len_a = _invlist_len(a);
8238 UV len_b = _invlist_len(b);
8240 UV i = 0; /* current index into the arrays */
8241 bool retval = TRUE; /* Assume are identical until proven otherwise */
8243 PERL_ARGS_ASSERT__INVLISTEQ;
8245 /* If are to compare 'a' with the complement of b, set it
8246 * up so are looking at b's complement. */
8249 /* The complement of nothing is everything, so <a> would have to have
8250 * just one element, starting at zero (ending at infinity) */
8252 return (len_a == 1 && array_a[0] == 0);
8254 else if (array_b[0] == 0) {
8256 /* Otherwise, to complement, we invert. Here, the first element is
8257 * 0, just remove it. To do this, we just pretend the array starts
8258 * one later, and clear the flag as we don't have to do anything
8263 complement_b = FALSE;
8267 /* But if the first element is not zero, we unshift a 0 before the
8268 * array. The data structure reserves a space for that 0 (which
8269 * should be a '1' right now), so physical shifting is unneeded,
8270 * but temporarily change that element to 0. Before exiting the
8271 * routine, we must restore the element to '1' */
8278 /* Make sure that the lengths are the same, as well as the final element
8279 * before looping through the remainder. (Thus we test the length, final,
8280 * and first elements right off the bat) */
8281 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8284 else for (i = 0; i < len_a - 1; i++) {
8285 if (array_a[i] != array_b[i]) {
8298 #undef HEADER_LENGTH
8299 #undef INVLIST_INITIAL_LENGTH
8300 #undef TO_INTERNAL_SIZE
8301 #undef FROM_INTERNAL_SIZE
8302 #undef INVLIST_LEN_OFFSET
8303 #undef INVLIST_ZERO_OFFSET
8304 #undef INVLIST_ITER_OFFSET
8305 #undef INVLIST_VERSION_ID
8306 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8308 /* End of inversion list object */
8311 - reg - regular expression, i.e. main body or parenthesized thing
8313 * Caller must absorb opening parenthesis.
8315 * Combining parenthesis handling with the base level of regular expression
8316 * is a trifle forced, but the need to tie the tails of the branches to what
8317 * follows makes it hard to avoid.
8319 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8321 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8323 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8327 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8328 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8331 regnode *ret; /* Will be the head of the group. */
8334 regnode *ender = NULL;
8337 U32 oregflags = RExC_flags;
8338 bool have_branch = 0;
8340 I32 freeze_paren = 0;
8341 I32 after_freeze = 0;
8343 /* for (?g), (?gc), and (?o) warnings; warning
8344 about (?c) will warn about (?g) -- japhy */
8346 #define WASTED_O 0x01
8347 #define WASTED_G 0x02
8348 #define WASTED_C 0x04
8349 #define WASTED_GC (0x02|0x04)
8350 I32 wastedflags = 0x00;
8352 char * parse_start = RExC_parse; /* MJD */
8353 char * const oregcomp_parse = RExC_parse;
8355 GET_RE_DEBUG_FLAGS_DECL;
8357 PERL_ARGS_ASSERT_REG;
8358 DEBUG_PARSE("reg ");
8360 *flagp = 0; /* Tentatively. */
8363 /* Make an OPEN node, if parenthesized. */
8365 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8366 char *start_verb = RExC_parse;
8367 STRLEN verb_len = 0;
8368 char *start_arg = NULL;
8369 unsigned char op = 0;
8371 int internal_argval = 0; /* internal_argval is only useful if !argok */
8372 while ( *RExC_parse && *RExC_parse != ')' ) {
8373 if ( *RExC_parse == ':' ) {
8374 start_arg = RExC_parse + 1;
8380 verb_len = RExC_parse - start_verb;
8383 while ( *RExC_parse && *RExC_parse != ')' )
8385 if ( *RExC_parse != ')' )
8386 vFAIL("Unterminated verb pattern argument");
8387 if ( RExC_parse == start_arg )
8390 if ( *RExC_parse != ')' )
8391 vFAIL("Unterminated verb pattern");
8394 switch ( *start_verb ) {
8395 case 'A': /* (*ACCEPT) */
8396 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8398 internal_argval = RExC_nestroot;
8401 case 'C': /* (*COMMIT) */
8402 if ( memEQs(start_verb,verb_len,"COMMIT") )
8405 case 'F': /* (*FAIL) */
8406 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8411 case ':': /* (*:NAME) */
8412 case 'M': /* (*MARK:NAME) */
8413 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8418 case 'P': /* (*PRUNE) */
8419 if ( memEQs(start_verb,verb_len,"PRUNE") )
8422 case 'S': /* (*SKIP) */
8423 if ( memEQs(start_verb,verb_len,"SKIP") )
8426 case 'T': /* (*THEN) */
8427 /* [19:06] <TimToady> :: is then */
8428 if ( memEQs(start_verb,verb_len,"THEN") ) {
8430 RExC_seen |= REG_SEEN_CUTGROUP;
8436 vFAIL3("Unknown verb pattern '%.*s'",
8437 verb_len, start_verb);
8440 if ( start_arg && internal_argval ) {
8441 vFAIL3("Verb pattern '%.*s' may not have an argument",
8442 verb_len, start_verb);
8443 } else if ( argok < 0 && !start_arg ) {
8444 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8445 verb_len, start_verb);
8447 ret = reganode(pRExC_state, op, internal_argval);
8448 if ( ! internal_argval && ! SIZE_ONLY ) {
8450 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8451 ARG(ret) = add_data( pRExC_state, 1, "S" );
8452 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8459 if (!internal_argval)
8460 RExC_seen |= REG_SEEN_VERBARG;
8461 } else if ( start_arg ) {
8462 vFAIL3("Verb pattern '%.*s' may not have an argument",
8463 verb_len, start_verb);
8465 ret = reg_node(pRExC_state, op);
8467 nextchar(pRExC_state);
8470 if (*RExC_parse == '?') { /* (?...) */
8471 bool is_logical = 0;
8472 const char * const seqstart = RExC_parse;
8473 bool has_use_defaults = FALSE;
8476 paren = *RExC_parse++;
8477 ret = NULL; /* For look-ahead/behind. */
8480 case 'P': /* (?P...) variants for those used to PCRE/Python */
8481 paren = *RExC_parse++;
8482 if ( paren == '<') /* (?P<...>) named capture */
8484 else if (paren == '>') { /* (?P>name) named recursion */
8485 goto named_recursion;
8487 else if (paren == '=') { /* (?P=...) named backref */
8488 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8489 you change this make sure you change that */
8490 char* name_start = RExC_parse;
8492 SV *sv_dat = reg_scan_name(pRExC_state,
8493 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8494 if (RExC_parse == name_start || *RExC_parse != ')')
8495 vFAIL2("Sequence %.3s... not terminated",parse_start);
8498 num = add_data( pRExC_state, 1, "S" );
8499 RExC_rxi->data->data[num]=(void*)sv_dat;
8500 SvREFCNT_inc_simple_void(sv_dat);
8503 ret = reganode(pRExC_state,
8506 : (ASCII_FOLD_RESTRICTED)
8508 : (AT_LEAST_UNI_SEMANTICS)
8516 Set_Node_Offset(ret, parse_start+1);
8517 Set_Node_Cur_Length(ret); /* MJD */
8519 nextchar(pRExC_state);
8523 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8525 case '<': /* (?<...) */
8526 if (*RExC_parse == '!')
8528 else if (*RExC_parse != '=')
8534 case '\'': /* (?'...') */
8535 name_start= RExC_parse;
8536 svname = reg_scan_name(pRExC_state,
8537 SIZE_ONLY ? /* reverse test from the others */
8538 REG_RSN_RETURN_NAME :
8539 REG_RSN_RETURN_NULL);
8540 if (RExC_parse == name_start) {
8542 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8545 if (*RExC_parse != paren)
8546 vFAIL2("Sequence (?%c... not terminated",
8547 paren=='>' ? '<' : paren);
8551 if (!svname) /* shouldn't happen */
8553 "panic: reg_scan_name returned NULL");
8554 if (!RExC_paren_names) {
8555 RExC_paren_names= newHV();
8556 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8558 RExC_paren_name_list= newAV();
8559 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8562 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8564 sv_dat = HeVAL(he_str);
8566 /* croak baby croak */
8568 "panic: paren_name hash element allocation failed");
8569 } else if ( SvPOK(sv_dat) ) {
8570 /* (?|...) can mean we have dupes so scan to check
8571 its already been stored. Maybe a flag indicating
8572 we are inside such a construct would be useful,
8573 but the arrays are likely to be quite small, so
8574 for now we punt -- dmq */
8575 IV count = SvIV(sv_dat);
8576 I32 *pv = (I32*)SvPVX(sv_dat);
8578 for ( i = 0 ; i < count ; i++ ) {
8579 if ( pv[i] == RExC_npar ) {
8585 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8586 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8587 pv[count] = RExC_npar;
8588 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8591 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8592 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8594 SvIV_set(sv_dat, 1);
8597 /* Yes this does cause a memory leak in debugging Perls */
8598 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8599 SvREFCNT_dec_NN(svname);
8602 /*sv_dump(sv_dat);*/
8604 nextchar(pRExC_state);
8606 goto capturing_parens;
8608 RExC_seen |= REG_SEEN_LOOKBEHIND;
8609 RExC_in_lookbehind++;
8611 case '=': /* (?=...) */
8612 RExC_seen_zerolen++;
8614 case '!': /* (?!...) */
8615 RExC_seen_zerolen++;
8616 if (*RExC_parse == ')') {
8617 ret=reg_node(pRExC_state, OPFAIL);
8618 nextchar(pRExC_state);
8622 case '|': /* (?|...) */
8623 /* branch reset, behave like a (?:...) except that
8624 buffers in alternations share the same numbers */
8626 after_freeze = freeze_paren = RExC_npar;
8628 case ':': /* (?:...) */
8629 case '>': /* (?>...) */
8631 case '$': /* (?$...) */
8632 case '@': /* (?@...) */
8633 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8635 case '#': /* (?#...) */
8636 while (*RExC_parse && *RExC_parse != ')')
8638 if (*RExC_parse != ')')
8639 FAIL("Sequence (?#... not terminated");
8640 nextchar(pRExC_state);
8643 case '0' : /* (?0) */
8644 case 'R' : /* (?R) */
8645 if (*RExC_parse != ')')
8646 FAIL("Sequence (?R) not terminated");
8647 ret = reg_node(pRExC_state, GOSTART);
8648 *flagp |= POSTPONED;
8649 nextchar(pRExC_state);
8652 { /* named and numeric backreferences */
8654 case '&': /* (?&NAME) */
8655 parse_start = RExC_parse - 1;
8658 SV *sv_dat = reg_scan_name(pRExC_state,
8659 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8660 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8662 goto gen_recurse_regop;
8663 assert(0); /* NOT REACHED */
8665 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8667 vFAIL("Illegal pattern");
8669 goto parse_recursion;
8671 case '-': /* (?-1) */
8672 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8673 RExC_parse--; /* rewind to let it be handled later */
8677 case '1': case '2': case '3': case '4': /* (?1) */
8678 case '5': case '6': case '7': case '8': case '9':
8681 num = atoi(RExC_parse);
8682 parse_start = RExC_parse - 1; /* MJD */
8683 if (*RExC_parse == '-')
8685 while (isDIGIT(*RExC_parse))
8687 if (*RExC_parse!=')')
8688 vFAIL("Expecting close bracket");
8691 if ( paren == '-' ) {
8693 Diagram of capture buffer numbering.
8694 Top line is the normal capture buffer numbers
8695 Bottom line is the negative indexing as from
8699 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8703 num = RExC_npar + num;
8706 vFAIL("Reference to nonexistent group");
8708 } else if ( paren == '+' ) {
8709 num = RExC_npar + num - 1;
8712 ret = reganode(pRExC_state, GOSUB, num);
8714 if (num > (I32)RExC_rx->nparens) {
8716 vFAIL("Reference to nonexistent group");
8718 ARG2L_SET( ret, RExC_recurse_count++);
8720 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8721 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8725 RExC_seen |= REG_SEEN_RECURSE;
8726 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8727 Set_Node_Offset(ret, parse_start); /* MJD */
8729 *flagp |= POSTPONED;
8730 nextchar(pRExC_state);
8732 } /* named and numeric backreferences */
8733 assert(0); /* NOT REACHED */
8735 case '?': /* (??...) */
8737 if (*RExC_parse != '{') {
8739 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8742 *flagp |= POSTPONED;
8743 paren = *RExC_parse++;
8745 case '{': /* (?{...}) */
8748 struct reg_code_block *cb;
8750 RExC_seen_zerolen++;
8752 if ( !pRExC_state->num_code_blocks
8753 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8754 || pRExC_state->code_blocks[pRExC_state->code_index].start
8755 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8758 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8759 FAIL("panic: Sequence (?{...}): no code block found\n");
8760 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8762 /* this is a pre-compiled code block (?{...}) */
8763 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8764 RExC_parse = RExC_start + cb->end;
8767 if (cb->src_regex) {
8768 n = add_data(pRExC_state, 2, "rl");
8769 RExC_rxi->data->data[n] =
8770 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8771 RExC_rxi->data->data[n+1] = (void*)o;
8774 n = add_data(pRExC_state, 1,
8775 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8776 RExC_rxi->data->data[n] = (void*)o;
8779 pRExC_state->code_index++;
8780 nextchar(pRExC_state);
8784 ret = reg_node(pRExC_state, LOGICAL);
8785 eval = reganode(pRExC_state, EVAL, n);
8788 /* for later propagation into (??{}) return value */
8789 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8791 REGTAIL(pRExC_state, ret, eval);
8792 /* deal with the length of this later - MJD */
8795 ret = reganode(pRExC_state, EVAL, n);
8796 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8797 Set_Node_Offset(ret, parse_start);
8800 case '(': /* (?(?{...})...) and (?(?=...)...) */
8803 if (RExC_parse[0] == '?') { /* (?(?...)) */
8804 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8805 || RExC_parse[1] == '<'
8806 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8809 ret = reg_node(pRExC_state, LOGICAL);
8812 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8816 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8817 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8819 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8820 char *name_start= RExC_parse++;
8822 SV *sv_dat=reg_scan_name(pRExC_state,
8823 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8824 if (RExC_parse == name_start || *RExC_parse != ch)
8825 vFAIL2("Sequence (?(%c... not terminated",
8826 (ch == '>' ? '<' : ch));
8829 num = add_data( pRExC_state, 1, "S" );
8830 RExC_rxi->data->data[num]=(void*)sv_dat;
8831 SvREFCNT_inc_simple_void(sv_dat);
8833 ret = reganode(pRExC_state,NGROUPP,num);
8834 goto insert_if_check_paren;
8836 else if (RExC_parse[0] == 'D' &&
8837 RExC_parse[1] == 'E' &&
8838 RExC_parse[2] == 'F' &&
8839 RExC_parse[3] == 'I' &&
8840 RExC_parse[4] == 'N' &&
8841 RExC_parse[5] == 'E')
8843 ret = reganode(pRExC_state,DEFINEP,0);
8846 goto insert_if_check_paren;
8848 else if (RExC_parse[0] == 'R') {
8851 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8852 parno = atoi(RExC_parse++);
8853 while (isDIGIT(*RExC_parse))
8855 } else if (RExC_parse[0] == '&') {
8858 sv_dat = reg_scan_name(pRExC_state,
8859 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8860 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8862 ret = reganode(pRExC_state,INSUBP,parno);
8863 goto insert_if_check_paren;
8865 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8868 parno = atoi(RExC_parse++);
8870 while (isDIGIT(*RExC_parse))
8872 ret = reganode(pRExC_state, GROUPP, parno);
8874 insert_if_check_paren:
8875 if ((c = *nextchar(pRExC_state)) != ')')
8876 vFAIL("Switch condition not recognized");
8878 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8879 br = regbranch(pRExC_state, &flags, 1,depth+1);
8881 br = reganode(pRExC_state, LONGJMP, 0);
8883 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8884 c = *nextchar(pRExC_state);
8889 vFAIL("(?(DEFINE)....) does not allow branches");
8890 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8891 regbranch(pRExC_state, &flags, 1,depth+1);
8892 REGTAIL(pRExC_state, ret, lastbr);
8895 c = *nextchar(pRExC_state);
8900 vFAIL("Switch (?(condition)... contains too many branches");
8901 ender = reg_node(pRExC_state, TAIL);
8902 REGTAIL(pRExC_state, br, ender);
8904 REGTAIL(pRExC_state, lastbr, ender);
8905 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8908 REGTAIL(pRExC_state, ret, ender);
8909 RExC_size++; /* XXX WHY do we need this?!!
8910 For large programs it seems to be required
8911 but I can't figure out why. -- dmq*/
8915 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8918 case '[': /* (?[ ... ]) */
8919 return handle_sets(pRExC_state, flagp, depth, oregcomp_parse);
8921 RExC_parse--; /* for vFAIL to print correctly */
8922 vFAIL("Sequence (? incomplete");
8924 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8926 has_use_defaults = TRUE;
8927 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8928 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8929 ? REGEX_UNICODE_CHARSET
8930 : REGEX_DEPENDS_CHARSET);
8934 parse_flags: /* (?i) */
8936 U32 posflags = 0, negflags = 0;
8937 U32 *flagsp = &posflags;
8938 char has_charset_modifier = '\0';
8939 regex_charset cs = get_regex_charset(RExC_flags);
8940 if (cs == REGEX_DEPENDS_CHARSET
8941 && (RExC_utf8 || RExC_uni_semantics))
8943 cs = REGEX_UNICODE_CHARSET;
8946 while (*RExC_parse) {
8947 /* && strchr("iogcmsx", *RExC_parse) */
8948 /* (?g), (?gc) and (?o) are useless here
8949 and must be globally applied -- japhy */
8950 switch (*RExC_parse) {
8951 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8952 case LOCALE_PAT_MOD:
8953 if (has_charset_modifier) {
8954 goto excess_modifier;
8956 else if (flagsp == &negflags) {
8959 cs = REGEX_LOCALE_CHARSET;
8960 has_charset_modifier = LOCALE_PAT_MOD;
8961 RExC_contains_locale = 1;
8963 case UNICODE_PAT_MOD:
8964 if (has_charset_modifier) {
8965 goto excess_modifier;
8967 else if (flagsp == &negflags) {
8970 cs = REGEX_UNICODE_CHARSET;
8971 has_charset_modifier = UNICODE_PAT_MOD;
8973 case ASCII_RESTRICT_PAT_MOD:
8974 if (flagsp == &negflags) {
8977 if (has_charset_modifier) {
8978 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8979 goto excess_modifier;
8981 /* Doubled modifier implies more restricted */
8982 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8985 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8987 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8989 case DEPENDS_PAT_MOD:
8990 if (has_use_defaults) {
8991 goto fail_modifiers;
8993 else if (flagsp == &negflags) {
8996 else if (has_charset_modifier) {
8997 goto excess_modifier;
9000 /* The dual charset means unicode semantics if the
9001 * pattern (or target, not known until runtime) are
9002 * utf8, or something in the pattern indicates unicode
9004 cs = (RExC_utf8 || RExC_uni_semantics)
9005 ? REGEX_UNICODE_CHARSET
9006 : REGEX_DEPENDS_CHARSET;
9007 has_charset_modifier = DEPENDS_PAT_MOD;
9011 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9012 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9014 else if (has_charset_modifier == *(RExC_parse - 1)) {
9015 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9018 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9023 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9025 case ONCE_PAT_MOD: /* 'o' */
9026 case GLOBAL_PAT_MOD: /* 'g' */
9027 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9028 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9029 if (! (wastedflags & wflagbit) ) {
9030 wastedflags |= wflagbit;
9033 "Useless (%s%c) - %suse /%c modifier",
9034 flagsp == &negflags ? "?-" : "?",
9036 flagsp == &negflags ? "don't " : "",
9043 case CONTINUE_PAT_MOD: /* 'c' */
9044 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9045 if (! (wastedflags & WASTED_C) ) {
9046 wastedflags |= WASTED_GC;
9049 "Useless (%sc) - %suse /gc modifier",
9050 flagsp == &negflags ? "?-" : "?",
9051 flagsp == &negflags ? "don't " : ""
9056 case KEEPCOPY_PAT_MOD: /* 'p' */
9057 if (flagsp == &negflags) {
9059 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9061 *flagsp |= RXf_PMf_KEEPCOPY;
9065 /* A flag is a default iff it is following a minus, so
9066 * if there is a minus, it means will be trying to
9067 * re-specify a default which is an error */
9068 if (has_use_defaults || flagsp == &negflags) {
9071 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9075 wastedflags = 0; /* reset so (?g-c) warns twice */
9081 RExC_flags |= posflags;
9082 RExC_flags &= ~negflags;
9083 set_regex_charset(&RExC_flags, cs);
9085 oregflags |= posflags;
9086 oregflags &= ~negflags;
9087 set_regex_charset(&oregflags, cs);
9089 nextchar(pRExC_state);
9100 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9105 }} /* one for the default block, one for the switch */
9112 ret = reganode(pRExC_state, OPEN, parno);
9115 RExC_nestroot = parno;
9116 if (RExC_seen & REG_SEEN_RECURSE
9117 && !RExC_open_parens[parno-1])
9119 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9120 "Setting open paren #%"IVdf" to %d\n",
9121 (IV)parno, REG_NODE_NUM(ret)));
9122 RExC_open_parens[parno-1]= ret;
9125 Set_Node_Length(ret, 1); /* MJD */
9126 Set_Node_Offset(ret, RExC_parse); /* MJD */
9134 /* Pick up the branches, linking them together. */
9135 parse_start = RExC_parse; /* MJD */
9136 br = regbranch(pRExC_state, &flags, 1,depth+1);
9138 /* branch_len = (paren != 0); */
9142 if (*RExC_parse == '|') {
9143 if (!SIZE_ONLY && RExC_extralen) {
9144 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9147 reginsert(pRExC_state, BRANCH, br, depth+1);
9148 Set_Node_Length(br, paren != 0);
9149 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9153 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9155 else if (paren == ':') {
9156 *flagp |= flags&SIMPLE;
9158 if (is_open) { /* Starts with OPEN. */
9159 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9161 else if (paren != '?') /* Not Conditional */
9163 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9165 while (*RExC_parse == '|') {
9166 if (!SIZE_ONLY && RExC_extralen) {
9167 ender = reganode(pRExC_state, LONGJMP,0);
9168 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9171 RExC_extralen += 2; /* Account for LONGJMP. */
9172 nextchar(pRExC_state);
9174 if (RExC_npar > after_freeze)
9175 after_freeze = RExC_npar;
9176 RExC_npar = freeze_paren;
9178 br = regbranch(pRExC_state, &flags, 0, depth+1);
9182 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9184 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9187 if (have_branch || paren != ':') {
9188 /* Make a closing node, and hook it on the end. */
9191 ender = reg_node(pRExC_state, TAIL);
9194 ender = reganode(pRExC_state, CLOSE, parno);
9195 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9196 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9197 "Setting close paren #%"IVdf" to %d\n",
9198 (IV)parno, REG_NODE_NUM(ender)));
9199 RExC_close_parens[parno-1]= ender;
9200 if (RExC_nestroot == parno)
9203 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9204 Set_Node_Length(ender,1); /* MJD */
9210 *flagp &= ~HASWIDTH;
9213 ender = reg_node(pRExC_state, SUCCEED);
9216 ender = reg_node(pRExC_state, END);
9218 assert(!RExC_opend); /* there can only be one! */
9223 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9224 SV * const mysv_val1=sv_newmortal();
9225 SV * const mysv_val2=sv_newmortal();
9226 DEBUG_PARSE_MSG("lsbr");
9227 regprop(RExC_rx, mysv_val1, lastbr);
9228 regprop(RExC_rx, mysv_val2, ender);
9229 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9230 SvPV_nolen_const(mysv_val1),
9231 (IV)REG_NODE_NUM(lastbr),
9232 SvPV_nolen_const(mysv_val2),
9233 (IV)REG_NODE_NUM(ender),
9234 (IV)(ender - lastbr)
9237 REGTAIL(pRExC_state, lastbr, ender);
9239 if (have_branch && !SIZE_ONLY) {
9242 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9244 /* Hook the tails of the branches to the closing node. */
9245 for (br = ret; br; br = regnext(br)) {
9246 const U8 op = PL_regkind[OP(br)];
9248 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9249 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9252 else if (op == BRANCHJ) {
9253 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9254 /* for now we always disable this optimisation * /
9255 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9261 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9262 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9263 SV * const mysv_val1=sv_newmortal();
9264 SV * const mysv_val2=sv_newmortal();
9265 DEBUG_PARSE_MSG("NADA");
9266 regprop(RExC_rx, mysv_val1, ret);
9267 regprop(RExC_rx, mysv_val2, ender);
9268 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9269 SvPV_nolen_const(mysv_val1),
9270 (IV)REG_NODE_NUM(ret),
9271 SvPV_nolen_const(mysv_val2),
9272 (IV)REG_NODE_NUM(ender),
9277 if (OP(ender) == TAIL) {
9282 for ( opt= br + 1; opt < ender ; opt++ )
9284 NEXT_OFF(br)= ender - br;
9292 static const char parens[] = "=!<,>";
9294 if (paren && (p = strchr(parens, paren))) {
9295 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9296 int flag = (p - parens) > 1;
9299 node = SUSPEND, flag = 0;
9300 reginsert(pRExC_state, node,ret, depth+1);
9301 Set_Node_Cur_Length(ret);
9302 Set_Node_Offset(ret, parse_start + 1);
9304 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9308 /* Check for proper termination. */
9310 RExC_flags = oregflags;
9311 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9312 RExC_parse = oregcomp_parse;
9313 vFAIL("Unmatched (");
9316 else if (!paren && RExC_parse < RExC_end) {
9317 if (*RExC_parse == ')') {
9319 vFAIL("Unmatched )");
9322 FAIL("Junk on end of regexp"); /* "Can't happen". */
9323 assert(0); /* NOTREACHED */
9326 if (RExC_in_lookbehind) {
9327 RExC_in_lookbehind--;
9329 if (after_freeze > RExC_npar)
9330 RExC_npar = after_freeze;
9335 - regbranch - one alternative of an | operator
9337 * Implements the concatenation operator.
9340 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9344 regnode *chain = NULL;
9346 I32 flags = 0, c = 0;
9347 GET_RE_DEBUG_FLAGS_DECL;
9349 PERL_ARGS_ASSERT_REGBRANCH;
9351 DEBUG_PARSE("brnc");
9356 if (!SIZE_ONLY && RExC_extralen)
9357 ret = reganode(pRExC_state, BRANCHJ,0);
9359 ret = reg_node(pRExC_state, BRANCH);
9360 Set_Node_Length(ret, 1);
9364 if (!first && SIZE_ONLY)
9365 RExC_extralen += 1; /* BRANCHJ */
9367 *flagp = WORST; /* Tentatively. */
9370 nextchar(pRExC_state);
9371 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9373 latest = regpiece(pRExC_state, &flags,depth+1);
9374 if (latest == NULL) {
9375 if (flags & TRYAGAIN)
9379 else if (ret == NULL)
9381 *flagp |= flags&(HASWIDTH|POSTPONED);
9382 if (chain == NULL) /* First piece. */
9383 *flagp |= flags&SPSTART;
9386 REGTAIL(pRExC_state, chain, latest);
9391 if (chain == NULL) { /* Loop ran zero times. */
9392 chain = reg_node(pRExC_state, NOTHING);
9397 *flagp |= flags&SIMPLE;
9404 - regpiece - something followed by possible [*+?]
9406 * Note that the branching code sequences used for ? and the general cases
9407 * of * and + are somewhat optimized: they use the same NOTHING node as
9408 * both the endmarker for their branch list and the body of the last branch.
9409 * It might seem that this node could be dispensed with entirely, but the
9410 * endmarker role is not redundant.
9413 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9420 const char * const origparse = RExC_parse;
9422 I32 max = REG_INFTY;
9423 #ifdef RE_TRACK_PATTERN_OFFSETS
9426 const char *maxpos = NULL;
9428 /* Save the original in case we change the emitted regop to a FAIL. */
9429 regnode * const orig_emit = RExC_emit;
9431 GET_RE_DEBUG_FLAGS_DECL;
9433 PERL_ARGS_ASSERT_REGPIECE;
9435 DEBUG_PARSE("piec");
9437 ret = regatom(pRExC_state, &flags,depth+1);
9439 if (flags & TRYAGAIN)
9446 if (op == '{' && regcurly(RExC_parse, FALSE)) {
9448 #ifdef RE_TRACK_PATTERN_OFFSETS
9449 parse_start = RExC_parse; /* MJD */
9451 next = RExC_parse + 1;
9452 while (isDIGIT(*next) || *next == ',') {
9461 if (*next == '}') { /* got one */
9465 min = atoi(RExC_parse);
9469 maxpos = RExC_parse;
9471 if (!max && *maxpos != '0')
9472 max = REG_INFTY; /* meaning "infinity" */
9473 else if (max >= REG_INFTY)
9474 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9476 nextchar(pRExC_state);
9477 if (max < min) { /* If can't match, warn and optimize to fail
9480 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9482 /* We can't back off the size because we have to reserve
9483 * enough space for all the things we are about to throw
9484 * away, but we can shrink it by the ammount we are about
9486 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9489 RExC_emit = orig_emit;
9491 ret = reg_node(pRExC_state, OPFAIL);
9494 else if (max == 0) { /* replace {0} with a nothing node */
9496 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9499 RExC_emit = orig_emit;
9501 ret = reg_node(pRExC_state, NOTHING);
9506 if ((flags&SIMPLE)) {
9507 RExC_naughty += 2 + RExC_naughty / 2;
9508 reginsert(pRExC_state, CURLY, ret, depth+1);
9509 Set_Node_Offset(ret, parse_start+1); /* MJD */
9510 Set_Node_Cur_Length(ret);
9513 regnode * const w = reg_node(pRExC_state, WHILEM);
9516 REGTAIL(pRExC_state, ret, w);
9517 if (!SIZE_ONLY && RExC_extralen) {
9518 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9519 reginsert(pRExC_state, NOTHING,ret, depth+1);
9520 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9522 reginsert(pRExC_state, CURLYX,ret, depth+1);
9524 Set_Node_Offset(ret, parse_start+1);
9525 Set_Node_Length(ret,
9526 op == '{' ? (RExC_parse - parse_start) : 1);
9528 if (!SIZE_ONLY && RExC_extralen)
9529 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9530 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9532 RExC_whilem_seen++, RExC_extralen += 3;
9533 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9542 ARG1_SET(ret, (U16)min);
9543 ARG2_SET(ret, (U16)max);
9555 #if 0 /* Now runtime fix should be reliable. */
9557 /* if this is reinstated, don't forget to put this back into perldiag:
9559 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9561 (F) The part of the regexp subject to either the * or + quantifier
9562 could match an empty string. The {#} shows in the regular
9563 expression about where the problem was discovered.
9567 if (!(flags&HASWIDTH) && op != '?')
9568 vFAIL("Regexp *+ operand could be empty");
9571 #ifdef RE_TRACK_PATTERN_OFFSETS
9572 parse_start = RExC_parse;
9574 nextchar(pRExC_state);
9576 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9578 if (op == '*' && (flags&SIMPLE)) {
9579 reginsert(pRExC_state, STAR, ret, depth+1);
9583 else if (op == '*') {
9587 else if (op == '+' && (flags&SIMPLE)) {
9588 reginsert(pRExC_state, PLUS, ret, depth+1);
9592 else if (op == '+') {
9596 else if (op == '?') {
9601 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9602 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9603 ckWARN3reg(RExC_parse,
9604 "%.*s matches null string many times",
9605 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9607 (void)ReREFCNT_inc(RExC_rx_sv);
9610 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9611 nextchar(pRExC_state);
9612 reginsert(pRExC_state, MINMOD, ret, depth+1);
9613 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9615 #ifndef REG_ALLOW_MINMOD_SUSPEND
9618 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9620 nextchar(pRExC_state);
9621 ender = reg_node(pRExC_state, SUCCEED);
9622 REGTAIL(pRExC_state, ret, ender);
9623 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9625 ender = reg_node(pRExC_state, TAIL);
9626 REGTAIL(pRExC_state, ret, ender);
9630 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9632 vFAIL("Nested quantifiers");
9639 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9640 const bool strict /* Apply stricter parsing rules? */
9644 /* This is expected to be called by a parser routine that has recognized '\N'
9645 and needs to handle the rest. RExC_parse is expected to point at the first
9646 char following the N at the time of the call. On successful return,
9647 RExC_parse has been updated to point to just after the sequence identified
9648 by this routine, and <*flagp> has been updated.
9650 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9653 \N may begin either a named sequence, or if outside a character class, mean
9654 to match a non-newline. For non single-quoted regexes, the tokenizer has
9655 attempted to decide which, and in the case of a named sequence, converted it
9656 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9657 where c1... are the characters in the sequence. For single-quoted regexes,
9658 the tokenizer passes the \N sequence through unchanged; this code will not
9659 attempt to determine this nor expand those, instead raising a syntax error.
9660 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9661 or there is no '}', it signals that this \N occurrence means to match a
9664 Only the \N{U+...} form should occur in a character class, for the same
9665 reason that '.' inside a character class means to just match a period: it
9666 just doesn't make sense.
9668 The function raises an error (via vFAIL), and doesn't return for various
9669 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9670 success; it returns FALSE otherwise.
9672 If <valuep> is non-null, it means the caller can accept an input sequence
9673 consisting of a just a single code point; <*valuep> is set to that value
9674 if the input is such.
9676 If <node_p> is non-null it signifies that the caller can accept any other
9677 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9679 1) \N means not-a-NL: points to a newly created REG_ANY node;
9680 2) \N{}: points to a new NOTHING node;
9681 3) otherwise: points to a new EXACT node containing the resolved
9683 Note that FALSE is returned for single code point sequences if <valuep> is
9687 char * endbrace; /* '}' following the name */
9689 char *endchar; /* Points to '.' or '}' ending cur char in the input
9691 bool has_multiple_chars; /* true if the input stream contains a sequence of
9692 more than one character */
9694 GET_RE_DEBUG_FLAGS_DECL;
9696 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9700 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9702 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9703 * modifier. The other meaning does not */
9704 p = (RExC_flags & RXf_PMf_EXTENDED)
9705 ? regwhite( pRExC_state, RExC_parse )
9708 /* Disambiguate between \N meaning a named character versus \N meaning
9709 * [^\n]. The former is assumed when it can't be the latter. */
9710 if (*p != '{' || regcurly(p, FALSE)) {
9713 /* no bare \N in a charclass */
9714 if (in_char_class) {
9715 vFAIL("\\N in a character class must be a named character: \\N{...}");
9719 nextchar(pRExC_state);
9720 *node_p = reg_node(pRExC_state, REG_ANY);
9721 *flagp |= HASWIDTH|SIMPLE;
9724 Set_Node_Length(*node_p, 1); /* MJD */
9728 /* Here, we have decided it should be a named character or sequence */
9730 /* The test above made sure that the next real character is a '{', but
9731 * under the /x modifier, it could be separated by space (or a comment and
9732 * \n) and this is not allowed (for consistency with \x{...} and the
9733 * tokenizer handling of \N{NAME}). */
9734 if (*RExC_parse != '{') {
9735 vFAIL("Missing braces on \\N{}");
9738 RExC_parse++; /* Skip past the '{' */
9740 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9741 || ! (endbrace == RExC_parse /* nothing between the {} */
9742 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9743 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9745 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9746 vFAIL("\\N{NAME} must be resolved by the lexer");
9749 if (endbrace == RExC_parse) { /* empty: \N{} */
9752 *node_p = reg_node(pRExC_state,NOTHING);
9754 else if (in_char_class) {
9755 if (SIZE_ONLY && in_char_class) {
9757 RExC_parse++; /* Position after the "}" */
9758 vFAIL("Zero length \\N{}");
9761 ckWARNreg(RExC_parse,
9762 "Ignoring zero length \\N{} in character class");
9770 nextchar(pRExC_state);
9774 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9775 RExC_parse += 2; /* Skip past the 'U+' */
9777 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9779 /* Code points are separated by dots. If none, there is only one code
9780 * point, and is terminated by the brace */
9781 has_multiple_chars = (endchar < endbrace);
9783 if (valuep && (! has_multiple_chars || in_char_class)) {
9784 /* We only pay attention to the first char of
9785 multichar strings being returned in char classes. I kinda wonder
9786 if this makes sense as it does change the behaviour
9787 from earlier versions, OTOH that behaviour was broken
9788 as well. XXX Solution is to recharacterize as
9789 [rest-of-class]|multi1|multi2... */
9791 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9792 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9793 | PERL_SCAN_DISALLOW_PREFIX
9794 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9796 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9798 /* The tokenizer should have guaranteed validity, but it's possible to
9799 * bypass it by using single quoting, so check */
9800 if (length_of_hex == 0
9801 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9803 RExC_parse += length_of_hex; /* Includes all the valid */
9804 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9805 ? UTF8SKIP(RExC_parse)
9807 /* Guard against malformed utf8 */
9808 if (RExC_parse >= endchar) {
9809 RExC_parse = endchar;
9811 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9814 if (in_char_class && has_multiple_chars) {
9816 RExC_parse = endbrace;
9817 vFAIL("\\N{} in character class restricted to one character");
9820 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9824 RExC_parse = endbrace + 1;
9826 else if (! node_p || ! has_multiple_chars) {
9828 /* Here, the input is legal, but not according to the caller's
9829 * options. We fail without advancing the parse, so that the
9830 * caller can try again */
9836 /* What is done here is to convert this to a sub-pattern of the form
9837 * (?:\x{char1}\x{char2}...)
9838 * and then call reg recursively. That way, it retains its atomicness,
9839 * while not having to worry about special handling that some code
9840 * points may have. toke.c has converted the original Unicode values
9841 * to native, so that we can just pass on the hex values unchanged. We
9842 * do have to set a flag to keep recoding from happening in the
9845 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9847 char *orig_end = RExC_end;
9850 while (RExC_parse < endbrace) {
9852 /* Convert to notation the rest of the code understands */
9853 sv_catpv(substitute_parse, "\\x{");
9854 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9855 sv_catpv(substitute_parse, "}");
9857 /* Point to the beginning of the next character in the sequence. */
9858 RExC_parse = endchar + 1;
9859 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9861 sv_catpv(substitute_parse, ")");
9863 RExC_parse = SvPV(substitute_parse, len);
9865 /* Don't allow empty number */
9867 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9869 RExC_end = RExC_parse + len;
9871 /* The values are Unicode, and therefore not subject to recoding */
9872 RExC_override_recoding = 1;
9874 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9875 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9877 RExC_parse = endbrace;
9878 RExC_end = orig_end;
9879 RExC_override_recoding = 0;
9881 nextchar(pRExC_state);
9891 * It returns the code point in utf8 for the value in *encp.
9892 * value: a code value in the source encoding
9893 * encp: a pointer to an Encode object
9895 * If the result from Encode is not a single character,
9896 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9899 S_reg_recode(pTHX_ const char value, SV **encp)
9902 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9903 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9904 const STRLEN newlen = SvCUR(sv);
9905 UV uv = UNICODE_REPLACEMENT;
9907 PERL_ARGS_ASSERT_REG_RECODE;
9911 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9914 if (!newlen || numlen != newlen) {
9915 uv = UNICODE_REPLACEMENT;
9921 PERL_STATIC_INLINE U8
9922 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9926 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9932 op = get_regex_charset(RExC_flags);
9933 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9934 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9935 been, so there is no hole */
9941 PERL_STATIC_INLINE void
9942 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9944 /* This knows the details about sizing an EXACTish node, setting flags for
9945 * it (by setting <*flagp>, and potentially populating it with a single
9948 * If <len> (the length in bytes) is non-zero, this function assumes that
9949 * the node has already been populated, and just does the sizing. In this
9950 * case <code_point> should be the final code point that has already been
9951 * placed into the node. This value will be ignored except that under some
9952 * circumstances <*flagp> is set based on it.
9954 * If <len> is zero, the function assumes that the node is to contain only
9955 * the single character given by <code_point> and calculates what <len>
9956 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9957 * additionally will populate the node's STRING with <code_point>, if <len>
9958 * is 0. In both cases <*flagp> is appropriately set
9960 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9961 * folded (the latter only when the rules indicate it can match 'ss') */
9963 bool len_passed_in = cBOOL(len != 0);
9964 U8 character[UTF8_MAXBYTES_CASE+1];
9966 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9968 if (! len_passed_in) {
9971 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9974 uvchr_to_utf8( character, code_point);
9975 len = UTF8SKIP(character);
9979 || code_point != LATIN_SMALL_LETTER_SHARP_S
9980 || ASCII_FOLD_RESTRICTED
9981 || ! AT_LEAST_UNI_SEMANTICS)
9983 *character = (U8) code_point;
9988 *(character + 1) = 's';
9994 RExC_size += STR_SZ(len);
9997 RExC_emit += STR_SZ(len);
9998 STR_LEN(node) = len;
9999 if (! len_passed_in) {
10000 Copy((char *) character, STRING(node), len, char);
10004 *flagp |= HASWIDTH;
10006 /* A single character node is SIMPLE, except for the special-cased SHARP S
10008 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10009 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10010 || ! FOLD || ! DEPENDS_SEMANTICS))
10017 - regatom - the lowest level
10019 Try to identify anything special at the start of the pattern. If there
10020 is, then handle it as required. This may involve generating a single regop,
10021 such as for an assertion; or it may involve recursing, such as to
10022 handle a () structure.
10024 If the string doesn't start with something special then we gobble up
10025 as much literal text as we can.
10027 Once we have been able to handle whatever type of thing started the
10028 sequence, we return.
10030 Note: we have to be careful with escapes, as they can be both literal
10031 and special, and in the case of \10 and friends, context determines which.
10033 A summary of the code structure is:
10035 switch (first_byte) {
10036 cases for each special:
10037 handle this special;
10040 switch (2nd byte) {
10041 cases for each unambiguous special:
10042 handle this special;
10044 cases for each ambigous special/literal:
10046 if (special) handle here
10048 default: // unambiguously literal:
10051 default: // is a literal char
10054 create EXACTish node for literal;
10055 while (more input and node isn't full) {
10056 switch (input_byte) {
10057 cases for each special;
10058 make sure parse pointer is set so that the next call to
10059 regatom will see this special first
10060 goto loopdone; // EXACTish node terminated by prev. char
10062 append char to EXACTISH node;
10064 get next input byte;
10068 return the generated node;
10070 Specifically there are two separate switches for handling
10071 escape sequences, with the one for handling literal escapes requiring
10072 a dummy entry for all of the special escapes that are actually handled
10077 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10080 regnode *ret = NULL;
10082 char *parse_start = RExC_parse;
10086 GET_RE_DEBUG_FLAGS_DECL;
10088 *flagp = WORST; /* Tentatively. */
10090 DEBUG_PARSE("atom");
10092 PERL_ARGS_ASSERT_REGATOM;
10095 switch ((U8)*RExC_parse) {
10097 RExC_seen_zerolen++;
10098 nextchar(pRExC_state);
10099 if (RExC_flags & RXf_PMf_MULTILINE)
10100 ret = reg_node(pRExC_state, MBOL);
10101 else if (RExC_flags & RXf_PMf_SINGLELINE)
10102 ret = reg_node(pRExC_state, SBOL);
10104 ret = reg_node(pRExC_state, BOL);
10105 Set_Node_Length(ret, 1); /* MJD */
10108 nextchar(pRExC_state);
10110 RExC_seen_zerolen++;
10111 if (RExC_flags & RXf_PMf_MULTILINE)
10112 ret = reg_node(pRExC_state, MEOL);
10113 else if (RExC_flags & RXf_PMf_SINGLELINE)
10114 ret = reg_node(pRExC_state, SEOL);
10116 ret = reg_node(pRExC_state, EOL);
10117 Set_Node_Length(ret, 1); /* MJD */
10120 nextchar(pRExC_state);
10121 if (RExC_flags & RXf_PMf_SINGLELINE)
10122 ret = reg_node(pRExC_state, SANY);
10124 ret = reg_node(pRExC_state, REG_ANY);
10125 *flagp |= HASWIDTH|SIMPLE;
10127 Set_Node_Length(ret, 1); /* MJD */
10131 char * const oregcomp_parse = ++RExC_parse;
10132 ret = regclass(pRExC_state, flagp,depth+1,
10133 FALSE, /* means parse the whole char class */
10134 TRUE, /* allow multi-char folds */
10135 FALSE, /* don't silence non-portable warnings. */
10137 if (*RExC_parse != ']') {
10138 RExC_parse = oregcomp_parse;
10139 vFAIL("Unmatched [");
10141 nextchar(pRExC_state);
10142 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10146 nextchar(pRExC_state);
10147 ret = reg(pRExC_state, 1, &flags,depth+1);
10149 if (flags & TRYAGAIN) {
10150 if (RExC_parse == RExC_end) {
10151 /* Make parent create an empty node if needed. */
10152 *flagp |= TRYAGAIN;
10159 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10163 if (flags & TRYAGAIN) {
10164 *flagp |= TRYAGAIN;
10167 vFAIL("Internal urp");
10168 /* Supposed to be caught earlier. */
10171 if (!regcurly(RExC_parse, FALSE)) {
10180 vFAIL("Quantifier follows nothing");
10185 This switch handles escape sequences that resolve to some kind
10186 of special regop and not to literal text. Escape sequnces that
10187 resolve to literal text are handled below in the switch marked
10190 Every entry in this switch *must* have a corresponding entry
10191 in the literal escape switch. However, the opposite is not
10192 required, as the default for this switch is to jump to the
10193 literal text handling code.
10195 switch ((U8)*++RExC_parse) {
10197 /* Special Escapes */
10199 RExC_seen_zerolen++;
10200 ret = reg_node(pRExC_state, SBOL);
10202 goto finish_meta_pat;
10204 ret = reg_node(pRExC_state, GPOS);
10205 RExC_seen |= REG_SEEN_GPOS;
10207 goto finish_meta_pat;
10209 RExC_seen_zerolen++;
10210 ret = reg_node(pRExC_state, KEEPS);
10212 /* XXX:dmq : disabling in-place substitution seems to
10213 * be necessary here to avoid cases of memory corruption, as
10214 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10216 RExC_seen |= REG_SEEN_LOOKBEHIND;
10217 goto finish_meta_pat;
10219 ret = reg_node(pRExC_state, SEOL);
10221 RExC_seen_zerolen++; /* Do not optimize RE away */
10222 goto finish_meta_pat;
10224 ret = reg_node(pRExC_state, EOS);
10226 RExC_seen_zerolen++; /* Do not optimize RE away */
10227 goto finish_meta_pat;
10229 ret = reg_node(pRExC_state, CANY);
10230 RExC_seen |= REG_SEEN_CANY;
10231 *flagp |= HASWIDTH|SIMPLE;
10232 goto finish_meta_pat;
10234 ret = reg_node(pRExC_state, CLUMP);
10235 *flagp |= HASWIDTH;
10236 goto finish_meta_pat;
10242 arg = ANYOF_WORDCHAR;
10246 RExC_seen_zerolen++;
10247 RExC_seen |= REG_SEEN_LOOKBEHIND;
10248 op = BOUND + get_regex_charset(RExC_flags);
10249 if (op > BOUNDA) { /* /aa is same as /a */
10252 ret = reg_node(pRExC_state, op);
10253 FLAGS(ret) = get_regex_charset(RExC_flags);
10255 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10256 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
10258 goto finish_meta_pat;
10260 RExC_seen_zerolen++;
10261 RExC_seen |= REG_SEEN_LOOKBEHIND;
10262 op = NBOUND + get_regex_charset(RExC_flags);
10263 if (op > NBOUNDA) { /* /aa is same as /a */
10266 ret = reg_node(pRExC_state, op);
10267 FLAGS(ret) = get_regex_charset(RExC_flags);
10269 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10270 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
10272 goto finish_meta_pat;
10282 ret = reg_node(pRExC_state, LNBREAK);
10283 *flagp |= HASWIDTH|SIMPLE;
10284 goto finish_meta_pat;
10292 goto join_posix_op_known;
10298 arg = ANYOF_VERTWS;
10300 goto join_posix_op_known;
10310 op = POSIXD + get_regex_charset(RExC_flags);
10311 if (op > POSIXA) { /* /aa is same as /a */
10315 join_posix_op_known:
10318 op += NPOSIXD - POSIXD;
10321 ret = reg_node(pRExC_state, op);
10323 FLAGS(ret) = namedclass_to_classnum(arg);
10326 *flagp |= HASWIDTH|SIMPLE;
10330 nextchar(pRExC_state);
10331 Set_Node_Length(ret, 2); /* MJD */
10337 char* parse_start = RExC_parse - 2;
10342 ret = regclass(pRExC_state, flagp,depth+1,
10343 TRUE, /* means just parse this element */
10344 FALSE, /* don't allow multi-char folds */
10345 FALSE, /* don't silence non-portable warnings.
10346 It would be a bug if these returned
10352 Set_Node_Offset(ret, parse_start + 2);
10353 Set_Node_Cur_Length(ret);
10354 nextchar(pRExC_state);
10358 /* Handle \N and \N{NAME} with multiple code points here and not
10359 * below because it can be multicharacter. join_exact() will join
10360 * them up later on. Also this makes sure that things like
10361 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10362 * The options to the grok function call causes it to fail if the
10363 * sequence is just a single code point. We then go treat it as
10364 * just another character in the current EXACT node, and hence it
10365 * gets uniform treatment with all the other characters. The
10366 * special treatment for quantifiers is not needed for such single
10367 * character sequences */
10369 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10370 FALSE /* not strict */ )) {
10375 case 'k': /* Handle \k<NAME> and \k'NAME' */
10378 char ch= RExC_parse[1];
10379 if (ch != '<' && ch != '\'' && ch != '{') {
10381 vFAIL2("Sequence %.2s... not terminated",parse_start);
10383 /* this pretty much dupes the code for (?P=...) in reg(), if
10384 you change this make sure you change that */
10385 char* name_start = (RExC_parse += 2);
10387 SV *sv_dat = reg_scan_name(pRExC_state,
10388 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10389 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10390 if (RExC_parse == name_start || *RExC_parse != ch)
10391 vFAIL2("Sequence %.3s... not terminated",parse_start);
10394 num = add_data( pRExC_state, 1, "S" );
10395 RExC_rxi->data->data[num]=(void*)sv_dat;
10396 SvREFCNT_inc_simple_void(sv_dat);
10400 ret = reganode(pRExC_state,
10403 : (ASCII_FOLD_RESTRICTED)
10405 : (AT_LEAST_UNI_SEMANTICS)
10411 *flagp |= HASWIDTH;
10413 /* override incorrect value set in reganode MJD */
10414 Set_Node_Offset(ret, parse_start+1);
10415 Set_Node_Cur_Length(ret); /* MJD */
10416 nextchar(pRExC_state);
10422 case '1': case '2': case '3': case '4':
10423 case '5': case '6': case '7': case '8': case '9':
10426 bool isg = *RExC_parse == 'g';
10431 if (*RExC_parse == '{') {
10435 if (*RExC_parse == '-') {
10439 if (hasbrace && !isDIGIT(*RExC_parse)) {
10440 if (isrel) RExC_parse--;
10442 goto parse_named_seq;
10444 num = atoi(RExC_parse);
10445 if (isg && num == 0)
10446 vFAIL("Reference to invalid group 0");
10448 num = RExC_npar - num;
10450 vFAIL("Reference to nonexistent or unclosed group");
10452 if (!isg && num > 9 && num >= RExC_npar)
10453 /* Probably a character specified in octal, e.g. \35 */
10456 char * const parse_start = RExC_parse - 1; /* MJD */
10457 while (isDIGIT(*RExC_parse))
10459 if (parse_start == RExC_parse - 1)
10460 vFAIL("Unterminated \\g... pattern");
10462 if (*RExC_parse != '}')
10463 vFAIL("Unterminated \\g{...} pattern");
10467 if (num > (I32)RExC_rx->nparens)
10468 vFAIL("Reference to nonexistent group");
10471 ret = reganode(pRExC_state,
10474 : (ASCII_FOLD_RESTRICTED)
10476 : (AT_LEAST_UNI_SEMANTICS)
10482 *flagp |= HASWIDTH;
10484 /* override incorrect value set in reganode MJD */
10485 Set_Node_Offset(ret, parse_start+1);
10486 Set_Node_Cur_Length(ret); /* MJD */
10488 nextchar(pRExC_state);
10493 if (RExC_parse >= RExC_end)
10494 FAIL("Trailing \\");
10497 /* Do not generate "unrecognized" warnings here, we fall
10498 back into the quick-grab loop below */
10505 if (RExC_flags & RXf_PMf_EXTENDED) {
10506 if ( reg_skipcomment( pRExC_state ) )
10513 parse_start = RExC_parse - 1;
10522 #define MAX_NODE_STRING_SIZE 127
10523 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10525 U8 upper_parse = MAX_NODE_STRING_SIZE;
10528 bool next_is_quantifier;
10529 char * oldp = NULL;
10531 /* If a folding node contains only code points that don't
10532 * participate in folds, it can be changed into an EXACT node,
10533 * which allows the optimizer more things to look for */
10537 node_type = compute_EXACTish(pRExC_state);
10538 ret = reg_node(pRExC_state, node_type);
10540 /* In pass1, folded, we use a temporary buffer instead of the
10541 * actual node, as the node doesn't exist yet */
10542 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10548 /* We do the EXACTFish to EXACT node only if folding, and not if in
10549 * locale, as whether a character folds or not isn't known until
10551 maybe_exact = FOLD && ! LOC;
10553 /* XXX The node can hold up to 255 bytes, yet this only goes to
10554 * 127. I (khw) do not know why. Keeping it somewhat less than
10555 * 255 allows us to not have to worry about overflow due to
10556 * converting to utf8 and fold expansion, but that value is
10557 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10558 * split up by this limit into a single one using the real max of
10559 * 255. Even at 127, this breaks under rare circumstances. If
10560 * folding, we do not want to split a node at a character that is a
10561 * non-final in a multi-char fold, as an input string could just
10562 * happen to want to match across the node boundary. The join
10563 * would solve that problem if the join actually happens. But a
10564 * series of more than two nodes in a row each of 127 would cause
10565 * the first join to succeed to get to 254, but then there wouldn't
10566 * be room for the next one, which could at be one of those split
10567 * multi-char folds. I don't know of any fool-proof solution. One
10568 * could back off to end with only a code point that isn't such a
10569 * non-final, but it is possible for there not to be any in the
10571 for (p = RExC_parse - 1;
10572 len < upper_parse && p < RExC_end;
10577 if (RExC_flags & RXf_PMf_EXTENDED)
10578 p = regwhite( pRExC_state, p );
10589 /* Literal Escapes Switch
10591 This switch is meant to handle escape sequences that
10592 resolve to a literal character.
10594 Every escape sequence that represents something
10595 else, like an assertion or a char class, is handled
10596 in the switch marked 'Special Escapes' above in this
10597 routine, but also has an entry here as anything that
10598 isn't explicitly mentioned here will be treated as
10599 an unescaped equivalent literal.
10602 switch ((U8)*++p) {
10603 /* These are all the special escapes. */
10604 case 'A': /* Start assertion */
10605 case 'b': case 'B': /* Word-boundary assertion*/
10606 case 'C': /* Single char !DANGEROUS! */
10607 case 'd': case 'D': /* digit class */
10608 case 'g': case 'G': /* generic-backref, pos assertion */
10609 case 'h': case 'H': /* HORIZWS */
10610 case 'k': case 'K': /* named backref, keep marker */
10611 case 'p': case 'P': /* Unicode property */
10612 case 'R': /* LNBREAK */
10613 case 's': case 'S': /* space class */
10614 case 'v': case 'V': /* VERTWS */
10615 case 'w': case 'W': /* word class */
10616 case 'X': /* eXtended Unicode "combining character sequence" */
10617 case 'z': case 'Z': /* End of line/string assertion */
10621 /* Anything after here is an escape that resolves to a
10622 literal. (Except digits, which may or may not)
10628 case 'N': /* Handle a single-code point named character. */
10629 /* The options cause it to fail if a multiple code
10630 * point sequence. Handle those in the switch() above
10632 RExC_parse = p + 1;
10633 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10634 flagp, depth, FALSE,
10635 FALSE /* not strict */ ))
10637 RExC_parse = p = oldp;
10641 if (ender > 0xff) {
10658 ender = ASCII_TO_NATIVE('\033');
10662 ender = ASCII_TO_NATIVE('\007');
10668 const char* error_msg;
10670 bool valid = grok_bslash_o(&p,
10673 TRUE, /* out warnings */
10674 FALSE, /* not strict */
10675 TRUE, /* Output warnings
10680 RExC_parse = p; /* going to die anyway; point
10681 to exact spot of failure */
10685 if (PL_encoding && ender < 0x100) {
10686 goto recode_encoding;
10688 if (ender > 0xff) {
10695 UV result = UV_MAX; /* initialize to erroneous
10697 const char* error_msg;
10699 bool valid = grok_bslash_x(&p,
10702 TRUE, /* out warnings */
10703 FALSE, /* not strict */
10704 TRUE, /* Output warnings
10709 RExC_parse = p; /* going to die anyway; point
10710 to exact spot of failure */
10715 if (PL_encoding && ender < 0x100) {
10716 goto recode_encoding;
10718 if (ender > 0xff) {
10725 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10727 case '0': case '1': case '2': case '3':case '4':
10728 case '5': case '6': case '7':
10730 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10732 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10734 ender = grok_oct(p, &numlen, &flags, NULL);
10735 if (ender > 0xff) {
10739 if (SIZE_ONLY /* like \08, \178 */
10742 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10744 reg_warn_non_literal_string(
10746 form_short_octal_warning(p, numlen));
10749 else { /* Not to be treated as an octal constant, go
10754 if (PL_encoding && ender < 0x100)
10755 goto recode_encoding;
10758 if (! RExC_override_recoding) {
10759 SV* enc = PL_encoding;
10760 ender = reg_recode((const char)(U8)ender, &enc);
10761 if (!enc && SIZE_ONLY)
10762 ckWARNreg(p, "Invalid escape in the specified encoding");
10768 FAIL("Trailing \\");
10771 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10772 /* Include any { following the alpha to emphasize
10773 * that it could be part of an escape at some point
10775 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10776 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10778 goto normal_default;
10783 if (UTF8_IS_START(*p) && UTF) {
10785 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10786 &numlen, UTF8_ALLOW_DEFAULT);
10792 } /* End of switch on the literal */
10794 /* Here, have looked at the literal character and <ender>
10795 * contains its ordinal, <p> points to the character after it
10798 if ( RExC_flags & RXf_PMf_EXTENDED)
10799 p = regwhite( pRExC_state, p );
10801 /* If the next thing is a quantifier, it applies to this
10802 * character only, which means that this character has to be in
10803 * its own node and can't just be appended to the string in an
10804 * existing node, so if there are already other characters in
10805 * the node, close the node with just them, and set up to do
10806 * this character again next time through, when it will be the
10807 * only thing in its new node */
10808 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10816 /* See comments for join_exact() as to why we fold
10817 * this non-UTF at compile time */
10818 || (node_type == EXACTFU
10819 && ender == LATIN_SMALL_LETTER_SHARP_S))
10823 /* Prime the casefolded buffer. Locale rules, which
10824 * apply only to code points < 256, aren't known until
10825 * execution, so for them, just output the original
10826 * character using utf8. If we start to fold non-UTF
10827 * patterns, be sure to update join_exact() */
10828 if (LOC && ender < 256) {
10829 if (UNI_IS_INVARIANT(ender)) {
10833 *s = UTF8_TWO_BYTE_HI(ender);
10834 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10839 UV folded = _to_uni_fold_flags(
10844 | ((LOC) ? FOLD_FLAGS_LOCALE
10845 : (ASCII_FOLD_RESTRICTED)
10846 ? FOLD_FLAGS_NOMIX_ASCII
10850 /* If this node only contains non-folding code
10851 * points so far, see if this new one is also
10854 if (folded != ender) {
10855 maybe_exact = FALSE;
10858 /* Here the fold is the original; we have
10859 * to check further to see if anything
10861 if (! PL_utf8_foldable) {
10862 SV* swash = swash_init("utf8",
10864 &PL_sv_undef, 1, 0);
10866 _get_swash_invlist(swash);
10867 SvREFCNT_dec_NN(swash);
10869 if (_invlist_contains_cp(PL_utf8_foldable,
10872 maybe_exact = FALSE;
10880 /* The loop increments <len> each time, as all but this
10881 * path (and the one just below for UTF) through it add
10882 * a single byte to the EXACTish node. But this one
10883 * has changed len to be the correct final value, so
10884 * subtract one to cancel out the increment that
10886 len += foldlen - 1;
10889 *(s++) = (char) ender;
10890 maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10894 const STRLEN unilen = reguni(pRExC_state, ender, s);
10900 /* See comment just above for - 1 */
10904 REGC((char)ender, s++);
10907 if (next_is_quantifier) {
10909 /* Here, the next input is a quantifier, and to get here,
10910 * the current character is the only one in the node.
10911 * Also, here <len> doesn't include the final byte for this
10917 } /* End of loop through literal characters */
10919 /* Here we have either exhausted the input or ran out of room in
10920 * the node. (If we encountered a character that can't be in the
10921 * node, transfer is made directly to <loopdone>, and so we
10922 * wouldn't have fallen off the end of the loop.) In the latter
10923 * case, we artificially have to split the node into two, because
10924 * we just don't have enough space to hold everything. This
10925 * creates a problem if the final character participates in a
10926 * multi-character fold in the non-final position, as a match that
10927 * should have occurred won't, due to the way nodes are matched,
10928 * and our artificial boundary. So back off until we find a non-
10929 * problematic character -- one that isn't at the beginning or
10930 * middle of such a fold. (Either it doesn't participate in any
10931 * folds, or appears only in the final position of all the folds it
10932 * does participate in.) A better solution with far fewer false
10933 * positives, and that would fill the nodes more completely, would
10934 * be to actually have available all the multi-character folds to
10935 * test against, and to back-off only far enough to be sure that
10936 * this node isn't ending with a partial one. <upper_parse> is set
10937 * further below (if we need to reparse the node) to include just
10938 * up through that final non-problematic character that this code
10939 * identifies, so when it is set to less than the full node, we can
10940 * skip the rest of this */
10941 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10943 const STRLEN full_len = len;
10945 assert(len >= MAX_NODE_STRING_SIZE);
10947 /* Here, <s> points to the final byte of the final character.
10948 * Look backwards through the string until find a non-
10949 * problematic character */
10953 /* These two have no multi-char folds to non-UTF characters
10955 if (ASCII_FOLD_RESTRICTED || LOC) {
10959 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10963 if (! PL_NonL1NonFinalFold) {
10964 PL_NonL1NonFinalFold = _new_invlist_C_array(
10965 NonL1_Perl_Non_Final_Folds_invlist);
10968 /* Point to the first byte of the final character */
10969 s = (char *) utf8_hop((U8 *) s, -1);
10971 while (s >= s0) { /* Search backwards until find
10972 non-problematic char */
10973 if (UTF8_IS_INVARIANT(*s)) {
10975 /* There are no ascii characters that participate
10976 * in multi-char folds under /aa. In EBCDIC, the
10977 * non-ascii invariants are all control characters,
10978 * so don't ever participate in any folds. */
10979 if (ASCII_FOLD_RESTRICTED
10980 || ! IS_NON_FINAL_FOLD(*s))
10985 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10987 /* No Latin1 characters participate in multi-char
10988 * folds under /l */
10990 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10996 else if (! _invlist_contains_cp(
10997 PL_NonL1NonFinalFold,
10998 valid_utf8_to_uvchr((U8 *) s, NULL)))
11003 /* Here, the current character is problematic in that
11004 * it does occur in the non-final position of some
11005 * fold, so try the character before it, but have to
11006 * special case the very first byte in the string, so
11007 * we don't read outside the string */
11008 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11009 } /* End of loop backwards through the string */
11011 /* If there were only problematic characters in the string,
11012 * <s> will point to before s0, in which case the length
11013 * should be 0, otherwise include the length of the
11014 * non-problematic character just found */
11015 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11018 /* Here, have found the final character, if any, that is
11019 * non-problematic as far as ending the node without splitting
11020 * it across a potential multi-char fold. <len> contains the
11021 * number of bytes in the node up-to and including that
11022 * character, or is 0 if there is no such character, meaning
11023 * the whole node contains only problematic characters. In
11024 * this case, give up and just take the node as-is. We can't
11030 /* Here, the node does contain some characters that aren't
11031 * problematic. If one such is the final character in the
11032 * node, we are done */
11033 if (len == full_len) {
11036 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11038 /* If the final character is problematic, but the
11039 * penultimate is not, back-off that last character to
11040 * later start a new node with it */
11045 /* Here, the final non-problematic character is earlier
11046 * in the input than the penultimate character. What we do
11047 * is reparse from the beginning, going up only as far as
11048 * this final ok one, thus guaranteeing that the node ends
11049 * in an acceptable character. The reason we reparse is
11050 * that we know how far in the character is, but we don't
11051 * know how to correlate its position with the input parse.
11052 * An alternate implementation would be to build that
11053 * correlation as we go along during the original parse,
11054 * but that would entail extra work for every node, whereas
11055 * this code gets executed only when the string is too
11056 * large for the node, and the final two characters are
11057 * problematic, an infrequent occurrence. Yet another
11058 * possible strategy would be to save the tail of the
11059 * string, and the next time regatom is called, initialize
11060 * with that. The problem with this is that unless you
11061 * back off one more character, you won't be guaranteed
11062 * regatom will get called again, unless regbranch,
11063 * regpiece ... are also changed. If you do back off that
11064 * extra character, so that there is input guaranteed to
11065 * force calling regatom, you can't handle the case where
11066 * just the first character in the node is acceptable. I
11067 * (khw) decided to try this method which doesn't have that
11068 * pitfall; if performance issues are found, we can do a
11069 * combination of the current approach plus that one */
11075 } /* End of verifying node ends with an appropriate char */
11077 loopdone: /* Jumped to when encounters something that shouldn't be in
11080 /* If 'maybe_exact' is still set here, means there are no
11081 * code points in the node that participate in folds */
11082 if (FOLD && maybe_exact) {
11086 /* I (khw) don't know if you can get here with zero length, but the
11087 * old code handled this situation by creating a zero-length EXACT
11088 * node. Might as well be NOTHING instead */
11093 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11096 RExC_parse = p - 1;
11097 Set_Node_Cur_Length(ret); /* MJD */
11098 nextchar(pRExC_state);
11100 /* len is STRLEN which is unsigned, need to copy to signed */
11103 vFAIL("Internal disaster");
11106 } /* End of label 'defchar:' */
11108 } /* End of giant switch on input character */
11114 S_regwhite( RExC_state_t *pRExC_state, char *p )
11116 const char *e = RExC_end;
11118 PERL_ARGS_ASSERT_REGWHITE;
11123 else if (*p == '#') {
11126 if (*p++ == '\n') {
11132 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11141 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11143 /* Returns the next non-pattern-white space, non-comment character (the
11144 * latter only if 'recognize_comment is true) in the string p, which is
11145 * ended by RExC_end. If there is no line break ending a comment,
11146 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11147 const char *e = RExC_end;
11149 PERL_ARGS_ASSERT_REGPATWS;
11153 if ((len = is_PATWS_safe(p, e, UTF))) {
11156 else if (recognize_comment && *p == '#') {
11160 if (is_LNBREAK_safe(p, e, UTF)) {
11166 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11174 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11175 Character classes ([:foo:]) can also be negated ([:^foo:]).
11176 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11177 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11178 but trigger failures because they are currently unimplemented. */
11180 #define POSIXCC_DONE(c) ((c) == ':')
11181 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11182 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11184 PERL_STATIC_INLINE I32
11185 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me,
11189 I32 namedclass = OOB_NAMEDCLASS;
11191 PERL_ARGS_ASSERT_REGPPOSIXCC;
11193 if (value == '[' && RExC_parse + 1 < RExC_end &&
11194 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11195 POSIXCC(UCHARAT(RExC_parse)))
11197 const char c = UCHARAT(RExC_parse);
11198 char* const s = RExC_parse++;
11200 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11202 if (RExC_parse == RExC_end) {
11205 /* Try to give a better location for the error (than the end of
11206 * the string) by looking for the matching ']' */
11208 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11211 vFAIL2("Unmatched '%c' in POSIX class", c);
11213 /* Grandfather lone [:, [=, [. */
11217 const char* const t = RExC_parse++; /* skip over the c */
11220 if (UCHARAT(RExC_parse) == ']') {
11221 const char *posixcc = s + 1;
11222 RExC_parse++; /* skip over the ending ] */
11225 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11226 const I32 skip = t - posixcc;
11228 /* Initially switch on the length of the name. */
11231 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11232 this is the Perl \w
11234 namedclass = ANYOF_WORDCHAR;
11237 /* Names all of length 5. */
11238 /* alnum alpha ascii blank cntrl digit graph lower
11239 print punct space upper */
11240 /* Offset 4 gives the best switch position. */
11241 switch (posixcc[4]) {
11243 if (memEQ(posixcc, "alph", 4)) /* alpha */
11244 namedclass = ANYOF_ALPHA;
11247 if (memEQ(posixcc, "spac", 4)) /* space */
11248 namedclass = ANYOF_PSXSPC;
11251 if (memEQ(posixcc, "grap", 4)) /* graph */
11252 namedclass = ANYOF_GRAPH;
11255 if (memEQ(posixcc, "asci", 4)) /* ascii */
11256 namedclass = ANYOF_ASCII;
11259 if (memEQ(posixcc, "blan", 4)) /* blank */
11260 namedclass = ANYOF_BLANK;
11263 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11264 namedclass = ANYOF_CNTRL;
11267 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11268 namedclass = ANYOF_ALPHANUMERIC;
11271 if (memEQ(posixcc, "lowe", 4)) /* lower */
11272 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11273 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11274 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11277 if (memEQ(posixcc, "digi", 4)) /* digit */
11278 namedclass = ANYOF_DIGIT;
11279 else if (memEQ(posixcc, "prin", 4)) /* print */
11280 namedclass = ANYOF_PRINT;
11281 else if (memEQ(posixcc, "punc", 4)) /* punct */
11282 namedclass = ANYOF_PUNCT;
11287 if (memEQ(posixcc, "xdigit", 6))
11288 namedclass = ANYOF_XDIGIT;
11292 if (namedclass == OOB_NAMEDCLASS)
11293 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11296 /* The #defines are structured so each complement is +1 to
11297 * the normal one */
11301 assert (posixcc[skip] == ':');
11302 assert (posixcc[skip+1] == ']');
11303 } else if (!SIZE_ONLY) {
11304 /* [[=foo=]] and [[.foo.]] are still future. */
11306 /* adjust RExC_parse so the warning shows after
11307 the class closes */
11308 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11310 SvREFCNT_dec(free_me);
11311 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11314 /* Maternal grandfather:
11315 * "[:" ending in ":" but not in ":]" */
11317 vFAIL("Unmatched '[' in POSIX class");
11320 /* Grandfather lone [:, [=, [. */
11330 S_could_it_be_POSIX(pTHX_ RExC_state_t *pRExC_state)
11332 /* This applies some heuristics at the current parse position (which should
11333 * be at a '[') to see if what follows might be intended to be a [:posix:]
11334 * class. It returns true if it really is a posix class, of course, but it
11335 * also can return true if it thinks that what was intended was a posix
11336 * class that didn't quite make it.
11338 * It will return true for
11340 * [:alphanumerics] (as long as the ] isn't followed immediately by a
11341 * ')' indicating the end of the (?[
11342 * [:any garbage including %^&$ punctuation:]
11344 * This is designed to be called only from S_handle_sets; it could be
11345 * easily adapted to be called from the spot at the beginning of regclass()
11346 * that checks to see in a normal bracketed class if the surrounding []
11347 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
11348 * change long-standing behavior, so I (khw) didn't do that */
11349 char* p = RExC_parse + 1;
11350 char first_char = *p;
11352 PERL_ARGS_ASSERT_COULD_IT_BE_POSIX;
11354 assert(*(p - 1) == '[');
11356 if (! POSIXCC(first_char)) {
11361 while (p < RExC_end && isWORDCHAR(*p)) p++;
11363 if (p >= RExC_end) {
11367 if (p - RExC_parse > 2 /* Got at least 1 word character */
11368 && (*p == first_char
11369 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11374 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11377 && p - RExC_parse > 2 /* [:] evaluates to colon;
11378 [::] is a bad posix class. */
11379 && first_char == *(p - 1));
11383 S_handle_sets(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11384 char * const oregcomp_parse)
11386 /* Handle the (?[...]) construct to do set operations */
11389 UV start, end; /* End points of code point ranges */
11391 char *save_end, *save_parse;
11396 const bool save_fold = FOLD;
11398 GET_RE_DEBUG_FLAGS_DECL;
11400 PERL_ARGS_ASSERT_HANDLE_SETS;
11403 vFAIL("(?[...]) not valid in locale");
11405 RExC_uni_semantics = 1;
11407 /* This will return only an ANYOF regnode, or (unlikely) something smaller
11408 * (such as EXACT). Thus we can skip most everything if just sizing. We
11409 * call regclass to handle '[]' so as to not have to reinvent its parsing
11410 * rules here (throwing away the size it computes each time). And, we exit
11411 * upon an unescaped ']' that isn't one ending a regclass. To do both
11412 * these things, we need to realize that something preceded by a backslash
11413 * is escaped, so we have to keep track of backslashes */
11416 Perl_ck_warner_d(aTHX_
11417 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11418 "The regex_sets feature is experimental" REPORT_LOCATION,
11419 (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11421 while (RExC_parse < RExC_end) {
11422 SV* current = NULL;
11423 RExC_parse = regpatws(pRExC_state, RExC_parse,
11424 TRUE); /* means recognize comments */
11425 switch (*RExC_parse) {
11429 /* Skip the next byte. This would have to change to skip
11430 * the next character if we were to recognize and handle
11431 * specific non-ASCIIs */
11436 /* If this looks like it is a [:posix:] class, leave the
11437 * parse pointer at the '[' to fool regclass() into
11438 * thinking it is part of a '[[:posix]]'. That function
11439 * will use strict checking to force a syntax error if it
11440 * doesn't work out to a legitimate class */
11441 bool is_posix_class = could_it_be_POSIX(pRExC_state);
11442 if (! is_posix_class) {
11446 (void) regclass(pRExC_state, flagp,depth+1,
11447 is_posix_class, /* parse the whole char
11448 class only if not a
11450 FALSE, /* don't allow multi-char folds */
11451 TRUE, /* silence non-portable warnings. */
11453 /* function call leaves parse pointing to the ']', except
11454 * if we faked it */
11455 if (is_posix_class) {
11459 SvREFCNT_dec(current); /* In case it returned something */
11465 if (RExC_parse < RExC_end
11466 && *RExC_parse == ')')
11468 node = reganode(pRExC_state, ANYOF, 0);
11469 RExC_size += ANYOF_SKIP;
11470 nextchar(pRExC_state);
11471 Set_Node_Length(node,
11472 RExC_parse - oregcomp_parse + 1); /* MJD */
11481 FAIL("Syntax error in (?[...])");
11484 /* Pass 2 only after this. Everything in this construct is a
11485 * metacharacter. Operands begin with either a '\' (for an escape
11486 * sequence), or a '[' for a bracketed character class. Any other
11487 * character should be an operator, or parenthesis for grouping. Both
11488 * types of operands are handled by calling regclass() to parse them. It
11489 * is called with a parameter to indicate to return the computed inversion
11490 * list. The parsing here is implemented via a stack. Each entry on the
11491 * stack is a single character representing one of the operators, or the
11492 * '('; or else a pointer to an operand inversion list. */
11494 #define IS_OPERAND(a) (! SvIOK(a))
11496 /* The stack starts empty. It is a syntax error if the first thing parsed
11497 * is a binary operator; everything else is pushed on the stack. When an
11498 * operand is parsed, the top of the stack is examined. If it is a binary
11499 * operator, the item before it should be an operand, and both are replaced
11500 * by the result of doing that operation on the new operand and the one on
11501 * the stack. Thus a sequence of binary operands is reduced to a single
11502 * one before the next one is parsed.
11504 * A unary operator may immediately follow a binary in the input, for
11507 * When an operand is parsed and the top of the stack is a unary operator,
11508 * the operation is performed, and then the stack is rechecked to see if
11509 * this new operand is part of a binary operation; if so, it is handled as
11512 * A '(' is simply pushed on the stack; it is valid only if the stack is
11513 * empty, or the top element of the stack is an operator (for which the
11514 * parenthesized expression will become an operand). By the time the
11515 * corresponding ')' is parsed everything in between should have been
11516 * parsed and evaluated to a single operand (or else is a syntax error),
11517 * and is handled as a regular operand */
11521 while (RExC_parse < RExC_end) {
11522 I32 top_index = av_top(stack);
11524 SV* current = NULL;
11526 /* Skip white space */
11527 RExC_parse = regpatws(pRExC_state, RExC_parse,
11528 TRUE); /* means recognize comments */
11529 if (RExC_parse >= RExC_end
11530 || (curchar = UCHARAT(RExC_parse)) == ']')
11531 { /* Exit loop at the end */
11538 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11539 vFAIL("Unexpected character");
11542 (void) regclass(pRExC_state, flagp,depth+1,
11543 TRUE, /* means parse just the next thing */
11544 FALSE, /* don't allow multi-char folds */
11545 FALSE, /* don't silence non-portable warnings.
11548 /* regclass() will return with parsing just the \ sequence,
11549 * leaving the parse pointer at the next thing to parse */
11551 goto handle_operand;
11553 case '[': /* Is a bracketed character class */
11555 bool is_posix_class = could_it_be_POSIX(pRExC_state);
11557 if (! is_posix_class) {
11561 (void) regclass(pRExC_state, flagp,depth+1,
11562 is_posix_class, /* parse the whole char class
11563 only if not a posix class */
11564 FALSE, /* don't allow multi-char folds */
11565 FALSE, /* don't silence non-portable warnings.
11568 /* function call leaves parse pointing to the ']', except if we
11570 if (is_posix_class) {
11574 goto handle_operand;
11583 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11584 || ! IS_OPERAND(*top_ptr))
11587 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11589 av_push(stack, newSVuv(curchar));
11593 av_push(stack, newSVuv(curchar));
11597 if (top_index >= 0) {
11598 top_ptr = av_fetch(stack, top_index, FALSE);
11600 if (IS_OPERAND(*top_ptr)) {
11602 vFAIL("Unexpected '(' with no preceding operator");
11605 av_push(stack, newSVuv(curchar));
11612 || ! (current = av_pop(stack))
11613 || ! IS_OPERAND(current)
11614 || ! (lparen = av_pop(stack))
11615 || IS_OPERAND(lparen)
11616 || SvUV(lparen) != '(')
11619 vFAIL("Unexpected ')'");
11622 SvREFCNT_dec_NN(lparen);
11629 /* Here, we have an operand to process, in 'current' */
11631 if (top_index < 0) { /* Just push if stack is empty */
11632 av_push(stack, current);
11635 SV* top = av_pop(stack);
11636 char current_operator;
11638 if (IS_OPERAND(top)) {
11639 vFAIL("Operand with no preceding operator");
11641 current_operator = (char) SvUV(top);
11642 switch (current_operator) {
11643 case '(': /* Push the '(' back on followed by the new
11645 av_push(stack, top);
11646 av_push(stack, current);
11647 SvREFCNT_inc(top); /* Counters the '_dec' done
11648 just after the 'break', so
11649 it doesn't get wrongly freed
11654 _invlist_invert(current);
11656 /* Unlike binary operators, the top of the stack,
11657 * now that this unary one has been popped off, may
11658 * legally be an operator, and we now have operand
11661 SvREFCNT_dec_NN(top);
11662 goto handle_operand;
11665 _invlist_intersection(av_pop(stack),
11668 av_push(stack, current);
11673 _invlist_union(av_pop(stack), current, ¤t);
11674 av_push(stack, current);
11678 _invlist_subtract(av_pop(stack), current, ¤t);
11679 av_push(stack, current);
11682 case '^': /* The union minus the intersection */
11688 element = av_pop(stack);
11689 _invlist_union(element, current, &u);
11690 _invlist_intersection(element, current, &i);
11691 _invlist_subtract(u, i, ¤t);
11692 av_push(stack, current);
11693 SvREFCNT_dec_NN(i);
11694 SvREFCNT_dec_NN(u);
11695 SvREFCNT_dec_NN(element);
11700 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
11702 SvREFCNT_dec_NN(top);
11706 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11709 if (av_top(stack) < 0 /* Was empty */
11710 || ((final = av_pop(stack)) == NULL)
11711 || ! IS_OPERAND(final)
11712 || av_top(stack) >= 0) /* More left on stack */
11714 vFAIL("Incomplete expression within '(?[ ])'");
11717 invlist_iterinit(final);
11719 /* Here, 'final' is the resultant inversion list of evaluating the
11720 * expression. Feed it to regclass() to generate the real resultant node.
11721 * regclass() is expecting a string of ranges and individual code points */
11722 result_string = newSVpvs("");
11723 while (invlist_iternext(final, &start, &end)) {
11724 if (start == end) {
11725 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
11728 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
11733 save_parse = RExC_parse;
11734 RExC_parse = SvPV(result_string, len);
11735 save_end = RExC_end;
11736 RExC_end = RExC_parse + len;
11738 /* We turn off folding around the call, as the class we have constructed
11739 * already has all folding taken into consideration, and we don't want
11740 * regclass() to add to that */
11741 RExC_flags &= ~RXf_PMf_FOLD;
11742 node = regclass(pRExC_state, flagp,depth+1,
11743 FALSE, /* means parse the whole char class */
11744 FALSE, /* don't allow multi-char folds */
11745 TRUE, /* silence non-portable warnings. The above may very
11746 well have generated non-portable code points, but
11747 they're valid on this machine */
11750 RExC_flags |= RXf_PMf_FOLD;
11752 RExC_parse = save_parse + 1;
11753 RExC_end = save_end;
11754 SvREFCNT_dec_NN(final);
11755 SvREFCNT_dec_NN(result_string);
11756 SvREFCNT_dec_NN(stack);
11758 nextchar(pRExC_state);
11759 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
11764 /* The names of properties whose definitions are not known at compile time are
11765 * stored in this SV, after a constant heading. So if the length has been
11766 * changed since initialization, then there is a run-time definition. */
11767 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11770 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11771 const bool stop_at_1, /* Just parse the next thing, don't
11772 look for a full character class */
11773 bool allow_multi_folds,
11774 const bool silence_non_portable, /* Don't output warnings
11777 SV** ret_invlist) /* Return an inversion list, not a node */
11779 /* parse a bracketed class specification. Most of these will produce an
11780 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
11781 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
11782 * under /i with multi-character folds: it will be rewritten following the
11783 * paradigm of this example, where the <multi-fold>s are characters which
11784 * fold to multiple character sequences:
11785 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11786 * gets effectively rewritten as:
11787 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11788 * reg() gets called (recursively) on the rewritten version, and this
11789 * function will return what it constructs. (Actually the <multi-fold>s
11790 * aren't physically removed from the [abcdefghi], it's just that they are
11791 * ignored in the recursion by means of a flag:
11792 * <RExC_in_multi_char_class>.)
11794 * ANYOF nodes contain a bit map for the first 256 characters, with the
11795 * corresponding bit set if that character is in the list. For characters
11796 * above 255, a range list or swash is used. There are extra bits for \w,
11797 * etc. in locale ANYOFs, as what these match is not determinable at
11801 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11803 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11806 IV namedclass = OOB_NAMEDCLASS;
11807 char *rangebegin = NULL;
11808 bool need_class = 0;
11810 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11811 than just initialized. */
11812 SV* properties = NULL; /* Code points that match \p{} \P{} */
11813 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11814 extended beyond the Latin1 range */
11815 UV element_count = 0; /* Number of distinct elements in the class.
11816 Optimizations may be possible if this is tiny */
11817 AV * multi_char_matches = NULL; /* Code points that fold to more than one
11818 character; used under /i */
11820 char * stop_ptr = RExC_end; /* where to stop parsing */
11821 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
11823 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
11825 /* Unicode properties are stored in a swash; this holds the current one
11826 * being parsed. If this swash is the only above-latin1 component of the
11827 * character class, an optimization is to pass it directly on to the
11828 * execution engine. Otherwise, it is set to NULL to indicate that there
11829 * are other things in the class that have to be dealt with at execution
11831 SV* swash = NULL; /* Code points that match \p{} \P{} */
11833 /* Set if a component of this character class is user-defined; just passed
11834 * on to the engine */
11835 bool has_user_defined_property = FALSE;
11837 /* inversion list of code points this node matches only when the target
11838 * string is in UTF-8. (Because is under /d) */
11839 SV* depends_list = NULL;
11841 /* inversion list of code points this node matches. For much of the
11842 * function, it includes only those that match regardless of the utf8ness
11843 * of the target string */
11844 SV* cp_list = NULL;
11847 /* In a range, counts how many 0-2 of the ends of it came from literals,
11848 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11849 UV literal_endpoint = 0;
11851 bool invert = FALSE; /* Is this class to be complemented */
11853 /* Is there any thing like \W or [:^digit:] that matches above the legal
11854 * Unicode range? */
11855 bool runtime_posix_matches_above_Unicode = FALSE;
11857 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11858 case we need to change the emitted regop to an EXACT. */
11859 const char * orig_parse = RExC_parse;
11860 const I32 orig_size = RExC_size;
11861 GET_RE_DEBUG_FLAGS_DECL;
11863 PERL_ARGS_ASSERT_REGCLASS;
11865 PERL_UNUSED_ARG(depth);
11868 DEBUG_PARSE("clas");
11870 /* Assume we are going to generate an ANYOF node. */
11871 ret = reganode(pRExC_state, ANYOF, 0);
11874 RExC_size += ANYOF_SKIP;
11875 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11878 ANYOF_FLAGS(ret) = 0;
11880 RExC_emit += ANYOF_SKIP;
11882 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11884 listsv = newSVpvs("# comment\n");
11885 initial_listsv_len = SvCUR(listsv);
11889 RExC_parse = regpatws(pRExC_state, RExC_parse,
11890 FALSE /* means don't recognize comments */);
11893 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11896 allow_multi_folds = FALSE;
11899 RExC_parse = regpatws(pRExC_state, RExC_parse,
11900 FALSE /* means don't recognize comments */);
11904 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
11905 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
11906 const char *s = RExC_parse;
11907 const char c = *s++;
11909 while (isWORDCHAR(*s))
11911 if (*s && c == *s && s[1] == ']') {
11912 SAVEFREESV(RExC_rx_sv);
11913 SAVEFREESV(listsv);
11915 "POSIX syntax [%c %c] belongs inside character classes",
11917 (void)ReREFCNT_inc(RExC_rx_sv);
11918 SvREFCNT_inc_simple_void_NN(listsv);
11922 /* If the caller wants us to just parse a single element, accomplish this
11923 * by faking the loop ending condition */
11924 if (stop_at_1 && RExC_end > RExC_parse) {
11925 stop_ptr = RExC_parse + 1;
11928 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
11929 if (UCHARAT(RExC_parse) == ']')
11930 goto charclassloop;
11934 if (RExC_parse >= stop_ptr) {
11939 RExC_parse = regpatws(pRExC_state, RExC_parse,
11940 FALSE /* means don't recognize comments */);
11943 if (UCHARAT(RExC_parse) == ']') {
11949 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11950 save_value = value;
11951 save_prevvalue = prevvalue;
11954 rangebegin = RExC_parse;
11958 value = utf8n_to_uvchr((U8*)RExC_parse,
11959 RExC_end - RExC_parse,
11960 &numlen, UTF8_ALLOW_DEFAULT);
11961 RExC_parse += numlen;
11964 value = UCHARAT(RExC_parse++);
11967 && RExC_parse < RExC_end
11968 && POSIXCC(UCHARAT(RExC_parse)))
11970 namedclass = regpposixcc(pRExC_state, value, listsv, strict);
11972 else if (value == '\\') {
11974 value = utf8n_to_uvchr((U8*)RExC_parse,
11975 RExC_end - RExC_parse,
11976 &numlen, UTF8_ALLOW_DEFAULT);
11977 RExC_parse += numlen;
11980 value = UCHARAT(RExC_parse++);
11982 /* Some compilers cannot handle switching on 64-bit integer
11983 * values, therefore value cannot be an UV. Yes, this will
11984 * be a problem later if we want switch on Unicode.
11985 * A similar issue a little bit later when switching on
11986 * namedclass. --jhi */
11988 /* If the \ is escaping white space when white space is being
11989 * skipped, it means that that white space is wanted literally, and
11990 * is already in 'value'. Otherwise, need to translate the escape
11991 * into what it signifies. */
11992 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
11994 case 'w': namedclass = ANYOF_WORDCHAR; break;
11995 case 'W': namedclass = ANYOF_NWORDCHAR; break;
11996 case 's': namedclass = ANYOF_SPACE; break;
11997 case 'S': namedclass = ANYOF_NSPACE; break;
11998 case 'd': namedclass = ANYOF_DIGIT; break;
11999 case 'D': namedclass = ANYOF_NDIGIT; break;
12000 case 'v': namedclass = ANYOF_VERTWS; break;
12001 case 'V': namedclass = ANYOF_NVERTWS; break;
12002 case 'h': namedclass = ANYOF_HORIZWS; break;
12003 case 'H': namedclass = ANYOF_NHORIZWS; break;
12004 case 'N': /* Handle \N{NAME} in class */
12006 /* We only pay attention to the first char of
12007 multichar strings being returned. I kinda wonder
12008 if this makes sense as it does change the behaviour
12009 from earlier versions, OTOH that behaviour was broken
12011 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12012 TRUE, /* => charclass */
12024 /* We will handle any undefined properties ourselves */
12025 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12027 if (RExC_parse >= RExC_end)
12028 vFAIL2("Empty \\%c{}", (U8)value);
12029 if (*RExC_parse == '{') {
12030 const U8 c = (U8)value;
12031 e = strchr(RExC_parse++, '}');
12033 vFAIL2("Missing right brace on \\%c{}", c);
12034 while (isSPACE(UCHARAT(RExC_parse)))
12036 if (e == RExC_parse)
12037 vFAIL2("Empty \\%c{}", c);
12038 n = e - RExC_parse;
12039 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12050 if (UCHARAT(RExC_parse) == '^') {
12053 /* toggle. (The rhs xor gets the single bit that
12054 * differs between P and p; the other xor inverts just
12056 value ^= 'P' ^ 'p';
12058 while (isSPACE(UCHARAT(RExC_parse))) {
12063 /* Try to get the definition of the property into
12064 * <invlist>. If /i is in effect, the effective property
12065 * will have its name be <__NAME_i>. The design is
12066 * discussed in commit
12067 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12068 Newx(name, n + sizeof("_i__\n"), char);
12070 sprintf(name, "%s%.*s%s\n",
12071 (FOLD) ? "__" : "",
12077 /* Look up the property name, and get its swash and
12078 * inversion list, if the property is found */
12080 SvREFCNT_dec_NN(swash);
12082 swash = _core_swash_init("utf8", name, &PL_sv_undef,
12085 NULL, /* No inversion list */
12088 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12090 SvREFCNT_dec_NN(swash);
12094 /* Here didn't find it. It could be a user-defined
12095 * property that will be available at run-time. If we
12096 * accept only compile-time properties, is an error;
12097 * otherwise add it to the list for run-time look up */
12099 RExC_parse = e + 1;
12100 vFAIL3("Property '%.*s' is unknown", (int) n, name);
12102 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12103 (value == 'p' ? '+' : '!'),
12105 has_user_defined_property = TRUE;
12107 /* We don't know yet, so have to assume that the
12108 * property could match something in the Latin1 range,
12109 * hence something that isn't utf8. Note that this
12110 * would cause things in <depends_list> to match
12111 * inappropriately, except that any \p{}, including
12112 * this one forces Unicode semantics, which means there
12113 * is <no depends_list> */
12114 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12118 /* Here, did get the swash and its inversion list. If
12119 * the swash is from a user-defined property, then this
12120 * whole character class should be regarded as such */
12121 has_user_defined_property =
12123 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12125 /* Invert if asking for the complement */
12126 if (value == 'P') {
12127 _invlist_union_complement_2nd(properties,
12131 /* The swash can't be used as-is, because we've
12132 * inverted things; delay removing it to here after
12133 * have copied its invlist above */
12134 SvREFCNT_dec_NN(swash);
12138 _invlist_union(properties, invlist, &properties);
12143 RExC_parse = e + 1;
12144 namedclass = ANYOF_UNIPROP; /* no official name, but it's
12147 /* \p means they want Unicode semantics */
12148 RExC_uni_semantics = 1;
12151 case 'n': value = '\n'; break;
12152 case 'r': value = '\r'; break;
12153 case 't': value = '\t'; break;
12154 case 'f': value = '\f'; break;
12155 case 'b': value = '\b'; break;
12156 case 'e': value = ASCII_TO_NATIVE('\033');break;
12157 case 'a': value = ASCII_TO_NATIVE('\007');break;
12159 RExC_parse--; /* function expects to be pointed at the 'o' */
12161 const char* error_msg;
12162 bool valid = grok_bslash_o(&RExC_parse,
12165 SIZE_ONLY, /* warnings in pass
12168 silence_non_portable,
12174 if (PL_encoding && value < 0x100) {
12175 goto recode_encoding;
12179 RExC_parse--; /* function expects to be pointed at the 'x' */
12181 const char* error_msg;
12182 bool valid = grok_bslash_x(&RExC_parse,
12185 TRUE, /* Output warnings */
12187 silence_non_portable,
12193 if (PL_encoding && value < 0x100)
12194 goto recode_encoding;
12197 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12199 case '0': case '1': case '2': case '3': case '4':
12200 case '5': case '6': case '7':
12202 /* Take 1-3 octal digits */
12203 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12204 numlen = (strict) ? 4 : 3;
12205 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12206 RExC_parse += numlen;
12208 SAVEFREESV(listsv); /* In case warnings are fatalized */
12210 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12211 vFAIL("Need exactly 3 octal digits");
12213 else if (! SIZE_ONLY /* like \08, \178 */
12215 && RExC_parse < RExC_end
12216 && isDIGIT(*RExC_parse)
12217 && ckWARN(WARN_REGEXP))
12219 SAVEFREESV(RExC_rx_sv);
12220 reg_warn_non_literal_string(
12222 form_short_octal_warning(RExC_parse, numlen));
12223 (void)ReREFCNT_inc(RExC_rx_sv);
12225 SvREFCNT_inc_simple_void_NN(listsv);
12227 if (PL_encoding && value < 0x100)
12228 goto recode_encoding;
12232 if (! RExC_override_recoding) {
12233 SV* enc = PL_encoding;
12234 value = reg_recode((const char)(U8)value, &enc);
12237 vFAIL("Invalid escape in the specified encoding");
12239 else if (SIZE_ONLY) {
12240 ckWARNreg(RExC_parse,
12241 "Invalid escape in the specified encoding");
12247 /* Allow \_ to not give an error */
12248 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12249 SAVEFREESV(listsv);
12251 vFAIL2("Unrecognized escape \\%c in character class",
12255 SAVEFREESV(RExC_rx_sv);
12256 ckWARN2reg(RExC_parse,
12257 "Unrecognized escape \\%c in character class passed through",
12259 (void)ReREFCNT_inc(RExC_rx_sv);
12261 SvREFCNT_inc_simple_void_NN(listsv);
12264 } /* End of switch on char following backslash */
12265 } /* end of handling backslash escape sequences */
12268 literal_endpoint++;
12271 /* Here, we have the current token in 'value' */
12273 /* What matches in a locale is not known until runtime. This includes
12274 * what the Posix classes (like \w, [:space:]) match. Room must be
12275 * reserved (one time per class) to store such classes, either if Perl
12276 * is compiled so that locale nodes always should have this space, or
12277 * if there is such class info to be stored. The space will contain a
12278 * bit for each named class that is to be matched against. This isn't
12279 * needed for \p{} and pseudo-classes, as they are not affected by
12280 * locale, and hence are dealt with separately */
12283 && (ANYOF_LOCALE == ANYOF_CLASS
12284 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12288 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12291 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12292 ANYOF_CLASS_ZERO(ret);
12294 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12297 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12299 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
12300 * literal, as is the character that began the false range, i.e.
12301 * the 'a' in the examples */
12304 const int w = (RExC_parse >= rangebegin)
12305 ? RExC_parse - rangebegin
12307 SAVEFREESV(listsv); /* in case of fatal warnings */
12309 vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12312 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12313 ckWARN4reg(RExC_parse,
12314 "False [] range \"%*.*s\"",
12316 (void)ReREFCNT_inc(RExC_rx_sv);
12317 cp_list = add_cp_to_invlist(cp_list, '-');
12318 cp_list = add_cp_to_invlist(cp_list, prevvalue);
12320 SvREFCNT_inc_simple_void_NN(listsv);
12323 range = 0; /* this was not a true range */
12324 element_count += 2; /* So counts for three values */
12328 U8 classnum = namedclass_to_classnum(namedclass);
12329 if (namedclass >= ANYOF_MAX) { /* If a special class */
12330 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12332 /* Here, should be \h, \H, \v, or \V. Neither /d nor
12333 * /l make a difference in what these match. There
12334 * would be problems if these characters had folds
12335 * other than themselves, as cp_list is subject to
12337 if (classnum != _CC_VERTSPACE) {
12338 assert( namedclass == ANYOF_HORIZWS
12339 || namedclass == ANYOF_NHORIZWS);
12341 /* It turns out that \h is just a synonym for
12343 classnum = _CC_BLANK;
12346 _invlist_union_maybe_complement_2nd(
12348 PL_XPosix_ptrs[classnum],
12349 cBOOL(namedclass % 2), /* Complement if odd
12350 (NHORIZWS, NVERTWS)
12355 else if (classnum == _CC_ASCII) {
12358 ANYOF_CLASS_SET(ret, namedclass);
12361 #endif /* Not isascii(); just use the hard-coded definition for it */
12362 _invlist_union_maybe_complement_2nd(
12365 cBOOL(namedclass % 2), /* Complement if odd
12369 else { /* Garden variety class */
12371 /* The ascii range inversion list */
12372 SV* ascii_source = PL_Posix_ptrs[classnum];
12374 /* The full Latin1 range inversion list */
12375 SV* l1_source = PL_L1Posix_ptrs[classnum];
12377 /* This code is structured into two major clauses. The
12378 * first is for classes whose complete definitions may not
12379 * already be known. It not, the Latin1 definition
12380 * (guaranteed to already known) is used plus code is
12381 * generated to load the rest at run-time (only if needed).
12382 * If the complete definition is known, it drops down to
12383 * the second clause, where the complete definition is
12386 if (classnum < _FIRST_NON_SWASH_CC) {
12388 /* Here, the class has a swash, which may or not
12389 * already be loaded */
12391 /* The name of the property to use to match the full
12392 * eXtended Unicode range swash for this character
12394 const char *Xname = swash_property_names[classnum];
12396 /* If returning the inversion list, we can't defer
12397 * getting this until runtime */
12398 if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
12399 PL_utf8_swash_ptrs[classnum] =
12400 _core_swash_init("utf8", Xname, &PL_sv_undef,
12403 NULL, /* No inversion list */
12404 NULL /* No flags */
12406 assert(PL_utf8_swash_ptrs[classnum]);
12408 if ( ! PL_utf8_swash_ptrs[classnum]) {
12409 if (namedclass % 2 == 0) { /* A non-complemented
12411 /* If not /a matching, there are code points we
12412 * don't know at compile time. Arrange for the
12413 * unknown matches to be loaded at run-time, if
12415 if (! AT_LEAST_ASCII_RESTRICTED) {
12416 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12419 if (LOC) { /* Under locale, set run-time
12421 ANYOF_CLASS_SET(ret, namedclass);
12424 /* Add the current class's code points to
12425 * the running total */
12426 _invlist_union(posixes,
12427 (AT_LEAST_ASCII_RESTRICTED)
12433 else { /* A complemented class */
12434 if (AT_LEAST_ASCII_RESTRICTED) {
12435 /* Under /a should match everything above
12436 * ASCII, plus the complement of the set's
12438 _invlist_union_complement_2nd(posixes,
12443 /* Arrange for the unknown matches to be
12444 * loaded at run-time, if needed */
12445 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12447 runtime_posix_matches_above_Unicode = TRUE;
12449 ANYOF_CLASS_SET(ret, namedclass);
12453 /* We want to match everything in
12454 * Latin1, except those things that
12455 * l1_source matches */
12456 SV* scratch_list = NULL;
12457 _invlist_subtract(PL_Latin1, l1_source,
12460 /* Add the list from this class to the
12463 posixes = scratch_list;
12466 _invlist_union(posixes,
12469 SvREFCNT_dec_NN(scratch_list);
12471 if (DEPENDS_SEMANTICS) {
12473 |= ANYOF_NON_UTF8_LATIN1_ALL;
12478 goto namedclass_done;
12481 /* Here, there is a swash loaded for the class. If no
12482 * inversion list for it yet, get it */
12483 if (! PL_XPosix_ptrs[classnum]) {
12484 PL_XPosix_ptrs[classnum]
12485 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12489 /* Here there is an inversion list already loaded for the
12492 if (namedclass % 2 == 0) { /* A non-complemented class,
12493 like ANYOF_PUNCT */
12495 /* For non-locale, just add it to any existing list
12497 _invlist_union(posixes,
12498 (AT_LEAST_ASCII_RESTRICTED)
12500 : PL_XPosix_ptrs[classnum],
12503 else { /* Locale */
12504 SV* scratch_list = NULL;
12506 /* For above Latin1 code points, we use the full
12508 _invlist_intersection(PL_AboveLatin1,
12509 PL_XPosix_ptrs[classnum],
12511 /* And set the output to it, adding instead if
12512 * there already is an output. Checking if
12513 * 'posixes' is NULL first saves an extra clone.
12514 * Its reference count will be decremented at the
12515 * next union, etc, or if this is the only
12516 * instance, at the end of the routine */
12518 posixes = scratch_list;
12521 _invlist_union(posixes, scratch_list, &posixes);
12522 SvREFCNT_dec_NN(scratch_list);
12525 #ifndef HAS_ISBLANK
12526 if (namedclass != ANYOF_BLANK) {
12528 /* Set this class in the node for runtime
12530 ANYOF_CLASS_SET(ret, namedclass);
12531 #ifndef HAS_ISBLANK
12534 /* No isblank(), use the hard-coded ASCII-range
12535 * blanks, adding them to the running total. */
12537 _invlist_union(posixes, ascii_source, &posixes);
12542 else { /* A complemented class, like ANYOF_NPUNCT */
12544 _invlist_union_complement_2nd(
12546 (AT_LEAST_ASCII_RESTRICTED)
12548 : PL_XPosix_ptrs[classnum],
12550 /* Under /d, everything in the upper half of the
12551 * Latin1 range matches this complement */
12552 if (DEPENDS_SEMANTICS) {
12553 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12556 else { /* Locale */
12557 SV* scratch_list = NULL;
12558 _invlist_subtract(PL_AboveLatin1,
12559 PL_XPosix_ptrs[classnum],
12562 posixes = scratch_list;
12565 _invlist_union(posixes, scratch_list, &posixes);
12566 SvREFCNT_dec_NN(scratch_list);
12568 #ifndef HAS_ISBLANK
12569 if (namedclass != ANYOF_NBLANK) {
12571 ANYOF_CLASS_SET(ret, namedclass);
12572 #ifndef HAS_ISBLANK
12575 /* Get the list of all code points in Latin1
12576 * that are not ASCII blanks, and add them to
12577 * the running total */
12578 _invlist_subtract(PL_Latin1, ascii_source,
12580 _invlist_union(posixes, scratch_list, &posixes);
12581 SvREFCNT_dec_NN(scratch_list);
12588 continue; /* Go get next character */
12590 } /* end of namedclass \blah */
12592 /* Here, we have a single value. If 'range' is set, it is the ending
12593 * of a range--check its validity. Later, we will handle each
12594 * individual code point in the range. If 'range' isn't set, this
12595 * could be the beginning of a range, so check for that by looking
12596 * ahead to see if the next real character to be processed is the range
12597 * indicator--the minus sign */
12600 RExC_parse = regpatws(pRExC_state, RExC_parse,
12601 FALSE /* means don't recognize comments */);
12605 if (prevvalue > value) /* b-a */ {
12606 const int w = RExC_parse - rangebegin;
12607 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12608 range = 0; /* not a valid range */
12612 prevvalue = value; /* save the beginning of the potential range */
12613 if (! stop_at_1 /* Can't be a range if parsing just one thing */
12614 && *RExC_parse == '-')
12616 char* next_char_ptr = RExC_parse + 1;
12617 if (skip_white) { /* Get the next real char after the '-' */
12618 next_char_ptr = regpatws(pRExC_state,
12620 FALSE); /* means don't recognize
12624 /* If the '-' is at the end of the class (just before the ']',
12625 * it is a literal minus; otherwise it is a range */
12626 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12627 RExC_parse = next_char_ptr;
12629 /* a bad range like \w-, [:word:]- ? */
12630 if (namedclass > OOB_NAMEDCLASS) {
12631 if (strict || ckWARN(WARN_REGEXP)) {
12633 RExC_parse >= rangebegin ?
12634 RExC_parse - rangebegin : 0;
12636 vFAIL4("False [] range \"%*.*s\"",
12641 "False [] range \"%*.*s\"",
12646 cp_list = add_cp_to_invlist(cp_list, '-');
12650 range = 1; /* yeah, it's a range! */
12651 continue; /* but do it the next time */
12656 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12659 /* non-Latin1 code point implies unicode semantics. Must be set in
12660 * pass1 so is there for the whole of pass 2 */
12662 RExC_uni_semantics = 1;
12665 /* Ready to process either the single value, or the completed range.
12666 * For single-valued non-inverted ranges, we consider the possibility
12667 * of multi-char folds. (We made a conscious decision to not do this
12668 * for the other cases because it can often lead to non-intuitive
12669 * results. For example, you have the peculiar case that:
12670 * "s s" =~ /^[^\xDF]+$/i => Y
12671 * "ss" =~ /^[^\xDF]+$/i => N
12673 * See [perl #89750] */
12674 if (FOLD && allow_multi_folds && value == prevvalue) {
12675 if (value == LATIN_SMALL_LETTER_SHARP_S
12676 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12679 /* Here <value> is indeed a multi-char fold. Get what it is */
12681 U8 foldbuf[UTF8_MAXBYTES_CASE];
12684 UV folded = _to_uni_fold_flags(
12689 | ((LOC) ? FOLD_FLAGS_LOCALE
12690 : (ASCII_FOLD_RESTRICTED)
12691 ? FOLD_FLAGS_NOMIX_ASCII
12695 /* Here, <folded> should be the first character of the
12696 * multi-char fold of <value>, with <foldbuf> containing the
12697 * whole thing. But, if this fold is not allowed (because of
12698 * the flags), <fold> will be the same as <value>, and should
12699 * be processed like any other character, so skip the special
12701 if (folded != value) {
12703 /* Skip if we are recursed, currently parsing the class
12704 * again. Otherwise add this character to the list of
12705 * multi-char folds. */
12706 if (! RExC_in_multi_char_class) {
12707 AV** this_array_ptr;
12709 STRLEN cp_count = utf8_length(foldbuf,
12710 foldbuf + foldlen);
12711 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12713 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12716 if (! multi_char_matches) {
12717 multi_char_matches = newAV();
12720 /* <multi_char_matches> is actually an array of arrays.
12721 * There will be one or two top-level elements: [2],
12722 * and/or [3]. The [2] element is an array, each
12723 * element thereof is a character which folds to two
12724 * characters; likewise for [3]. (Unicode guarantees a
12725 * maximum of 3 characters in any fold.) When we
12726 * rewrite the character class below, we will do so
12727 * such that the longest folds are written first, so
12728 * that it prefers the longest matching strings first.
12729 * This is done even if it turns out that any
12730 * quantifier is non-greedy, out of programmer
12731 * laziness. Tom Christiansen has agreed that this is
12732 * ok. This makes the test for the ligature 'ffi' come
12733 * before the test for 'ff' */
12734 if (av_exists(multi_char_matches, cp_count)) {
12735 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12737 this_array = *this_array_ptr;
12740 this_array = newAV();
12741 av_store(multi_char_matches, cp_count,
12744 av_push(this_array, multi_fold);
12747 /* This element should not be processed further in this
12750 value = save_value;
12751 prevvalue = save_prevvalue;
12757 /* Deal with this element of the class */
12760 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12762 UV* this_range = _new_invlist(1);
12763 _append_range_to_invlist(this_range, prevvalue, value);
12765 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12766 * If this range was specified using something like 'i-j', we want
12767 * to include only the 'i' and the 'j', and not anything in
12768 * between, so exclude non-ASCII, non-alphabetics from it.
12769 * However, if the range was specified with something like
12770 * [\x89-\x91] or [\x89-j], all code points within it should be
12771 * included. literal_endpoint==2 means both ends of the range used
12772 * a literal character, not \x{foo} */
12773 if (literal_endpoint == 2
12774 && (prevvalue >= 'a' && value <= 'z')
12775 || (prevvalue >= 'A' && value <= 'Z'))
12777 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12778 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12780 _invlist_union(cp_list, this_range, &cp_list);
12781 literal_endpoint = 0;
12785 range = 0; /* this range (if it was one) is done now */
12786 } /* End of loop through all the text within the brackets */
12788 /* If anything in the class expands to more than one character, we have to
12789 * deal with them by building up a substitute parse string, and recursively
12790 * calling reg() on it, instead of proceeding */
12791 if (multi_char_matches) {
12792 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12795 char *save_end = RExC_end;
12796 char *save_parse = RExC_parse;
12797 bool first_time = TRUE; /* First multi-char occurrence doesn't get
12802 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
12803 because too confusing */
12805 sv_catpv(substitute_parse, "(?:");
12809 /* Look at the longest folds first */
12810 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12812 if (av_exists(multi_char_matches, cp_count)) {
12813 AV** this_array_ptr;
12816 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12818 while ((this_sequence = av_pop(*this_array_ptr)) !=
12821 if (! first_time) {
12822 sv_catpv(substitute_parse, "|");
12824 first_time = FALSE;
12826 sv_catpv(substitute_parse, SvPVX(this_sequence));
12831 /* If the character class contains anything else besides these
12832 * multi-character folds, have to include it in recursive parsing */
12833 if (element_count) {
12834 sv_catpv(substitute_parse, "|[");
12835 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12836 sv_catpv(substitute_parse, "]");
12839 sv_catpv(substitute_parse, ")");
12842 /* This is a way to get the parse to skip forward a whole named
12843 * sequence instead of matching the 2nd character when it fails the
12845 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12849 RExC_parse = SvPV(substitute_parse, len);
12850 RExC_end = RExC_parse + len;
12851 RExC_in_multi_char_class = 1;
12852 RExC_emit = (regnode *)orig_emit;
12854 ret = reg(pRExC_state, 1, ®_flags, depth+1);
12856 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12858 RExC_parse = save_parse;
12859 RExC_end = save_end;
12860 RExC_in_multi_char_class = 0;
12861 SvREFCNT_dec_NN(multi_char_matches);
12862 SvREFCNT_dec_NN(listsv);
12866 /* If the character class contains only a single element, it may be
12867 * optimizable into another node type which is smaller and runs faster.
12868 * Check if this is the case for this class */
12869 if (element_count == 1 && ! ret_invlist) {
12873 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12874 [:digit:] or \p{foo} */
12876 /* All named classes are mapped into POSIXish nodes, with its FLAG
12877 * argument giving which class it is */
12878 switch ((I32)namedclass) {
12879 case ANYOF_UNIPROP:
12882 /* These don't depend on the charset modifiers. They always
12883 * match under /u rules */
12884 case ANYOF_NHORIZWS:
12885 case ANYOF_HORIZWS:
12886 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
12889 case ANYOF_NVERTWS:
12894 /* The actual POSIXish node for all the rest depends on the
12895 * charset modifier. The ones in the first set depend only on
12896 * ASCII or, if available on this platform, locale */
12900 op = (LOC) ? POSIXL : POSIXA;
12911 /* under /a could be alpha */
12913 if (ASCII_RESTRICTED) {
12914 namedclass = ANYOF_ALPHA + (namedclass % 2);
12922 /* The rest have more possibilities depending on the charset.
12923 * We take advantage of the enum ordering of the charset
12924 * modifiers to get the exact node type, */
12926 op = POSIXD + get_regex_charset(RExC_flags);
12927 if (op > POSIXA) { /* /aa is same as /a */
12930 #ifndef HAS_ISBLANK
12932 && (namedclass == ANYOF_BLANK
12933 || namedclass == ANYOF_NBLANK))
12940 /* The odd numbered ones are the complements of the
12941 * next-lower even number one */
12942 if (namedclass % 2 == 1) {
12946 arg = namedclass_to_classnum(namedclass);
12950 else if (value == prevvalue) {
12952 /* Here, the class consists of just a single code point */
12955 if (! LOC && value == '\n') {
12956 op = REG_ANY; /* Optimize [^\n] */
12957 *flagp |= HASWIDTH|SIMPLE;
12961 else if (value < 256 || UTF) {
12963 /* Optimize a single value into an EXACTish node, but not if it
12964 * would require converting the pattern to UTF-8. */
12965 op = compute_EXACTish(pRExC_state);
12967 } /* Otherwise is a range */
12968 else if (! LOC) { /* locale could vary these */
12969 if (prevvalue == '0') {
12970 if (value == '9') {
12977 /* Here, we have changed <op> away from its initial value iff we found
12978 * an optimization */
12981 /* Throw away this ANYOF regnode, and emit the calculated one,
12982 * which should correspond to the beginning, not current, state of
12984 const char * cur_parse = RExC_parse;
12985 RExC_parse = (char *)orig_parse;
12989 /* To get locale nodes to not use the full ANYOF size would
12990 * require moving the code above that writes the portions
12991 * of it that aren't in other nodes to after this point.
12992 * e.g. ANYOF_CLASS_SET */
12993 RExC_size = orig_size;
12997 RExC_emit = (regnode *)orig_emit;
12998 if (PL_regkind[op] == POSIXD) {
13000 op += NPOSIXD - POSIXD;
13005 ret = reg_node(pRExC_state, op);
13007 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13011 *flagp |= HASWIDTH|SIMPLE;
13013 else if (PL_regkind[op] == EXACT) {
13014 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13017 RExC_parse = (char *) cur_parse;
13019 SvREFCNT_dec(posixes);
13020 SvREFCNT_dec_NN(listsv);
13021 SvREFCNT_dec(cp_list);
13028 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13030 /* If folding, we calculate all characters that could fold to or from the
13031 * ones already on the list */
13032 if (FOLD && cp_list) {
13033 UV start, end; /* End points of code point ranges */
13035 SV* fold_intersection = NULL;
13037 /* If the highest code point is within Latin1, we can use the
13038 * compiled-in Alphas list, and not have to go out to disk. This
13039 * yields two false positives, the masculine and feminine ordinal
13040 * indicators, which are weeded out below using the
13041 * IS_IN_SOME_FOLD_L1() macro */
13042 if (invlist_highest(cp_list) < 256) {
13043 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13044 &fold_intersection);
13048 /* Here, there are non-Latin1 code points, so we will have to go
13049 * fetch the list of all the characters that participate in folds
13051 if (! PL_utf8_foldable) {
13052 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13053 &PL_sv_undef, 1, 0);
13054 PL_utf8_foldable = _get_swash_invlist(swash);
13055 SvREFCNT_dec_NN(swash);
13058 /* This is a hash that for a particular fold gives all characters
13059 * that are involved in it */
13060 if (! PL_utf8_foldclosures) {
13062 /* If we were unable to find any folds, then we likely won't be
13063 * able to find the closures. So just create an empty list.
13064 * Folding will effectively be restricted to the non-Unicode
13065 * rules hard-coded into Perl. (This case happens legitimately
13066 * during compilation of Perl itself before the Unicode tables
13067 * are generated) */
13068 if (_invlist_len(PL_utf8_foldable) == 0) {
13069 PL_utf8_foldclosures = newHV();
13072 /* If the folds haven't been read in, call a fold function
13074 if (! PL_utf8_tofold) {
13075 U8 dummy[UTF8_MAXBYTES+1];
13077 /* This string is just a short named one above \xff */
13078 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13079 assert(PL_utf8_tofold); /* Verify that worked */
13081 PL_utf8_foldclosures =
13082 _swash_inversion_hash(PL_utf8_tofold);
13086 /* Only the characters in this class that participate in folds need
13087 * be checked. Get the intersection of this class and all the
13088 * possible characters that are foldable. This can quickly narrow
13089 * down a large class */
13090 _invlist_intersection(PL_utf8_foldable, cp_list,
13091 &fold_intersection);
13094 /* Now look at the foldable characters in this class individually */
13095 invlist_iterinit(fold_intersection);
13096 while (invlist_iternext(fold_intersection, &start, &end)) {
13099 /* Locale folding for Latin1 characters is deferred until runtime */
13100 if (LOC && start < 256) {
13104 /* Look at every character in the range */
13105 for (j = start; j <= end; j++) {
13107 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13113 /* We have the latin1 folding rules hard-coded here so that
13114 * an innocent-looking character class, like /[ks]/i won't
13115 * have to go out to disk to find the possible matches.
13116 * XXX It would be better to generate these via regen, in
13117 * case a new version of the Unicode standard adds new
13118 * mappings, though that is not really likely, and may be
13119 * caught by the default: case of the switch below. */
13121 if (IS_IN_SOME_FOLD_L1(j)) {
13123 /* ASCII is always matched; non-ASCII is matched only
13124 * under Unicode rules */
13125 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13127 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13131 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13135 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13136 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13138 /* Certain Latin1 characters have matches outside
13139 * Latin1. To get here, <j> is one of those
13140 * characters. None of these matches is valid for
13141 * ASCII characters under /aa, which is why the 'if'
13142 * just above excludes those. These matches only
13143 * happen when the target string is utf8. The code
13144 * below adds the single fold closures for <j> to the
13145 * inversion list. */
13150 add_cp_to_invlist(cp_list, KELVIN_SIGN);
13154 cp_list = add_cp_to_invlist(cp_list,
13155 LATIN_SMALL_LETTER_LONG_S);
13158 cp_list = add_cp_to_invlist(cp_list,
13159 GREEK_CAPITAL_LETTER_MU);
13160 cp_list = add_cp_to_invlist(cp_list,
13161 GREEK_SMALL_LETTER_MU);
13163 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13164 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13166 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13168 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13169 cp_list = add_cp_to_invlist(cp_list,
13170 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13172 case LATIN_SMALL_LETTER_SHARP_S:
13173 cp_list = add_cp_to_invlist(cp_list,
13174 LATIN_CAPITAL_LETTER_SHARP_S);
13176 case 'F': case 'f':
13177 case 'I': case 'i':
13178 case 'L': case 'l':
13179 case 'T': case 't':
13180 case 'A': case 'a':
13181 case 'H': case 'h':
13182 case 'J': case 'j':
13183 case 'N': case 'n':
13184 case 'W': case 'w':
13185 case 'Y': case 'y':
13186 /* These all are targets of multi-character
13187 * folds from code points that require UTF8 to
13188 * express, so they can't match unless the
13189 * target string is in UTF-8, so no action here
13190 * is necessary, as regexec.c properly handles
13191 * the general case for UTF-8 matching and
13192 * multi-char folds */
13195 /* Use deprecated warning to increase the
13196 * chances of this being output */
13197 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13204 /* Here is an above Latin1 character. We don't have the rules
13205 * hard-coded for it. First, get its fold. This is the simple
13206 * fold, as the multi-character folds have been handled earlier
13207 * and separated out */
13208 _to_uni_fold_flags(j, foldbuf, &foldlen,
13210 ? FOLD_FLAGS_LOCALE
13211 : (ASCII_FOLD_RESTRICTED)
13212 ? FOLD_FLAGS_NOMIX_ASCII
13215 /* Single character fold of above Latin1. Add everything in
13216 * its fold closure to the list that this node should match.
13217 * The fold closures data structure is a hash with the keys
13218 * being the UTF-8 of every character that is folded to, like
13219 * 'k', and the values each an array of all code points that
13220 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
13221 * Multi-character folds are not included */
13222 if ((listp = hv_fetch(PL_utf8_foldclosures,
13223 (char *) foldbuf, foldlen, FALSE)))
13225 AV* list = (AV*) *listp;
13227 for (k = 0; k <= av_len(list); k++) {
13228 SV** c_p = av_fetch(list, k, FALSE);
13231 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13235 /* /aa doesn't allow folds between ASCII and non-; /l
13236 * doesn't allow them between above and below 256 */
13237 if ((ASCII_FOLD_RESTRICTED
13238 && (isASCII(c) != isASCII(j)))
13239 || (LOC && ((c < 256) != (j < 256))))
13244 /* Folds involving non-ascii Latin1 characters
13245 * under /d are added to a separate list */
13246 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13248 cp_list = add_cp_to_invlist(cp_list, c);
13251 depends_list = add_cp_to_invlist(depends_list, c);
13257 SvREFCNT_dec_NN(fold_intersection);
13260 /* And combine the result (if any) with any inversion list from posix
13261 * classes. The lists are kept separate up to now because we don't want to
13262 * fold the classes (folding of those is automatically handled by the swash
13263 * fetching code) */
13265 if (! DEPENDS_SEMANTICS) {
13267 _invlist_union(cp_list, posixes, &cp_list);
13268 SvREFCNT_dec_NN(posixes);
13275 /* Under /d, we put into a separate list the Latin1 things that
13276 * match only when the target string is utf8 */
13277 SV* nonascii_but_latin1_properties = NULL;
13278 _invlist_intersection(posixes, PL_Latin1,
13279 &nonascii_but_latin1_properties);
13280 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13281 &nonascii_but_latin1_properties);
13282 _invlist_subtract(posixes, nonascii_but_latin1_properties,
13285 _invlist_union(cp_list, posixes, &cp_list);
13286 SvREFCNT_dec_NN(posixes);
13292 if (depends_list) {
13293 _invlist_union(depends_list, nonascii_but_latin1_properties,
13295 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13298 depends_list = nonascii_but_latin1_properties;
13303 /* And combine the result (if any) with any inversion list from properties.
13304 * The lists are kept separate up to now so that we can distinguish the two
13305 * in regards to matching above-Unicode. A run-time warning is generated
13306 * if a Unicode property is matched against a non-Unicode code point. But,
13307 * we allow user-defined properties to match anything, without any warning,
13308 * and we also suppress the warning if there is a portion of the character
13309 * class that isn't a Unicode property, and which matches above Unicode, \W
13310 * or [\x{110000}] for example.
13311 * (Note that in this case, unlike the Posix one above, there is no
13312 * <depends_list>, because having a Unicode property forces Unicode
13315 bool warn_super = ! has_user_defined_property;
13318 /* If it matters to the final outcome, see if a non-property
13319 * component of the class matches above Unicode. If so, the
13320 * warning gets suppressed. This is true even if just a single
13321 * such code point is specified, as though not strictly correct if
13322 * another such code point is matched against, the fact that they
13323 * are using above-Unicode code points indicates they should know
13324 * the issues involved */
13326 bool non_prop_matches_above_Unicode =
13327 runtime_posix_matches_above_Unicode
13328 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13330 non_prop_matches_above_Unicode =
13331 ! non_prop_matches_above_Unicode;
13333 warn_super = ! non_prop_matches_above_Unicode;
13336 _invlist_union(properties, cp_list, &cp_list);
13337 SvREFCNT_dec_NN(properties);
13340 cp_list = properties;
13344 OP(ret) = ANYOF_WARN_SUPER;
13348 /* Here, we have calculated what code points should be in the character
13351 * Now we can see about various optimizations. Fold calculation (which we
13352 * did above) needs to take place before inversion. Otherwise /[^k]/i
13353 * would invert to include K, which under /i would match k, which it
13354 * shouldn't. Therefore we can't invert folded locale now, as it won't be
13355 * folded until runtime */
13357 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13358 * at compile time. Besides not inverting folded locale now, we can't
13359 * invert if there are things such as \w, which aren't known until runtime
13362 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13364 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13366 _invlist_invert(cp_list);
13368 /* Any swash can't be used as-is, because we've inverted things */
13370 SvREFCNT_dec_NN(swash);
13374 /* Clear the invert flag since have just done it here */
13379 *ret_invlist = cp_list;
13381 /* Discard the generated node */
13383 RExC_size = orig_size;
13386 RExC_emit = orig_emit;
13391 /* If we didn't do folding, it's because some information isn't available
13392 * until runtime; set the run-time fold flag for these. (We don't have to
13393 * worry about properties folding, as that is taken care of by the swash
13397 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13400 /* Some character classes are equivalent to other nodes. Such nodes take
13401 * up less room and generally fewer operations to execute than ANYOF nodes.
13402 * Above, we checked for and optimized into some such equivalents for
13403 * certain common classes that are easy to test. Getting to this point in
13404 * the code means that the class didn't get optimized there. Since this
13405 * code is only executed in Pass 2, it is too late to save space--it has
13406 * been allocated in Pass 1, and currently isn't given back. But turning
13407 * things into an EXACTish node can allow the optimizer to join it to any
13408 * adjacent such nodes. And if the class is equivalent to things like /./,
13409 * expensive run-time swashes can be avoided. Now that we have more
13410 * complete information, we can find things necessarily missed by the
13411 * earlier code. I (khw) am not sure how much to look for here. It would
13412 * be easy, but perhaps too slow, to check any candidates against all the
13413 * node types they could possibly match using _invlistEQ(). */
13418 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13419 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13422 U8 op = END; /* The optimzation node-type */
13423 const char * cur_parse= RExC_parse;
13425 invlist_iterinit(cp_list);
13426 if (! invlist_iternext(cp_list, &start, &end)) {
13428 /* Here, the list is empty. This happens, for example, when a
13429 * Unicode property is the only thing in the character class, and
13430 * it doesn't match anything. (perluniprops.pod notes such
13433 *flagp |= HASWIDTH|SIMPLE;
13435 else if (start == end) { /* The range is a single code point */
13436 if (! invlist_iternext(cp_list, &start, &end)
13438 /* Don't do this optimization if it would require changing
13439 * the pattern to UTF-8 */
13440 && (start < 256 || UTF))
13442 /* Here, the list contains a single code point. Can optimize
13443 * into an EXACT node */
13452 /* A locale node under folding with one code point can be
13453 * an EXACTFL, as its fold won't be calculated until
13459 /* Here, we are generally folding, but there is only one
13460 * code point to match. If we have to, we use an EXACT
13461 * node, but it would be better for joining with adjacent
13462 * nodes in the optimization pass if we used the same
13463 * EXACTFish node that any such are likely to be. We can
13464 * do this iff the code point doesn't participate in any
13465 * folds. For example, an EXACTF of a colon is the same as
13466 * an EXACT one, since nothing folds to or from a colon. */
13468 if (IS_IN_SOME_FOLD_L1(value)) {
13473 if (! PL_utf8_foldable) {
13474 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13475 &PL_sv_undef, 1, 0);
13476 PL_utf8_foldable = _get_swash_invlist(swash);
13477 SvREFCNT_dec_NN(swash);
13479 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13484 /* If we haven't found the node type, above, it means we
13485 * can use the prevailing one */
13487 op = compute_EXACTish(pRExC_state);
13492 else if (start == 0) {
13493 if (end == UV_MAX) {
13495 *flagp |= HASWIDTH|SIMPLE;
13498 else if (end == '\n' - 1
13499 && invlist_iternext(cp_list, &start, &end)
13500 && start == '\n' + 1 && end == UV_MAX)
13503 *flagp |= HASWIDTH|SIMPLE;
13507 invlist_iterfinish(cp_list);
13510 RExC_parse = (char *)orig_parse;
13511 RExC_emit = (regnode *)orig_emit;
13513 ret = reg_node(pRExC_state, op);
13515 RExC_parse = (char *)cur_parse;
13517 if (PL_regkind[op] == EXACT) {
13518 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13521 SvREFCNT_dec_NN(cp_list);
13522 SvREFCNT_dec_NN(listsv);
13527 /* Here, <cp_list> contains all the code points we can determine at
13528 * compile time that match under all conditions. Go through it, and
13529 * for things that belong in the bitmap, put them there, and delete from
13530 * <cp_list>. While we are at it, see if everything above 255 is in the
13531 * list, and if so, set a flag to speed up execution */
13532 ANYOF_BITMAP_ZERO(ret);
13535 /* This gets set if we actually need to modify things */
13536 bool change_invlist = FALSE;
13540 /* Start looking through <cp_list> */
13541 invlist_iterinit(cp_list);
13542 while (invlist_iternext(cp_list, &start, &end)) {
13546 if (end == UV_MAX && start <= 256) {
13547 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13550 /* Quit if are above what we should change */
13555 change_invlist = TRUE;
13557 /* Set all the bits in the range, up to the max that we are doing */
13558 high = (end < 255) ? end : 255;
13559 for (i = start; i <= (int) high; i++) {
13560 if (! ANYOF_BITMAP_TEST(ret, i)) {
13561 ANYOF_BITMAP_SET(ret, i);
13567 invlist_iterfinish(cp_list);
13569 /* Done with loop; remove any code points that are in the bitmap from
13571 if (change_invlist) {
13572 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13575 /* If have completely emptied it, remove it completely */
13576 if (_invlist_len(cp_list) == 0) {
13577 SvREFCNT_dec_NN(cp_list);
13583 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13586 /* Here, the bitmap has been populated with all the Latin1 code points that
13587 * always match. Can now add to the overall list those that match only
13588 * when the target string is UTF-8 (<depends_list>). */
13589 if (depends_list) {
13591 _invlist_union(cp_list, depends_list, &cp_list);
13592 SvREFCNT_dec_NN(depends_list);
13595 cp_list = depends_list;
13599 /* If there is a swash and more than one element, we can't use the swash in
13600 * the optimization below. */
13601 if (swash && element_count > 1) {
13602 SvREFCNT_dec_NN(swash);
13607 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13609 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13610 SvREFCNT_dec_NN(listsv);
13613 /* av[0] stores the character class description in its textual form:
13614 * used later (regexec.c:Perl_regclass_swash()) to initialize the
13615 * appropriate swash, and is also useful for dumping the regnode.
13616 * av[1] if NULL, is a placeholder to later contain the swash computed
13617 * from av[0]. But if no further computation need be done, the
13618 * swash is stored there now.
13619 * av[2] stores the cp_list inversion list for use in addition or
13620 * instead of av[0]; used only if av[1] is NULL
13621 * av[3] is set if any component of the class is from a user-defined
13622 * property; used only if av[1] is NULL */
13623 AV * const av = newAV();
13626 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13628 : (SvREFCNT_dec_NN(listsv), &PL_sv_undef));
13630 av_store(av, 1, swash);
13631 SvREFCNT_dec_NN(cp_list);
13634 av_store(av, 1, NULL);
13636 av_store(av, 2, cp_list);
13637 av_store(av, 3, newSVuv(has_user_defined_property));
13641 rv = newRV_noinc(MUTABLE_SV(av));
13642 n = add_data(pRExC_state, 1, "s");
13643 RExC_rxi->data->data[n] = (void*)rv;
13647 *flagp |= HASWIDTH|SIMPLE;
13650 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13653 /* reg_skipcomment()
13655 Absorbs an /x style # comments from the input stream.
13656 Returns true if there is more text remaining in the stream.
13657 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13658 terminates the pattern without including a newline.
13660 Note its the callers responsibility to ensure that we are
13661 actually in /x mode
13666 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13670 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13672 while (RExC_parse < RExC_end)
13673 if (*RExC_parse++ == '\n') {
13678 /* we ran off the end of the pattern without ending
13679 the comment, so we have to add an \n when wrapping */
13680 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13688 Advances the parse position, and optionally absorbs
13689 "whitespace" from the inputstream.
13691 Without /x "whitespace" means (?#...) style comments only,
13692 with /x this means (?#...) and # comments and whitespace proper.
13694 Returns the RExC_parse point from BEFORE the scan occurs.
13696 This is the /x friendly way of saying RExC_parse++.
13700 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13702 char* const retval = RExC_parse++;
13704 PERL_ARGS_ASSERT_NEXTCHAR;
13707 if (RExC_end - RExC_parse >= 3
13708 && *RExC_parse == '('
13709 && RExC_parse[1] == '?'
13710 && RExC_parse[2] == '#')
13712 while (*RExC_parse != ')') {
13713 if (RExC_parse == RExC_end)
13714 FAIL("Sequence (?#... not terminated");
13720 if (RExC_flags & RXf_PMf_EXTENDED) {
13721 if (isSPACE(*RExC_parse)) {
13725 else if (*RExC_parse == '#') {
13726 if ( reg_skipcomment( pRExC_state ) )
13735 - reg_node - emit a node
13737 STATIC regnode * /* Location. */
13738 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13742 regnode * const ret = RExC_emit;
13743 GET_RE_DEBUG_FLAGS_DECL;
13745 PERL_ARGS_ASSERT_REG_NODE;
13748 SIZE_ALIGN(RExC_size);
13752 if (RExC_emit >= RExC_emit_bound)
13753 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13754 op, RExC_emit, RExC_emit_bound);
13756 NODE_ALIGN_FILL(ret);
13758 FILL_ADVANCE_NODE(ptr, op);
13759 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
13760 #ifdef RE_TRACK_PATTERN_OFFSETS
13761 if (RExC_offsets) { /* MJD */
13762 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13763 "reg_node", __LINE__,
13765 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13766 ? "Overwriting end of array!\n" : "OK",
13767 (UV)(RExC_emit - RExC_emit_start),
13768 (UV)(RExC_parse - RExC_start),
13769 (UV)RExC_offsets[0]));
13770 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13778 - reganode - emit a node with an argument
13780 STATIC regnode * /* Location. */
13781 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13785 regnode * const ret = RExC_emit;
13786 GET_RE_DEBUG_FLAGS_DECL;
13788 PERL_ARGS_ASSERT_REGANODE;
13791 SIZE_ALIGN(RExC_size);
13796 assert(2==regarglen[op]+1);
13798 Anything larger than this has to allocate the extra amount.
13799 If we changed this to be:
13801 RExC_size += (1 + regarglen[op]);
13803 then it wouldn't matter. Its not clear what side effect
13804 might come from that so its not done so far.
13809 if (RExC_emit >= RExC_emit_bound)
13810 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13811 op, RExC_emit, RExC_emit_bound);
13813 NODE_ALIGN_FILL(ret);
13815 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13816 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
13817 #ifdef RE_TRACK_PATTERN_OFFSETS
13818 if (RExC_offsets) { /* MJD */
13819 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13823 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13824 "Overwriting end of array!\n" : "OK",
13825 (UV)(RExC_emit - RExC_emit_start),
13826 (UV)(RExC_parse - RExC_start),
13827 (UV)RExC_offsets[0]));
13828 Set_Cur_Node_Offset;
13836 - reguni - emit (if appropriate) a Unicode character
13839 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13843 PERL_ARGS_ASSERT_REGUNI;
13845 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13849 - reginsert - insert an operator in front of already-emitted operand
13851 * Means relocating the operand.
13854 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13860 const int offset = regarglen[(U8)op];
13861 const int size = NODE_STEP_REGNODE + offset;
13862 GET_RE_DEBUG_FLAGS_DECL;
13864 PERL_ARGS_ASSERT_REGINSERT;
13865 PERL_UNUSED_ARG(depth);
13866 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13867 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13876 if (RExC_open_parens) {
13878 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13879 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13880 if ( RExC_open_parens[paren] >= opnd ) {
13881 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13882 RExC_open_parens[paren] += size;
13884 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13886 if ( RExC_close_parens[paren] >= opnd ) {
13887 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13888 RExC_close_parens[paren] += size;
13890 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13895 while (src > opnd) {
13896 StructCopy(--src, --dst, regnode);
13897 #ifdef RE_TRACK_PATTERN_OFFSETS
13898 if (RExC_offsets) { /* MJD 20010112 */
13899 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13903 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13904 ? "Overwriting end of array!\n" : "OK",
13905 (UV)(src - RExC_emit_start),
13906 (UV)(dst - RExC_emit_start),
13907 (UV)RExC_offsets[0]));
13908 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13909 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13915 place = opnd; /* Op node, where operand used to be. */
13916 #ifdef RE_TRACK_PATTERN_OFFSETS
13917 if (RExC_offsets) { /* MJD */
13918 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13922 (UV)(place - RExC_emit_start) > RExC_offsets[0]
13923 ? "Overwriting end of array!\n" : "OK",
13924 (UV)(place - RExC_emit_start),
13925 (UV)(RExC_parse - RExC_start),
13926 (UV)RExC_offsets[0]));
13927 Set_Node_Offset(place, RExC_parse);
13928 Set_Node_Length(place, 1);
13931 src = NEXTOPER(place);
13932 FILL_ADVANCE_NODE(place, op);
13933 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
13934 Zero(src, offset, regnode);
13938 - regtail - set the next-pointer at the end of a node chain of p to val.
13939 - SEE ALSO: regtail_study
13941 /* TODO: All three parms should be const */
13943 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13947 GET_RE_DEBUG_FLAGS_DECL;
13949 PERL_ARGS_ASSERT_REGTAIL;
13951 PERL_UNUSED_ARG(depth);
13957 /* Find last node. */
13960 regnode * const temp = regnext(scan);
13962 SV * const mysv=sv_newmortal();
13963 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13964 regprop(RExC_rx, mysv, scan);
13965 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13966 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13967 (temp == NULL ? "->" : ""),
13968 (temp == NULL ? PL_reg_name[OP(val)] : "")
13976 if (reg_off_by_arg[OP(scan)]) {
13977 ARG_SET(scan, val - scan);
13980 NEXT_OFF(scan) = val - scan;
13986 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13987 - Look for optimizable sequences at the same time.
13988 - currently only looks for EXACT chains.
13990 This is experimental code. The idea is to use this routine to perform
13991 in place optimizations on branches and groups as they are constructed,
13992 with the long term intention of removing optimization from study_chunk so
13993 that it is purely analytical.
13995 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13996 to control which is which.
13999 /* TODO: All four parms should be const */
14002 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14007 #ifdef EXPERIMENTAL_INPLACESCAN
14010 GET_RE_DEBUG_FLAGS_DECL;
14012 PERL_ARGS_ASSERT_REGTAIL_STUDY;
14018 /* Find last node. */
14022 regnode * const temp = regnext(scan);
14023 #ifdef EXPERIMENTAL_INPLACESCAN
14024 if (PL_regkind[OP(scan)] == EXACT) {
14025 bool has_exactf_sharp_s; /* Unexamined in this routine */
14026 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14031 switch (OP(scan)) {
14037 case EXACTFU_TRICKYFOLD:
14039 if( exact == PSEUDO )
14041 else if ( exact != OP(scan) )
14050 SV * const mysv=sv_newmortal();
14051 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14052 regprop(RExC_rx, mysv, scan);
14053 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14054 SvPV_nolen_const(mysv),
14055 REG_NODE_NUM(scan),
14056 PL_reg_name[exact]);
14063 SV * const mysv_val=sv_newmortal();
14064 DEBUG_PARSE_MSG("");
14065 regprop(RExC_rx, mysv_val, val);
14066 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14067 SvPV_nolen_const(mysv_val),
14068 (IV)REG_NODE_NUM(val),
14072 if (reg_off_by_arg[OP(scan)]) {
14073 ARG_SET(scan, val - scan);
14076 NEXT_OFF(scan) = val - scan;
14084 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14088 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14094 for (bit=0; bit<32; bit++) {
14095 if (flags & (1<<bit)) {
14096 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14099 if (!set++ && lead)
14100 PerlIO_printf(Perl_debug_log, "%s",lead);
14101 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14104 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14105 if (!set++ && lead) {
14106 PerlIO_printf(Perl_debug_log, "%s",lead);
14109 case REGEX_UNICODE_CHARSET:
14110 PerlIO_printf(Perl_debug_log, "UNICODE");
14112 case REGEX_LOCALE_CHARSET:
14113 PerlIO_printf(Perl_debug_log, "LOCALE");
14115 case REGEX_ASCII_RESTRICTED_CHARSET:
14116 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14118 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14119 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14122 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14128 PerlIO_printf(Perl_debug_log, "\n");
14130 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14136 Perl_regdump(pTHX_ const regexp *r)
14140 SV * const sv = sv_newmortal();
14141 SV *dsv= sv_newmortal();
14142 RXi_GET_DECL(r,ri);
14143 GET_RE_DEBUG_FLAGS_DECL;
14145 PERL_ARGS_ASSERT_REGDUMP;
14147 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14149 /* Header fields of interest. */
14150 if (r->anchored_substr) {
14151 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14152 RE_SV_DUMPLEN(r->anchored_substr), 30);
14153 PerlIO_printf(Perl_debug_log,
14154 "anchored %s%s at %"IVdf" ",
14155 s, RE_SV_TAIL(r->anchored_substr),
14156 (IV)r->anchored_offset);
14157 } else if (r->anchored_utf8) {
14158 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14159 RE_SV_DUMPLEN(r->anchored_utf8), 30);
14160 PerlIO_printf(Perl_debug_log,
14161 "anchored utf8 %s%s at %"IVdf" ",
14162 s, RE_SV_TAIL(r->anchored_utf8),
14163 (IV)r->anchored_offset);
14165 if (r->float_substr) {
14166 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14167 RE_SV_DUMPLEN(r->float_substr), 30);
14168 PerlIO_printf(Perl_debug_log,
14169 "floating %s%s at %"IVdf"..%"UVuf" ",
14170 s, RE_SV_TAIL(r->float_substr),
14171 (IV)r->float_min_offset, (UV)r->float_max_offset);
14172 } else if (r->float_utf8) {
14173 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14174 RE_SV_DUMPLEN(r->float_utf8), 30);
14175 PerlIO_printf(Perl_debug_log,
14176 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14177 s, RE_SV_TAIL(r->float_utf8),
14178 (IV)r->float_min_offset, (UV)r->float_max_offset);
14180 if (r->check_substr || r->check_utf8)
14181 PerlIO_printf(Perl_debug_log,
14183 (r->check_substr == r->float_substr
14184 && r->check_utf8 == r->float_utf8
14185 ? "(checking floating" : "(checking anchored"));
14186 if (r->extflags & RXf_NOSCAN)
14187 PerlIO_printf(Perl_debug_log, " noscan");
14188 if (r->extflags & RXf_CHECK_ALL)
14189 PerlIO_printf(Perl_debug_log, " isall");
14190 if (r->check_substr || r->check_utf8)
14191 PerlIO_printf(Perl_debug_log, ") ");
14193 if (ri->regstclass) {
14194 regprop(r, sv, ri->regstclass);
14195 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14197 if (r->extflags & RXf_ANCH) {
14198 PerlIO_printf(Perl_debug_log, "anchored");
14199 if (r->extflags & RXf_ANCH_BOL)
14200 PerlIO_printf(Perl_debug_log, "(BOL)");
14201 if (r->extflags & RXf_ANCH_MBOL)
14202 PerlIO_printf(Perl_debug_log, "(MBOL)");
14203 if (r->extflags & RXf_ANCH_SBOL)
14204 PerlIO_printf(Perl_debug_log, "(SBOL)");
14205 if (r->extflags & RXf_ANCH_GPOS)
14206 PerlIO_printf(Perl_debug_log, "(GPOS)");
14207 PerlIO_putc(Perl_debug_log, ' ');
14209 if (r->extflags & RXf_GPOS_SEEN)
14210 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14211 if (r->intflags & PREGf_SKIP)
14212 PerlIO_printf(Perl_debug_log, "plus ");
14213 if (r->intflags & PREGf_IMPLICIT)
14214 PerlIO_printf(Perl_debug_log, "implicit ");
14215 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14216 if (r->extflags & RXf_EVAL_SEEN)
14217 PerlIO_printf(Perl_debug_log, "with eval ");
14218 PerlIO_printf(Perl_debug_log, "\n");
14219 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14221 PERL_ARGS_ASSERT_REGDUMP;
14222 PERL_UNUSED_CONTEXT;
14223 PERL_UNUSED_ARG(r);
14224 #endif /* DEBUGGING */
14228 - regprop - printable representation of opcode
14230 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14233 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14234 if (flags & ANYOF_INVERT) \
14235 /*make sure the invert info is in each */ \
14236 sv_catpvs(sv, "^"); \
14242 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14248 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14249 static const char * const anyofs[] = {
14250 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14251 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
14252 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
14253 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
14254 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
14255 || _CC_VERTSPACE != 16
14256 #error Need to adjust order of anyofs[]
14293 RXi_GET_DECL(prog,progi);
14294 GET_RE_DEBUG_FLAGS_DECL;
14296 PERL_ARGS_ASSERT_REGPROP;
14300 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
14301 /* It would be nice to FAIL() here, but this may be called from
14302 regexec.c, and it would be hard to supply pRExC_state. */
14303 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14304 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14306 k = PL_regkind[OP(o)];
14309 sv_catpvs(sv, " ");
14310 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14311 * is a crude hack but it may be the best for now since
14312 * we have no flag "this EXACTish node was UTF-8"
14314 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14315 PERL_PV_ESCAPE_UNI_DETECT |
14316 PERL_PV_ESCAPE_NONASCII |
14317 PERL_PV_PRETTY_ELLIPSES |
14318 PERL_PV_PRETTY_LTGT |
14319 PERL_PV_PRETTY_NOCLEAR
14321 } else if (k == TRIE) {
14322 /* print the details of the trie in dumpuntil instead, as
14323 * progi->data isn't available here */
14324 const char op = OP(o);
14325 const U32 n = ARG(o);
14326 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14327 (reg_ac_data *)progi->data->data[n] :
14329 const reg_trie_data * const trie
14330 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14332 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14333 DEBUG_TRIE_COMPILE_r(
14334 Perl_sv_catpvf(aTHX_ sv,
14335 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14336 (UV)trie->startstate,
14337 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14338 (UV)trie->wordcount,
14341 (UV)TRIE_CHARCOUNT(trie),
14342 (UV)trie->uniquecharcount
14345 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14347 int rangestart = -1;
14348 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14349 sv_catpvs(sv, "[");
14350 for (i = 0; i <= 256; i++) {
14351 if (i < 256 && BITMAP_TEST(bitmap,i)) {
14352 if (rangestart == -1)
14354 } else if (rangestart != -1) {
14355 if (i <= rangestart + 3)
14356 for (; rangestart < i; rangestart++)
14357 put_byte(sv, rangestart);
14359 put_byte(sv, rangestart);
14360 sv_catpvs(sv, "-");
14361 put_byte(sv, i - 1);
14366 sv_catpvs(sv, "]");
14369 } else if (k == CURLY) {
14370 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14371 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14372 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14374 else if (k == WHILEM && o->flags) /* Ordinal/of */
14375 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14376 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14377 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14378 if ( RXp_PAREN_NAMES(prog) ) {
14379 if ( k != REF || (OP(o) < NREF)) {
14380 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14381 SV **name= av_fetch(list, ARG(o), 0 );
14383 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14386 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14387 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14388 I32 *nums=(I32*)SvPVX(sv_dat);
14389 SV **name= av_fetch(list, nums[0], 0 );
14392 for ( n=0; n<SvIVX(sv_dat); n++ ) {
14393 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14394 (n ? "," : ""), (IV)nums[n]);
14396 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14400 } else if (k == GOSUB)
14401 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14402 else if (k == VERB) {
14404 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14405 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14406 } else if (k == LOGICAL)
14407 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14408 else if (k == ANYOF) {
14409 int i, rangestart = -1;
14410 const U8 flags = ANYOF_FLAGS(o);
14414 if (flags & ANYOF_LOCALE)
14415 sv_catpvs(sv, "{loc}");
14416 if (flags & ANYOF_LOC_FOLD)
14417 sv_catpvs(sv, "{i}");
14418 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14419 if (flags & ANYOF_INVERT)
14420 sv_catpvs(sv, "^");
14422 /* output what the standard cp 0-255 bitmap matches */
14423 for (i = 0; i <= 256; i++) {
14424 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14425 if (rangestart == -1)
14427 } else if (rangestart != -1) {
14428 if (i <= rangestart + 3)
14429 for (; rangestart < i; rangestart++)
14430 put_byte(sv, rangestart);
14432 put_byte(sv, rangestart);
14433 sv_catpvs(sv, "-");
14434 put_byte(sv, i - 1);
14441 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14442 /* output any special charclass tests (used entirely under use locale) */
14443 if (ANYOF_CLASS_TEST_ANY_SET(o))
14444 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14445 if (ANYOF_CLASS_TEST(o,i)) {
14446 sv_catpv(sv, anyofs[i]);
14450 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14452 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14453 sv_catpvs(sv, "{non-utf8-latin1-all}");
14456 /* output information about the unicode matching */
14457 if (flags & ANYOF_UNICODE_ALL)
14458 sv_catpvs(sv, "{unicode_all}");
14459 else if (ANYOF_NONBITMAP(o))
14460 sv_catpvs(sv, "{unicode}");
14461 if (flags & ANYOF_NONBITMAP_NON_UTF8)
14462 sv_catpvs(sv, "{outside bitmap}");
14464 if (ANYOF_NONBITMAP(o)) {
14465 SV *lv; /* Set if there is something outside the bit map */
14466 SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14467 bool byte_output = FALSE; /* If something in the bitmap has been
14470 if (lv && lv != &PL_sv_undef) {
14472 U8 s[UTF8_MAXBYTES_CASE+1];
14474 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14475 uvchr_to_utf8(s, i);
14478 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
14482 && swash_fetch(sw, s, TRUE))
14484 if (rangestart == -1)
14486 } else if (rangestart != -1) {
14487 byte_output = TRUE;
14488 if (i <= rangestart + 3)
14489 for (; rangestart < i; rangestart++) {
14490 put_byte(sv, rangestart);
14493 put_byte(sv, rangestart);
14494 sv_catpvs(sv, "-");
14503 char *s = savesvpv(lv);
14504 char * const origs = s;
14506 while (*s && *s != '\n')
14510 const char * const t = ++s;
14513 sv_catpvs(sv, " ");
14519 /* Truncate very long output */
14520 if (s - origs > 256) {
14521 Perl_sv_catpvf(aTHX_ sv,
14523 (int) (s - origs - 1),
14529 else if (*s == '\t') {
14544 SvREFCNT_dec_NN(lv);
14548 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14550 else if (k == POSIXD || k == NPOSIXD) {
14551 U8 index = FLAGS(o) * 2;
14552 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14553 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14556 sv_catpv(sv, anyofs[index]);
14559 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14560 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14562 PERL_UNUSED_CONTEXT;
14563 PERL_UNUSED_ARG(sv);
14564 PERL_UNUSED_ARG(o);
14565 PERL_UNUSED_ARG(prog);
14566 #endif /* DEBUGGING */
14570 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14571 { /* Assume that RE_INTUIT is set */
14573 struct regexp *const prog = ReANY(r);
14574 GET_RE_DEBUG_FLAGS_DECL;
14576 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14577 PERL_UNUSED_CONTEXT;
14581 const char * const s = SvPV_nolen_const(prog->check_substr
14582 ? prog->check_substr : prog->check_utf8);
14584 if (!PL_colorset) reginitcolors();
14585 PerlIO_printf(Perl_debug_log,
14586 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14588 prog->check_substr ? "" : "utf8 ",
14589 PL_colors[5],PL_colors[0],
14592 (strlen(s) > 60 ? "..." : ""));
14595 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14601 handles refcounting and freeing the perl core regexp structure. When
14602 it is necessary to actually free the structure the first thing it
14603 does is call the 'free' method of the regexp_engine associated to
14604 the regexp, allowing the handling of the void *pprivate; member
14605 first. (This routine is not overridable by extensions, which is why
14606 the extensions free is called first.)
14608 See regdupe and regdupe_internal if you change anything here.
14610 #ifndef PERL_IN_XSUB_RE
14612 Perl_pregfree(pTHX_ REGEXP *r)
14618 Perl_pregfree2(pTHX_ REGEXP *rx)
14621 struct regexp *const r = ReANY(rx);
14622 GET_RE_DEBUG_FLAGS_DECL;
14624 PERL_ARGS_ASSERT_PREGFREE2;
14626 if (r->mother_re) {
14627 ReREFCNT_dec(r->mother_re);
14629 CALLREGFREE_PVT(rx); /* free the private data */
14630 SvREFCNT_dec(RXp_PAREN_NAMES(r));
14631 Safefree(r->xpv_len_u.xpvlenu_pv);
14634 SvREFCNT_dec(r->anchored_substr);
14635 SvREFCNT_dec(r->anchored_utf8);
14636 SvREFCNT_dec(r->float_substr);
14637 SvREFCNT_dec(r->float_utf8);
14638 Safefree(r->substrs);
14640 RX_MATCH_COPY_FREE(rx);
14641 #ifdef PERL_ANY_COW
14642 SvREFCNT_dec(r->saved_copy);
14645 SvREFCNT_dec(r->qr_anoncv);
14646 rx->sv_u.svu_rx = 0;
14651 This is a hacky workaround to the structural issue of match results
14652 being stored in the regexp structure which is in turn stored in
14653 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14654 could be PL_curpm in multiple contexts, and could require multiple
14655 result sets being associated with the pattern simultaneously, such
14656 as when doing a recursive match with (??{$qr})
14658 The solution is to make a lightweight copy of the regexp structure
14659 when a qr// is returned from the code executed by (??{$qr}) this
14660 lightweight copy doesn't actually own any of its data except for
14661 the starp/end and the actual regexp structure itself.
14667 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14669 struct regexp *ret;
14670 struct regexp *const r = ReANY(rx);
14671 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14673 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14676 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14678 SvOK_off((SV *)ret_x);
14680 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14681 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
14682 made both spots point to the same regexp body.) */
14683 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14684 assert(!SvPVX(ret_x));
14685 ret_x->sv_u.svu_rx = temp->sv_any;
14686 temp->sv_any = NULL;
14687 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14688 SvREFCNT_dec_NN(temp);
14689 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14690 ing below will not set it. */
14691 SvCUR_set(ret_x, SvCUR(rx));
14694 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14695 sv_force_normal(sv) is called. */
14697 ret = ReANY(ret_x);
14699 SvFLAGS(ret_x) |= SvUTF8(rx);
14700 /* We share the same string buffer as the original regexp, on which we
14701 hold a reference count, incremented when mother_re is set below.
14702 The string pointer is copied here, being part of the regexp struct.
14704 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14705 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14707 const I32 npar = r->nparens+1;
14708 Newx(ret->offs, npar, regexp_paren_pair);
14709 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14712 Newx(ret->substrs, 1, struct reg_substr_data);
14713 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14715 SvREFCNT_inc_void(ret->anchored_substr);
14716 SvREFCNT_inc_void(ret->anchored_utf8);
14717 SvREFCNT_inc_void(ret->float_substr);
14718 SvREFCNT_inc_void(ret->float_utf8);
14720 /* check_substr and check_utf8, if non-NULL, point to either their
14721 anchored or float namesakes, and don't hold a second reference. */
14723 RX_MATCH_COPIED_off(ret_x);
14724 #ifdef PERL_ANY_COW
14725 ret->saved_copy = NULL;
14727 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14728 SvREFCNT_inc_void(ret->qr_anoncv);
14734 /* regfree_internal()
14736 Free the private data in a regexp. This is overloadable by
14737 extensions. Perl takes care of the regexp structure in pregfree(),
14738 this covers the *pprivate pointer which technically perl doesn't
14739 know about, however of course we have to handle the
14740 regexp_internal structure when no extension is in use.
14742 Note this is called before freeing anything in the regexp
14747 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14750 struct regexp *const r = ReANY(rx);
14751 RXi_GET_DECL(r,ri);
14752 GET_RE_DEBUG_FLAGS_DECL;
14754 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14760 SV *dsv= sv_newmortal();
14761 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14762 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14763 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14764 PL_colors[4],PL_colors[5],s);
14767 #ifdef RE_TRACK_PATTERN_OFFSETS
14769 Safefree(ri->u.offsets); /* 20010421 MJD */
14771 if (ri->code_blocks) {
14773 for (n = 0; n < ri->num_code_blocks; n++)
14774 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14775 Safefree(ri->code_blocks);
14779 int n = ri->data->count;
14782 /* If you add a ->what type here, update the comment in regcomp.h */
14783 switch (ri->data->what[n]) {
14789 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14792 Safefree(ri->data->data[n]);
14798 { /* Aho Corasick add-on structure for a trie node.
14799 Used in stclass optimization only */
14801 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14803 refcount = --aho->refcount;
14806 PerlMemShared_free(aho->states);
14807 PerlMemShared_free(aho->fail);
14808 /* do this last!!!! */
14809 PerlMemShared_free(ri->data->data[n]);
14810 PerlMemShared_free(ri->regstclass);
14816 /* trie structure. */
14818 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14820 refcount = --trie->refcount;
14823 PerlMemShared_free(trie->charmap);
14824 PerlMemShared_free(trie->states);
14825 PerlMemShared_free(trie->trans);
14827 PerlMemShared_free(trie->bitmap);
14829 PerlMemShared_free(trie->jump);
14830 PerlMemShared_free(trie->wordinfo);
14831 /* do this last!!!! */
14832 PerlMemShared_free(ri->data->data[n]);
14837 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14840 Safefree(ri->data->what);
14841 Safefree(ri->data);
14847 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14848 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14849 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14852 re_dup - duplicate a regexp.
14854 This routine is expected to clone a given regexp structure. It is only
14855 compiled under USE_ITHREADS.
14857 After all of the core data stored in struct regexp is duplicated
14858 the regexp_engine.dupe method is used to copy any private data
14859 stored in the *pprivate pointer. This allows extensions to handle
14860 any duplication it needs to do.
14862 See pregfree() and regfree_internal() if you change anything here.
14864 #if defined(USE_ITHREADS)
14865 #ifndef PERL_IN_XSUB_RE
14867 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14871 const struct regexp *r = ReANY(sstr);
14872 struct regexp *ret = ReANY(dstr);
14874 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14876 npar = r->nparens+1;
14877 Newx(ret->offs, npar, regexp_paren_pair);
14878 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14880 /* no need to copy these */
14881 Newx(ret->swap, npar, regexp_paren_pair);
14884 if (ret->substrs) {
14885 /* Do it this way to avoid reading from *r after the StructCopy().
14886 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14887 cache, it doesn't matter. */
14888 const bool anchored = r->check_substr
14889 ? r->check_substr == r->anchored_substr
14890 : r->check_utf8 == r->anchored_utf8;
14891 Newx(ret->substrs, 1, struct reg_substr_data);
14892 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14894 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14895 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14896 ret->float_substr = sv_dup_inc(ret->float_substr, param);
14897 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14899 /* check_substr and check_utf8, if non-NULL, point to either their
14900 anchored or float namesakes, and don't hold a second reference. */
14902 if (ret->check_substr) {
14904 assert(r->check_utf8 == r->anchored_utf8);
14905 ret->check_substr = ret->anchored_substr;
14906 ret->check_utf8 = ret->anchored_utf8;
14908 assert(r->check_substr == r->float_substr);
14909 assert(r->check_utf8 == r->float_utf8);
14910 ret->check_substr = ret->float_substr;
14911 ret->check_utf8 = ret->float_utf8;
14913 } else if (ret->check_utf8) {
14915 ret->check_utf8 = ret->anchored_utf8;
14917 ret->check_utf8 = ret->float_utf8;
14922 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14923 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14926 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14928 if (RX_MATCH_COPIED(dstr))
14929 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
14931 ret->subbeg = NULL;
14932 #ifdef PERL_ANY_COW
14933 ret->saved_copy = NULL;
14936 /* Whether mother_re be set or no, we need to copy the string. We
14937 cannot refrain from copying it when the storage points directly to
14938 our mother regexp, because that's
14939 1: a buffer in a different thread
14940 2: something we no longer hold a reference on
14941 so we need to copy it locally. */
14942 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
14943 ret->mother_re = NULL;
14946 #endif /* PERL_IN_XSUB_RE */
14951 This is the internal complement to regdupe() which is used to copy
14952 the structure pointed to by the *pprivate pointer in the regexp.
14953 This is the core version of the extension overridable cloning hook.
14954 The regexp structure being duplicated will be copied by perl prior
14955 to this and will be provided as the regexp *r argument, however
14956 with the /old/ structures pprivate pointer value. Thus this routine
14957 may override any copying normally done by perl.
14959 It returns a pointer to the new regexp_internal structure.
14963 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14966 struct regexp *const r = ReANY(rx);
14967 regexp_internal *reti;
14969 RXi_GET_DECL(r,ri);
14971 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14975 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14976 Copy(ri->program, reti->program, len+1, regnode);
14978 reti->num_code_blocks = ri->num_code_blocks;
14979 if (ri->code_blocks) {
14981 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14982 struct reg_code_block);
14983 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14984 struct reg_code_block);
14985 for (n = 0; n < ri->num_code_blocks; n++)
14986 reti->code_blocks[n].src_regex = (REGEXP*)
14987 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14990 reti->code_blocks = NULL;
14992 reti->regstclass = NULL;
14995 struct reg_data *d;
14996 const int count = ri->data->count;
14999 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15000 char, struct reg_data);
15001 Newx(d->what, count, U8);
15004 for (i = 0; i < count; i++) {
15005 d->what[i] = ri->data->what[i];
15006 switch (d->what[i]) {
15007 /* see also regcomp.h and regfree_internal() */
15008 case 'a': /* actually an AV, but the dup function is identical. */
15012 case 'u': /* actually an HV, but the dup function is identical. */
15013 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15016 /* This is cheating. */
15017 Newx(d->data[i], 1, struct regnode_charclass_class);
15018 StructCopy(ri->data->data[i], d->data[i],
15019 struct regnode_charclass_class);
15020 reti->regstclass = (regnode*)d->data[i];
15023 /* Trie stclasses are readonly and can thus be shared
15024 * without duplication. We free the stclass in pregfree
15025 * when the corresponding reg_ac_data struct is freed.
15027 reti->regstclass= ri->regstclass;
15031 ((reg_trie_data*)ri->data->data[i])->refcount++;
15036 d->data[i] = ri->data->data[i];
15039 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15048 reti->name_list_idx = ri->name_list_idx;
15050 #ifdef RE_TRACK_PATTERN_OFFSETS
15051 if (ri->u.offsets) {
15052 Newx(reti->u.offsets, 2*len+1, U32);
15053 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15056 SetProgLen(reti,len);
15059 return (void*)reti;
15062 #endif /* USE_ITHREADS */
15064 #ifndef PERL_IN_XSUB_RE
15067 - regnext - dig the "next" pointer out of a node
15070 Perl_regnext(pTHX_ regnode *p)
15078 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
15079 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15082 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15091 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15094 STRLEN l1 = strlen(pat1);
15095 STRLEN l2 = strlen(pat2);
15098 const char *message;
15100 PERL_ARGS_ASSERT_RE_CROAK2;
15106 Copy(pat1, buf, l1 , char);
15107 Copy(pat2, buf + l1, l2 , char);
15108 buf[l1 + l2] = '\n';
15109 buf[l1 + l2 + 1] = '\0';
15111 /* ANSI variant takes additional second argument */
15112 va_start(args, pat2);
15116 msv = vmess(buf, &args);
15118 message = SvPV_const(msv,l1);
15121 Copy(message, buf, l1 , char);
15122 buf[l1-1] = '\0'; /* Overwrite \n */
15123 Perl_croak(aTHX_ "%s", buf);
15126 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
15128 #ifndef PERL_IN_XSUB_RE
15130 Perl_save_re_context(pTHX)
15134 struct re_save_state *state;
15136 SAVEVPTR(PL_curcop);
15137 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15139 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15140 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15141 SSPUSHUV(SAVEt_RE_STATE);
15143 Copy(&PL_reg_state, state, 1, struct re_save_state);
15145 PL_reg_oldsaved = NULL;
15146 PL_reg_oldsavedlen = 0;
15147 PL_reg_oldsavedoffset = 0;
15148 PL_reg_oldsavedcoffset = 0;
15149 PL_reg_maxiter = 0;
15150 PL_reg_leftiter = 0;
15151 PL_reg_poscache = NULL;
15152 PL_reg_poscache_size = 0;
15153 #ifdef PERL_ANY_COW
15157 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15159 const REGEXP * const rx = PM_GETRE(PL_curpm);
15162 for (i = 1; i <= RX_NPARENS(rx); i++) {
15163 char digits[TYPE_CHARS(long)];
15164 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15165 GV *const *const gvp
15166 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15169 GV * const gv = *gvp;
15170 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15182 S_put_byte(pTHX_ SV *sv, int c)
15184 PERL_ARGS_ASSERT_PUT_BYTE;
15186 /* Our definition of isPRINT() ignores locales, so only bytes that are
15187 not part of UTF-8 are considered printable. I assume that the same
15188 holds for UTF-EBCDIC.
15189 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15190 which Wikipedia says:
15192 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15193 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15194 identical, to the ASCII delete (DEL) or rubout control character.
15195 ) So the old condition can be simplified to !isPRINT(c) */
15198 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15201 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15205 const char string = c;
15206 if (c == '-' || c == ']' || c == '\\' || c == '^')
15207 sv_catpvs(sv, "\\");
15208 sv_catpvn(sv, &string, 1);
15213 #define CLEAR_OPTSTART \
15214 if (optstart) STMT_START { \
15215 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15219 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15221 STATIC const regnode *
15222 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15223 const regnode *last, const regnode *plast,
15224 SV* sv, I32 indent, U32 depth)
15227 U8 op = PSEUDO; /* Arbitrary non-END op. */
15228 const regnode *next;
15229 const regnode *optstart= NULL;
15231 RXi_GET_DECL(r,ri);
15232 GET_RE_DEBUG_FLAGS_DECL;
15234 PERL_ARGS_ASSERT_DUMPUNTIL;
15236 #ifdef DEBUG_DUMPUNTIL
15237 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15238 last ? last-start : 0,plast ? plast-start : 0);
15241 if (plast && plast < last)
15244 while (PL_regkind[op] != END && (!last || node < last)) {
15245 /* While that wasn't END last time... */
15248 if (op == CLOSE || op == WHILEM)
15250 next = regnext((regnode *)node);
15253 if (OP(node) == OPTIMIZED) {
15254 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15261 regprop(r, sv, node);
15262 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15263 (int)(2*indent + 1), "", SvPVX_const(sv));
15265 if (OP(node) != OPTIMIZED) {
15266 if (next == NULL) /* Next ptr. */
15267 PerlIO_printf(Perl_debug_log, " (0)");
15268 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15269 PerlIO_printf(Perl_debug_log, " (FAIL)");
15271 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15272 (void)PerlIO_putc(Perl_debug_log, '\n');
15276 if (PL_regkind[(U8)op] == BRANCHJ) {
15279 const regnode *nnode = (OP(next) == LONGJMP
15280 ? regnext((regnode *)next)
15282 if (last && nnode > last)
15284 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15287 else if (PL_regkind[(U8)op] == BRANCH) {
15289 DUMPUNTIL(NEXTOPER(node), next);
15291 else if ( PL_regkind[(U8)op] == TRIE ) {
15292 const regnode *this_trie = node;
15293 const char op = OP(node);
15294 const U32 n = ARG(node);
15295 const reg_ac_data * const ac = op>=AHOCORASICK ?
15296 (reg_ac_data *)ri->data->data[n] :
15298 const reg_trie_data * const trie =
15299 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15301 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15303 const regnode *nextbranch= NULL;
15306 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15307 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15309 PerlIO_printf(Perl_debug_log, "%*s%s ",
15310 (int)(2*(indent+3)), "",
15311 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15312 PL_colors[0], PL_colors[1],
15313 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15314 PERL_PV_PRETTY_ELLIPSES |
15315 PERL_PV_PRETTY_LTGT
15320 U16 dist= trie->jump[word_idx+1];
15321 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15322 (UV)((dist ? this_trie + dist : next) - start));
15325 nextbranch= this_trie + trie->jump[0];
15326 DUMPUNTIL(this_trie + dist, nextbranch);
15328 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15329 nextbranch= regnext((regnode *)nextbranch);
15331 PerlIO_printf(Perl_debug_log, "\n");
15334 if (last && next > last)
15339 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
15340 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15341 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15343 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15345 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15347 else if ( op == PLUS || op == STAR) {
15348 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15350 else if (PL_regkind[(U8)op] == ANYOF) {
15351 /* arglen 1 + class block */
15352 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15353 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15354 node = NEXTOPER(node);
15356 else if (PL_regkind[(U8)op] == EXACT) {
15357 /* Literal string, where present. */
15358 node += NODE_SZ_STR(node) - 1;
15359 node = NEXTOPER(node);
15362 node = NEXTOPER(node);
15363 node += regarglen[(U8)op];
15365 if (op == CURLYX || op == OPEN)
15369 #ifdef DEBUG_DUMPUNTIL
15370 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15375 #endif /* DEBUGGING */
15379 * c-indentation-style: bsd
15380 * c-basic-offset: 4
15381 * indent-tabs-mode: nil
15384 * ex: set ts=8 sts=4 sw=4 et: