5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
92 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
100 # if defined(BUGGY_MSC6)
101 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
102 # pragma optimize("a",off)
103 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
104 # pragma optimize("w",on )
105 # endif /* BUGGY_MSC6 */
109 #define STATIC static
113 typedef struct RExC_state_t {
114 U32 flags; /* RXf_* are we folding, multilining? */
115 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
116 char *precomp; /* uncompiled string. */
117 REGEXP *rx_sv; /* The SV that is the regexp. */
118 regexp *rx; /* perl core regexp structure */
119 regexp_internal *rxi; /* internal data for regexp object pprivate field */
120 char *start; /* Start of input for compile */
121 char *end; /* End of input for compile */
122 char *parse; /* Input-scan pointer. */
123 I32 whilem_seen; /* number of WHILEM in this expr */
124 regnode *emit_start; /* Start of emitted-code area */
125 regnode *emit_bound; /* First regnode outside of the allocated space */
126 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
130 I32 size; /* Code size. */
131 I32 npar; /* Capture buffer count, (OPEN). */
132 I32 cpar; /* Capture buffer count, (CLOSE). */
133 I32 nestroot; /* root parens we are in - used by accept */
136 regnode **open_parens; /* pointers to open parens */
137 regnode **close_parens; /* pointers to close parens */
138 regnode *opend; /* END node in program */
139 I32 utf8; /* whether the pattern is utf8 or not */
140 I32 orig_utf8; /* whether the pattern was originally in utf8 */
141 /* XXX use this for future optimisation of case
142 * where pattern must be upgraded to utf8. */
143 I32 uni_semantics; /* If a d charset modifier should use unicode
144 rules, even if the pattern is not in
146 HV *paren_names; /* Paren names */
148 regnode **recurse; /* Recurse regops */
149 I32 recurse_count; /* Number of recurse regops */
152 I32 override_recoding;
153 struct reg_code_block *code_blocks; /* positions of literal (?{})
155 int num_code_blocks; /* size of code_blocks[] */
156 int code_index; /* next code_blocks[] slot */
158 char *starttry; /* -Dr: where regtry was called. */
159 #define RExC_starttry (pRExC_state->starttry)
161 SV *runtime_code_qr; /* qr with the runtime code blocks */
163 const char *lastparse;
165 AV *paren_name_list; /* idx -> name */
166 #define RExC_lastparse (pRExC_state->lastparse)
167 #define RExC_lastnum (pRExC_state->lastnum)
168 #define RExC_paren_name_list (pRExC_state->paren_name_list)
172 #define RExC_flags (pRExC_state->flags)
173 #define RExC_pm_flags (pRExC_state->pm_flags)
174 #define RExC_precomp (pRExC_state->precomp)
175 #define RExC_rx_sv (pRExC_state->rx_sv)
176 #define RExC_rx (pRExC_state->rx)
177 #define RExC_rxi (pRExC_state->rxi)
178 #define RExC_start (pRExC_state->start)
179 #define RExC_end (pRExC_state->end)
180 #define RExC_parse (pRExC_state->parse)
181 #define RExC_whilem_seen (pRExC_state->whilem_seen)
182 #ifdef RE_TRACK_PATTERN_OFFSETS
183 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
185 #define RExC_emit (pRExC_state->emit)
186 #define RExC_emit_start (pRExC_state->emit_start)
187 #define RExC_emit_bound (pRExC_state->emit_bound)
188 #define RExC_naughty (pRExC_state->naughty)
189 #define RExC_sawback (pRExC_state->sawback)
190 #define RExC_seen (pRExC_state->seen)
191 #define RExC_size (pRExC_state->size)
192 #define RExC_npar (pRExC_state->npar)
193 #define RExC_nestroot (pRExC_state->nestroot)
194 #define RExC_extralen (pRExC_state->extralen)
195 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
196 #define RExC_utf8 (pRExC_state->utf8)
197 #define RExC_uni_semantics (pRExC_state->uni_semantics)
198 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
199 #define RExC_open_parens (pRExC_state->open_parens)
200 #define RExC_close_parens (pRExC_state->close_parens)
201 #define RExC_opend (pRExC_state->opend)
202 #define RExC_paren_names (pRExC_state->paren_names)
203 #define RExC_recurse (pRExC_state->recurse)
204 #define RExC_recurse_count (pRExC_state->recurse_count)
205 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
206 #define RExC_contains_locale (pRExC_state->contains_locale)
207 #define RExC_override_recoding (pRExC_state->override_recoding)
210 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
211 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
212 ((*s) == '{' && regcurly(s)))
215 #undef SPSTART /* dratted cpp namespace... */
218 * Flags to be passed up and down.
220 #define WORST 0 /* Worst case. */
221 #define HASWIDTH 0x01 /* Known to match non-null strings. */
223 /* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
224 * character, and if utf8, must be invariant. Note that this is not the same
225 * thing as REGNODE_SIMPLE */
227 #define SPSTART 0x04 /* Starts with * or +. */
228 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
229 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
231 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
233 /* whether trie related optimizations are enabled */
234 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
235 #define TRIE_STUDY_OPT
236 #define FULL_TRIE_STUDY
242 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
243 #define PBITVAL(paren) (1 << ((paren) & 7))
244 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
245 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
246 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
248 /* If not already in utf8, do a longjmp back to the beginning */
249 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
250 #define REQUIRE_UTF8 STMT_START { \
251 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
254 /* About scan_data_t.
256 During optimisation we recurse through the regexp program performing
257 various inplace (keyhole style) optimisations. In addition study_chunk
258 and scan_commit populate this data structure with information about
259 what strings MUST appear in the pattern. We look for the longest
260 string that must appear at a fixed location, and we look for the
261 longest string that may appear at a floating location. So for instance
266 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
267 strings (because they follow a .* construct). study_chunk will identify
268 both FOO and BAR as being the longest fixed and floating strings respectively.
270 The strings can be composites, for instance
274 will result in a composite fixed substring 'foo'.
276 For each string some basic information is maintained:
278 - offset or min_offset
279 This is the position the string must appear at, or not before.
280 It also implicitly (when combined with minlenp) tells us how many
281 characters must match before the string we are searching for.
282 Likewise when combined with minlenp and the length of the string it
283 tells us how many characters must appear after the string we have
287 Only used for floating strings. This is the rightmost point that
288 the string can appear at. If set to I32 max it indicates that the
289 string can occur infinitely far to the right.
292 A pointer to the minimum length of the pattern that the string
293 was found inside. This is important as in the case of positive
294 lookahead or positive lookbehind we can have multiple patterns
299 The minimum length of the pattern overall is 3, the minimum length
300 of the lookahead part is 3, but the minimum length of the part that
301 will actually match is 1. So 'FOO's minimum length is 3, but the
302 minimum length for the F is 1. This is important as the minimum length
303 is used to determine offsets in front of and behind the string being
304 looked for. Since strings can be composites this is the length of the
305 pattern at the time it was committed with a scan_commit. Note that
306 the length is calculated by study_chunk, so that the minimum lengths
307 are not known until the full pattern has been compiled, thus the
308 pointer to the value.
312 In the case of lookbehind the string being searched for can be
313 offset past the start point of the final matching string.
314 If this value was just blithely removed from the min_offset it would
315 invalidate some of the calculations for how many chars must match
316 before or after (as they are derived from min_offset and minlen and
317 the length of the string being searched for).
318 When the final pattern is compiled and the data is moved from the
319 scan_data_t structure into the regexp structure the information
320 about lookbehind is factored in, with the information that would
321 have been lost precalculated in the end_shift field for the
324 The fields pos_min and pos_delta are used to store the minimum offset
325 and the delta to the maximum offset at the current point in the pattern.
329 typedef struct scan_data_t {
330 /*I32 len_min; unused */
331 /*I32 len_delta; unused */
335 I32 last_end; /* min value, <0 unless valid. */
338 SV **longest; /* Either &l_fixed, or &l_float. */
339 SV *longest_fixed; /* longest fixed string found in pattern */
340 I32 offset_fixed; /* offset where it starts */
341 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
342 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
343 SV *longest_float; /* longest floating string found in pattern */
344 I32 offset_float_min; /* earliest point in string it can appear */
345 I32 offset_float_max; /* latest point in string it can appear */
346 I32 *minlen_float; /* pointer to the minlen relevant to the string */
347 I32 lookbehind_float; /* is the position of the string modified by LB */
351 struct regnode_charclass_class *start_class;
355 * Forward declarations for pregcomp()'s friends.
358 static const scan_data_t zero_scan_data =
359 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
361 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
362 #define SF_BEFORE_SEOL 0x0001
363 #define SF_BEFORE_MEOL 0x0002
364 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
365 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
368 # define SF_FIX_SHIFT_EOL (0+2)
369 # define SF_FL_SHIFT_EOL (0+4)
371 # define SF_FIX_SHIFT_EOL (+2)
372 # define SF_FL_SHIFT_EOL (+4)
375 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
376 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
378 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
379 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
380 #define SF_IS_INF 0x0040
381 #define SF_HAS_PAR 0x0080
382 #define SF_IN_PAR 0x0100
383 #define SF_HAS_EVAL 0x0200
384 #define SCF_DO_SUBSTR 0x0400
385 #define SCF_DO_STCLASS_AND 0x0800
386 #define SCF_DO_STCLASS_OR 0x1000
387 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
388 #define SCF_WHILEM_VISITED_POS 0x2000
390 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
391 #define SCF_SEEN_ACCEPT 0x8000
393 #define UTF cBOOL(RExC_utf8)
395 /* The enums for all these are ordered so things work out correctly */
396 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
397 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
398 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
399 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
400 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
401 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
402 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
404 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
406 #define OOB_NAMEDCLASS -1
408 /* There is no code point that is out-of-bounds, so this is problematic. But
409 * its only current use is to initialize a variable that is always set before
411 #define OOB_UNICODE 0xDEADBEEF
413 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
414 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
417 /* length of regex to show in messages that don't mark a position within */
418 #define RegexLengthToShowInErrorMessages 127
421 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
422 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
423 * op/pragma/warn/regcomp.
425 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
426 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
428 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
431 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
432 * arg. Show regex, up to a maximum length. If it's too long, chop and add
435 #define _FAIL(code) STMT_START { \
436 const char *ellipses = ""; \
437 IV len = RExC_end - RExC_precomp; \
440 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
441 if (len > RegexLengthToShowInErrorMessages) { \
442 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
443 len = RegexLengthToShowInErrorMessages - 10; \
449 #define FAIL(msg) _FAIL( \
450 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
451 msg, (int)len, RExC_precomp, ellipses))
453 #define FAIL2(msg,arg) _FAIL( \
454 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
455 arg, (int)len, RExC_precomp, ellipses))
458 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
460 #define Simple_vFAIL(m) STMT_START { \
461 const IV offset = RExC_parse - RExC_precomp; \
462 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
463 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
467 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
469 #define vFAIL(m) STMT_START { \
471 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
476 * Like Simple_vFAIL(), but accepts two arguments.
478 #define Simple_vFAIL2(m,a1) STMT_START { \
479 const IV offset = RExC_parse - RExC_precomp; \
480 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
481 (int)offset, RExC_precomp, RExC_precomp + offset); \
485 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
487 #define vFAIL2(m,a1) STMT_START { \
489 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
490 Simple_vFAIL2(m, a1); \
495 * Like Simple_vFAIL(), but accepts three arguments.
497 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
498 const IV offset = RExC_parse - RExC_precomp; \
499 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
500 (int)offset, RExC_precomp, RExC_precomp + offset); \
504 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
506 #define vFAIL3(m,a1,a2) STMT_START { \
508 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
509 Simple_vFAIL3(m, a1, a2); \
513 * Like Simple_vFAIL(), but accepts four arguments.
515 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
516 const IV offset = RExC_parse - RExC_precomp; \
517 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
518 (int)offset, RExC_precomp, RExC_precomp + offset); \
521 #define ckWARNreg(loc,m) STMT_START { \
522 const IV offset = loc - RExC_precomp; \
523 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 (int)offset, RExC_precomp, RExC_precomp + offset); \
527 #define ckWARNregdep(loc,m) STMT_START { \
528 const IV offset = loc - RExC_precomp; \
529 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
531 (int)offset, RExC_precomp, RExC_precomp + offset); \
534 #define ckWARN2regdep(loc,m, a1) STMT_START { \
535 const IV offset = loc - RExC_precomp; \
536 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
538 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
541 #define ckWARN2reg(loc, m, a1) STMT_START { \
542 const IV offset = loc - RExC_precomp; \
543 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
544 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
547 #define vWARN3(loc, m, a1, a2) STMT_START { \
548 const IV offset = loc - RExC_precomp; \
549 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
550 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
553 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
554 const IV offset = loc - RExC_precomp; \
555 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
556 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
559 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
560 const IV offset = loc - RExC_precomp; \
561 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
562 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
565 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
566 const IV offset = loc - RExC_precomp; \
567 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
568 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
571 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
572 const IV offset = loc - RExC_precomp; \
573 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
574 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
578 /* Allow for side effects in s */
579 #define REGC(c,s) STMT_START { \
580 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
583 /* Macros for recording node offsets. 20001227 mjd@plover.com
584 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
585 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
586 * Element 0 holds the number n.
587 * Position is 1 indexed.
589 #ifndef RE_TRACK_PATTERN_OFFSETS
590 #define Set_Node_Offset_To_R(node,byte)
591 #define Set_Node_Offset(node,byte)
592 #define Set_Cur_Node_Offset
593 #define Set_Node_Length_To_R(node,len)
594 #define Set_Node_Length(node,len)
595 #define Set_Node_Cur_Length(node)
596 #define Node_Offset(n)
597 #define Node_Length(n)
598 #define Set_Node_Offset_Length(node,offset,len)
599 #define ProgLen(ri) ri->u.proglen
600 #define SetProgLen(ri,x) ri->u.proglen = x
602 #define ProgLen(ri) ri->u.offsets[0]
603 #define SetProgLen(ri,x) ri->u.offsets[0] = x
604 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
606 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
607 __LINE__, (int)(node), (int)(byte))); \
609 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
611 RExC_offsets[2*(node)-1] = (byte); \
616 #define Set_Node_Offset(node,byte) \
617 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
618 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
620 #define Set_Node_Length_To_R(node,len) STMT_START { \
622 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
623 __LINE__, (int)(node), (int)(len))); \
625 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
627 RExC_offsets[2*(node)] = (len); \
632 #define Set_Node_Length(node,len) \
633 Set_Node_Length_To_R((node)-RExC_emit_start, len)
634 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
635 #define Set_Node_Cur_Length(node) \
636 Set_Node_Length(node, RExC_parse - parse_start)
638 /* Get offsets and lengths */
639 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
640 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
642 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
643 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
644 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
648 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
649 #define EXPERIMENTAL_INPLACESCAN
650 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
652 #define DEBUG_STUDYDATA(str,data,depth) \
653 DEBUG_OPTIMISE_MORE_r(if(data){ \
654 PerlIO_printf(Perl_debug_log, \
655 "%*s" str "Pos:%"IVdf"/%"IVdf \
656 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
657 (int)(depth)*2, "", \
658 (IV)((data)->pos_min), \
659 (IV)((data)->pos_delta), \
660 (UV)((data)->flags), \
661 (IV)((data)->whilem_c), \
662 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
663 is_inf ? "INF " : "" \
665 if ((data)->last_found) \
666 PerlIO_printf(Perl_debug_log, \
667 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
668 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
669 SvPVX_const((data)->last_found), \
670 (IV)((data)->last_end), \
671 (IV)((data)->last_start_min), \
672 (IV)((data)->last_start_max), \
673 ((data)->longest && \
674 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
675 SvPVX_const((data)->longest_fixed), \
676 (IV)((data)->offset_fixed), \
677 ((data)->longest && \
678 (data)->longest==&((data)->longest_float)) ? "*" : "", \
679 SvPVX_const((data)->longest_float), \
680 (IV)((data)->offset_float_min), \
681 (IV)((data)->offset_float_max) \
683 PerlIO_printf(Perl_debug_log,"\n"); \
686 static void clear_re(pTHX_ void *r);
688 /* Mark that we cannot extend a found fixed substring at this point.
689 Update the longest found anchored substring and the longest found
690 floating substrings if needed. */
693 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
695 const STRLEN l = CHR_SVLEN(data->last_found);
696 const STRLEN old_l = CHR_SVLEN(*data->longest);
697 GET_RE_DEBUG_FLAGS_DECL;
699 PERL_ARGS_ASSERT_SCAN_COMMIT;
701 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
702 SvSetMagicSV(*data->longest, data->last_found);
703 if (*data->longest == data->longest_fixed) {
704 data->offset_fixed = l ? data->last_start_min : data->pos_min;
705 if (data->flags & SF_BEFORE_EOL)
707 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
709 data->flags &= ~SF_FIX_BEFORE_EOL;
710 data->minlen_fixed=minlenp;
711 data->lookbehind_fixed=0;
713 else { /* *data->longest == data->longest_float */
714 data->offset_float_min = l ? data->last_start_min : data->pos_min;
715 data->offset_float_max = (l
716 ? data->last_start_max
717 : data->pos_min + data->pos_delta);
718 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
719 data->offset_float_max = I32_MAX;
720 if (data->flags & SF_BEFORE_EOL)
722 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
724 data->flags &= ~SF_FL_BEFORE_EOL;
725 data->minlen_float=minlenp;
726 data->lookbehind_float=0;
729 SvCUR_set(data->last_found, 0);
731 SV * const sv = data->last_found;
732 if (SvUTF8(sv) && SvMAGICAL(sv)) {
733 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
739 data->flags &= ~SF_BEFORE_EOL;
740 DEBUG_STUDYDATA("commit: ",data,0);
743 /* Can match anything (initialization) */
745 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
747 PERL_ARGS_ASSERT_CL_ANYTHING;
749 ANYOF_BITMAP_SETALL(cl);
750 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
751 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
753 /* If any portion of the regex is to operate under locale rules,
754 * initialization includes it. The reason this isn't done for all regexes
755 * is that the optimizer was written under the assumption that locale was
756 * all-or-nothing. Given the complexity and lack of documentation in the
757 * optimizer, and that there are inadequate test cases for locale, so many
758 * parts of it may not work properly, it is safest to avoid locale unless
760 if (RExC_contains_locale) {
761 ANYOF_CLASS_SETALL(cl); /* /l uses class */
762 cl->flags |= ANYOF_LOCALE;
765 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
769 /* Can match anything (initialization) */
771 S_cl_is_anything(const struct regnode_charclass_class *cl)
775 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
777 for (value = 0; value <= ANYOF_MAX; value += 2)
778 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
780 if (!(cl->flags & ANYOF_UNICODE_ALL))
782 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
787 /* Can match anything (initialization) */
789 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
791 PERL_ARGS_ASSERT_CL_INIT;
793 Zero(cl, 1, struct regnode_charclass_class);
795 cl_anything(pRExC_state, cl);
796 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
799 /* These two functions currently do the exact same thing */
800 #define cl_init_zero S_cl_init
802 /* 'AND' a given class with another one. Can create false positives. 'cl'
803 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
804 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
806 S_cl_and(struct regnode_charclass_class *cl,
807 const struct regnode_charclass_class *and_with)
809 PERL_ARGS_ASSERT_CL_AND;
811 assert(and_with->type == ANYOF);
813 /* I (khw) am not sure all these restrictions are necessary XXX */
814 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
815 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
816 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
817 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
818 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
821 if (and_with->flags & ANYOF_INVERT)
822 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
823 cl->bitmap[i] &= ~and_with->bitmap[i];
825 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
826 cl->bitmap[i] &= and_with->bitmap[i];
827 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
829 if (and_with->flags & ANYOF_INVERT) {
831 /* Here, the and'ed node is inverted. Get the AND of the flags that
832 * aren't affected by the inversion. Those that are affected are
833 * handled individually below */
834 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
835 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
836 cl->flags |= affected_flags;
838 /* We currently don't know how to deal with things that aren't in the
839 * bitmap, but we know that the intersection is no greater than what
840 * is already in cl, so let there be false positives that get sorted
841 * out after the synthetic start class succeeds, and the node is
842 * matched for real. */
844 /* The inversion of these two flags indicate that the resulting
845 * intersection doesn't have them */
846 if (and_with->flags & ANYOF_UNICODE_ALL) {
847 cl->flags &= ~ANYOF_UNICODE_ALL;
849 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
850 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
853 else { /* and'd node is not inverted */
854 U8 outside_bitmap_but_not_utf8; /* Temp variable */
856 if (! ANYOF_NONBITMAP(and_with)) {
858 /* Here 'and_with' doesn't match anything outside the bitmap
859 * (except possibly ANYOF_UNICODE_ALL), which means the
860 * intersection can't either, except for ANYOF_UNICODE_ALL, in
861 * which case we don't know what the intersection is, but it's no
862 * greater than what cl already has, so can just leave it alone,
863 * with possible false positives */
864 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
865 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
866 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
869 else if (! ANYOF_NONBITMAP(cl)) {
871 /* Here, 'and_with' does match something outside the bitmap, and cl
872 * doesn't have a list of things to match outside the bitmap. If
873 * cl can match all code points above 255, the intersection will
874 * be those above-255 code points that 'and_with' matches. If cl
875 * can't match all Unicode code points, it means that it can't
876 * match anything outside the bitmap (since the 'if' that got us
877 * into this block tested for that), so we leave the bitmap empty.
879 if (cl->flags & ANYOF_UNICODE_ALL) {
880 ARG_SET(cl, ARG(and_with));
882 /* and_with's ARG may match things that don't require UTF8.
883 * And now cl's will too, in spite of this being an 'and'. See
884 * the comments below about the kludge */
885 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
889 /* Here, both 'and_with' and cl match something outside the
890 * bitmap. Currently we do not do the intersection, so just match
891 * whatever cl had at the beginning. */
895 /* Take the intersection of the two sets of flags. However, the
896 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
897 * kludge around the fact that this flag is not treated like the others
898 * which are initialized in cl_anything(). The way the optimizer works
899 * is that the synthetic start class (SSC) is initialized to match
900 * anything, and then the first time a real node is encountered, its
901 * values are AND'd with the SSC's with the result being the values of
902 * the real node. However, there are paths through the optimizer where
903 * the AND never gets called, so those initialized bits are set
904 * inappropriately, which is not usually a big deal, as they just cause
905 * false positives in the SSC, which will just mean a probably
906 * imperceptible slow down in execution. However this bit has a
907 * higher false positive consequence in that it can cause utf8.pm,
908 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
909 * bigger slowdown and also causes significant extra memory to be used.
910 * In order to prevent this, the code now takes a different tack. The
911 * bit isn't set unless some part of the regular expression needs it,
912 * but once set it won't get cleared. This means that these extra
913 * modules won't get loaded unless there was some path through the
914 * pattern that would have required them anyway, and so any false
915 * positives that occur by not ANDing them out when they could be
916 * aren't as severe as they would be if we treated this bit like all
918 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
919 & ANYOF_NONBITMAP_NON_UTF8;
920 cl->flags &= and_with->flags;
921 cl->flags |= outside_bitmap_but_not_utf8;
925 /* 'OR' a given class with another one. Can create false positives. 'cl'
926 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
927 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
929 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
931 PERL_ARGS_ASSERT_CL_OR;
933 if (or_with->flags & ANYOF_INVERT) {
935 /* Here, the or'd node is to be inverted. This means we take the
936 * complement of everything not in the bitmap, but currently we don't
937 * know what that is, so give up and match anything */
938 if (ANYOF_NONBITMAP(or_with)) {
939 cl_anything(pRExC_state, cl);
942 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
943 * <= (B1 | !B2) | (CL1 | !CL2)
944 * which is wasteful if CL2 is small, but we ignore CL2:
945 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
946 * XXXX Can we handle case-fold? Unclear:
947 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
948 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
950 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
951 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
952 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
955 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
956 cl->bitmap[i] |= ~or_with->bitmap[i];
957 } /* XXXX: logic is complicated otherwise */
959 cl_anything(pRExC_state, cl);
962 /* And, we can just take the union of the flags that aren't affected
963 * by the inversion */
964 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
966 /* For the remaining flags:
967 ANYOF_UNICODE_ALL and inverted means to not match anything above
968 255, which means that the union with cl should just be
969 what cl has in it, so can ignore this flag
970 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
971 is 127-255 to match them, but then invert that, so the
972 union with cl should just be what cl has in it, so can
975 } else { /* 'or_with' is not inverted */
976 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
977 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
978 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
979 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
982 /* OR char bitmap and class bitmap separately */
983 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
984 cl->bitmap[i] |= or_with->bitmap[i];
985 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
986 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
987 cl->classflags[i] |= or_with->classflags[i];
988 cl->flags |= ANYOF_CLASS;
991 else { /* XXXX: logic is complicated, leave it along for a moment. */
992 cl_anything(pRExC_state, cl);
995 if (ANYOF_NONBITMAP(or_with)) {
997 /* Use the added node's outside-the-bit-map match if there isn't a
998 * conflict. If there is a conflict (both nodes match something
999 * outside the bitmap, but what they match outside is not the same
1000 * pointer, and hence not easily compared until XXX we extend
1001 * inversion lists this far), give up and allow the start class to
1002 * match everything outside the bitmap. If that stuff is all above
1003 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1004 if (! ANYOF_NONBITMAP(cl)) {
1005 ARG_SET(cl, ARG(or_with));
1007 else if (ARG(cl) != ARG(or_with)) {
1009 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1010 cl_anything(pRExC_state, cl);
1013 cl->flags |= ANYOF_UNICODE_ALL;
1018 /* Take the union */
1019 cl->flags |= or_with->flags;
1023 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1024 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1025 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1026 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1031 dump_trie(trie,widecharmap,revcharmap)
1032 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1033 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1035 These routines dump out a trie in a somewhat readable format.
1036 The _interim_ variants are used for debugging the interim
1037 tables that are used to generate the final compressed
1038 representation which is what dump_trie expects.
1040 Part of the reason for their existence is to provide a form
1041 of documentation as to how the different representations function.
1046 Dumps the final compressed table form of the trie to Perl_debug_log.
1047 Used for debugging make_trie().
1051 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1052 AV *revcharmap, U32 depth)
1055 SV *sv=sv_newmortal();
1056 int colwidth= widecharmap ? 6 : 4;
1058 GET_RE_DEBUG_FLAGS_DECL;
1060 PERL_ARGS_ASSERT_DUMP_TRIE;
1062 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1063 (int)depth * 2 + 2,"",
1064 "Match","Base","Ofs" );
1066 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1067 SV ** const tmp = av_fetch( revcharmap, state, 0);
1069 PerlIO_printf( Perl_debug_log, "%*s",
1071 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1072 PL_colors[0], PL_colors[1],
1073 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1074 PERL_PV_ESCAPE_FIRSTCHAR
1079 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1080 (int)depth * 2 + 2,"");
1082 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1083 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1084 PerlIO_printf( Perl_debug_log, "\n");
1086 for( state = 1 ; state < trie->statecount ; state++ ) {
1087 const U32 base = trie->states[ state ].trans.base;
1089 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1091 if ( trie->states[ state ].wordnum ) {
1092 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1094 PerlIO_printf( Perl_debug_log, "%6s", "" );
1097 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1102 while( ( base + ofs < trie->uniquecharcount ) ||
1103 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1104 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1107 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1109 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1110 if ( ( base + ofs >= trie->uniquecharcount ) &&
1111 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1112 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1114 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1116 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1118 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1122 PerlIO_printf( Perl_debug_log, "]");
1125 PerlIO_printf( Perl_debug_log, "\n" );
1127 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1128 for (word=1; word <= trie->wordcount; word++) {
1129 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1130 (int)word, (int)(trie->wordinfo[word].prev),
1131 (int)(trie->wordinfo[word].len));
1133 PerlIO_printf(Perl_debug_log, "\n" );
1136 Dumps a fully constructed but uncompressed trie in list form.
1137 List tries normally only are used for construction when the number of
1138 possible chars (trie->uniquecharcount) is very high.
1139 Used for debugging make_trie().
1142 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1143 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1147 SV *sv=sv_newmortal();
1148 int colwidth= widecharmap ? 6 : 4;
1149 GET_RE_DEBUG_FLAGS_DECL;
1151 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1153 /* print out the table precompression. */
1154 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1155 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1156 "------:-----+-----------------\n" );
1158 for( state=1 ; state < next_alloc ; state ++ ) {
1161 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1162 (int)depth * 2 + 2,"", (UV)state );
1163 if ( ! trie->states[ state ].wordnum ) {
1164 PerlIO_printf( Perl_debug_log, "%5s| ","");
1166 PerlIO_printf( Perl_debug_log, "W%4x| ",
1167 trie->states[ state ].wordnum
1170 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1171 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1173 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1175 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1176 PL_colors[0], PL_colors[1],
1177 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1178 PERL_PV_ESCAPE_FIRSTCHAR
1180 TRIE_LIST_ITEM(state,charid).forid,
1181 (UV)TRIE_LIST_ITEM(state,charid).newstate
1184 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1185 (int)((depth * 2) + 14), "");
1188 PerlIO_printf( Perl_debug_log, "\n");
1193 Dumps a fully constructed but uncompressed trie in table form.
1194 This is the normal DFA style state transition table, with a few
1195 twists to facilitate compression later.
1196 Used for debugging make_trie().
1199 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1200 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1205 SV *sv=sv_newmortal();
1206 int colwidth= widecharmap ? 6 : 4;
1207 GET_RE_DEBUG_FLAGS_DECL;
1209 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1212 print out the table precompression so that we can do a visual check
1213 that they are identical.
1216 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1218 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1219 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1221 PerlIO_printf( Perl_debug_log, "%*s",
1223 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1224 PL_colors[0], PL_colors[1],
1225 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1226 PERL_PV_ESCAPE_FIRSTCHAR
1232 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1234 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1235 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1238 PerlIO_printf( Perl_debug_log, "\n" );
1240 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1242 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1243 (int)depth * 2 + 2,"",
1244 (UV)TRIE_NODENUM( state ) );
1246 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1247 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1249 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1251 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1253 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1254 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1256 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1257 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1265 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1266 startbranch: the first branch in the whole branch sequence
1267 first : start branch of sequence of branch-exact nodes.
1268 May be the same as startbranch
1269 last : Thing following the last branch.
1270 May be the same as tail.
1271 tail : item following the branch sequence
1272 count : words in the sequence
1273 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1274 depth : indent depth
1276 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1278 A trie is an N'ary tree where the branches are determined by digital
1279 decomposition of the key. IE, at the root node you look up the 1st character and
1280 follow that branch repeat until you find the end of the branches. Nodes can be
1281 marked as "accepting" meaning they represent a complete word. Eg:
1285 would convert into the following structure. Numbers represent states, letters
1286 following numbers represent valid transitions on the letter from that state, if
1287 the number is in square brackets it represents an accepting state, otherwise it
1288 will be in parenthesis.
1290 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1294 (1) +-i->(6)-+-s->[7]
1296 +-s->(3)-+-h->(4)-+-e->[5]
1298 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1300 This shows that when matching against the string 'hers' we will begin at state 1
1301 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1302 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1303 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1304 single traverse. We store a mapping from accepting to state to which word was
1305 matched, and then when we have multiple possibilities we try to complete the
1306 rest of the regex in the order in which they occured in the alternation.
1308 The only prior NFA like behaviour that would be changed by the TRIE support is
1309 the silent ignoring of duplicate alternations which are of the form:
1311 / (DUPE|DUPE) X? (?{ ... }) Y /x
1313 Thus EVAL blocks following a trie may be called a different number of times with
1314 and without the optimisation. With the optimisations dupes will be silently
1315 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1316 the following demonstrates:
1318 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1320 which prints out 'word' three times, but
1322 'words'=~/(word|word|word)(?{ print $1 })S/
1324 which doesnt print it out at all. This is due to other optimisations kicking in.
1326 Example of what happens on a structural level:
1328 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1330 1: CURLYM[1] {1,32767}(18)
1341 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1342 and should turn into:
1344 1: CURLYM[1] {1,32767}(18)
1346 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1354 Cases where tail != last would be like /(?foo|bar)baz/:
1364 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1365 and would end up looking like:
1368 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1375 d = uvuni_to_utf8_flags(d, uv, 0);
1377 is the recommended Unicode-aware way of saying
1382 #define TRIE_STORE_REVCHAR(val) \
1385 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1386 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1387 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1388 SvCUR_set(zlopp, kapow - flrbbbbb); \
1391 av_push(revcharmap, zlopp); \
1393 char ooooff = (char)val; \
1394 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1398 #define TRIE_READ_CHAR STMT_START { \
1401 /* if it is UTF then it is either already folded, or does not need folding */ \
1402 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1404 else if (folder == PL_fold_latin1) { \
1405 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1406 if ( foldlen > 0 ) { \
1407 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1413 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1414 skiplen = UNISKIP(uvc); \
1415 foldlen -= skiplen; \
1416 scan = foldbuf + skiplen; \
1419 /* raw data, will be folded later if needed */ \
1427 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1428 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1429 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1430 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1432 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1433 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1434 TRIE_LIST_CUR( state )++; \
1437 #define TRIE_LIST_NEW(state) STMT_START { \
1438 Newxz( trie->states[ state ].trans.list, \
1439 4, reg_trie_trans_le ); \
1440 TRIE_LIST_CUR( state ) = 1; \
1441 TRIE_LIST_LEN( state ) = 4; \
1444 #define TRIE_HANDLE_WORD(state) STMT_START { \
1445 U16 dupe= trie->states[ state ].wordnum; \
1446 regnode * const noper_next = regnext( noper ); \
1449 /* store the word for dumping */ \
1451 if (OP(noper) != NOTHING) \
1452 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1454 tmp = newSVpvn_utf8( "", 0, UTF ); \
1455 av_push( trie_words, tmp ); \
1459 trie->wordinfo[curword].prev = 0; \
1460 trie->wordinfo[curword].len = wordlen; \
1461 trie->wordinfo[curword].accept = state; \
1463 if ( noper_next < tail ) { \
1465 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1466 trie->jump[curword] = (U16)(noper_next - convert); \
1468 jumper = noper_next; \
1470 nextbranch= regnext(cur); \
1474 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1475 /* chain, so that when the bits of chain are later */\
1476 /* linked together, the dups appear in the chain */\
1477 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1478 trie->wordinfo[dupe].prev = curword; \
1480 /* we haven't inserted this word yet. */ \
1481 trie->states[ state ].wordnum = curword; \
1486 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1487 ( ( base + charid >= ucharcount \
1488 && base + charid < ubound \
1489 && state == trie->trans[ base - ucharcount + charid ].check \
1490 && trie->trans[ base - ucharcount + charid ].next ) \
1491 ? trie->trans[ base - ucharcount + charid ].next \
1492 : ( state==1 ? special : 0 ) \
1496 #define MADE_JUMP_TRIE 2
1497 #define MADE_EXACT_TRIE 4
1500 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1503 /* first pass, loop through and scan words */
1504 reg_trie_data *trie;
1505 HV *widecharmap = NULL;
1506 AV *revcharmap = newAV();
1508 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1513 regnode *jumper = NULL;
1514 regnode *nextbranch = NULL;
1515 regnode *convert = NULL;
1516 U32 *prev_states; /* temp array mapping each state to previous one */
1517 /* we just use folder as a flag in utf8 */
1518 const U8 * folder = NULL;
1521 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1522 AV *trie_words = NULL;
1523 /* along with revcharmap, this only used during construction but both are
1524 * useful during debugging so we store them in the struct when debugging.
1527 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1528 STRLEN trie_charcount=0;
1530 SV *re_trie_maxbuff;
1531 GET_RE_DEBUG_FLAGS_DECL;
1533 PERL_ARGS_ASSERT_MAKE_TRIE;
1535 PERL_UNUSED_ARG(depth);
1542 case EXACTFU_TRICKYFOLD:
1543 case EXACTFU: folder = PL_fold_latin1; break;
1544 case EXACTF: folder = PL_fold; break;
1545 case EXACTFL: folder = PL_fold_locale; break;
1546 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1549 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1551 trie->startstate = 1;
1552 trie->wordcount = word_count;
1553 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1554 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1556 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1557 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1558 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1561 trie_words = newAV();
1564 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1565 if (!SvIOK(re_trie_maxbuff)) {
1566 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1568 DEBUG_TRIE_COMPILE_r({
1569 PerlIO_printf( Perl_debug_log,
1570 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1571 (int)depth * 2 + 2, "",
1572 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1573 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1577 /* Find the node we are going to overwrite */
1578 if ( first == startbranch && OP( last ) != BRANCH ) {
1579 /* whole branch chain */
1582 /* branch sub-chain */
1583 convert = NEXTOPER( first );
1586 /* -- First loop and Setup --
1588 We first traverse the branches and scan each word to determine if it
1589 contains widechars, and how many unique chars there are, this is
1590 important as we have to build a table with at least as many columns as we
1593 We use an array of integers to represent the character codes 0..255
1594 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1595 native representation of the character value as the key and IV's for the
1598 *TODO* If we keep track of how many times each character is used we can
1599 remap the columns so that the table compression later on is more
1600 efficient in terms of memory by ensuring the most common value is in the
1601 middle and the least common are on the outside. IMO this would be better
1602 than a most to least common mapping as theres a decent chance the most
1603 common letter will share a node with the least common, meaning the node
1604 will not be compressible. With a middle is most common approach the worst
1605 case is when we have the least common nodes twice.
1609 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1610 regnode *noper = NEXTOPER( cur );
1611 const U8 *uc = (U8*)STRING( noper );
1612 const U8 *e = uc + STR_LEN( noper );
1614 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1616 const U8 *scan = (U8*)NULL;
1617 U32 wordlen = 0; /* required init */
1619 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1621 if (OP(noper) == NOTHING) {
1622 regnode *noper_next= regnext(noper);
1623 if (noper_next != tail && OP(noper_next) == flags) {
1625 uc= (U8*)STRING(noper);
1626 e= uc + STR_LEN(noper);
1627 trie->minlen= STR_LEN(noper);
1634 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1635 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1636 regardless of encoding */
1637 if (OP( noper ) == EXACTFU_SS) {
1638 /* false positives are ok, so just set this */
1639 TRIE_BITMAP_SET(trie,0xDF);
1642 for ( ; uc < e ; uc += len ) {
1643 TRIE_CHARCOUNT(trie)++;
1648 U8 folded= folder[ (U8) uvc ];
1649 if ( !trie->charmap[ folded ] ) {
1650 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1651 TRIE_STORE_REVCHAR( folded );
1654 if ( !trie->charmap[ uvc ] ) {
1655 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1656 TRIE_STORE_REVCHAR( uvc );
1659 /* store the codepoint in the bitmap, and its folded
1661 TRIE_BITMAP_SET(trie, uvc);
1663 /* store the folded codepoint */
1664 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1667 /* store first byte of utf8 representation of
1668 variant codepoints */
1669 if (! UNI_IS_INVARIANT(uvc)) {
1670 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1673 set_bit = 0; /* We've done our bit :-) */
1678 widecharmap = newHV();
1680 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1683 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1685 if ( !SvTRUE( *svpp ) ) {
1686 sv_setiv( *svpp, ++trie->uniquecharcount );
1687 TRIE_STORE_REVCHAR(uvc);
1691 if( cur == first ) {
1692 trie->minlen = chars;
1693 trie->maxlen = chars;
1694 } else if (chars < trie->minlen) {
1695 trie->minlen = chars;
1696 } else if (chars > trie->maxlen) {
1697 trie->maxlen = chars;
1699 if (OP( noper ) == EXACTFU_SS) {
1700 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1701 if (trie->minlen > 1)
1704 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1705 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1706 * - We assume that any such sequence might match a 2 byte string */
1707 if (trie->minlen > 2 )
1711 } /* end first pass */
1712 DEBUG_TRIE_COMPILE_r(
1713 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1714 (int)depth * 2 + 2,"",
1715 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1716 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1717 (int)trie->minlen, (int)trie->maxlen )
1721 We now know what we are dealing with in terms of unique chars and
1722 string sizes so we can calculate how much memory a naive
1723 representation using a flat table will take. If it's over a reasonable
1724 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1725 conservative but potentially much slower representation using an array
1728 At the end we convert both representations into the same compressed
1729 form that will be used in regexec.c for matching with. The latter
1730 is a form that cannot be used to construct with but has memory
1731 properties similar to the list form and access properties similar
1732 to the table form making it both suitable for fast searches and
1733 small enough that its feasable to store for the duration of a program.
1735 See the comment in the code where the compressed table is produced
1736 inplace from the flat tabe representation for an explanation of how
1737 the compression works.
1742 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1745 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1747 Second Pass -- Array Of Lists Representation
1749 Each state will be represented by a list of charid:state records
1750 (reg_trie_trans_le) the first such element holds the CUR and LEN
1751 points of the allocated array. (See defines above).
1753 We build the initial structure using the lists, and then convert
1754 it into the compressed table form which allows faster lookups
1755 (but cant be modified once converted).
1758 STRLEN transcount = 1;
1760 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1761 "%*sCompiling trie using list compiler\n",
1762 (int)depth * 2 + 2, ""));
1764 trie->states = (reg_trie_state *)
1765 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1766 sizeof(reg_trie_state) );
1770 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1772 regnode *noper = NEXTOPER( cur );
1773 U8 *uc = (U8*)STRING( noper );
1774 const U8 *e = uc + STR_LEN( noper );
1775 U32 state = 1; /* required init */
1776 U16 charid = 0; /* sanity init */
1777 U8 *scan = (U8*)NULL; /* sanity init */
1778 STRLEN foldlen = 0; /* required init */
1779 U32 wordlen = 0; /* required init */
1780 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1783 if (OP(noper) == NOTHING) {
1784 regnode *noper_next= regnext(noper);
1785 if (noper_next != tail && OP(noper_next) == flags) {
1787 uc= (U8*)STRING(noper);
1788 e= uc + STR_LEN(noper);
1792 if (OP(noper) != NOTHING) {
1793 for ( ; uc < e ; uc += len ) {
1798 charid = trie->charmap[ uvc ];
1800 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1804 charid=(U16)SvIV( *svpp );
1807 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1814 if ( !trie->states[ state ].trans.list ) {
1815 TRIE_LIST_NEW( state );
1817 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1818 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1819 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1824 newstate = next_alloc++;
1825 prev_states[newstate] = state;
1826 TRIE_LIST_PUSH( state, charid, newstate );
1831 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1835 TRIE_HANDLE_WORD(state);
1837 } /* end second pass */
1839 /* next alloc is the NEXT state to be allocated */
1840 trie->statecount = next_alloc;
1841 trie->states = (reg_trie_state *)
1842 PerlMemShared_realloc( trie->states,
1844 * sizeof(reg_trie_state) );
1846 /* and now dump it out before we compress it */
1847 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1848 revcharmap, next_alloc,
1852 trie->trans = (reg_trie_trans *)
1853 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1860 for( state=1 ; state < next_alloc ; state ++ ) {
1864 DEBUG_TRIE_COMPILE_MORE_r(
1865 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1869 if (trie->states[state].trans.list) {
1870 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1874 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1875 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1876 if ( forid < minid ) {
1878 } else if ( forid > maxid ) {
1882 if ( transcount < tp + maxid - minid + 1) {
1884 trie->trans = (reg_trie_trans *)
1885 PerlMemShared_realloc( trie->trans,
1887 * sizeof(reg_trie_trans) );
1888 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1890 base = trie->uniquecharcount + tp - minid;
1891 if ( maxid == minid ) {
1893 for ( ; zp < tp ; zp++ ) {
1894 if ( ! trie->trans[ zp ].next ) {
1895 base = trie->uniquecharcount + zp - minid;
1896 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1897 trie->trans[ zp ].check = state;
1903 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1904 trie->trans[ tp ].check = state;
1909 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1910 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1911 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1912 trie->trans[ tid ].check = state;
1914 tp += ( maxid - minid + 1 );
1916 Safefree(trie->states[ state ].trans.list);
1919 DEBUG_TRIE_COMPILE_MORE_r(
1920 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1923 trie->states[ state ].trans.base=base;
1925 trie->lasttrans = tp + 1;
1929 Second Pass -- Flat Table Representation.
1931 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1932 We know that we will need Charcount+1 trans at most to store the data
1933 (one row per char at worst case) So we preallocate both structures
1934 assuming worst case.
1936 We then construct the trie using only the .next slots of the entry
1939 We use the .check field of the first entry of the node temporarily to
1940 make compression both faster and easier by keeping track of how many non
1941 zero fields are in the node.
1943 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1946 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1947 number representing the first entry of the node, and state as a
1948 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1949 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1950 are 2 entrys per node. eg:
1958 The table is internally in the right hand, idx form. However as we also
1959 have to deal with the states array which is indexed by nodenum we have to
1960 use TRIE_NODENUM() to convert.
1963 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1964 "%*sCompiling trie using table compiler\n",
1965 (int)depth * 2 + 2, ""));
1967 trie->trans = (reg_trie_trans *)
1968 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1969 * trie->uniquecharcount + 1,
1970 sizeof(reg_trie_trans) );
1971 trie->states = (reg_trie_state *)
1972 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1973 sizeof(reg_trie_state) );
1974 next_alloc = trie->uniquecharcount + 1;
1977 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1979 regnode *noper = NEXTOPER( cur );
1980 const U8 *uc = (U8*)STRING( noper );
1981 const U8 *e = uc + STR_LEN( noper );
1983 U32 state = 1; /* required init */
1985 U16 charid = 0; /* sanity init */
1986 U32 accept_state = 0; /* sanity init */
1987 U8 *scan = (U8*)NULL; /* sanity init */
1989 STRLEN foldlen = 0; /* required init */
1990 U32 wordlen = 0; /* required init */
1992 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1994 if (OP(noper) == NOTHING) {
1995 regnode *noper_next= regnext(noper);
1996 if (noper_next != tail && OP(noper_next) == flags) {
1998 uc= (U8*)STRING(noper);
1999 e= uc + STR_LEN(noper);
2003 if ( OP(noper) != NOTHING ) {
2004 for ( ; uc < e ; uc += len ) {
2009 charid = trie->charmap[ uvc ];
2011 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2012 charid = svpp ? (U16)SvIV(*svpp) : 0;
2016 if ( !trie->trans[ state + charid ].next ) {
2017 trie->trans[ state + charid ].next = next_alloc;
2018 trie->trans[ state ].check++;
2019 prev_states[TRIE_NODENUM(next_alloc)]
2020 = TRIE_NODENUM(state);
2021 next_alloc += trie->uniquecharcount;
2023 state = trie->trans[ state + charid ].next;
2025 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2027 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2030 accept_state = TRIE_NODENUM( state );
2031 TRIE_HANDLE_WORD(accept_state);
2033 } /* end second pass */
2035 /* and now dump it out before we compress it */
2036 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2038 next_alloc, depth+1));
2042 * Inplace compress the table.*
2044 For sparse data sets the table constructed by the trie algorithm will
2045 be mostly 0/FAIL transitions or to put it another way mostly empty.
2046 (Note that leaf nodes will not contain any transitions.)
2048 This algorithm compresses the tables by eliminating most such
2049 transitions, at the cost of a modest bit of extra work during lookup:
2051 - Each states[] entry contains a .base field which indicates the
2052 index in the state[] array wheres its transition data is stored.
2054 - If .base is 0 there are no valid transitions from that node.
2056 - If .base is nonzero then charid is added to it to find an entry in
2059 -If trans[states[state].base+charid].check!=state then the
2060 transition is taken to be a 0/Fail transition. Thus if there are fail
2061 transitions at the front of the node then the .base offset will point
2062 somewhere inside the previous nodes data (or maybe even into a node
2063 even earlier), but the .check field determines if the transition is
2067 The following process inplace converts the table to the compressed
2068 table: We first do not compress the root node 1,and mark all its
2069 .check pointers as 1 and set its .base pointer as 1 as well. This
2070 allows us to do a DFA construction from the compressed table later,
2071 and ensures that any .base pointers we calculate later are greater
2074 - We set 'pos' to indicate the first entry of the second node.
2076 - We then iterate over the columns of the node, finding the first and
2077 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2078 and set the .check pointers accordingly, and advance pos
2079 appropriately and repreat for the next node. Note that when we copy
2080 the next pointers we have to convert them from the original
2081 NODEIDX form to NODENUM form as the former is not valid post
2084 - If a node has no transitions used we mark its base as 0 and do not
2085 advance the pos pointer.
2087 - If a node only has one transition we use a second pointer into the
2088 structure to fill in allocated fail transitions from other states.
2089 This pointer is independent of the main pointer and scans forward
2090 looking for null transitions that are allocated to a state. When it
2091 finds one it writes the single transition into the "hole". If the
2092 pointer doesnt find one the single transition is appended as normal.
2094 - Once compressed we can Renew/realloc the structures to release the
2097 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2098 specifically Fig 3.47 and the associated pseudocode.
2102 const U32 laststate = TRIE_NODENUM( next_alloc );
2105 trie->statecount = laststate;
2107 for ( state = 1 ; state < laststate ; state++ ) {
2109 const U32 stateidx = TRIE_NODEIDX( state );
2110 const U32 o_used = trie->trans[ stateidx ].check;
2111 U32 used = trie->trans[ stateidx ].check;
2112 trie->trans[ stateidx ].check = 0;
2114 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2115 if ( flag || trie->trans[ stateidx + charid ].next ) {
2116 if ( trie->trans[ stateidx + charid ].next ) {
2118 for ( ; zp < pos ; zp++ ) {
2119 if ( ! trie->trans[ zp ].next ) {
2123 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2124 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2125 trie->trans[ zp ].check = state;
2126 if ( ++zp > pos ) pos = zp;
2133 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2135 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2136 trie->trans[ pos ].check = state;
2141 trie->lasttrans = pos + 1;
2142 trie->states = (reg_trie_state *)
2143 PerlMemShared_realloc( trie->states, laststate
2144 * sizeof(reg_trie_state) );
2145 DEBUG_TRIE_COMPILE_MORE_r(
2146 PerlIO_printf( Perl_debug_log,
2147 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2148 (int)depth * 2 + 2,"",
2149 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2152 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2155 } /* end table compress */
2157 DEBUG_TRIE_COMPILE_MORE_r(
2158 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2159 (int)depth * 2 + 2, "",
2160 (UV)trie->statecount,
2161 (UV)trie->lasttrans)
2163 /* resize the trans array to remove unused space */
2164 trie->trans = (reg_trie_trans *)
2165 PerlMemShared_realloc( trie->trans, trie->lasttrans
2166 * sizeof(reg_trie_trans) );
2168 { /* Modify the program and insert the new TRIE node */
2169 U8 nodetype =(U8)(flags & 0xFF);
2173 regnode *optimize = NULL;
2174 #ifdef RE_TRACK_PATTERN_OFFSETS
2177 U32 mjd_nodelen = 0;
2178 #endif /* RE_TRACK_PATTERN_OFFSETS */
2179 #endif /* DEBUGGING */
2181 This means we convert either the first branch or the first Exact,
2182 depending on whether the thing following (in 'last') is a branch
2183 or not and whther first is the startbranch (ie is it a sub part of
2184 the alternation or is it the whole thing.)
2185 Assuming its a sub part we convert the EXACT otherwise we convert
2186 the whole branch sequence, including the first.
2188 /* Find the node we are going to overwrite */
2189 if ( first != startbranch || OP( last ) == BRANCH ) {
2190 /* branch sub-chain */
2191 NEXT_OFF( first ) = (U16)(last - first);
2192 #ifdef RE_TRACK_PATTERN_OFFSETS
2194 mjd_offset= Node_Offset((convert));
2195 mjd_nodelen= Node_Length((convert));
2198 /* whole branch chain */
2200 #ifdef RE_TRACK_PATTERN_OFFSETS
2203 const regnode *nop = NEXTOPER( convert );
2204 mjd_offset= Node_Offset((nop));
2205 mjd_nodelen= Node_Length((nop));
2209 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2210 (int)depth * 2 + 2, "",
2211 (UV)mjd_offset, (UV)mjd_nodelen)
2214 /* But first we check to see if there is a common prefix we can
2215 split out as an EXACT and put in front of the TRIE node. */
2216 trie->startstate= 1;
2217 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2219 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2223 const U32 base = trie->states[ state ].trans.base;
2225 if ( trie->states[state].wordnum )
2228 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2229 if ( ( base + ofs >= trie->uniquecharcount ) &&
2230 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2231 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2233 if ( ++count > 1 ) {
2234 SV **tmp = av_fetch( revcharmap, ofs, 0);
2235 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2236 if ( state == 1 ) break;
2238 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2240 PerlIO_printf(Perl_debug_log,
2241 "%*sNew Start State=%"UVuf" Class: [",
2242 (int)depth * 2 + 2, "",
2245 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2246 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2248 TRIE_BITMAP_SET(trie,*ch);
2250 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2252 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2256 TRIE_BITMAP_SET(trie,*ch);
2258 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2259 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2265 SV **tmp = av_fetch( revcharmap, idx, 0);
2267 char *ch = SvPV( *tmp, len );
2269 SV *sv=sv_newmortal();
2270 PerlIO_printf( Perl_debug_log,
2271 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2272 (int)depth * 2 + 2, "",
2274 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2275 PL_colors[0], PL_colors[1],
2276 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2277 PERL_PV_ESCAPE_FIRSTCHAR
2282 OP( convert ) = nodetype;
2283 str=STRING(convert);
2286 STR_LEN(convert) += len;
2292 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2297 trie->prefixlen = (state-1);
2299 regnode *n = convert+NODE_SZ_STR(convert);
2300 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2301 trie->startstate = state;
2302 trie->minlen -= (state - 1);
2303 trie->maxlen -= (state - 1);
2305 /* At least the UNICOS C compiler choked on this
2306 * being argument to DEBUG_r(), so let's just have
2309 #ifdef PERL_EXT_RE_BUILD
2315 regnode *fix = convert;
2316 U32 word = trie->wordcount;
2318 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2319 while( ++fix < n ) {
2320 Set_Node_Offset_Length(fix, 0, 0);
2323 SV ** const tmp = av_fetch( trie_words, word, 0 );
2325 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2326 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2328 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2336 NEXT_OFF(convert) = (U16)(tail - convert);
2337 DEBUG_r(optimize= n);
2343 if ( trie->maxlen ) {
2344 NEXT_OFF( convert ) = (U16)(tail - convert);
2345 ARG_SET( convert, data_slot );
2346 /* Store the offset to the first unabsorbed branch in
2347 jump[0], which is otherwise unused by the jump logic.
2348 We use this when dumping a trie and during optimisation. */
2350 trie->jump[0] = (U16)(nextbranch - convert);
2352 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2353 * and there is a bitmap
2354 * and the first "jump target" node we found leaves enough room
2355 * then convert the TRIE node into a TRIEC node, with the bitmap
2356 * embedded inline in the opcode - this is hypothetically faster.
2358 if ( !trie->states[trie->startstate].wordnum
2360 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2362 OP( convert ) = TRIEC;
2363 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2364 PerlMemShared_free(trie->bitmap);
2367 OP( convert ) = TRIE;
2369 /* store the type in the flags */
2370 convert->flags = nodetype;
2374 + regarglen[ OP( convert ) ];
2376 /* XXX We really should free up the resource in trie now,
2377 as we won't use them - (which resources?) dmq */
2379 /* needed for dumping*/
2380 DEBUG_r(if (optimize) {
2381 regnode *opt = convert;
2383 while ( ++opt < optimize) {
2384 Set_Node_Offset_Length(opt,0,0);
2387 Try to clean up some of the debris left after the
2390 while( optimize < jumper ) {
2391 mjd_nodelen += Node_Length((optimize));
2392 OP( optimize ) = OPTIMIZED;
2393 Set_Node_Offset_Length(optimize,0,0);
2396 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2398 } /* end node insert */
2400 /* Finish populating the prev field of the wordinfo array. Walk back
2401 * from each accept state until we find another accept state, and if
2402 * so, point the first word's .prev field at the second word. If the
2403 * second already has a .prev field set, stop now. This will be the
2404 * case either if we've already processed that word's accept state,
2405 * or that state had multiple words, and the overspill words were
2406 * already linked up earlier.
2413 for (word=1; word <= trie->wordcount; word++) {
2415 if (trie->wordinfo[word].prev)
2417 state = trie->wordinfo[word].accept;
2419 state = prev_states[state];
2422 prev = trie->states[state].wordnum;
2426 trie->wordinfo[word].prev = prev;
2428 Safefree(prev_states);
2432 /* and now dump out the compressed format */
2433 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2435 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2437 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2438 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2440 SvREFCNT_dec(revcharmap);
2444 : trie->startstate>1
2450 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2452 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2454 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2455 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2458 We find the fail state for each state in the trie, this state is the longest proper
2459 suffix of the current state's 'word' that is also a proper prefix of another word in our
2460 trie. State 1 represents the word '' and is thus the default fail state. This allows
2461 the DFA not to have to restart after its tried and failed a word at a given point, it
2462 simply continues as though it had been matching the other word in the first place.
2464 'abcdgu'=~/abcdefg|cdgu/
2465 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2466 fail, which would bring us to the state representing 'd' in the second word where we would
2467 try 'g' and succeed, proceeding to match 'cdgu'.
2469 /* add a fail transition */
2470 const U32 trie_offset = ARG(source);
2471 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2473 const U32 ucharcount = trie->uniquecharcount;
2474 const U32 numstates = trie->statecount;
2475 const U32 ubound = trie->lasttrans + ucharcount;
2479 U32 base = trie->states[ 1 ].trans.base;
2482 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2483 GET_RE_DEBUG_FLAGS_DECL;
2485 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2487 PERL_UNUSED_ARG(depth);
2491 ARG_SET( stclass, data_slot );
2492 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2493 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2494 aho->trie=trie_offset;
2495 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2496 Copy( trie->states, aho->states, numstates, reg_trie_state );
2497 Newxz( q, numstates, U32);
2498 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2501 /* initialize fail[0..1] to be 1 so that we always have
2502 a valid final fail state */
2503 fail[ 0 ] = fail[ 1 ] = 1;
2505 for ( charid = 0; charid < ucharcount ; charid++ ) {
2506 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2508 q[ q_write ] = newstate;
2509 /* set to point at the root */
2510 fail[ q[ q_write++ ] ]=1;
2513 while ( q_read < q_write) {
2514 const U32 cur = q[ q_read++ % numstates ];
2515 base = trie->states[ cur ].trans.base;
2517 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2518 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2520 U32 fail_state = cur;
2523 fail_state = fail[ fail_state ];
2524 fail_base = aho->states[ fail_state ].trans.base;
2525 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2527 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2528 fail[ ch_state ] = fail_state;
2529 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2531 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2533 q[ q_write++ % numstates] = ch_state;
2537 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2538 when we fail in state 1, this allows us to use the
2539 charclass scan to find a valid start char. This is based on the principle
2540 that theres a good chance the string being searched contains lots of stuff
2541 that cant be a start char.
2543 fail[ 0 ] = fail[ 1 ] = 0;
2544 DEBUG_TRIE_COMPILE_r({
2545 PerlIO_printf(Perl_debug_log,
2546 "%*sStclass Failtable (%"UVuf" states): 0",
2547 (int)(depth * 2), "", (UV)numstates
2549 for( q_read=1; q_read<numstates; q_read++ ) {
2550 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2552 PerlIO_printf(Perl_debug_log, "\n");
2555 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2560 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2561 * These need to be revisited when a newer toolchain becomes available.
2563 #if defined(__sparc64__) && defined(__GNUC__)
2564 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2565 # undef SPARC64_GCC_WORKAROUND
2566 # define SPARC64_GCC_WORKAROUND 1
2570 #define DEBUG_PEEP(str,scan,depth) \
2571 DEBUG_OPTIMISE_r({if (scan){ \
2572 SV * const mysv=sv_newmortal(); \
2573 regnode *Next = regnext(scan); \
2574 regprop(RExC_rx, mysv, scan); \
2575 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2576 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2577 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2581 /* The below joins as many adjacent EXACTish nodes as possible into a single
2582 * one, and looks for problematic sequences of characters whose folds vs.
2583 * non-folds have sufficiently different lengths, that the optimizer would be
2584 * fooled into rejecting legitimate matches of them, and the trie construction
2585 * code needs to handle specially. The joining is only done if:
2586 * 1) there is room in the current conglomerated node to entirely contain the
2588 * 2) they are the exact same node type
2590 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2591 * these get optimized out
2593 * If there are problematic code sequences, *min_subtract is set to the delta
2594 * that the minimum size of the node can be less than its actual size. And,
2595 * the node type of the result is changed to reflect that it contains these
2598 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2599 * and contains LATIN SMALL LETTER SHARP S
2601 * This is as good a place as any to discuss the design of handling these
2602 * problematic sequences. It's been wrong in Perl for a very long time. There
2603 * are three code points currently in Unicode whose folded lengths differ so
2604 * much from the un-folded lengths that it causes problems for the optimizer
2605 * and trie construction. Why only these are problematic, and not others where
2606 * lengths also differ is something I (khw) do not understand. New versions of
2607 * Unicode might add more such code points. Hopefully the logic in
2608 * fold_grind.t that figures out what to test (in part by verifying that each
2609 * size-combination gets tested) will catch any that do come along, so they can
2610 * be added to the special handling below. The chances of new ones are
2611 * actually rather small, as most, if not all, of the world's scripts that have
2612 * casefolding have already been encoded by Unicode. Also, a number of
2613 * Unicode's decisions were made to allow compatibility with pre-existing
2614 * standards, and almost all of those have already been dealt with. These
2615 * would otherwise be the most likely candidates for generating further tricky
2616 * sequences. In other words, Unicode by itself is unlikely to add new ones
2617 * unless it is for compatibility with pre-existing standards, and there aren't
2618 * many of those left.
2620 * The previous designs for dealing with these involved assigning a special
2621 * node for them. This approach doesn't work, as evidenced by this example:
2622 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2623 * Both these fold to "sss", but if the pattern is parsed to create a node
2624 * that would match just the \xDF, it won't be able to handle the case where a
2625 * successful match would have to cross the node's boundary. The new approach
2626 * that hopefully generally solves the problem generates an EXACTFU_SS node
2629 * There are a number of components to the approach (a lot of work for just
2630 * three code points!):
2631 * 1) This routine examines each EXACTFish node that could contain the
2632 * problematic sequences. It returns in *min_subtract how much to
2633 * subtract from the the actual length of the string to get a real minimum
2634 * for one that could match it. This number is usually 0 except for the
2635 * problematic sequences. This delta is used by the caller to adjust the
2636 * min length of the match, and the delta between min and max, so that the
2637 * optimizer doesn't reject these possibilities based on size constraints.
2638 * 2) These sequences require special handling by the trie code, so this code
2639 * changes the joined node type to special ops: EXACTFU_TRICKYFOLD and
2641 * 3) This is sufficient for the two Greek sequences (described below), but
2642 * the one involving the Sharp s (\xDF) needs more. The node type
2643 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2644 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2645 * case where there is a possible fold length change. That means that a
2646 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2647 * itself with length changes, and so can be processed faster. regexec.c
2648 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2649 * is pre-folded by regcomp.c. This saves effort in regex matching.
2650 * However, the pre-folding isn't done for non-UTF8 patterns because the
2651 * fold of the MICRO SIGN requires UTF-8. Also what EXACTF and EXACTFL
2652 * nodes fold to isn't known until runtime. The fold possibilities for
2653 * the non-UTF8 patterns are quite simple, except for the sharp s. All
2654 * the ones that don't involve a UTF-8 target string are members of a
2655 * fold-pair, and arrays are set up for all of them so that the other
2656 * member of the pair can be found quickly. Code elsewhere in this file
2657 * makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even
2658 * if the pattern isn't UTF-8. This avoids the issues described in the
2660 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2661 * 'ss' or not is not knowable at compile time. It will match iff the
2662 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2663 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2664 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2665 * described in item 3). An assumption that the optimizer part of
2666 * regexec.c (probably unwittingly) makes is that a character in the
2667 * pattern corresponds to at most a single character in the target string.
2668 * (And I do mean character, and not byte here, unlike other parts of the
2669 * documentation that have never been updated to account for multibyte
2670 * Unicode.) This assumption is wrong only in this case, as all other
2671 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2672 * virtue of having this file pre-fold UTF-8 patterns. I'm
2673 * reluctant to try to change this assumption, so instead the code punts.
2674 * This routine examines EXACTF nodes for the sharp s, and returns a
2675 * boolean indicating whether or not the node is an EXACTF node that
2676 * contains a sharp s. When it is true, the caller sets a flag that later
2677 * causes the optimizer in this file to not set values for the floating
2678 * and fixed string lengths, and thus avoids the optimizer code in
2679 * regexec.c that makes the invalid assumption. Thus, there is no
2680 * optimization based on string lengths for EXACTF nodes that contain the
2681 * sharp s. This only happens for /id rules (which means the pattern
2685 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2686 if (PL_regkind[OP(scan)] == EXACT) \
2687 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2690 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2691 /* Merge several consecutive EXACTish nodes into one. */
2692 regnode *n = regnext(scan);
2694 regnode *next = scan + NODE_SZ_STR(scan);
2698 regnode *stop = scan;
2699 GET_RE_DEBUG_FLAGS_DECL;
2701 PERL_UNUSED_ARG(depth);
2704 PERL_ARGS_ASSERT_JOIN_EXACT;
2705 #ifndef EXPERIMENTAL_INPLACESCAN
2706 PERL_UNUSED_ARG(flags);
2707 PERL_UNUSED_ARG(val);
2709 DEBUG_PEEP("join",scan,depth);
2711 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2712 * EXACT ones that are mergeable to the current one. */
2714 && (PL_regkind[OP(n)] == NOTHING
2715 || (stringok && OP(n) == OP(scan)))
2717 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2720 if (OP(n) == TAIL || n > next)
2722 if (PL_regkind[OP(n)] == NOTHING) {
2723 DEBUG_PEEP("skip:",n,depth);
2724 NEXT_OFF(scan) += NEXT_OFF(n);
2725 next = n + NODE_STEP_REGNODE;
2732 else if (stringok) {
2733 const unsigned int oldl = STR_LEN(scan);
2734 regnode * const nnext = regnext(n);
2736 /* XXX I (khw) kind of doubt that this works on platforms where
2737 * U8_MAX is above 255 because of lots of other assumptions */
2738 if (oldl + STR_LEN(n) > U8_MAX)
2741 DEBUG_PEEP("merg",n,depth);
2744 NEXT_OFF(scan) += NEXT_OFF(n);
2745 STR_LEN(scan) += STR_LEN(n);
2746 next = n + NODE_SZ_STR(n);
2747 /* Now we can overwrite *n : */
2748 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2756 #ifdef EXPERIMENTAL_INPLACESCAN
2757 if (flags && !NEXT_OFF(n)) {
2758 DEBUG_PEEP("atch", val, depth);
2759 if (reg_off_by_arg[OP(n)]) {
2760 ARG_SET(n, val - n);
2763 NEXT_OFF(n) = val - n;
2771 *has_exactf_sharp_s = FALSE;
2773 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2774 * can now analyze for sequences of problematic code points. (Prior to
2775 * this final joining, sequences could have been split over boundaries, and
2776 * hence missed). The sequences only happen in folding, hence for any
2777 * non-EXACT EXACTish node */
2778 if (OP(scan) != EXACT) {
2780 U8 * s0 = (U8*) STRING(scan);
2781 U8 * const s_end = s0 + STR_LEN(scan);
2783 /* The below is perhaps overboard, but this allows us to save a test
2784 * each time through the loop at the expense of a mask. This is
2785 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2786 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2787 * This uses an exclusive 'or' to find that bit and then inverts it to
2788 * form a mask, with just a single 0, in the bit position where 'S' and
2790 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2791 const U8 s_masked = 's' & S_or_s_mask;
2793 /* One pass is made over the node's string looking for all the
2794 * possibilities. to avoid some tests in the loop, there are two main
2795 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2799 /* There are two problematic Greek code points in Unicode
2802 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2803 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2809 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2810 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2812 * This means that in case-insensitive matching (or "loose
2813 * matching", as Unicode calls it), an EXACTF of length six (the
2814 * UTF-8 encoded byte length of the above casefolded versions) can
2815 * match a target string of length two (the byte length of UTF-8
2816 * encoded U+0390 or U+03B0). This would rather mess up the
2817 * minimum length computation. (there are other code points that
2818 * also fold to these two sequences, but the delta is smaller)
2820 * If these sequences are found, the minimum length is decreased by
2821 * four (six minus two).
2823 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2824 * LETTER SHARP S. We decrease the min length by 1 for each
2825 * occurrence of 'ss' found */
2827 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2828 # define U390_first_byte 0xb4
2829 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2830 # define U3B0_first_byte 0xb5
2831 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2833 # define U390_first_byte 0xce
2834 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2835 # define U3B0_first_byte 0xcf
2836 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2838 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2839 yields a net of 0 */
2840 /* Examine the string for one of the problematic sequences */
2842 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2843 * sequence we are looking for is 2 */
2847 /* Look for the first byte in each problematic sequence */
2849 /* We don't have to worry about other things that fold to
2850 * 's' (such as the long s, U+017F), as all above-latin1
2851 * code points have been pre-folded */
2855 /* Current character is an 's' or 'S'. If next one is
2856 * as well, we have the dreaded sequence */
2857 if (((*(s+1) & S_or_s_mask) == s_masked)
2858 /* These two node types don't have special handling
2860 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2863 OP(scan) = EXACTFU_SS;
2864 s++; /* No need to look at this character again */
2868 case U390_first_byte:
2869 if (s_end - s >= len
2871 /* The 1's are because are skipping comparing the
2873 && memEQ(s + 1, U390_tail, len - 1))
2875 goto greek_sequence;
2879 case U3B0_first_byte:
2880 if (! (s_end - s >= len
2881 && memEQ(s + 1, U3B0_tail, len - 1)))
2888 /* This requires special handling by trie's, so change
2889 * the node type to indicate this. If EXACTFA and
2890 * EXACTFL were ever to be handled by trie's, this
2891 * would have to be changed. If this node has already
2892 * been changed to EXACTFU_SS in this loop, leave it as
2893 * is. (I (khw) think it doesn't matter in regexec.c
2894 * for UTF patterns, but no need to change it */
2895 if (OP(scan) == EXACTFU) {
2896 OP(scan) = EXACTFU_TRICKYFOLD;
2898 s += 6; /* We already know what this sequence is. Skip
2904 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2906 /* Here, the pattern is not UTF-8. We need to look only for the
2907 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2908 * in the final position. Otherwise we can stop looking 1 byte
2909 * earlier because have to find both the first and second 's' */
2910 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2912 for (s = s0; s < upper; s++) {
2917 && ((*(s+1) & S_or_s_mask) == s_masked))
2921 /* EXACTF nodes need to know that the minimum
2922 * length changed so that a sharp s in the string
2923 * can match this ss in the pattern, but they
2924 * remain EXACTF nodes, as they won't match this
2925 * unless the target string is is UTF-8, which we
2926 * don't know until runtime */
2927 if (OP(scan) != EXACTF) {
2928 OP(scan) = EXACTFU_SS;
2933 case LATIN_SMALL_LETTER_SHARP_S:
2934 if (OP(scan) == EXACTF) {
2935 *has_exactf_sharp_s = TRUE;
2944 /* Allow dumping but overwriting the collection of skipped
2945 * ops and/or strings with fake optimized ops */
2946 n = scan + NODE_SZ_STR(scan);
2954 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2958 /* REx optimizer. Converts nodes into quicker variants "in place".
2959 Finds fixed substrings. */
2961 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2962 to the position after last scanned or to NULL. */
2964 #define INIT_AND_WITHP \
2965 assert(!and_withp); \
2966 Newx(and_withp,1,struct regnode_charclass_class); \
2967 SAVEFREEPV(and_withp)
2969 /* this is a chain of data about sub patterns we are processing that
2970 need to be handled separately/specially in study_chunk. Its so
2971 we can simulate recursion without losing state. */
2973 typedef struct scan_frame {
2974 regnode *last; /* last node to process in this frame */
2975 regnode *next; /* next node to process when last is reached */
2976 struct scan_frame *prev; /*previous frame*/
2977 I32 stop; /* what stopparen do we use */
2981 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2983 #define CASE_SYNST_FNC(nAmE) \
2985 if (flags & SCF_DO_STCLASS_AND) { \
2986 for (value = 0; value < 256; value++) \
2987 if (!is_ ## nAmE ## _cp(value)) \
2988 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2991 for (value = 0; value < 256; value++) \
2992 if (is_ ## nAmE ## _cp(value)) \
2993 ANYOF_BITMAP_SET(data->start_class, value); \
2997 if (flags & SCF_DO_STCLASS_AND) { \
2998 for (value = 0; value < 256; value++) \
2999 if (is_ ## nAmE ## _cp(value)) \
3000 ANYOF_BITMAP_CLEAR(data->start_class, value); \
3003 for (value = 0; value < 256; value++) \
3004 if (!is_ ## nAmE ## _cp(value)) \
3005 ANYOF_BITMAP_SET(data->start_class, value); \
3012 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3013 I32 *minlenp, I32 *deltap,
3018 struct regnode_charclass_class *and_withp,
3019 U32 flags, U32 depth)
3020 /* scanp: Start here (read-write). */
3021 /* deltap: Write maxlen-minlen here. */
3022 /* last: Stop before this one. */
3023 /* data: string data about the pattern */
3024 /* stopparen: treat close N as END */
3025 /* recursed: which subroutines have we recursed into */
3026 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3029 I32 min = 0, pars = 0, code;
3030 regnode *scan = *scanp, *next;
3032 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3033 int is_inf_internal = 0; /* The studied chunk is infinite */
3034 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3035 scan_data_t data_fake;
3036 SV *re_trie_maxbuff = NULL;
3037 regnode *first_non_open = scan;
3038 I32 stopmin = I32_MAX;
3039 scan_frame *frame = NULL;
3040 GET_RE_DEBUG_FLAGS_DECL;
3042 PERL_ARGS_ASSERT_STUDY_CHUNK;
3045 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3049 while (first_non_open && OP(first_non_open) == OPEN)
3050 first_non_open=regnext(first_non_open);
3055 while ( scan && OP(scan) != END && scan < last ){
3056 UV min_subtract = 0; /* How much to subtract from the minimum node
3057 length to get a real minimum (because the
3058 folded version may be shorter) */
3059 bool has_exactf_sharp_s = FALSE;
3060 /* Peephole optimizer: */
3061 DEBUG_STUDYDATA("Peep:", data,depth);
3062 DEBUG_PEEP("Peep",scan,depth);
3064 /* Its not clear to khw or hv why this is done here, and not in the
3065 * clauses that deal with EXACT nodes. khw's guess is that it's
3066 * because of a previous design */
3067 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3069 /* Follow the next-chain of the current node and optimize
3070 away all the NOTHINGs from it. */
3071 if (OP(scan) != CURLYX) {
3072 const int max = (reg_off_by_arg[OP(scan)]
3074 /* I32 may be smaller than U16 on CRAYs! */
3075 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3076 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3080 /* Skip NOTHING and LONGJMP. */
3081 while ((n = regnext(n))
3082 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3083 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3084 && off + noff < max)
3086 if (reg_off_by_arg[OP(scan)])
3089 NEXT_OFF(scan) = off;
3094 /* The principal pseudo-switch. Cannot be a switch, since we
3095 look into several different things. */
3096 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3097 || OP(scan) == IFTHEN) {
3098 next = regnext(scan);
3100 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3102 if (OP(next) == code || code == IFTHEN) {
3103 /* NOTE - There is similar code to this block below for handling
3104 TRIE nodes on a re-study. If you change stuff here check there
3106 I32 max1 = 0, min1 = I32_MAX, num = 0;
3107 struct regnode_charclass_class accum;
3108 regnode * const startbranch=scan;
3110 if (flags & SCF_DO_SUBSTR)
3111 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3112 if (flags & SCF_DO_STCLASS)
3113 cl_init_zero(pRExC_state, &accum);
3115 while (OP(scan) == code) {
3116 I32 deltanext, minnext, f = 0, fake;
3117 struct regnode_charclass_class this_class;
3120 data_fake.flags = 0;
3122 data_fake.whilem_c = data->whilem_c;
3123 data_fake.last_closep = data->last_closep;
3126 data_fake.last_closep = &fake;
3128 data_fake.pos_delta = delta;
3129 next = regnext(scan);
3130 scan = NEXTOPER(scan);
3132 scan = NEXTOPER(scan);
3133 if (flags & SCF_DO_STCLASS) {
3134 cl_init(pRExC_state, &this_class);
3135 data_fake.start_class = &this_class;
3136 f = SCF_DO_STCLASS_AND;
3138 if (flags & SCF_WHILEM_VISITED_POS)
3139 f |= SCF_WHILEM_VISITED_POS;
3141 /* we suppose the run is continuous, last=next...*/
3142 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3144 stopparen, recursed, NULL, f,depth+1);
3147 if (max1 < minnext + deltanext)
3148 max1 = minnext + deltanext;
3149 if (deltanext == I32_MAX)
3150 is_inf = is_inf_internal = 1;
3152 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3154 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3155 if ( stopmin > minnext)
3156 stopmin = min + min1;
3157 flags &= ~SCF_DO_SUBSTR;
3159 data->flags |= SCF_SEEN_ACCEPT;
3162 if (data_fake.flags & SF_HAS_EVAL)
3163 data->flags |= SF_HAS_EVAL;
3164 data->whilem_c = data_fake.whilem_c;
3166 if (flags & SCF_DO_STCLASS)
3167 cl_or(pRExC_state, &accum, &this_class);
3169 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3171 if (flags & SCF_DO_SUBSTR) {
3172 data->pos_min += min1;
3173 data->pos_delta += max1 - min1;
3174 if (max1 != min1 || is_inf)
3175 data->longest = &(data->longest_float);
3178 delta += max1 - min1;
3179 if (flags & SCF_DO_STCLASS_OR) {
3180 cl_or(pRExC_state, data->start_class, &accum);
3182 cl_and(data->start_class, and_withp);
3183 flags &= ~SCF_DO_STCLASS;
3186 else if (flags & SCF_DO_STCLASS_AND) {
3188 cl_and(data->start_class, &accum);
3189 flags &= ~SCF_DO_STCLASS;
3192 /* Switch to OR mode: cache the old value of
3193 * data->start_class */
3195 StructCopy(data->start_class, and_withp,
3196 struct regnode_charclass_class);
3197 flags &= ~SCF_DO_STCLASS_AND;
3198 StructCopy(&accum, data->start_class,
3199 struct regnode_charclass_class);
3200 flags |= SCF_DO_STCLASS_OR;
3201 data->start_class->flags |= ANYOF_EOS;
3205 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3208 Assuming this was/is a branch we are dealing with: 'scan' now
3209 points at the item that follows the branch sequence, whatever
3210 it is. We now start at the beginning of the sequence and look
3217 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3219 If we can find such a subsequence we need to turn the first
3220 element into a trie and then add the subsequent branch exact
3221 strings to the trie.
3225 1. patterns where the whole set of branches can be converted.
3227 2. patterns where only a subset can be converted.
3229 In case 1 we can replace the whole set with a single regop
3230 for the trie. In case 2 we need to keep the start and end
3233 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3234 becomes BRANCH TRIE; BRANCH X;
3236 There is an additional case, that being where there is a
3237 common prefix, which gets split out into an EXACT like node
3238 preceding the TRIE node.
3240 If x(1..n)==tail then we can do a simple trie, if not we make
3241 a "jump" trie, such that when we match the appropriate word
3242 we "jump" to the appropriate tail node. Essentially we turn
3243 a nested if into a case structure of sorts.
3248 if (!re_trie_maxbuff) {
3249 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3250 if (!SvIOK(re_trie_maxbuff))
3251 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3253 if ( SvIV(re_trie_maxbuff)>=0 ) {
3255 regnode *first = (regnode *)NULL;
3256 regnode *last = (regnode *)NULL;
3257 regnode *tail = scan;
3262 SV * const mysv = sv_newmortal(); /* for dumping */
3264 /* var tail is used because there may be a TAIL
3265 regop in the way. Ie, the exacts will point to the
3266 thing following the TAIL, but the last branch will
3267 point at the TAIL. So we advance tail. If we
3268 have nested (?:) we may have to move through several
3272 while ( OP( tail ) == TAIL ) {
3273 /* this is the TAIL generated by (?:) */
3274 tail = regnext( tail );
3278 DEBUG_TRIE_COMPILE_r({
3279 regprop(RExC_rx, mysv, tail );
3280 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3281 (int)depth * 2 + 2, "",
3282 "Looking for TRIE'able sequences. Tail node is: ",
3283 SvPV_nolen_const( mysv )
3289 Step through the branches
3290 cur represents each branch,
3291 noper is the first thing to be matched as part of that branch
3292 noper_next is the regnext() of that node.
3294 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3295 via a "jump trie" but we also support building with NOJUMPTRIE,
3296 which restricts the trie logic to structures like /FOO|BAR/.
3298 If noper is a trieable nodetype then the branch is a possible optimization
3299 target. If we are building under NOJUMPTRIE then we require that noper_next
3300 is the same as scan (our current position in the regex program).
3302 Once we have two or more consecutive such branches we can create a
3303 trie of the EXACT's contents and stitch it in place into the program.
3305 If the sequence represents all of the branches in the alternation we
3306 replace the entire thing with a single TRIE node.
3308 Otherwise when it is a subsequence we need to stitch it in place and
3309 replace only the relevant branches. This means the first branch has
3310 to remain as it is used by the alternation logic, and its next pointer,
3311 and needs to be repointed at the item on the branch chain following
3312 the last branch we have optimized away.
3314 This could be either a BRANCH, in which case the subsequence is internal,
3315 or it could be the item following the branch sequence in which case the
3316 subsequence is at the end (which does not necessarily mean the first node
3317 is the start of the alternation).
3319 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3322 ----------------+-----------
3326 EXACTFU_SS | EXACTFU
3327 EXACTFU_TRICKYFOLD | EXACTFU
3332 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3333 ( EXACT == (X) ) ? EXACT : \
3334 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3337 /* dont use tail as the end marker for this traverse */
3338 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3339 regnode * const noper = NEXTOPER( cur );
3340 U8 noper_type = OP( noper );
3341 U8 noper_trietype = TRIE_TYPE( noper_type );
3342 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3343 regnode * const noper_next = regnext( noper );
3344 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3345 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3348 DEBUG_TRIE_COMPILE_r({
3349 regprop(RExC_rx, mysv, cur);
3350 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3351 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3353 regprop(RExC_rx, mysv, noper);
3354 PerlIO_printf( Perl_debug_log, " -> %s",
3355 SvPV_nolen_const(mysv));
3358 regprop(RExC_rx, mysv, noper_next );
3359 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3360 SvPV_nolen_const(mysv));
3362 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3363 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3364 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3368 /* Is noper a trieable nodetype that can be merged with the
3369 * current trie (if there is one)? */
3373 ( noper_trietype == NOTHING)
3374 || ( trietype == NOTHING )
3375 || ( trietype == noper_trietype )
3378 && noper_next == tail
3382 /* Handle mergable triable node
3383 * Either we are the first node in a new trieable sequence,
3384 * in which case we do some bookkeeping, otherwise we update
3385 * the end pointer. */
3388 if ( noper_trietype == NOTHING ) {
3389 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3390 regnode * const noper_next = regnext( noper );
3391 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3392 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3395 if ( noper_next_trietype ) {
3396 trietype = noper_next_trietype;
3397 } else if (noper_next_type) {
3398 /* a NOTHING regop is 1 regop wide. We need at least two
3399 * for a trie so we can't merge this in */
3403 trietype = noper_trietype;
3406 if ( trietype == NOTHING )
3407 trietype = noper_trietype;
3412 } /* end handle mergable triable node */
3414 /* handle unmergable node -
3415 * noper may either be a triable node which can not be tried
3416 * together with the current trie, or a non triable node */
3418 /* If last is set and trietype is not NOTHING then we have found
3419 * at least two triable branch sequences in a row of a similar
3420 * trietype so we can turn them into a trie. If/when we
3421 * allow NOTHING to start a trie sequence this condition will be
3422 * required, and it isn't expensive so we leave it in for now. */
3423 if ( trietype != NOTHING )
3424 make_trie( pRExC_state,
3425 startbranch, first, cur, tail, count,
3426 trietype, depth+1 );
3427 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3431 && noper_next == tail
3434 /* noper is triable, so we can start a new trie sequence */
3437 trietype = noper_trietype;
3439 /* if we already saw a first but the current node is not triable then we have
3440 * to reset the first information. */
3445 } /* end handle unmergable node */
3446 } /* loop over branches */
3447 DEBUG_TRIE_COMPILE_r({
3448 regprop(RExC_rx, mysv, cur);
3449 PerlIO_printf( Perl_debug_log,
3450 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3451 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3455 if ( trietype != NOTHING ) {
3456 /* the last branch of the sequence was part of a trie,
3457 * so we have to construct it here outside of the loop
3459 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3460 #ifdef TRIE_STUDY_OPT
3461 if ( ((made == MADE_EXACT_TRIE &&
3462 startbranch == first)
3463 || ( first_non_open == first )) &&
3465 flags |= SCF_TRIE_RESTUDY;
3466 if ( startbranch == first
3469 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3474 /* at this point we know whatever we have is a NOTHING sequence/branch
3475 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3477 if ( startbranch == first ) {
3479 /* the entire thing is a NOTHING sequence, something like this:
3480 * (?:|) So we can turn it into a plain NOTHING op. */
3481 DEBUG_TRIE_COMPILE_r({
3482 regprop(RExC_rx, mysv, cur);
3483 PerlIO_printf( Perl_debug_log,
3484 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3485 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3488 OP(startbranch)= NOTHING;
3489 NEXT_OFF(startbranch)= tail - startbranch;
3490 for ( opt= startbranch + 1; opt < tail ; opt++ )
3494 } /* end if ( last) */
3495 } /* TRIE_MAXBUF is non zero */
3500 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3501 scan = NEXTOPER(NEXTOPER(scan));
3502 } else /* single branch is optimized. */
3503 scan = NEXTOPER(scan);
3505 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3506 scan_frame *newframe = NULL;
3511 if (OP(scan) != SUSPEND) {
3512 /* set the pointer */
3513 if (OP(scan) == GOSUB) {
3515 RExC_recurse[ARG2L(scan)] = scan;
3516 start = RExC_open_parens[paren-1];
3517 end = RExC_close_parens[paren-1];
3520 start = RExC_rxi->program + 1;
3524 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3525 SAVEFREEPV(recursed);
3527 if (!PAREN_TEST(recursed,paren+1)) {
3528 PAREN_SET(recursed,paren+1);
3529 Newx(newframe,1,scan_frame);
3531 if (flags & SCF_DO_SUBSTR) {
3532 SCAN_COMMIT(pRExC_state,data,minlenp);
3533 data->longest = &(data->longest_float);
3535 is_inf = is_inf_internal = 1;
3536 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3537 cl_anything(pRExC_state, data->start_class);
3538 flags &= ~SCF_DO_STCLASS;
3541 Newx(newframe,1,scan_frame);
3544 end = regnext(scan);
3549 SAVEFREEPV(newframe);
3550 newframe->next = regnext(scan);
3551 newframe->last = last;
3552 newframe->stop = stopparen;
3553 newframe->prev = frame;
3563 else if (OP(scan) == EXACT) {
3564 I32 l = STR_LEN(scan);
3567 const U8 * const s = (U8*)STRING(scan);
3568 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3569 l = utf8_length(s, s + l);
3571 uc = *((U8*)STRING(scan));
3574 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3575 /* The code below prefers earlier match for fixed
3576 offset, later match for variable offset. */
3577 if (data->last_end == -1) { /* Update the start info. */
3578 data->last_start_min = data->pos_min;
3579 data->last_start_max = is_inf
3580 ? I32_MAX : data->pos_min + data->pos_delta;
3582 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3584 SvUTF8_on(data->last_found);
3586 SV * const sv = data->last_found;
3587 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3588 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3589 if (mg && mg->mg_len >= 0)
3590 mg->mg_len += utf8_length((U8*)STRING(scan),
3591 (U8*)STRING(scan)+STR_LEN(scan));
3593 data->last_end = data->pos_min + l;
3594 data->pos_min += l; /* As in the first entry. */
3595 data->flags &= ~SF_BEFORE_EOL;
3597 if (flags & SCF_DO_STCLASS_AND) {
3598 /* Check whether it is compatible with what we know already! */
3602 /* If compatible, we or it in below. It is compatible if is
3603 * in the bitmp and either 1) its bit or its fold is set, or 2)
3604 * it's for a locale. Even if there isn't unicode semantics
3605 * here, at runtime there may be because of matching against a
3606 * utf8 string, so accept a possible false positive for
3607 * latin1-range folds */
3609 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3610 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3611 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3612 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3617 ANYOF_CLASS_ZERO(data->start_class);
3618 ANYOF_BITMAP_ZERO(data->start_class);
3620 ANYOF_BITMAP_SET(data->start_class, uc);
3621 else if (uc >= 0x100) {
3624 /* Some Unicode code points fold to the Latin1 range; as
3625 * XXX temporary code, instead of figuring out if this is
3626 * one, just assume it is and set all the start class bits
3627 * that could be some such above 255 code point's fold
3628 * which will generate fals positives. As the code
3629 * elsewhere that does compute the fold settles down, it
3630 * can be extracted out and re-used here */
3631 for (i = 0; i < 256; i++){
3632 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3633 ANYOF_BITMAP_SET(data->start_class, i);
3637 data->start_class->flags &= ~ANYOF_EOS;
3639 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3641 else if (flags & SCF_DO_STCLASS_OR) {
3642 /* false positive possible if the class is case-folded */
3644 ANYOF_BITMAP_SET(data->start_class, uc);
3646 data->start_class->flags |= ANYOF_UNICODE_ALL;
3647 data->start_class->flags &= ~ANYOF_EOS;
3648 cl_and(data->start_class, and_withp);
3650 flags &= ~SCF_DO_STCLASS;
3652 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3653 I32 l = STR_LEN(scan);
3654 UV uc = *((U8*)STRING(scan));
3656 /* Search for fixed substrings supports EXACT only. */
3657 if (flags & SCF_DO_SUBSTR) {
3659 SCAN_COMMIT(pRExC_state, data, minlenp);
3662 const U8 * const s = (U8 *)STRING(scan);
3663 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3664 l = utf8_length(s, s + l);
3666 if (has_exactf_sharp_s) {
3667 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3669 min += l - min_subtract;
3673 delta += min_subtract;
3674 if (flags & SCF_DO_SUBSTR) {
3675 data->pos_min += l - min_subtract;
3676 if (data->pos_min < 0) {
3679 data->pos_delta += min_subtract;
3681 data->longest = &(data->longest_float);
3684 if (flags & SCF_DO_STCLASS_AND) {
3685 /* Check whether it is compatible with what we know already! */
3688 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3689 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3690 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3694 ANYOF_CLASS_ZERO(data->start_class);
3695 ANYOF_BITMAP_ZERO(data->start_class);
3697 ANYOF_BITMAP_SET(data->start_class, uc);
3698 data->start_class->flags &= ~ANYOF_EOS;
3699 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3700 if (OP(scan) == EXACTFL) {
3701 /* XXX This set is probably no longer necessary, and
3702 * probably wrong as LOCALE now is on in the initial
3704 data->start_class->flags |= ANYOF_LOCALE;
3708 /* Also set the other member of the fold pair. In case
3709 * that unicode semantics is called for at runtime, use
3710 * the full latin1 fold. (Can't do this for locale,
3711 * because not known until runtime) */
3712 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3714 /* All other (EXACTFL handled above) folds except under
3715 * /iaa that include s, S, and sharp_s also may include
3717 if (OP(scan) != EXACTFA) {
3718 if (uc == 's' || uc == 'S') {
3719 ANYOF_BITMAP_SET(data->start_class,
3720 LATIN_SMALL_LETTER_SHARP_S);
3722 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3723 ANYOF_BITMAP_SET(data->start_class, 's');
3724 ANYOF_BITMAP_SET(data->start_class, 'S');
3729 else if (uc >= 0x100) {
3731 for (i = 0; i < 256; i++){
3732 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3733 ANYOF_BITMAP_SET(data->start_class, i);
3738 else if (flags & SCF_DO_STCLASS_OR) {
3739 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3740 /* false positive possible if the class is case-folded.
3741 Assume that the locale settings are the same... */
3743 ANYOF_BITMAP_SET(data->start_class, uc);
3744 if (OP(scan) != EXACTFL) {
3746 /* And set the other member of the fold pair, but
3747 * can't do that in locale because not known until
3749 ANYOF_BITMAP_SET(data->start_class,
3750 PL_fold_latin1[uc]);
3752 /* All folds except under /iaa that include s, S,
3753 * and sharp_s also may include the others */
3754 if (OP(scan) != EXACTFA) {
3755 if (uc == 's' || uc == 'S') {
3756 ANYOF_BITMAP_SET(data->start_class,
3757 LATIN_SMALL_LETTER_SHARP_S);
3759 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3760 ANYOF_BITMAP_SET(data->start_class, 's');
3761 ANYOF_BITMAP_SET(data->start_class, 'S');
3766 data->start_class->flags &= ~ANYOF_EOS;
3768 cl_and(data->start_class, and_withp);
3770 flags &= ~SCF_DO_STCLASS;
3772 else if (REGNODE_VARIES(OP(scan))) {
3773 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3774 I32 f = flags, pos_before = 0;
3775 regnode * const oscan = scan;
3776 struct regnode_charclass_class this_class;
3777 struct regnode_charclass_class *oclass = NULL;
3778 I32 next_is_eval = 0;
3780 switch (PL_regkind[OP(scan)]) {
3781 case WHILEM: /* End of (?:...)* . */
3782 scan = NEXTOPER(scan);
3785 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3786 next = NEXTOPER(scan);
3787 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3789 maxcount = REG_INFTY;
3790 next = regnext(scan);
3791 scan = NEXTOPER(scan);
3795 if (flags & SCF_DO_SUBSTR)
3800 if (flags & SCF_DO_STCLASS) {
3802 maxcount = REG_INFTY;
3803 next = regnext(scan);
3804 scan = NEXTOPER(scan);
3807 is_inf = is_inf_internal = 1;
3808 scan = regnext(scan);
3809 if (flags & SCF_DO_SUBSTR) {
3810 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3811 data->longest = &(data->longest_float);
3813 goto optimize_curly_tail;
3815 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3816 && (scan->flags == stopparen))
3821 mincount = ARG1(scan);
3822 maxcount = ARG2(scan);
3824 next = regnext(scan);
3825 if (OP(scan) == CURLYX) {
3826 I32 lp = (data ? *(data->last_closep) : 0);
3827 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3829 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3830 next_is_eval = (OP(scan) == EVAL);
3832 if (flags & SCF_DO_SUBSTR) {
3833 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3834 pos_before = data->pos_min;
3838 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3840 data->flags |= SF_IS_INF;
3842 if (flags & SCF_DO_STCLASS) {
3843 cl_init(pRExC_state, &this_class);
3844 oclass = data->start_class;
3845 data->start_class = &this_class;
3846 f |= SCF_DO_STCLASS_AND;
3847 f &= ~SCF_DO_STCLASS_OR;
3849 /* Exclude from super-linear cache processing any {n,m}
3850 regops for which the combination of input pos and regex
3851 pos is not enough information to determine if a match
3854 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3855 regex pos at the \s*, the prospects for a match depend not
3856 only on the input position but also on how many (bar\s*)
3857 repeats into the {4,8} we are. */
3858 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3859 f &= ~SCF_WHILEM_VISITED_POS;
3861 /* This will finish on WHILEM, setting scan, or on NULL: */
3862 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3863 last, data, stopparen, recursed, NULL,
3865 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3867 if (flags & SCF_DO_STCLASS)
3868 data->start_class = oclass;
3869 if (mincount == 0 || minnext == 0) {
3870 if (flags & SCF_DO_STCLASS_OR) {
3871 cl_or(pRExC_state, data->start_class, &this_class);
3873 else if (flags & SCF_DO_STCLASS_AND) {
3874 /* Switch to OR mode: cache the old value of
3875 * data->start_class */
3877 StructCopy(data->start_class, and_withp,
3878 struct regnode_charclass_class);
3879 flags &= ~SCF_DO_STCLASS_AND;
3880 StructCopy(&this_class, data->start_class,
3881 struct regnode_charclass_class);
3882 flags |= SCF_DO_STCLASS_OR;
3883 data->start_class->flags |= ANYOF_EOS;
3885 } else { /* Non-zero len */
3886 if (flags & SCF_DO_STCLASS_OR) {
3887 cl_or(pRExC_state, data->start_class, &this_class);
3888 cl_and(data->start_class, and_withp);
3890 else if (flags & SCF_DO_STCLASS_AND)
3891 cl_and(data->start_class, &this_class);
3892 flags &= ~SCF_DO_STCLASS;
3894 if (!scan) /* It was not CURLYX, but CURLY. */
3896 if ( /* ? quantifier ok, except for (?{ ... }) */
3897 (next_is_eval || !(mincount == 0 && maxcount == 1))
3898 && (minnext == 0) && (deltanext == 0)
3899 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3900 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3902 ckWARNreg(RExC_parse,
3903 "Quantifier unexpected on zero-length expression");
3906 min += minnext * mincount;
3907 is_inf_internal |= ((maxcount == REG_INFTY
3908 && (minnext + deltanext) > 0)
3909 || deltanext == I32_MAX);
3910 is_inf |= is_inf_internal;
3911 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3913 /* Try powerful optimization CURLYX => CURLYN. */
3914 if ( OP(oscan) == CURLYX && data
3915 && data->flags & SF_IN_PAR
3916 && !(data->flags & SF_HAS_EVAL)
3917 && !deltanext && minnext == 1 ) {
3918 /* Try to optimize to CURLYN. */
3919 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3920 regnode * const nxt1 = nxt;
3927 if (!REGNODE_SIMPLE(OP(nxt))
3928 && !(PL_regkind[OP(nxt)] == EXACT
3929 && STR_LEN(nxt) == 1))
3935 if (OP(nxt) != CLOSE)
3937 if (RExC_open_parens) {
3938 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3939 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3941 /* Now we know that nxt2 is the only contents: */
3942 oscan->flags = (U8)ARG(nxt);
3944 OP(nxt1) = NOTHING; /* was OPEN. */
3947 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3948 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3949 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3950 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3951 OP(nxt + 1) = OPTIMIZED; /* was count. */
3952 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3957 /* Try optimization CURLYX => CURLYM. */
3958 if ( OP(oscan) == CURLYX && data
3959 && !(data->flags & SF_HAS_PAR)
3960 && !(data->flags & SF_HAS_EVAL)
3961 && !deltanext /* atom is fixed width */
3962 && minnext != 0 /* CURLYM can't handle zero width */
3963 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3965 /* XXXX How to optimize if data == 0? */
3966 /* Optimize to a simpler form. */
3967 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3971 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3972 && (OP(nxt2) != WHILEM))
3974 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3975 /* Need to optimize away parenths. */
3976 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3977 /* Set the parenth number. */
3978 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3980 oscan->flags = (U8)ARG(nxt);
3981 if (RExC_open_parens) {
3982 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3983 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3985 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3986 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3989 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3990 OP(nxt + 1) = OPTIMIZED; /* was count. */
3991 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3992 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3995 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3996 regnode *nnxt = regnext(nxt1);
3998 if (reg_off_by_arg[OP(nxt1)])
3999 ARG_SET(nxt1, nxt2 - nxt1);
4000 else if (nxt2 - nxt1 < U16_MAX)
4001 NEXT_OFF(nxt1) = nxt2 - nxt1;
4003 OP(nxt) = NOTHING; /* Cannot beautify */
4008 /* Optimize again: */
4009 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4010 NULL, stopparen, recursed, NULL, 0,depth+1);
4015 else if ((OP(oscan) == CURLYX)
4016 && (flags & SCF_WHILEM_VISITED_POS)
4017 /* See the comment on a similar expression above.
4018 However, this time it's not a subexpression
4019 we care about, but the expression itself. */
4020 && (maxcount == REG_INFTY)
4021 && data && ++data->whilem_c < 16) {
4022 /* This stays as CURLYX, we can put the count/of pair. */
4023 /* Find WHILEM (as in regexec.c) */
4024 regnode *nxt = oscan + NEXT_OFF(oscan);
4026 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4028 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4029 | (RExC_whilem_seen << 4)); /* On WHILEM */
4031 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4033 if (flags & SCF_DO_SUBSTR) {
4034 SV *last_str = NULL;
4035 int counted = mincount != 0;
4037 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4038 #if defined(SPARC64_GCC_WORKAROUND)
4041 const char *s = NULL;
4044 if (pos_before >= data->last_start_min)
4047 b = data->last_start_min;
4050 s = SvPV_const(data->last_found, l);
4051 old = b - data->last_start_min;
4054 I32 b = pos_before >= data->last_start_min
4055 ? pos_before : data->last_start_min;
4057 const char * const s = SvPV_const(data->last_found, l);
4058 I32 old = b - data->last_start_min;
4062 old = utf8_hop((U8*)s, old) - (U8*)s;
4064 /* Get the added string: */
4065 last_str = newSVpvn_utf8(s + old, l, UTF);
4066 if (deltanext == 0 && pos_before == b) {
4067 /* What was added is a constant string */
4069 SvGROW(last_str, (mincount * l) + 1);
4070 repeatcpy(SvPVX(last_str) + l,
4071 SvPVX_const(last_str), l, mincount - 1);
4072 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4073 /* Add additional parts. */
4074 SvCUR_set(data->last_found,
4075 SvCUR(data->last_found) - l);
4076 sv_catsv(data->last_found, last_str);
4078 SV * sv = data->last_found;
4080 SvUTF8(sv) && SvMAGICAL(sv) ?
4081 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4082 if (mg && mg->mg_len >= 0)
4083 mg->mg_len += CHR_SVLEN(last_str) - l;
4085 data->last_end += l * (mincount - 1);
4088 /* start offset must point into the last copy */
4089 data->last_start_min += minnext * (mincount - 1);
4090 data->last_start_max += is_inf ? I32_MAX
4091 : (maxcount - 1) * (minnext + data->pos_delta);
4094 /* It is counted once already... */
4095 data->pos_min += minnext * (mincount - counted);
4096 data->pos_delta += - counted * deltanext +
4097 (minnext + deltanext) * maxcount - minnext * mincount;
4098 if (mincount != maxcount) {
4099 /* Cannot extend fixed substrings found inside
4101 SCAN_COMMIT(pRExC_state,data,minlenp);
4102 if (mincount && last_str) {
4103 SV * const sv = data->last_found;
4104 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4105 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4109 sv_setsv(sv, last_str);
4110 data->last_end = data->pos_min;
4111 data->last_start_min =
4112 data->pos_min - CHR_SVLEN(last_str);
4113 data->last_start_max = is_inf
4115 : data->pos_min + data->pos_delta
4116 - CHR_SVLEN(last_str);
4118 data->longest = &(data->longest_float);
4120 SvREFCNT_dec(last_str);
4122 if (data && (fl & SF_HAS_EVAL))
4123 data->flags |= SF_HAS_EVAL;
4124 optimize_curly_tail:
4125 if (OP(oscan) != CURLYX) {
4126 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4128 NEXT_OFF(oscan) += NEXT_OFF(next);
4131 default: /* REF, ANYOFV, and CLUMP only? */
4132 if (flags & SCF_DO_SUBSTR) {
4133 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4134 data->longest = &(data->longest_float);
4136 is_inf = is_inf_internal = 1;
4137 if (flags & SCF_DO_STCLASS_OR)
4138 cl_anything(pRExC_state, data->start_class);
4139 flags &= ~SCF_DO_STCLASS;
4143 else if (OP(scan) == LNBREAK) {
4144 if (flags & SCF_DO_STCLASS) {
4146 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4147 if (flags & SCF_DO_STCLASS_AND) {
4148 for (value = 0; value < 256; value++)
4149 if (!is_VERTWS_cp(value))
4150 ANYOF_BITMAP_CLEAR(data->start_class, value);
4153 for (value = 0; value < 256; value++)
4154 if (is_VERTWS_cp(value))
4155 ANYOF_BITMAP_SET(data->start_class, value);
4157 if (flags & SCF_DO_STCLASS_OR)
4158 cl_and(data->start_class, and_withp);
4159 flags &= ~SCF_DO_STCLASS;
4163 if (flags & SCF_DO_SUBSTR) {
4164 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4166 data->pos_delta += 1;
4167 data->longest = &(data->longest_float);
4170 else if (REGNODE_SIMPLE(OP(scan))) {
4173 if (flags & SCF_DO_SUBSTR) {
4174 SCAN_COMMIT(pRExC_state,data,minlenp);
4178 if (flags & SCF_DO_STCLASS) {
4179 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4181 /* Some of the logic below assumes that switching
4182 locale on will only add false positives. */
4183 switch (PL_regkind[OP(scan)]) {
4187 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4188 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4189 cl_anything(pRExC_state, data->start_class);
4192 if (OP(scan) == SANY)
4194 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4195 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4196 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4197 cl_anything(pRExC_state, data->start_class);
4199 if (flags & SCF_DO_STCLASS_AND || !value)
4200 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4203 if (flags & SCF_DO_STCLASS_AND)
4204 cl_and(data->start_class,
4205 (struct regnode_charclass_class*)scan);
4207 cl_or(pRExC_state, data->start_class,
4208 (struct regnode_charclass_class*)scan);
4211 if (flags & SCF_DO_STCLASS_AND) {
4212 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4213 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4214 if (OP(scan) == ALNUMU) {
4215 for (value = 0; value < 256; value++) {
4216 if (!isWORDCHAR_L1(value)) {
4217 ANYOF_BITMAP_CLEAR(data->start_class, value);
4221 for (value = 0; value < 256; value++) {
4222 if (!isALNUM(value)) {
4223 ANYOF_BITMAP_CLEAR(data->start_class, value);
4230 if (data->start_class->flags & ANYOF_LOCALE)
4231 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4233 /* Even if under locale, set the bits for non-locale
4234 * in case it isn't a true locale-node. This will
4235 * create false positives if it truly is locale */
4236 if (OP(scan) == ALNUMU) {
4237 for (value = 0; value < 256; value++) {
4238 if (isWORDCHAR_L1(value)) {
4239 ANYOF_BITMAP_SET(data->start_class, value);
4243 for (value = 0; value < 256; value++) {
4244 if (isALNUM(value)) {
4245 ANYOF_BITMAP_SET(data->start_class, value);
4252 if (flags & SCF_DO_STCLASS_AND) {
4253 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4254 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4255 if (OP(scan) == NALNUMU) {
4256 for (value = 0; value < 256; value++) {
4257 if (isWORDCHAR_L1(value)) {
4258 ANYOF_BITMAP_CLEAR(data->start_class, value);
4262 for (value = 0; value < 256; value++) {
4263 if (isALNUM(value)) {
4264 ANYOF_BITMAP_CLEAR(data->start_class, value);
4271 if (data->start_class->flags & ANYOF_LOCALE)
4272 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4274 /* Even if under locale, set the bits for non-locale in
4275 * case it isn't a true locale-node. This will create
4276 * false positives if it truly is locale */
4277 if (OP(scan) == NALNUMU) {
4278 for (value = 0; value < 256; value++) {
4279 if (! isWORDCHAR_L1(value)) {
4280 ANYOF_BITMAP_SET(data->start_class, value);
4284 for (value = 0; value < 256; value++) {
4285 if (! isALNUM(value)) {
4286 ANYOF_BITMAP_SET(data->start_class, value);
4293 if (flags & SCF_DO_STCLASS_AND) {
4294 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4295 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4296 if (OP(scan) == SPACEU) {
4297 for (value = 0; value < 256; value++) {
4298 if (!isSPACE_L1(value)) {
4299 ANYOF_BITMAP_CLEAR(data->start_class, value);
4303 for (value = 0; value < 256; value++) {
4304 if (!isSPACE(value)) {
4305 ANYOF_BITMAP_CLEAR(data->start_class, value);
4312 if (data->start_class->flags & ANYOF_LOCALE) {
4313 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4315 if (OP(scan) == SPACEU) {
4316 for (value = 0; value < 256; value++) {
4317 if (isSPACE_L1(value)) {
4318 ANYOF_BITMAP_SET(data->start_class, value);
4322 for (value = 0; value < 256; value++) {
4323 if (isSPACE(value)) {
4324 ANYOF_BITMAP_SET(data->start_class, value);
4331 if (flags & SCF_DO_STCLASS_AND) {
4332 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4333 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4334 if (OP(scan) == NSPACEU) {
4335 for (value = 0; value < 256; value++) {
4336 if (isSPACE_L1(value)) {
4337 ANYOF_BITMAP_CLEAR(data->start_class, value);
4341 for (value = 0; value < 256; value++) {
4342 if (isSPACE(value)) {
4343 ANYOF_BITMAP_CLEAR(data->start_class, value);
4350 if (data->start_class->flags & ANYOF_LOCALE)
4351 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4352 if (OP(scan) == NSPACEU) {
4353 for (value = 0; value < 256; value++) {
4354 if (!isSPACE_L1(value)) {
4355 ANYOF_BITMAP_SET(data->start_class, value);
4360 for (value = 0; value < 256; value++) {
4361 if (!isSPACE(value)) {
4362 ANYOF_BITMAP_SET(data->start_class, value);
4369 if (flags & SCF_DO_STCLASS_AND) {
4370 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4371 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4372 for (value = 0; value < 256; value++)
4373 if (!isDIGIT(value))
4374 ANYOF_BITMAP_CLEAR(data->start_class, value);
4378 if (data->start_class->flags & ANYOF_LOCALE)
4379 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4380 for (value = 0; value < 256; value++)
4382 ANYOF_BITMAP_SET(data->start_class, value);
4386 if (flags & SCF_DO_STCLASS_AND) {
4387 if (!(data->start_class->flags & ANYOF_LOCALE))
4388 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4389 for (value = 0; value < 256; value++)
4391 ANYOF_BITMAP_CLEAR(data->start_class, value);
4394 if (data->start_class->flags & ANYOF_LOCALE)
4395 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4396 for (value = 0; value < 256; value++)
4397 if (!isDIGIT(value))
4398 ANYOF_BITMAP_SET(data->start_class, value);
4401 CASE_SYNST_FNC(VERTWS);
4402 CASE_SYNST_FNC(HORIZWS);
4405 if (flags & SCF_DO_STCLASS_OR)
4406 cl_and(data->start_class, and_withp);
4407 flags &= ~SCF_DO_STCLASS;
4410 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4411 data->flags |= (OP(scan) == MEOL
4414 SCAN_COMMIT(pRExC_state, data, minlenp);
4417 else if ( PL_regkind[OP(scan)] == BRANCHJ
4418 /* Lookbehind, or need to calculate parens/evals/stclass: */
4419 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4420 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4421 if ( OP(scan) == UNLESSM &&
4423 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4424 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4427 regnode *upto= regnext(scan);
4429 SV * const mysv_val=sv_newmortal();
4430 DEBUG_STUDYDATA("OPFAIL",data,depth);
4432 /*DEBUG_PARSE_MSG("opfail");*/
4433 regprop(RExC_rx, mysv_val, upto);
4434 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4435 SvPV_nolen_const(mysv_val),
4436 (IV)REG_NODE_NUM(upto),
4441 NEXT_OFF(scan) = upto - scan;
4442 for (opt= scan + 1; opt < upto ; opt++)
4443 OP(opt) = OPTIMIZED;
4447 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4448 || OP(scan) == UNLESSM )
4450 /* Negative Lookahead/lookbehind
4451 In this case we can't do fixed string optimisation.
4454 I32 deltanext, minnext, fake = 0;
4456 struct regnode_charclass_class intrnl;
4459 data_fake.flags = 0;
4461 data_fake.whilem_c = data->whilem_c;
4462 data_fake.last_closep = data->last_closep;
4465 data_fake.last_closep = &fake;
4466 data_fake.pos_delta = delta;
4467 if ( flags & SCF_DO_STCLASS && !scan->flags
4468 && OP(scan) == IFMATCH ) { /* Lookahead */
4469 cl_init(pRExC_state, &intrnl);
4470 data_fake.start_class = &intrnl;
4471 f |= SCF_DO_STCLASS_AND;
4473 if (flags & SCF_WHILEM_VISITED_POS)
4474 f |= SCF_WHILEM_VISITED_POS;
4475 next = regnext(scan);
4476 nscan = NEXTOPER(NEXTOPER(scan));
4477 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4478 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4481 FAIL("Variable length lookbehind not implemented");
4483 else if (minnext > (I32)U8_MAX) {
4484 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4486 scan->flags = (U8)minnext;
4489 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4491 if (data_fake.flags & SF_HAS_EVAL)
4492 data->flags |= SF_HAS_EVAL;
4493 data->whilem_c = data_fake.whilem_c;
4495 if (f & SCF_DO_STCLASS_AND) {
4496 if (flags & SCF_DO_STCLASS_OR) {
4497 /* OR before, AND after: ideally we would recurse with
4498 * data_fake to get the AND applied by study of the
4499 * remainder of the pattern, and then derecurse;
4500 * *** HACK *** for now just treat as "no information".
4501 * See [perl #56690].
4503 cl_init(pRExC_state, data->start_class);
4505 /* AND before and after: combine and continue */
4506 const int was = (data->start_class->flags & ANYOF_EOS);
4508 cl_and(data->start_class, &intrnl);
4510 data->start_class->flags |= ANYOF_EOS;
4514 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4516 /* Positive Lookahead/lookbehind
4517 In this case we can do fixed string optimisation,
4518 but we must be careful about it. Note in the case of
4519 lookbehind the positions will be offset by the minimum
4520 length of the pattern, something we won't know about
4521 until after the recurse.
4523 I32 deltanext, fake = 0;
4525 struct regnode_charclass_class intrnl;
4527 /* We use SAVEFREEPV so that when the full compile
4528 is finished perl will clean up the allocated
4529 minlens when it's all done. This way we don't
4530 have to worry about freeing them when we know
4531 they wont be used, which would be a pain.
4534 Newx( minnextp, 1, I32 );
4535 SAVEFREEPV(minnextp);
4538 StructCopy(data, &data_fake, scan_data_t);
4539 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4542 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4543 data_fake.last_found=newSVsv(data->last_found);
4547 data_fake.last_closep = &fake;
4548 data_fake.flags = 0;
4549 data_fake.pos_delta = delta;
4551 data_fake.flags |= SF_IS_INF;
4552 if ( flags & SCF_DO_STCLASS && !scan->flags
4553 && OP(scan) == IFMATCH ) { /* Lookahead */
4554 cl_init(pRExC_state, &intrnl);
4555 data_fake.start_class = &intrnl;
4556 f |= SCF_DO_STCLASS_AND;
4558 if (flags & SCF_WHILEM_VISITED_POS)
4559 f |= SCF_WHILEM_VISITED_POS;
4560 next = regnext(scan);
4561 nscan = NEXTOPER(NEXTOPER(scan));
4563 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4564 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4567 FAIL("Variable length lookbehind not implemented");
4569 else if (*minnextp > (I32)U8_MAX) {
4570 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4572 scan->flags = (U8)*minnextp;
4577 if (f & SCF_DO_STCLASS_AND) {
4578 const int was = (data->start_class->flags & ANYOF_EOS);
4580 cl_and(data->start_class, &intrnl);
4582 data->start_class->flags |= ANYOF_EOS;
4585 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4587 if (data_fake.flags & SF_HAS_EVAL)
4588 data->flags |= SF_HAS_EVAL;
4589 data->whilem_c = data_fake.whilem_c;
4590 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4591 if (RExC_rx->minlen<*minnextp)
4592 RExC_rx->minlen=*minnextp;
4593 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4594 SvREFCNT_dec(data_fake.last_found);
4596 if ( data_fake.minlen_fixed != minlenp )
4598 data->offset_fixed= data_fake.offset_fixed;
4599 data->minlen_fixed= data_fake.minlen_fixed;
4600 data->lookbehind_fixed+= scan->flags;
4602 if ( data_fake.minlen_float != minlenp )
4604 data->minlen_float= data_fake.minlen_float;
4605 data->offset_float_min=data_fake.offset_float_min;
4606 data->offset_float_max=data_fake.offset_float_max;
4607 data->lookbehind_float+= scan->flags;
4614 else if (OP(scan) == OPEN) {
4615 if (stopparen != (I32)ARG(scan))
4618 else if (OP(scan) == CLOSE) {
4619 if (stopparen == (I32)ARG(scan)) {
4622 if ((I32)ARG(scan) == is_par) {
4623 next = regnext(scan);
4625 if ( next && (OP(next) != WHILEM) && next < last)
4626 is_par = 0; /* Disable optimization */
4629 *(data->last_closep) = ARG(scan);
4631 else if (OP(scan) == EVAL) {
4633 data->flags |= SF_HAS_EVAL;
4635 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4636 if (flags & SCF_DO_SUBSTR) {
4637 SCAN_COMMIT(pRExC_state,data,minlenp);
4638 flags &= ~SCF_DO_SUBSTR;
4640 if (data && OP(scan)==ACCEPT) {
4641 data->flags |= SCF_SEEN_ACCEPT;
4646 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4648 if (flags & SCF_DO_SUBSTR) {
4649 SCAN_COMMIT(pRExC_state,data,minlenp);
4650 data->longest = &(data->longest_float);
4652 is_inf = is_inf_internal = 1;
4653 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4654 cl_anything(pRExC_state, data->start_class);
4655 flags &= ~SCF_DO_STCLASS;
4657 else if (OP(scan) == GPOS) {
4658 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4659 !(delta || is_inf || (data && data->pos_delta)))
4661 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4662 RExC_rx->extflags |= RXf_ANCH_GPOS;
4663 if (RExC_rx->gofs < (U32)min)
4664 RExC_rx->gofs = min;
4666 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4670 #ifdef TRIE_STUDY_OPT
4671 #ifdef FULL_TRIE_STUDY
4672 else if (PL_regkind[OP(scan)] == TRIE) {
4673 /* NOTE - There is similar code to this block above for handling
4674 BRANCH nodes on the initial study. If you change stuff here
4676 regnode *trie_node= scan;
4677 regnode *tail= regnext(scan);
4678 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4679 I32 max1 = 0, min1 = I32_MAX;
4680 struct regnode_charclass_class accum;
4682 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4683 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4684 if (flags & SCF_DO_STCLASS)
4685 cl_init_zero(pRExC_state, &accum);
4691 const regnode *nextbranch= NULL;
4694 for ( word=1 ; word <= trie->wordcount ; word++)
4696 I32 deltanext=0, minnext=0, f = 0, fake;
4697 struct regnode_charclass_class this_class;
4699 data_fake.flags = 0;
4701 data_fake.whilem_c = data->whilem_c;
4702 data_fake.last_closep = data->last_closep;
4705 data_fake.last_closep = &fake;
4706 data_fake.pos_delta = delta;
4707 if (flags & SCF_DO_STCLASS) {
4708 cl_init(pRExC_state, &this_class);
4709 data_fake.start_class = &this_class;
4710 f = SCF_DO_STCLASS_AND;
4712 if (flags & SCF_WHILEM_VISITED_POS)
4713 f |= SCF_WHILEM_VISITED_POS;
4715 if (trie->jump[word]) {
4717 nextbranch = trie_node + trie->jump[0];
4718 scan= trie_node + trie->jump[word];
4719 /* We go from the jump point to the branch that follows
4720 it. Note this means we need the vestigal unused branches
4721 even though they arent otherwise used.
4723 minnext = study_chunk(pRExC_state, &scan, minlenp,
4724 &deltanext, (regnode *)nextbranch, &data_fake,
4725 stopparen, recursed, NULL, f,depth+1);
4727 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4728 nextbranch= regnext((regnode*)nextbranch);
4730 if (min1 > (I32)(minnext + trie->minlen))
4731 min1 = minnext + trie->minlen;
4732 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4733 max1 = minnext + deltanext + trie->maxlen;
4734 if (deltanext == I32_MAX)
4735 is_inf = is_inf_internal = 1;
4737 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4739 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4740 if ( stopmin > min + min1)
4741 stopmin = min + min1;
4742 flags &= ~SCF_DO_SUBSTR;
4744 data->flags |= SCF_SEEN_ACCEPT;
4747 if (data_fake.flags & SF_HAS_EVAL)
4748 data->flags |= SF_HAS_EVAL;
4749 data->whilem_c = data_fake.whilem_c;
4751 if (flags & SCF_DO_STCLASS)
4752 cl_or(pRExC_state, &accum, &this_class);
4755 if (flags & SCF_DO_SUBSTR) {
4756 data->pos_min += min1;
4757 data->pos_delta += max1 - min1;
4758 if (max1 != min1 || is_inf)
4759 data->longest = &(data->longest_float);
4762 delta += max1 - min1;
4763 if (flags & SCF_DO_STCLASS_OR) {
4764 cl_or(pRExC_state, data->start_class, &accum);
4766 cl_and(data->start_class, and_withp);
4767 flags &= ~SCF_DO_STCLASS;
4770 else if (flags & SCF_DO_STCLASS_AND) {
4772 cl_and(data->start_class, &accum);
4773 flags &= ~SCF_DO_STCLASS;
4776 /* Switch to OR mode: cache the old value of
4777 * data->start_class */
4779 StructCopy(data->start_class, and_withp,
4780 struct regnode_charclass_class);
4781 flags &= ~SCF_DO_STCLASS_AND;
4782 StructCopy(&accum, data->start_class,
4783 struct regnode_charclass_class);
4784 flags |= SCF_DO_STCLASS_OR;
4785 data->start_class->flags |= ANYOF_EOS;
4792 else if (PL_regkind[OP(scan)] == TRIE) {
4793 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4796 min += trie->minlen;
4797 delta += (trie->maxlen - trie->minlen);
4798 flags &= ~SCF_DO_STCLASS; /* xxx */
4799 if (flags & SCF_DO_SUBSTR) {
4800 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4801 data->pos_min += trie->minlen;
4802 data->pos_delta += (trie->maxlen - trie->minlen);
4803 if (trie->maxlen != trie->minlen)
4804 data->longest = &(data->longest_float);
4806 if (trie->jump) /* no more substrings -- for now /grr*/
4807 flags &= ~SCF_DO_SUBSTR;
4809 #endif /* old or new */
4810 #endif /* TRIE_STUDY_OPT */
4812 /* Else: zero-length, ignore. */
4813 scan = regnext(scan);
4818 stopparen = frame->stop;
4819 frame = frame->prev;
4820 goto fake_study_recurse;
4825 DEBUG_STUDYDATA("pre-fin:",data,depth);
4828 *deltap = is_inf_internal ? I32_MAX : delta;
4829 if (flags & SCF_DO_SUBSTR && is_inf)
4830 data->pos_delta = I32_MAX - data->pos_min;
4831 if (is_par > (I32)U8_MAX)
4833 if (is_par && pars==1 && data) {
4834 data->flags |= SF_IN_PAR;
4835 data->flags &= ~SF_HAS_PAR;
4837 else if (pars && data) {
4838 data->flags |= SF_HAS_PAR;
4839 data->flags &= ~SF_IN_PAR;
4841 if (flags & SCF_DO_STCLASS_OR)
4842 cl_and(data->start_class, and_withp);
4843 if (flags & SCF_TRIE_RESTUDY)
4844 data->flags |= SCF_TRIE_RESTUDY;
4846 DEBUG_STUDYDATA("post-fin:",data,depth);
4848 return min < stopmin ? min : stopmin;
4852 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4854 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4856 PERL_ARGS_ASSERT_ADD_DATA;
4858 Renewc(RExC_rxi->data,
4859 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4860 char, struct reg_data);
4862 Renew(RExC_rxi->data->what, count + n, U8);
4864 Newx(RExC_rxi->data->what, n, U8);
4865 RExC_rxi->data->count = count + n;
4866 Copy(s, RExC_rxi->data->what + count, n, U8);
4870 /*XXX: todo make this not included in a non debugging perl */
4871 #ifndef PERL_IN_XSUB_RE
4873 Perl_reginitcolors(pTHX)
4876 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4878 char *t = savepv(s);
4882 t = strchr(t, '\t');
4888 PL_colors[i] = t = (char *)"";
4893 PL_colors[i++] = (char *)"";
4900 #ifdef TRIE_STUDY_OPT
4901 #define CHECK_RESTUDY_GOTO \
4903 (data.flags & SCF_TRIE_RESTUDY) \
4907 #define CHECK_RESTUDY_GOTO
4911 * pregcomp - compile a regular expression into internal code
4913 * Decides which engine's compiler to call based on the hint currently in
4917 #ifndef PERL_IN_XSUB_RE
4919 /* return the currently in-scope regex engine (or the default if none) */
4921 regexp_engine const *
4922 Perl_current_re_engine(pTHX)
4926 if (IN_PERL_COMPILETIME) {
4927 HV * const table = GvHV(PL_hintgv);
4931 return &PL_core_reg_engine;
4932 ptr = hv_fetchs(table, "regcomp", FALSE);
4933 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4934 return &PL_core_reg_engine;
4935 return INT2PTR(regexp_engine*,SvIV(*ptr));
4939 if (!PL_curcop->cop_hints_hash)
4940 return &PL_core_reg_engine;
4941 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4942 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4943 return &PL_core_reg_engine;
4944 return INT2PTR(regexp_engine*,SvIV(ptr));
4950 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4953 regexp_engine const *eng = current_re_engine();
4954 GET_RE_DEBUG_FLAGS_DECL;
4956 PERL_ARGS_ASSERT_PREGCOMP;
4958 /* Dispatch a request to compile a regexp to correct regexp engine. */
4960 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4963 return CALLREGCOMP_ENG(eng, pattern, flags);
4967 /* public(ish) entry point for the perl core's own regex compiling code.
4968 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4969 * pattern rather than a list of OPs, and uses the internal engine rather
4970 * than the current one */
4973 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4975 SV *pat = pattern; /* defeat constness! */
4976 PERL_ARGS_ASSERT_RE_COMPILE;
4977 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4978 #ifdef PERL_IN_XSUB_RE
4981 &PL_core_reg_engine,
4983 NULL, NULL, rx_flags, 0);
4986 /* see if there are any run-time code blocks in the pattern.
4987 * False positives are allowed */
4990 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4991 U32 pm_flags, char *pat, STRLEN plen)
4996 /* avoid infinitely recursing when we recompile the pattern parcelled up
4997 * as qr'...'. A single constant qr// string can't have have any
4998 * run-time component in it, and thus, no runtime code. (A non-qr
4999 * string, however, can, e.g. $x =~ '(?{})') */
5000 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
5003 for (s = 0; s < plen; s++) {
5004 if (n < pRExC_state->num_code_blocks
5005 && s == pRExC_state->code_blocks[n].start)
5007 s = pRExC_state->code_blocks[n].end;
5011 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5013 if (pat[s] == '(' && pat[s+1] == '?' &&
5014 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
5021 /* Handle run-time code blocks. We will already have compiled any direct
5022 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5023 * copy of it, but with any literal code blocks blanked out and
5024 * appropriate chars escaped; then feed it into
5026 * eval "qr'modified_pattern'"
5030 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5034 * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
5036 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5037 * and merge them with any code blocks of the original regexp.
5039 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5040 * instead, just save the qr and return FALSE; this tells our caller that
5041 * the original pattern needs upgrading to utf8.
5045 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5046 char *pat, STRLEN plen)
5050 GET_RE_DEBUG_FLAGS_DECL;
5052 if (pRExC_state->runtime_code_qr) {
5053 /* this is the second time we've been called; this should
5054 * only happen if the main pattern got upgraded to utf8
5055 * during compilation; re-use the qr we compiled first time
5056 * round (which should be utf8 too)
5058 qr = pRExC_state->runtime_code_qr;
5059 pRExC_state->runtime_code_qr = NULL;
5060 assert(RExC_utf8 && SvUTF8(qr));
5066 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5070 /* determine how many extra chars we need for ' and \ escaping */
5071 for (s = 0; s < plen; s++) {
5072 if (pat[s] == '\'' || pat[s] == '\\')
5076 Newx(newpat, newlen, char);
5078 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5080 for (s = 0; s < plen; s++) {
5081 if (n < pRExC_state->num_code_blocks
5082 && s == pRExC_state->code_blocks[n].start)
5084 /* blank out literal code block */
5085 assert(pat[s] == '(');
5086 while (s <= pRExC_state->code_blocks[n].end) {
5094 if (pat[s] == '\'' || pat[s] == '\\')
5099 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5103 PerlIO_printf(Perl_debug_log,
5104 "%sre-parsing pattern for runtime code:%s %s\n",
5105 PL_colors[4],PL_colors[5],newpat);
5108 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5114 PUSHSTACKi(PERLSI_REQUIRE);
5115 /* this causes the toker to collapse \\ into \ when parsing
5116 * qr''; normally only q'' does this. It also alters hints
5118 PL_reg_state.re_reparsing = TRUE;
5119 eval_sv(sv, G_SCALAR);
5125 Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5126 assert(SvROK(qr_ref));
5128 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5129 /* the leaving below frees the tmp qr_ref.
5130 * Give qr a life of its own */
5138 if (!RExC_utf8 && SvUTF8(qr)) {
5139 /* first time through; the pattern got upgraded; save the
5140 * qr for the next time through */
5141 assert(!pRExC_state->runtime_code_qr);
5142 pRExC_state->runtime_code_qr = qr;
5147 /* extract any code blocks within the returned qr// */
5150 /* merge the main (r1) and run-time (r2) code blocks into one */
5152 RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5153 struct reg_code_block *new_block, *dst;
5154 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5157 if (!r2->num_code_blocks) /* we guessed wrong */
5161 r1->num_code_blocks + r2->num_code_blocks,
5162 struct reg_code_block);
5165 while ( i1 < r1->num_code_blocks
5166 || i2 < r2->num_code_blocks)
5168 struct reg_code_block *src;
5171 if (i1 == r1->num_code_blocks) {
5172 src = &r2->code_blocks[i2++];
5175 else if (i2 == r2->num_code_blocks)
5176 src = &r1->code_blocks[i1++];
5177 else if ( r1->code_blocks[i1].start
5178 < r2->code_blocks[i2].start)
5180 src = &r1->code_blocks[i1++];
5181 assert(src->end < r2->code_blocks[i2].start);
5184 assert( r1->code_blocks[i1].start
5185 > r2->code_blocks[i2].start);
5186 src = &r2->code_blocks[i2++];
5188 assert(src->end < r1->code_blocks[i1].start);
5191 assert(pat[src->start] == '(');
5192 assert(pat[src->end] == ')');
5193 dst->start = src->start;
5194 dst->end = src->end;
5195 dst->block = src->block;
5196 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5200 r1->num_code_blocks += r2->num_code_blocks;
5201 Safefree(r1->code_blocks);
5202 r1->code_blocks = new_block;
5211 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5213 /* This is the common code for setting up the floating and fixed length
5214 * string data extracted from Perlre_op_compile() below. Returns a boolean
5215 * as to whether succeeded or not */
5219 if (! (longest_length
5220 || (eol /* Can't have SEOL and MULTI */
5221 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5223 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5224 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5229 /* copy the information about the longest from the reg_scan_data
5230 over to the program. */
5231 if (SvUTF8(sv_longest)) {
5232 *rx_utf8 = sv_longest;
5235 *rx_substr = sv_longest;
5238 /* end_shift is how many chars that must be matched that
5239 follow this item. We calculate it ahead of time as once the
5240 lookbehind offset is added in we lose the ability to correctly
5242 ml = minlen ? *(minlen) : (I32)longest_length;
5243 *rx_end_shift = ml - offset
5244 - longest_length + (SvTAIL(sv_longest) != 0)
5247 t = (eol/* Can't have SEOL and MULTI */
5248 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5249 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5255 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5256 * regular expression into internal code.
5257 * The pattern may be passed either as:
5258 * a list of SVs (patternp plus pat_count)
5259 * a list of OPs (expr)
5260 * If both are passed, the SV list is used, but the OP list indicates
5261 * which SVs are actually pre-compiled code blocks
5263 * The SVs in the list have magic and qr overloading applied to them (and
5264 * the list may be modified in-place with replacement SVs in the latter
5267 * If the pattern hasn't changed from old_re, then old_re will be
5270 * eng is the current engine. If that engine has an op_comp method, then
5271 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5272 * do the initial concatenation of arguments and pass on to the external
5275 * If is_bare_re is not null, set it to a boolean indicating whether the
5276 * arg list reduced (after overloading) to a single bare regex which has
5277 * been returned (i.e. /$qr/).
5279 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5281 * pm_flags contains the PMf_* flags, typically based on those from the
5282 * pm_flags field of the related PMOP. Currently we're only interested in
5283 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5285 * We can't allocate space until we know how big the compiled form will be,
5286 * but we can't compile it (and thus know how big it is) until we've got a
5287 * place to put the code. So we cheat: we compile it twice, once with code
5288 * generation turned off and size counting turned on, and once "for real".
5289 * This also means that we don't allocate space until we are sure that the
5290 * thing really will compile successfully, and we never have to move the
5291 * code and thus invalidate pointers into it. (Note that it has to be in
5292 * one piece because free() must be able to free it all.) [NB: not true in perl]
5294 * Beware that the optimization-preparation code in here knows about some
5295 * of the structure of the compiled regexp. [I'll say.]
5299 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5300 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5301 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5306 regexp_internal *ri;
5316 /* these are all flags - maybe they should be turned
5317 * into a single int with different bit masks */
5318 I32 sawlookahead = 0;
5321 bool used_setjump = FALSE;
5322 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5323 bool code_is_utf8 = 0;
5324 bool VOL recompile = 0;
5325 bool runtime_code = 0;
5329 RExC_state_t RExC_state;
5330 RExC_state_t * const pRExC_state = &RExC_state;
5331 #ifdef TRIE_STUDY_OPT
5333 RExC_state_t copyRExC_state;
5335 GET_RE_DEBUG_FLAGS_DECL;
5337 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5339 DEBUG_r(if (!PL_colorset) reginitcolors());
5341 #ifndef PERL_IN_XSUB_RE
5342 /* Initialize these here instead of as-needed, as is quick and avoids
5343 * having to test them each time otherwise */
5344 if (! PL_AboveLatin1) {
5345 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5346 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5347 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5349 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5350 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5352 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5353 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5355 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5356 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5358 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5360 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5361 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5363 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5365 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5366 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5368 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5369 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5371 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5372 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5374 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5375 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5377 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5378 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5380 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5381 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5383 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5384 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5386 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5388 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5389 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5391 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5392 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5396 pRExC_state->code_blocks = NULL;
5397 pRExC_state->num_code_blocks = 0;
5400 *is_bare_re = FALSE;
5402 if (expr && (expr->op_type == OP_LIST ||
5403 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5405 /* is the source UTF8, and how many code blocks are there? */
5409 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5410 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5412 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5413 /* count of DO blocks */
5417 pRExC_state->num_code_blocks = ncode;
5418 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5423 /* handle a list of SVs */
5427 /* apply magic and RE overloading to each arg */
5428 for (svp = patternp; svp < patternp + pat_count; svp++) {
5431 if (SvROK(rx) && SvAMAGIC(rx)) {
5432 SV *sv = AMG_CALLunary(rx, regexp_amg);
5436 if (SvTYPE(sv) != SVt_REGEXP)
5437 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5443 if (pat_count > 1) {
5444 /* concat multiple args and find any code block indexes */
5449 STRLEN orig_patlen = 0;
5451 if (pRExC_state->num_code_blocks) {
5452 o = cLISTOPx(expr)->op_first;
5453 assert(o->op_type == OP_PUSHMARK);
5457 pat = newSVpvn("", 0);
5460 /* determine if the pattern is going to be utf8 (needed
5461 * in advance to align code block indices correctly).
5462 * XXX This could fail to be detected for an arg with
5463 * overloading but not concat overloading; but the main effect
5464 * in this obscure case is to need a 'use re eval' for a
5465 * literal code block */
5466 for (svp = patternp; svp < patternp + pat_count; svp++) {
5473 for (svp = patternp; svp < patternp + pat_count; svp++) {
5474 SV *sv, *msv = *svp;
5478 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5479 assert(n < pRExC_state->num_code_blocks);
5480 pRExC_state->code_blocks[n].start = SvCUR(pat);
5481 pRExC_state->code_blocks[n].block = o;
5482 pRExC_state->code_blocks[n].src_regex = NULL;
5485 o = o->op_sibling; /* skip CONST */
5491 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5492 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5495 /* overloading involved: all bets are off over literal
5496 * code. Pretend we haven't seen it */
5497 pRExC_state->num_code_blocks -= n;
5503 while (SvAMAGIC(msv)
5504 && (sv = AMG_CALLunary(msv, string_amg))
5510 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5512 orig_patlen = SvCUR(pat);
5513 sv_catsv_nomg(pat, msv);
5516 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5519 /* extract any code blocks within any embedded qr//'s */
5520 if (rx && SvTYPE(rx) == SVt_REGEXP
5521 && RX_ENGINE((REGEXP*)rx)->op_comp)
5524 RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5525 if (ri->num_code_blocks) {
5527 /* the presence of an embedded qr// with code means
5528 * we should always recompile: the text of the
5529 * qr// may not have changed, but it may be a
5530 * different closure than last time */
5532 Renew(pRExC_state->code_blocks,
5533 pRExC_state->num_code_blocks + ri->num_code_blocks,
5534 struct reg_code_block);
5535 pRExC_state->num_code_blocks += ri->num_code_blocks;
5536 for (i=0; i < ri->num_code_blocks; i++) {
5537 struct reg_code_block *src, *dst;
5538 STRLEN offset = orig_patlen
5539 + ((struct regexp *)SvANY(rx))->pre_prefix;
5540 assert(n < pRExC_state->num_code_blocks);
5541 src = &ri->code_blocks[i];
5542 dst = &pRExC_state->code_blocks[n];
5543 dst->start = src->start + offset;
5544 dst->end = src->end + offset;
5545 dst->block = src->block;
5546 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5560 while (SvAMAGIC(pat)
5561 && (sv = AMG_CALLunary(pat, string_amg))
5569 /* handle bare regex: foo =~ $re */
5574 if (SvTYPE(re) == SVt_REGEXP) {
5578 Safefree(pRExC_state->code_blocks);
5584 /* not a list of SVs, so must be a list of OPs */
5586 if (expr->op_type == OP_LIST) {
5591 pat = newSVpvn("", 0);
5596 /* given a list of CONSTs and DO blocks in expr, append all
5597 * the CONSTs to pat, and record the start and end of each
5598 * code block in code_blocks[] (each DO{} op is followed by an
5599 * OP_CONST containing the corresponding literal '(?{...})
5602 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5603 if (o->op_type == OP_CONST) {
5604 sv_catsv(pat, cSVOPo_sv);
5606 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5610 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5611 assert(i+1 < pRExC_state->num_code_blocks);
5612 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5613 pRExC_state->code_blocks[i].block = o;
5614 pRExC_state->code_blocks[i].src_regex = NULL;
5620 assert(expr->op_type == OP_CONST);
5621 pat = cSVOPx_sv(expr);
5625 exp = SvPV_nomg(pat, plen);
5627 if (!eng->op_comp) {
5628 if ((SvUTF8(pat) && IN_BYTES)
5629 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5631 /* make a temporary copy; either to convert to bytes,
5632 * or to avoid repeating get-magic / overloaded stringify */
5633 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5634 (IN_BYTES ? 0 : SvUTF8(pat)));
5636 Safefree(pRExC_state->code_blocks);
5637 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5640 /* ignore the utf8ness if the pattern is 0 length */
5641 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5642 RExC_uni_semantics = 0;
5643 RExC_contains_locale = 0;
5644 pRExC_state->runtime_code_qr = NULL;
5646 /****************** LONG JUMP TARGET HERE***********************/
5647 /* Longjmp back to here if have to switch in midstream to utf8 */
5648 if (! RExC_orig_utf8) {
5649 JMPENV_PUSH(jump_ret);
5650 used_setjump = TRUE;
5653 if (jump_ret == 0) { /* First time through */
5657 SV *dsv= sv_newmortal();
5658 RE_PV_QUOTED_DECL(s, RExC_utf8,
5659 dsv, exp, plen, 60);
5660 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5661 PL_colors[4],PL_colors[5],s);
5664 else { /* longjumped back */
5667 STRLEN s = 0, d = 0;
5670 /* If the cause for the longjmp was other than changing to utf8, pop
5671 * our own setjmp, and longjmp to the correct handler */
5672 if (jump_ret != UTF8_LONGJMP) {
5674 JMPENV_JUMP(jump_ret);
5679 /* It's possible to write a regexp in ascii that represents Unicode
5680 codepoints outside of the byte range, such as via \x{100}. If we
5681 detect such a sequence we have to convert the entire pattern to utf8
5682 and then recompile, as our sizing calculation will have been based
5683 on 1 byte == 1 character, but we will need to use utf8 to encode
5684 at least some part of the pattern, and therefore must convert the whole
5687 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5688 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5690 /* upgrade pattern to UTF8, and if there are code blocks,
5691 * recalculate the indices.
5692 * This is essentially an unrolled Perl_bytes_to_utf8() */
5694 src = (U8*)SvPV_nomg(pat, plen);
5695 Newx(dst, plen * 2 + 1, U8);
5698 const UV uv = NATIVE_TO_ASCII(src[s]);
5699 if (UNI_IS_INVARIANT(uv))
5700 dst[d] = (U8)UTF_TO_NATIVE(uv);
5702 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5703 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5705 if (n < pRExC_state->num_code_blocks) {
5706 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5707 pRExC_state->code_blocks[n].start = d;
5708 assert(dst[d] == '(');
5711 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5712 pRExC_state->code_blocks[n].end = d;
5713 assert(dst[d] == ')');
5726 RExC_orig_utf8 = RExC_utf8 = 1;
5729 /* return old regex if pattern hasn't changed */
5733 && !!RX_UTF8(old_re) == !!RExC_utf8
5734 && RX_PRECOMP(old_re)
5735 && RX_PRELEN(old_re) == plen
5736 && memEQ(RX_PRECOMP(old_re), exp, plen))
5738 /* with runtime code, always recompile */
5739 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5741 if (!runtime_code) {
5745 Safefree(pRExC_state->code_blocks);
5749 else if ((pm_flags & PMf_USE_RE_EVAL)
5750 /* this second condition covers the non-regex literal case,
5751 * i.e. $foo =~ '(?{})'. */
5752 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5753 && (PL_hints & HINT_RE_EVAL))
5755 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5758 #ifdef TRIE_STUDY_OPT
5762 rx_flags = orig_rx_flags;
5764 if (initial_charset == REGEX_LOCALE_CHARSET) {
5765 RExC_contains_locale = 1;
5767 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5769 /* Set to use unicode semantics if the pattern is in utf8 and has the
5770 * 'depends' charset specified, as it means unicode when utf8 */
5771 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5775 RExC_flags = rx_flags;
5776 RExC_pm_flags = pm_flags;
5779 if (PL_tainting && PL_tainted)
5780 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5782 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5783 /* whoops, we have a non-utf8 pattern, whilst run-time code
5784 * got compiled as utf8. Try again with a utf8 pattern */
5785 JMPENV_JUMP(UTF8_LONGJMP);
5788 assert(!pRExC_state->runtime_code_qr);
5793 RExC_in_lookbehind = 0;
5794 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5796 RExC_override_recoding = 0;
5798 /* First pass: determine size, legality. */
5806 RExC_emit = &PL_regdummy;
5807 RExC_whilem_seen = 0;
5808 RExC_open_parens = NULL;
5809 RExC_close_parens = NULL;
5811 RExC_paren_names = NULL;
5813 RExC_paren_name_list = NULL;
5815 RExC_recurse = NULL;
5816 RExC_recurse_count = 0;
5817 pRExC_state->code_index = 0;
5819 #if 0 /* REGC() is (currently) a NOP at the first pass.
5820 * Clever compilers notice this and complain. --jhi */
5821 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5824 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5826 RExC_lastparse=NULL;
5828 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5829 RExC_precomp = NULL;
5830 Safefree(pRExC_state->code_blocks);
5834 /* Here, finished first pass. Get rid of any added setjmp */
5840 PerlIO_printf(Perl_debug_log,
5841 "Required size %"IVdf" nodes\n"
5842 "Starting second pass (creation)\n",
5845 RExC_lastparse=NULL;
5848 /* The first pass could have found things that force Unicode semantics */
5849 if ((RExC_utf8 || RExC_uni_semantics)
5850 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5852 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5855 /* Small enough for pointer-storage convention?
5856 If extralen==0, this means that we will not need long jumps. */
5857 if (RExC_size >= 0x10000L && RExC_extralen)
5858 RExC_size += RExC_extralen;
5861 if (RExC_whilem_seen > 15)
5862 RExC_whilem_seen = 15;
5864 /* Allocate space and zero-initialize. Note, the two step process
5865 of zeroing when in debug mode, thus anything assigned has to
5866 happen after that */
5867 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5868 r = (struct regexp*)SvANY(rx);
5869 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5870 char, regexp_internal);
5871 if ( r == NULL || ri == NULL )
5872 FAIL("Regexp out of space");
5874 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5875 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5877 /* bulk initialize base fields with 0. */
5878 Zero(ri, sizeof(regexp_internal), char);
5881 /* non-zero initialization begins here */
5884 r->extflags = rx_flags;
5885 if (pm_flags & PMf_IS_QR) {
5886 ri->code_blocks = pRExC_state->code_blocks;
5887 ri->num_code_blocks = pRExC_state->num_code_blocks;
5890 SAVEFREEPV(pRExC_state->code_blocks);
5893 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5894 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5896 /* The caret is output if there are any defaults: if not all the STD
5897 * flags are set, or if no character set specifier is needed */
5899 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5901 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5902 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5903 >> RXf_PMf_STD_PMMOD_SHIFT);
5904 const char *fptr = STD_PAT_MODS; /*"msix"*/
5906 /* Allocate for the worst case, which is all the std flags are turned
5907 * on. If more precision is desired, we could do a population count of
5908 * the flags set. This could be done with a small lookup table, or by
5909 * shifting, masking and adding, or even, when available, assembly
5910 * language for a machine-language population count.
5911 * We never output a minus, as all those are defaults, so are
5912 * covered by the caret */
5913 const STRLEN wraplen = plen + has_p + has_runon
5914 + has_default /* If needs a caret */
5916 /* If needs a character set specifier */
5917 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5918 + (sizeof(STD_PAT_MODS) - 1)
5919 + (sizeof("(?:)") - 1);
5921 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5924 SvFLAGS(rx) |= SVf_UTF8;
5927 /* If a default, cover it using the caret */
5929 *p++= DEFAULT_PAT_MOD;
5933 const char* const name = get_regex_charset_name(r->extflags, &len);
5934 Copy(name, p, len, char);
5938 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5941 while((ch = *fptr++)) {
5949 Copy(RExC_precomp, p, plen, char);
5950 assert ((RX_WRAPPED(rx) - p) < 16);
5951 r->pre_prefix = p - RX_WRAPPED(rx);
5957 SvCUR_set(rx, p - SvPVX_const(rx));
5961 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5963 if (RExC_seen & REG_SEEN_RECURSE) {
5964 Newxz(RExC_open_parens, RExC_npar,regnode *);
5965 SAVEFREEPV(RExC_open_parens);
5966 Newxz(RExC_close_parens,RExC_npar,regnode *);
5967 SAVEFREEPV(RExC_close_parens);
5970 /* Useful during FAIL. */
5971 #ifdef RE_TRACK_PATTERN_OFFSETS
5972 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5973 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5974 "%s %"UVuf" bytes for offset annotations.\n",
5975 ri->u.offsets ? "Got" : "Couldn't get",
5976 (UV)((2*RExC_size+1) * sizeof(U32))));
5978 SetProgLen(ri,RExC_size);
5983 /* Second pass: emit code. */
5984 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5985 RExC_pm_flags = pm_flags;
5990 RExC_emit_start = ri->program;
5991 RExC_emit = ri->program;
5992 RExC_emit_bound = ri->program + RExC_size + 1;
5993 pRExC_state->code_index = 0;
5995 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5996 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6000 /* XXXX To minimize changes to RE engine we always allocate
6001 3-units-long substrs field. */
6002 Newx(r->substrs, 1, struct reg_substr_data);
6003 if (RExC_recurse_count) {
6004 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6005 SAVEFREEPV(RExC_recurse);
6009 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6010 Zero(r->substrs, 1, struct reg_substr_data);
6012 #ifdef TRIE_STUDY_OPT
6014 StructCopy(&zero_scan_data, &data, scan_data_t);
6015 copyRExC_state = RExC_state;
6018 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6020 RExC_state = copyRExC_state;
6021 if (seen & REG_TOP_LEVEL_BRANCHES)
6022 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6024 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6025 if (data.last_found) {
6026 SvREFCNT_dec(data.longest_fixed);
6027 SvREFCNT_dec(data.longest_float);
6028 SvREFCNT_dec(data.last_found);
6030 StructCopy(&zero_scan_data, &data, scan_data_t);
6033 StructCopy(&zero_scan_data, &data, scan_data_t);
6036 /* Dig out information for optimizations. */
6037 r->extflags = RExC_flags; /* was pm_op */
6038 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6041 SvUTF8_on(rx); /* Unicode in it? */
6042 ri->regstclass = NULL;
6043 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6044 r->intflags |= PREGf_NAUGHTY;
6045 scan = ri->program + 1; /* First BRANCH. */
6047 /* testing for BRANCH here tells us whether there is "must appear"
6048 data in the pattern. If there is then we can use it for optimisations */
6049 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6051 STRLEN longest_float_length, longest_fixed_length;
6052 struct regnode_charclass_class ch_class; /* pointed to by data */
6054 I32 last_close = 0; /* pointed to by data */
6055 regnode *first= scan;
6056 regnode *first_next= regnext(first);
6058 * Skip introductions and multiplicators >= 1
6059 * so that we can extract the 'meat' of the pattern that must
6060 * match in the large if() sequence following.
6061 * NOTE that EXACT is NOT covered here, as it is normally
6062 * picked up by the optimiser separately.
6064 * This is unfortunate as the optimiser isnt handling lookahead
6065 * properly currently.
6068 while ((OP(first) == OPEN && (sawopen = 1)) ||
6069 /* An OR of *one* alternative - should not happen now. */
6070 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6071 /* for now we can't handle lookbehind IFMATCH*/
6072 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6073 (OP(first) == PLUS) ||
6074 (OP(first) == MINMOD) ||
6075 /* An {n,m} with n>0 */
6076 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6077 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6080 * the only op that could be a regnode is PLUS, all the rest
6081 * will be regnode_1 or regnode_2.
6084 if (OP(first) == PLUS)
6087 first += regarglen[OP(first)];
6089 first = NEXTOPER(first);
6090 first_next= regnext(first);
6093 /* Starting-point info. */
6095 DEBUG_PEEP("first:",first,0);
6096 /* Ignore EXACT as we deal with it later. */
6097 if (PL_regkind[OP(first)] == EXACT) {
6098 if (OP(first) == EXACT)
6099 NOOP; /* Empty, get anchored substr later. */
6101 ri->regstclass = first;
6104 else if (PL_regkind[OP(first)] == TRIE &&
6105 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6108 /* this can happen only on restudy */
6109 if ( OP(first) == TRIE ) {
6110 struct regnode_1 *trieop = (struct regnode_1 *)
6111 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6112 StructCopy(first,trieop,struct regnode_1);
6113 trie_op=(regnode *)trieop;
6115 struct regnode_charclass *trieop = (struct regnode_charclass *)
6116 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6117 StructCopy(first,trieop,struct regnode_charclass);
6118 trie_op=(regnode *)trieop;
6121 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6122 ri->regstclass = trie_op;
6125 else if (REGNODE_SIMPLE(OP(first)))
6126 ri->regstclass = first;
6127 else if (PL_regkind[OP(first)] == BOUND ||
6128 PL_regkind[OP(first)] == NBOUND)
6129 ri->regstclass = first;
6130 else if (PL_regkind[OP(first)] == BOL) {
6131 r->extflags |= (OP(first) == MBOL
6133 : (OP(first) == SBOL
6136 first = NEXTOPER(first);
6139 else if (OP(first) == GPOS) {
6140 r->extflags |= RXf_ANCH_GPOS;
6141 first = NEXTOPER(first);
6144 else if ((!sawopen || !RExC_sawback) &&
6145 (OP(first) == STAR &&
6146 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6147 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6149 /* turn .* into ^.* with an implied $*=1 */
6151 (OP(NEXTOPER(first)) == REG_ANY)
6154 r->extflags |= type;
6155 r->intflags |= PREGf_IMPLICIT;
6156 first = NEXTOPER(first);
6159 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6160 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6161 /* x+ must match at the 1st pos of run of x's */
6162 r->intflags |= PREGf_SKIP;
6164 /* Scan is after the zeroth branch, first is atomic matcher. */
6165 #ifdef TRIE_STUDY_OPT
6168 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6169 (IV)(first - scan + 1))
6173 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6174 (IV)(first - scan + 1))
6180 * If there's something expensive in the r.e., find the
6181 * longest literal string that must appear and make it the
6182 * regmust. Resolve ties in favor of later strings, since
6183 * the regstart check works with the beginning of the r.e.
6184 * and avoiding duplication strengthens checking. Not a
6185 * strong reason, but sufficient in the absence of others.
6186 * [Now we resolve ties in favor of the earlier string if
6187 * it happens that c_offset_min has been invalidated, since the
6188 * earlier string may buy us something the later one won't.]
6191 data.longest_fixed = newSVpvs("");
6192 data.longest_float = newSVpvs("");
6193 data.last_found = newSVpvs("");
6194 data.longest = &(data.longest_fixed);
6196 if (!ri->regstclass) {
6197 cl_init(pRExC_state, &ch_class);
6198 data.start_class = &ch_class;
6199 stclass_flag = SCF_DO_STCLASS_AND;
6200 } else /* XXXX Check for BOUND? */
6202 data.last_closep = &last_close;
6204 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6205 &data, -1, NULL, NULL,
6206 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6212 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6213 && data.last_start_min == 0 && data.last_end > 0
6214 && !RExC_seen_zerolen
6215 && !(RExC_seen & REG_SEEN_VERBARG)
6216 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6217 r->extflags |= RXf_CHECK_ALL;
6218 scan_commit(pRExC_state, &data,&minlen,0);
6219 SvREFCNT_dec(data.last_found);
6221 longest_float_length = CHR_SVLEN(data.longest_float);
6223 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6224 && data.offset_fixed == data.offset_float_min
6225 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6226 && S_setup_longest (aTHX_ pRExC_state,
6230 &(r->float_end_shift),
6231 data.lookbehind_float,
6232 data.offset_float_min,
6234 longest_float_length,
6235 data.flags & SF_FL_BEFORE_EOL,
6236 data.flags & SF_FL_BEFORE_MEOL))
6238 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6239 r->float_max_offset = data.offset_float_max;
6240 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6241 r->float_max_offset -= data.lookbehind_float;
6244 r->float_substr = r->float_utf8 = NULL;
6245 SvREFCNT_dec(data.longest_float);
6246 longest_float_length = 0;
6249 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6251 if (S_setup_longest (aTHX_ pRExC_state,
6253 &(r->anchored_utf8),
6254 &(r->anchored_substr),
6255 &(r->anchored_end_shift),
6256 data.lookbehind_fixed,
6259 longest_fixed_length,
6260 data.flags & SF_FIX_BEFORE_EOL,
6261 data.flags & SF_FIX_BEFORE_MEOL))
6263 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6266 r->anchored_substr = r->anchored_utf8 = NULL;
6267 SvREFCNT_dec(data.longest_fixed);
6268 longest_fixed_length = 0;
6272 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6273 ri->regstclass = NULL;
6275 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6277 && !(data.start_class->flags & ANYOF_EOS)
6278 && !cl_is_anything(data.start_class))
6280 const U32 n = add_data(pRExC_state, 1, "f");
6281 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6283 Newx(RExC_rxi->data->data[n], 1,
6284 struct regnode_charclass_class);
6285 StructCopy(data.start_class,
6286 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6287 struct regnode_charclass_class);
6288 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6289 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6290 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6291 regprop(r, sv, (regnode*)data.start_class);
6292 PerlIO_printf(Perl_debug_log,
6293 "synthetic stclass \"%s\".\n",
6294 SvPVX_const(sv));});
6297 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6298 if (longest_fixed_length > longest_float_length) {
6299 r->check_end_shift = r->anchored_end_shift;
6300 r->check_substr = r->anchored_substr;
6301 r->check_utf8 = r->anchored_utf8;
6302 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6303 if (r->extflags & RXf_ANCH_SINGLE)
6304 r->extflags |= RXf_NOSCAN;
6307 r->check_end_shift = r->float_end_shift;
6308 r->check_substr = r->float_substr;
6309 r->check_utf8 = r->float_utf8;
6310 r->check_offset_min = r->float_min_offset;
6311 r->check_offset_max = r->float_max_offset;
6313 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6314 This should be changed ASAP! */
6315 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6316 r->extflags |= RXf_USE_INTUIT;
6317 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6318 r->extflags |= RXf_INTUIT_TAIL;
6320 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6321 if ( (STRLEN)minlen < longest_float_length )
6322 minlen= longest_float_length;
6323 if ( (STRLEN)minlen < longest_fixed_length )
6324 minlen= longest_fixed_length;
6328 /* Several toplevels. Best we can is to set minlen. */
6330 struct regnode_charclass_class ch_class;
6333 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6335 scan = ri->program + 1;
6336 cl_init(pRExC_state, &ch_class);
6337 data.start_class = &ch_class;
6338 data.last_closep = &last_close;
6341 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6342 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6346 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6347 = r->float_substr = r->float_utf8 = NULL;
6349 if (!(data.start_class->flags & ANYOF_EOS)
6350 && !cl_is_anything(data.start_class))
6352 const U32 n = add_data(pRExC_state, 1, "f");
6353 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6355 Newx(RExC_rxi->data->data[n], 1,
6356 struct regnode_charclass_class);
6357 StructCopy(data.start_class,
6358 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6359 struct regnode_charclass_class);
6360 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6361 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6362 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6363 regprop(r, sv, (regnode*)data.start_class);
6364 PerlIO_printf(Perl_debug_log,
6365 "synthetic stclass \"%s\".\n",
6366 SvPVX_const(sv));});
6370 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6371 the "real" pattern. */
6373 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6374 (IV)minlen, (IV)r->minlen);
6376 r->minlenret = minlen;
6377 if (r->minlen < minlen)
6380 if (RExC_seen & REG_SEEN_GPOS)
6381 r->extflags |= RXf_GPOS_SEEN;
6382 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6383 r->extflags |= RXf_LOOKBEHIND_SEEN;
6384 if (pRExC_state->num_code_blocks)
6385 r->extflags |= RXf_EVAL_SEEN;
6386 if (RExC_seen & REG_SEEN_CANY)
6387 r->extflags |= RXf_CANY_SEEN;
6388 if (RExC_seen & REG_SEEN_VERBARG)
6389 r->intflags |= PREGf_VERBARG_SEEN;
6390 if (RExC_seen & REG_SEEN_CUTGROUP)
6391 r->intflags |= PREGf_CUTGROUP_SEEN;
6392 if (pm_flags & PMf_USE_RE_EVAL)
6393 r->intflags |= PREGf_USE_RE_EVAL;
6394 if (RExC_paren_names)
6395 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6397 RXp_PAREN_NAMES(r) = NULL;
6399 #ifdef STUPID_PATTERN_CHECKS
6400 if (RX_PRELEN(rx) == 0)
6401 r->extflags |= RXf_NULL;
6402 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6403 /* XXX: this should happen BEFORE we compile */
6404 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6405 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6406 r->extflags |= RXf_WHITE;
6407 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6408 r->extflags |= RXf_START_ONLY;
6410 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6411 /* XXX: this should happen BEFORE we compile */
6412 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6414 regnode *first = ri->program + 1;
6417 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6418 r->extflags |= RXf_NULL;
6419 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6420 r->extflags |= RXf_START_ONLY;
6421 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6422 && OP(regnext(first)) == END)
6423 r->extflags |= RXf_WHITE;
6427 if (RExC_paren_names) {
6428 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6429 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6432 ri->name_list_idx = 0;
6434 if (RExC_recurse_count) {
6435 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6436 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6437 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6440 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6441 /* assume we don't need to swap parens around before we match */
6444 PerlIO_printf(Perl_debug_log,"Final program:\n");
6447 #ifdef RE_TRACK_PATTERN_OFFSETS
6448 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6449 const U32 len = ri->u.offsets[0];
6451 GET_RE_DEBUG_FLAGS_DECL;
6452 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6453 for (i = 1; i <= len; i++) {
6454 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6455 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6456 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6458 PerlIO_printf(Perl_debug_log, "\n");
6466 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6469 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6471 PERL_UNUSED_ARG(value);
6473 if (flags & RXapif_FETCH) {
6474 return reg_named_buff_fetch(rx, key, flags);
6475 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6476 Perl_croak_no_modify(aTHX);
6478 } else if (flags & RXapif_EXISTS) {
6479 return reg_named_buff_exists(rx, key, flags)
6482 } else if (flags & RXapif_REGNAMES) {
6483 return reg_named_buff_all(rx, flags);
6484 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6485 return reg_named_buff_scalar(rx, flags);
6487 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6493 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6496 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6497 PERL_UNUSED_ARG(lastkey);
6499 if (flags & RXapif_FIRSTKEY)
6500 return reg_named_buff_firstkey(rx, flags);
6501 else if (flags & RXapif_NEXTKEY)
6502 return reg_named_buff_nextkey(rx, flags);
6504 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6510 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6513 AV *retarray = NULL;
6515 struct regexp *const rx = (struct regexp *)SvANY(r);
6517 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6519 if (flags & RXapif_ALL)
6522 if (rx && RXp_PAREN_NAMES(rx)) {
6523 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6526 SV* sv_dat=HeVAL(he_str);
6527 I32 *nums=(I32*)SvPVX(sv_dat);
6528 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6529 if ((I32)(rx->nparens) >= nums[i]
6530 && rx->offs[nums[i]].start != -1
6531 && rx->offs[nums[i]].end != -1)
6534 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6539 ret = newSVsv(&PL_sv_undef);
6542 av_push(retarray, ret);
6545 return newRV_noinc(MUTABLE_SV(retarray));
6552 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6555 struct regexp *const rx = (struct regexp *)SvANY(r);
6557 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6559 if (rx && RXp_PAREN_NAMES(rx)) {
6560 if (flags & RXapif_ALL) {
6561 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6563 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6577 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6579 struct regexp *const rx = (struct regexp *)SvANY(r);
6581 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6583 if ( rx && RXp_PAREN_NAMES(rx) ) {
6584 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6586 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6593 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6595 struct regexp *const rx = (struct regexp *)SvANY(r);
6596 GET_RE_DEBUG_FLAGS_DECL;
6598 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6600 if (rx && RXp_PAREN_NAMES(rx)) {
6601 HV *hv = RXp_PAREN_NAMES(rx);
6603 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6606 SV* sv_dat = HeVAL(temphe);
6607 I32 *nums = (I32*)SvPVX(sv_dat);
6608 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6609 if ((I32)(rx->lastparen) >= nums[i] &&
6610 rx->offs[nums[i]].start != -1 &&
6611 rx->offs[nums[i]].end != -1)
6617 if (parno || flags & RXapif_ALL) {
6618 return newSVhek(HeKEY_hek(temphe));
6626 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6631 struct regexp *const rx = (struct regexp *)SvANY(r);
6633 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6635 if (rx && RXp_PAREN_NAMES(rx)) {
6636 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6637 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6638 } else if (flags & RXapif_ONE) {
6639 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6640 av = MUTABLE_AV(SvRV(ret));
6641 length = av_len(av);
6643 return newSViv(length + 1);
6645 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6649 return &PL_sv_undef;
6653 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6655 struct regexp *const rx = (struct regexp *)SvANY(r);
6658 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6660 if (rx && RXp_PAREN_NAMES(rx)) {
6661 HV *hv= RXp_PAREN_NAMES(rx);
6663 (void)hv_iterinit(hv);
6664 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6667 SV* sv_dat = HeVAL(temphe);
6668 I32 *nums = (I32*)SvPVX(sv_dat);
6669 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6670 if ((I32)(rx->lastparen) >= nums[i] &&
6671 rx->offs[nums[i]].start != -1 &&
6672 rx->offs[nums[i]].end != -1)
6678 if (parno || flags & RXapif_ALL) {
6679 av_push(av, newSVhek(HeKEY_hek(temphe)));
6684 return newRV_noinc(MUTABLE_SV(av));
6688 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6691 struct regexp *const rx = (struct regexp *)SvANY(r);
6696 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6699 sv_setsv(sv,&PL_sv_undef);
6703 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6705 i = rx->offs[0].start;
6709 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6711 s = rx->subbeg + rx->offs[0].end;
6712 i = rx->sublen - rx->offs[0].end;
6715 if ( 0 <= paren && paren <= (I32)rx->nparens &&
6716 (s1 = rx->offs[paren].start) != -1 &&
6717 (t1 = rx->offs[paren].end) != -1)
6721 s = rx->subbeg + s1;
6723 sv_setsv(sv,&PL_sv_undef);
6726 assert(rx->sublen >= (s - rx->subbeg) + i );
6728 const int oldtainted = PL_tainted;
6730 sv_setpvn(sv, s, i);
6731 PL_tainted = oldtainted;
6732 if ( (rx->extflags & RXf_CANY_SEEN)
6733 ? (RXp_MATCH_UTF8(rx)
6734 && (!i || is_utf8_string((U8*)s, i)))
6735 : (RXp_MATCH_UTF8(rx)) )
6742 if (RXp_MATCH_TAINTED(rx)) {
6743 if (SvTYPE(sv) >= SVt_PVMG) {
6744 MAGIC* const mg = SvMAGIC(sv);
6747 SvMAGIC_set(sv, mg->mg_moremagic);
6749 if ((mgt = SvMAGIC(sv))) {
6750 mg->mg_moremagic = mgt;
6751 SvMAGIC_set(sv, mg);
6761 sv_setsv(sv,&PL_sv_undef);
6767 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6768 SV const * const value)
6770 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6772 PERL_UNUSED_ARG(rx);
6773 PERL_UNUSED_ARG(paren);
6774 PERL_UNUSED_ARG(value);
6777 Perl_croak_no_modify(aTHX);
6781 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6784 struct regexp *const rx = (struct regexp *)SvANY(r);
6788 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6790 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6792 /* $` / ${^PREMATCH} */
6793 case RX_BUFF_IDX_PREMATCH:
6794 if (rx->offs[0].start != -1) {
6795 i = rx->offs[0].start;
6803 /* $' / ${^POSTMATCH} */
6804 case RX_BUFF_IDX_POSTMATCH:
6805 if (rx->offs[0].end != -1) {
6806 i = rx->sublen - rx->offs[0].end;
6808 s1 = rx->offs[0].end;
6814 /* $& / ${^MATCH}, $1, $2, ... */
6816 if (paren <= (I32)rx->nparens &&
6817 (s1 = rx->offs[paren].start) != -1 &&
6818 (t1 = rx->offs[paren].end) != -1)
6823 if (ckWARN(WARN_UNINITIALIZED))
6824 report_uninit((const SV *)sv);
6829 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6830 const char * const s = rx->subbeg + s1;
6835 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6842 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6844 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6845 PERL_UNUSED_ARG(rx);
6849 return newSVpvs("Regexp");
6852 /* Scans the name of a named buffer from the pattern.
6853 * If flags is REG_RSN_RETURN_NULL returns null.
6854 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6855 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6856 * to the parsed name as looked up in the RExC_paren_names hash.
6857 * If there is an error throws a vFAIL().. type exception.
6860 #define REG_RSN_RETURN_NULL 0
6861 #define REG_RSN_RETURN_NAME 1
6862 #define REG_RSN_RETURN_DATA 2
6865 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6867 char *name_start = RExC_parse;
6869 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6871 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6872 /* skip IDFIRST by using do...while */
6875 RExC_parse += UTF8SKIP(RExC_parse);
6876 } while (isALNUM_utf8((U8*)RExC_parse));
6880 } while (isALNUM(*RExC_parse));
6882 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6883 vFAIL("Group name must start with a non-digit word character");
6887 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6888 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6889 if ( flags == REG_RSN_RETURN_NAME)
6891 else if (flags==REG_RSN_RETURN_DATA) {
6894 if ( ! sv_name ) /* should not happen*/
6895 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6896 if (RExC_paren_names)
6897 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6899 sv_dat = HeVAL(he_str);
6901 vFAIL("Reference to nonexistent named group");
6905 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6906 (unsigned long) flags);
6908 assert(0); /* NOT REACHED */
6913 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6914 int rem=(int)(RExC_end - RExC_parse); \
6923 if (RExC_lastparse!=RExC_parse) \
6924 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6927 iscut ? "..." : "<" \
6930 PerlIO_printf(Perl_debug_log,"%16s",""); \
6933 num = RExC_size + 1; \
6935 num=REG_NODE_NUM(RExC_emit); \
6936 if (RExC_lastnum!=num) \
6937 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6939 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6940 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6941 (int)((depth*2)), "", \
6945 RExC_lastparse=RExC_parse; \
6950 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6951 DEBUG_PARSE_MSG((funcname)); \
6952 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6954 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6955 DEBUG_PARSE_MSG((funcname)); \
6956 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6959 /* This section of code defines the inversion list object and its methods. The
6960 * interfaces are highly subject to change, so as much as possible is static to
6961 * this file. An inversion list is here implemented as a malloc'd C UV array
6962 * with some added info that is placed as UVs at the beginning in a header
6963 * portion. An inversion list for Unicode is an array of code points, sorted
6964 * by ordinal number. The zeroth element is the first code point in the list.
6965 * The 1th element is the first element beyond that not in the list. In other
6966 * words, the first range is
6967 * invlist[0]..(invlist[1]-1)
6968 * The other ranges follow. Thus every element whose index is divisible by two
6969 * marks the beginning of a range that is in the list, and every element not
6970 * divisible by two marks the beginning of a range not in the list. A single
6971 * element inversion list that contains the single code point N generally
6972 * consists of two elements
6975 * (The exception is when N is the highest representable value on the
6976 * machine, in which case the list containing just it would be a single
6977 * element, itself. By extension, if the last range in the list extends to
6978 * infinity, then the first element of that range will be in the inversion list
6979 * at a position that is divisible by two, and is the final element in the
6981 * Taking the complement (inverting) an inversion list is quite simple, if the
6982 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6983 * This implementation reserves an element at the beginning of each inversion list
6984 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6985 * beginning of the list is either that element if 0, or the next one if 1.
6987 * More about inversion lists can be found in "Unicode Demystified"
6988 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6989 * More will be coming when functionality is added later.
6991 * The inversion list data structure is currently implemented as an SV pointing
6992 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6993 * array of UV whose memory management is automatically handled by the existing
6994 * facilities for SV's.
6996 * Some of the methods should always be private to the implementation, and some
6997 * should eventually be made public */
6999 #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
7000 #define INVLIST_ITER_OFFSET 1 /* Current iteration position */
7002 /* This is a combination of a version and data structure type, so that one
7003 * being passed in can be validated to be an inversion list of the correct
7004 * vintage. When the structure of the header is changed, a new random number
7005 * in the range 2**31-1 should be generated and the new() method changed to
7006 * insert that at this location. Then, if an auxiliary program doesn't change
7007 * correspondingly, it will be discovered immediately */
7008 #define INVLIST_VERSION_ID_OFFSET 2
7009 #define INVLIST_VERSION_ID 1064334010
7011 /* For safety, when adding new elements, remember to #undef them at the end of
7012 * the inversion list code section */
7014 #define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
7015 /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
7016 * contains the code point U+00000, and begins here. If 1, the inversion list
7017 * doesn't contain U+0000, and it begins at the next UV in the array.
7018 * Inverting an inversion list consists of adding or removing the 0 at the
7019 * beginning of it. By reserving a space for that 0, inversion can be made
7022 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
7024 /* Internally things are UVs */
7025 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7026 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7028 #define INVLIST_INITIAL_LEN 10
7030 PERL_STATIC_INLINE UV*
7031 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7033 /* Returns a pointer to the first element in the inversion list's array.
7034 * This is called upon initialization of an inversion list. Where the
7035 * array begins depends on whether the list has the code point U+0000
7036 * in it or not. The other parameter tells it whether the code that
7037 * follows this call is about to put a 0 in the inversion list or not.
7038 * The first element is either the element with 0, if 0, or the next one,
7041 UV* zero = get_invlist_zero_addr(invlist);
7043 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7046 assert(! *get_invlist_len_addr(invlist));
7048 /* 1^1 = 0; 1^0 = 1 */
7049 *zero = 1 ^ will_have_0;
7050 return zero + *zero;
7053 PERL_STATIC_INLINE UV*
7054 S_invlist_array(pTHX_ SV* const invlist)
7056 /* Returns the pointer to the inversion list's array. Every time the
7057 * length changes, this needs to be called in case malloc or realloc moved
7060 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7062 /* Must not be empty. If these fail, you probably didn't check for <len>
7063 * being non-zero before trying to get the array */
7064 assert(*get_invlist_len_addr(invlist));
7065 assert(*get_invlist_zero_addr(invlist) == 0
7066 || *get_invlist_zero_addr(invlist) == 1);
7068 /* The array begins either at the element reserved for zero if the
7069 * list contains 0 (that element will be set to 0), or otherwise the next
7070 * element (in which case the reserved element will be set to 1). */
7071 return (UV *) (get_invlist_zero_addr(invlist)
7072 + *get_invlist_zero_addr(invlist));
7075 PERL_STATIC_INLINE UV*
7076 S_get_invlist_len_addr(pTHX_ SV* invlist)
7078 /* Return the address of the UV that contains the current number
7079 * of used elements in the inversion list */
7081 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
7083 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
7086 PERL_STATIC_INLINE UV
7087 S_invlist_len(pTHX_ SV* const invlist)
7089 /* Returns the current number of elements stored in the inversion list's
7092 PERL_ARGS_ASSERT_INVLIST_LEN;
7094 return *get_invlist_len_addr(invlist);
7097 PERL_STATIC_INLINE void
7098 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7100 /* Sets the current number of elements stored in the inversion list */
7102 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7104 *get_invlist_len_addr(invlist) = len;
7106 assert(len <= SvLEN(invlist));
7108 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7109 /* If the list contains U+0000, that element is part of the header,
7110 * and should not be counted as part of the array. It will contain
7111 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7113 * SvCUR_set(invlist,
7114 * TO_INTERNAL_SIZE(len
7115 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7116 * But, this is only valid if len is not 0. The consequences of not doing
7117 * this is that the memory allocation code may think that 1 more UV is
7118 * being used than actually is, and so might do an unnecessary grow. That
7119 * seems worth not bothering to make this the precise amount.
7121 * Note that when inverting, SvCUR shouldn't change */
7124 PERL_STATIC_INLINE UV
7125 S_invlist_max(pTHX_ SV* const invlist)
7127 /* Returns the maximum number of elements storable in the inversion list's
7128 * array, without having to realloc() */
7130 PERL_ARGS_ASSERT_INVLIST_MAX;
7132 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7135 PERL_STATIC_INLINE UV*
7136 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7138 /* Return the address of the UV that is reserved to hold 0 if the inversion
7139 * list contains 0. This has to be the last element of the heading, as the
7140 * list proper starts with either it if 0, or the next element if not.
7141 * (But we force it to contain either 0 or 1) */
7143 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7145 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7148 #ifndef PERL_IN_XSUB_RE
7150 Perl__new_invlist(pTHX_ IV initial_size)
7153 /* Return a pointer to a newly constructed inversion list, with enough
7154 * space to store 'initial_size' elements. If that number is negative, a
7155 * system default is used instead */
7159 if (initial_size < 0) {
7160 initial_size = INVLIST_INITIAL_LEN;
7163 /* Allocate the initial space */
7164 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7165 invlist_set_len(new_list, 0);
7167 /* Force iterinit() to be used to get iteration to work */
7168 *get_invlist_iter_addr(new_list) = UV_MAX;
7170 /* This should force a segfault if a method doesn't initialize this
7172 *get_invlist_zero_addr(new_list) = UV_MAX;
7174 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7175 #if HEADER_LENGTH != 4
7176 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7184 S__new_invlist_C_array(pTHX_ UV* list)
7186 /* Return a pointer to a newly constructed inversion list, initialized to
7187 * point to <list>, which has to be in the exact correct inversion list
7188 * form, including internal fields. Thus this is a dangerous routine that
7189 * should not be used in the wrong hands */
7191 SV* invlist = newSV_type(SVt_PV);
7193 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7195 SvPV_set(invlist, (char *) list);
7196 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7197 shouldn't touch it */
7198 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
7200 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7201 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7208 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7210 /* Grow the maximum size of an inversion list */
7212 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7214 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7217 PERL_STATIC_INLINE void
7218 S_invlist_trim(pTHX_ SV* const invlist)
7220 PERL_ARGS_ASSERT_INVLIST_TRIM;
7222 /* Change the length of the inversion list to how many entries it currently
7225 SvPV_shrink_to_cur((SV *) invlist);
7228 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
7230 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
7231 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
7233 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7236 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7238 /* Subject to change or removal. Append the range from 'start' to 'end' at
7239 * the end of the inversion list. The range must be above any existing
7243 UV max = invlist_max(invlist);
7244 UV len = invlist_len(invlist);
7246 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7248 if (len == 0) { /* Empty lists must be initialized */
7249 array = _invlist_array_init(invlist, start == 0);
7252 /* Here, the existing list is non-empty. The current max entry in the
7253 * list is generally the first value not in the set, except when the
7254 * set extends to the end of permissible values, in which case it is
7255 * the first entry in that final set, and so this call is an attempt to
7256 * append out-of-order */
7258 UV final_element = len - 1;
7259 array = invlist_array(invlist);
7260 if (array[final_element] > start
7261 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7263 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7264 array[final_element], start,
7265 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7268 /* Here, it is a legal append. If the new range begins with the first
7269 * value not in the set, it is extending the set, so the new first
7270 * value not in the set is one greater than the newly extended range.
7272 if (array[final_element] == start) {
7273 if (end != UV_MAX) {
7274 array[final_element] = end + 1;
7277 /* But if the end is the maximum representable on the machine,
7278 * just let the range that this would extend to have no end */
7279 invlist_set_len(invlist, len - 1);
7285 /* Here the new range doesn't extend any existing set. Add it */
7287 len += 2; /* Includes an element each for the start and end of range */
7289 /* If overflows the existing space, extend, which may cause the array to be
7292 invlist_extend(invlist, len);
7293 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7294 failure in invlist_array() */
7295 array = invlist_array(invlist);
7298 invlist_set_len(invlist, len);
7301 /* The next item on the list starts the range, the one after that is
7302 * one past the new range. */
7303 array[len - 2] = start;
7304 if (end != UV_MAX) {
7305 array[len - 1] = end + 1;
7308 /* But if the end is the maximum representable on the machine, just let
7309 * the range have no end */
7310 invlist_set_len(invlist, len - 1);
7314 #ifndef PERL_IN_XSUB_RE
7317 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7319 /* Searches the inversion list for the entry that contains the input code
7320 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7321 * return value is the index into the list's array of the range that
7325 IV high = invlist_len(invlist);
7326 const UV * const array = invlist_array(invlist);
7328 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7330 /* If list is empty or the code point is before the first element, return
7332 if (high == 0 || cp < array[0]) {
7336 /* Binary search. What we are looking for is <i> such that
7337 * array[i] <= cp < array[i+1]
7338 * The loop below converges on the i+1. */
7339 while (low < high) {
7340 IV mid = (low + high) / 2;
7341 if (array[mid] <= cp) {
7344 /* We could do this extra test to exit the loop early.
7345 if (cp < array[low]) {
7350 else { /* cp < array[mid] */
7359 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7361 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7362 * but is used when the swash has an inversion list. This makes this much
7363 * faster, as it uses a binary search instead of a linear one. This is
7364 * intimately tied to that function, and perhaps should be in utf8.c,
7365 * except it is intimately tied to inversion lists as well. It assumes
7366 * that <swatch> is all 0's on input */
7369 const IV len = invlist_len(invlist);
7373 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7375 if (len == 0) { /* Empty inversion list */
7379 array = invlist_array(invlist);
7381 /* Find which element it is */
7382 i = _invlist_search(invlist, start);
7384 /* We populate from <start> to <end> */
7385 while (current < end) {
7388 /* The inversion list gives the results for every possible code point
7389 * after the first one in the list. Only those ranges whose index is
7390 * even are ones that the inversion list matches. For the odd ones,
7391 * and if the initial code point is not in the list, we have to skip
7392 * forward to the next element */
7393 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7395 if (i >= len) { /* Finished if beyond the end of the array */
7399 if (current >= end) { /* Finished if beyond the end of what we
7404 assert(current >= start);
7406 /* The current range ends one below the next one, except don't go past
7409 upper = (i < len && array[i] < end) ? array[i] : end;
7411 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7412 * for each code point in it */
7413 for (; current < upper; current++) {
7414 const STRLEN offset = (STRLEN)(current - start);
7415 swatch[offset >> 3] |= 1 << (offset & 7);
7418 /* Quit if at the end of the list */
7421 /* But first, have to deal with the highest possible code point on
7422 * the platform. The previous code assumes that <end> is one
7423 * beyond where we want to populate, but that is impossible at the
7424 * platform's infinity, so have to handle it specially */
7425 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7427 const STRLEN offset = (STRLEN)(end - start);
7428 swatch[offset >> 3] |= 1 << (offset & 7);
7433 /* Advance to the next range, which will be for code points not in the
7442 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7444 /* Take the union of two inversion lists and point <output> to it. *output
7445 * should be defined upon input, and if it points to one of the two lists,
7446 * the reference count to that list will be decremented. The first list,
7447 * <a>, may be NULL, in which case a copy of the second list is returned.
7448 * If <complement_b> is TRUE, the union is taken of the complement
7449 * (inversion) of <b> instead of b itself.
7451 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7452 * Richard Gillam, published by Addison-Wesley, and explained at some
7453 * length there. The preface says to incorporate its examples into your
7454 * code at your own risk.
7456 * The algorithm is like a merge sort.
7458 * XXX A potential performance improvement is to keep track as we go along
7459 * if only one of the inputs contributes to the result, meaning the other
7460 * is a subset of that one. In that case, we can skip the final copy and
7461 * return the larger of the input lists, but then outside code might need
7462 * to keep track of whether to free the input list or not */
7464 UV* array_a; /* a's array */
7466 UV len_a; /* length of a's array */
7469 SV* u; /* the resulting union */
7473 UV i_a = 0; /* current index into a's array */
7477 /* running count, as explained in the algorithm source book; items are
7478 * stopped accumulating and are output when the count changes to/from 0.
7479 * The count is incremented when we start a range that's in the set, and
7480 * decremented when we start a range that's not in the set. So its range
7481 * is 0 to 2. Only when the count is zero is something not in the set.
7485 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7488 /* If either one is empty, the union is the other one */
7489 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7496 *output = invlist_clone(b);
7498 _invlist_invert(*output);
7500 } /* else *output already = b; */
7503 else if ((len_b = invlist_len(b)) == 0) {
7508 /* The complement of an empty list is a list that has everything in it,
7509 * so the union with <a> includes everything too */
7514 *output = _new_invlist(1);
7515 _append_range_to_invlist(*output, 0, UV_MAX);
7517 else if (*output != a) {
7518 *output = invlist_clone(a);
7520 /* else *output already = a; */
7524 /* Here both lists exist and are non-empty */
7525 array_a = invlist_array(a);
7526 array_b = invlist_array(b);
7528 /* If are to take the union of 'a' with the complement of b, set it
7529 * up so are looking at b's complement. */
7532 /* To complement, we invert: if the first element is 0, remove it. To
7533 * do this, we just pretend the array starts one later, and clear the
7534 * flag as we don't have to do anything else later */
7535 if (array_b[0] == 0) {
7538 complement_b = FALSE;
7542 /* But if the first element is not zero, we unshift a 0 before the
7543 * array. The data structure reserves a space for that 0 (which
7544 * should be a '1' right now), so physical shifting is unneeded,
7545 * but temporarily change that element to 0. Before exiting the
7546 * routine, we must restore the element to '1' */
7553 /* Size the union for the worst case: that the sets are completely
7555 u = _new_invlist(len_a + len_b);
7557 /* Will contain U+0000 if either component does */
7558 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7559 || (len_b > 0 && array_b[0] == 0));
7561 /* Go through each list item by item, stopping when exhausted one of
7563 while (i_a < len_a && i_b < len_b) {
7564 UV cp; /* The element to potentially add to the union's array */
7565 bool cp_in_set; /* is it in the the input list's set or not */
7567 /* We need to take one or the other of the two inputs for the union.
7568 * Since we are merging two sorted lists, we take the smaller of the
7569 * next items. In case of a tie, we take the one that is in its set
7570 * first. If we took one not in the set first, it would decrement the
7571 * count, possibly to 0 which would cause it to be output as ending the
7572 * range, and the next time through we would take the same number, and
7573 * output it again as beginning the next range. By doing it the
7574 * opposite way, there is no possibility that the count will be
7575 * momentarily decremented to 0, and thus the two adjoining ranges will
7576 * be seamlessly merged. (In a tie and both are in the set or both not
7577 * in the set, it doesn't matter which we take first.) */
7578 if (array_a[i_a] < array_b[i_b]
7579 || (array_a[i_a] == array_b[i_b]
7580 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7582 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7586 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7590 /* Here, have chosen which of the two inputs to look at. Only output
7591 * if the running count changes to/from 0, which marks the
7592 * beginning/end of a range in that's in the set */
7595 array_u[i_u++] = cp;
7602 array_u[i_u++] = cp;
7607 /* Here, we are finished going through at least one of the lists, which
7608 * means there is something remaining in at most one. We check if the list
7609 * that hasn't been exhausted is positioned such that we are in the middle
7610 * of a range in its set or not. (i_a and i_b point to the element beyond
7611 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7612 * is potentially more to output.
7613 * There are four cases:
7614 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7615 * in the union is entirely from the non-exhausted set.
7616 * 2) Both were in their sets, count is 2. Nothing further should
7617 * be output, as everything that remains will be in the exhausted
7618 * list's set, hence in the union; decrementing to 1 but not 0 insures
7620 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7621 * Nothing further should be output because the union includes
7622 * everything from the exhausted set. Not decrementing ensures that.
7623 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7624 * decrementing to 0 insures that we look at the remainder of the
7625 * non-exhausted set */
7626 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7627 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7632 /* The final length is what we've output so far, plus what else is about to
7633 * be output. (If 'count' is non-zero, then the input list we exhausted
7634 * has everything remaining up to the machine's limit in its set, and hence
7635 * in the union, so there will be no further output. */
7638 /* At most one of the subexpressions will be non-zero */
7639 len_u += (len_a - i_a) + (len_b - i_b);
7642 /* Set result to final length, which can change the pointer to array_u, so
7644 if (len_u != invlist_len(u)) {
7645 invlist_set_len(u, len_u);
7647 array_u = invlist_array(u);
7650 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7651 * the other) ended with everything above it not in its set. That means
7652 * that the remaining part of the union is precisely the same as the
7653 * non-exhausted list, so can just copy it unchanged. (If both list were
7654 * exhausted at the same time, then the operations below will be both 0.)
7657 IV copy_count; /* At most one will have a non-zero copy count */
7658 if ((copy_count = len_a - i_a) > 0) {
7659 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7661 else if ((copy_count = len_b - i_b) > 0) {
7662 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7666 /* We may be removing a reference to one of the inputs */
7667 if (a == *output || b == *output) {
7668 SvREFCNT_dec(*output);
7671 /* If we've changed b, restore it */
7681 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7683 /* Take the intersection of two inversion lists and point <i> to it. *i
7684 * should be defined upon input, and if it points to one of the two lists,
7685 * the reference count to that list will be decremented.
7686 * If <complement_b> is TRUE, the result will be the intersection of <a>
7687 * and the complement (or inversion) of <b> instead of <b> directly.
7689 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7690 * Richard Gillam, published by Addison-Wesley, and explained at some
7691 * length there. The preface says to incorporate its examples into your
7692 * code at your own risk. In fact, it had bugs
7694 * The algorithm is like a merge sort, and is essentially the same as the
7698 UV* array_a; /* a's array */
7700 UV len_a; /* length of a's array */
7703 SV* r; /* the resulting intersection */
7707 UV i_a = 0; /* current index into a's array */
7711 /* running count, as explained in the algorithm source book; items are
7712 * stopped accumulating and are output when the count changes to/from 2.
7713 * The count is incremented when we start a range that's in the set, and
7714 * decremented when we start a range that's not in the set. So its range
7715 * is 0 to 2. Only when the count is 2 is something in the intersection.
7719 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7722 /* Special case if either one is empty */
7723 len_a = invlist_len(a);
7724 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7726 if (len_a != 0 && complement_b) {
7728 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7729 * be empty. Here, also we are using 'b's complement, which hence
7730 * must be every possible code point. Thus the intersection is
7733 *i = invlist_clone(a);
7739 /* else *i is already 'a' */
7743 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7744 * intersection must be empty */
7751 *i = _new_invlist(0);
7755 /* Here both lists exist and are non-empty */
7756 array_a = invlist_array(a);
7757 array_b = invlist_array(b);
7759 /* If are to take the intersection of 'a' with the complement of b, set it
7760 * up so are looking at b's complement. */
7763 /* To complement, we invert: if the first element is 0, remove it. To
7764 * do this, we just pretend the array starts one later, and clear the
7765 * flag as we don't have to do anything else later */
7766 if (array_b[0] == 0) {
7769 complement_b = FALSE;
7773 /* But if the first element is not zero, we unshift a 0 before the
7774 * array. The data structure reserves a space for that 0 (which
7775 * should be a '1' right now), so physical shifting is unneeded,
7776 * but temporarily change that element to 0. Before exiting the
7777 * routine, we must restore the element to '1' */
7784 /* Size the intersection for the worst case: that the intersection ends up
7785 * fragmenting everything to be completely disjoint */
7786 r= _new_invlist(len_a + len_b);
7788 /* Will contain U+0000 iff both components do */
7789 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7790 && len_b > 0 && array_b[0] == 0);
7792 /* Go through each list item by item, stopping when exhausted one of
7794 while (i_a < len_a && i_b < len_b) {
7795 UV cp; /* The element to potentially add to the intersection's
7797 bool cp_in_set; /* Is it in the input list's set or not */
7799 /* We need to take one or the other of the two inputs for the
7800 * intersection. Since we are merging two sorted lists, we take the
7801 * smaller of the next items. In case of a tie, we take the one that
7802 * is not in its set first (a difference from the union algorithm). If
7803 * we took one in the set first, it would increment the count, possibly
7804 * to 2 which would cause it to be output as starting a range in the
7805 * intersection, and the next time through we would take that same
7806 * number, and output it again as ending the set. By doing it the
7807 * opposite of this, there is no possibility that the count will be
7808 * momentarily incremented to 2. (In a tie and both are in the set or
7809 * both not in the set, it doesn't matter which we take first.) */
7810 if (array_a[i_a] < array_b[i_b]
7811 || (array_a[i_a] == array_b[i_b]
7812 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7814 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7818 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7822 /* Here, have chosen which of the two inputs to look at. Only output
7823 * if the running count changes to/from 2, which marks the
7824 * beginning/end of a range that's in the intersection */
7828 array_r[i_r++] = cp;
7833 array_r[i_r++] = cp;
7839 /* Here, we are finished going through at least one of the lists, which
7840 * means there is something remaining in at most one. We check if the list
7841 * that has been exhausted is positioned such that we are in the middle
7842 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7843 * the ones we care about.) There are four cases:
7844 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7845 * nothing left in the intersection.
7846 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7847 * above 2. What should be output is exactly that which is in the
7848 * non-exhausted set, as everything it has is also in the intersection
7849 * set, and everything it doesn't have can't be in the intersection
7850 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7851 * gets incremented to 2. Like the previous case, the intersection is
7852 * everything that remains in the non-exhausted set.
7853 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7854 * remains 1. And the intersection has nothing more. */
7855 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7856 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7861 /* The final length is what we've output so far plus what else is in the
7862 * intersection. At most one of the subexpressions below will be non-zero */
7865 len_r += (len_a - i_a) + (len_b - i_b);
7868 /* Set result to final length, which can change the pointer to array_r, so
7870 if (len_r != invlist_len(r)) {
7871 invlist_set_len(r, len_r);
7873 array_r = invlist_array(r);
7876 /* Finish outputting any remaining */
7877 if (count >= 2) { /* At most one will have a non-zero copy count */
7879 if ((copy_count = len_a - i_a) > 0) {
7880 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7882 else if ((copy_count = len_b - i_b) > 0) {
7883 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7887 /* We may be removing a reference to one of the inputs */
7888 if (a == *i || b == *i) {
7892 /* If we've changed b, restore it */
7902 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7904 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7905 * set. A pointer to the inversion list is returned. This may actually be
7906 * a new list, in which case the passed in one has been destroyed. The
7907 * passed in inversion list can be NULL, in which case a new one is created
7908 * with just the one range in it */
7913 if (invlist == NULL) {
7914 invlist = _new_invlist(2);
7918 len = invlist_len(invlist);
7921 /* If comes after the final entry, can just append it to the end */
7923 || start >= invlist_array(invlist)
7924 [invlist_len(invlist) - 1])
7926 _append_range_to_invlist(invlist, start, end);
7930 /* Here, can't just append things, create and return a new inversion list
7931 * which is the union of this range and the existing inversion list */
7932 range_invlist = _new_invlist(2);
7933 _append_range_to_invlist(range_invlist, start, end);
7935 _invlist_union(invlist, range_invlist, &invlist);
7937 /* The temporary can be freed */
7938 SvREFCNT_dec(range_invlist);
7945 PERL_STATIC_INLINE bool
7946 S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp)
7948 /* Does <invlist> contain code point <cp> as part of the set? */
7950 IV index = _invlist_search(invlist, cp);
7952 PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP;
7954 return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index);
7957 PERL_STATIC_INLINE SV*
7958 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7959 return _add_range_to_invlist(invlist, cp, cp);
7962 #ifndef PERL_IN_XSUB_RE
7964 Perl__invlist_invert(pTHX_ SV* const invlist)
7966 /* Complement the input inversion list. This adds a 0 if the list didn't
7967 * have a zero; removes it otherwise. As described above, the data
7968 * structure is set up so that this is very efficient */
7970 UV* len_pos = get_invlist_len_addr(invlist);
7972 PERL_ARGS_ASSERT__INVLIST_INVERT;
7974 /* The inverse of matching nothing is matching everything */
7975 if (*len_pos == 0) {
7976 _append_range_to_invlist(invlist, 0, UV_MAX);
7980 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7981 * zero element was a 0, so it is being removed, so the length decrements
7982 * by 1; and vice-versa. SvCUR is unaffected */
7983 if (*get_invlist_zero_addr(invlist) ^= 1) {
7992 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7994 /* Complement the input inversion list (which must be a Unicode property,
7995 * all of which don't match above the Unicode maximum code point.) And
7996 * Perl has chosen to not have the inversion match above that either. This
7997 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8003 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8005 _invlist_invert(invlist);
8007 len = invlist_len(invlist);
8009 if (len != 0) { /* If empty do nothing */
8010 array = invlist_array(invlist);
8011 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8012 /* Add 0x110000. First, grow if necessary */
8014 if (invlist_max(invlist) < len) {
8015 invlist_extend(invlist, len);
8016 array = invlist_array(invlist);
8018 invlist_set_len(invlist, len);
8019 array[len - 1] = PERL_UNICODE_MAX + 1;
8021 else { /* Remove the 0x110000 */
8022 invlist_set_len(invlist, len - 1);
8030 PERL_STATIC_INLINE SV*
8031 S_invlist_clone(pTHX_ SV* const invlist)
8034 /* Return a new inversion list that is a copy of the input one, which is
8037 /* Need to allocate extra space to accommodate Perl's addition of a
8038 * trailing NUL to SvPV's, since it thinks they are always strings */
8039 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
8040 STRLEN length = SvCUR(invlist);
8042 PERL_ARGS_ASSERT_INVLIST_CLONE;
8044 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8045 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8050 PERL_STATIC_INLINE UV*
8051 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8053 /* Return the address of the UV that contains the current iteration
8056 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8058 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8061 PERL_STATIC_INLINE UV*
8062 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8064 /* Return the address of the UV that contains the version id. */
8066 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8068 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8071 PERL_STATIC_INLINE void
8072 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8074 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8076 *get_invlist_iter_addr(invlist) = 0;
8080 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8082 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8083 * This call sets in <*start> and <*end>, the next range in <invlist>.
8084 * Returns <TRUE> if successful and the next call will return the next
8085 * range; <FALSE> if was already at the end of the list. If the latter,
8086 * <*start> and <*end> are unchanged, and the next call to this function
8087 * will start over at the beginning of the list */
8089 UV* pos = get_invlist_iter_addr(invlist);
8090 UV len = invlist_len(invlist);
8093 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8096 *pos = UV_MAX; /* Force iternit() to be required next time */
8100 array = invlist_array(invlist);
8102 *start = array[(*pos)++];
8108 *end = array[(*pos)++] - 1;
8114 PERL_STATIC_INLINE UV
8115 S_invlist_highest(pTHX_ SV* const invlist)
8117 /* Returns the highest code point that matches an inversion list. This API
8118 * has an ambiguity, as it returns 0 under either the highest is actually
8119 * 0, or if the list is empty. If this distinction matters to you, check
8120 * for emptiness before calling this function */
8122 UV len = invlist_len(invlist);
8125 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8131 array = invlist_array(invlist);
8133 /* The last element in the array in the inversion list always starts a
8134 * range that goes to infinity. That range may be for code points that are
8135 * matched in the inversion list, or it may be for ones that aren't
8136 * matched. In the latter case, the highest code point in the set is one
8137 * less than the beginning of this range; otherwise it is the final element
8138 * of this range: infinity */
8139 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8141 : array[len - 1] - 1;
8144 #ifndef PERL_IN_XSUB_RE
8146 Perl__invlist_contents(pTHX_ SV* const invlist)
8148 /* Get the contents of an inversion list into a string SV so that they can
8149 * be printed out. It uses the format traditionally done for debug tracing
8153 SV* output = newSVpvs("\n");
8155 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8157 invlist_iterinit(invlist);
8158 while (invlist_iternext(invlist, &start, &end)) {
8159 if (end == UV_MAX) {
8160 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8162 else if (end != start) {
8163 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8167 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8177 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8179 /* Dumps out the ranges in an inversion list. The string 'header'
8180 * if present is output on a line before the first range */
8184 if (header && strlen(header)) {
8185 PerlIO_printf(Perl_debug_log, "%s\n", header);
8187 invlist_iterinit(invlist);
8188 while (invlist_iternext(invlist, &start, &end)) {
8189 if (end == UV_MAX) {
8190 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8193 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8201 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8203 /* Return a boolean as to if the two passed in inversion lists are
8204 * identical. The final argument, if TRUE, says to take the complement of
8205 * the second inversion list before doing the comparison */
8207 UV* array_a = invlist_array(a);
8208 UV* array_b = invlist_array(b);
8209 UV len_a = invlist_len(a);
8210 UV len_b = invlist_len(b);
8212 UV i = 0; /* current index into the arrays */
8213 bool retval = TRUE; /* Assume are identical until proven otherwise */
8215 PERL_ARGS_ASSERT__INVLISTEQ;
8217 /* If are to compare 'a' with the complement of b, set it
8218 * up so are looking at b's complement. */
8221 /* The complement of nothing is everything, so <a> would have to have
8222 * just one element, starting at zero (ending at infinity) */
8224 return (len_a == 1 && array_a[0] == 0);
8226 else if (array_b[0] == 0) {
8228 /* Otherwise, to complement, we invert. Here, the first element is
8229 * 0, just remove it. To do this, we just pretend the array starts
8230 * one later, and clear the flag as we don't have to do anything
8235 complement_b = FALSE;
8239 /* But if the first element is not zero, we unshift a 0 before the
8240 * array. The data structure reserves a space for that 0 (which
8241 * should be a '1' right now), so physical shifting is unneeded,
8242 * but temporarily change that element to 0. Before exiting the
8243 * routine, we must restore the element to '1' */
8250 /* Make sure that the lengths are the same, as well as the final element
8251 * before looping through the remainder. (Thus we test the length, final,
8252 * and first elements right off the bat) */
8253 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8256 else for (i = 0; i < len_a - 1; i++) {
8257 if (array_a[i] != array_b[i]) {
8270 #undef HEADER_LENGTH
8271 #undef INVLIST_INITIAL_LENGTH
8272 #undef TO_INTERNAL_SIZE
8273 #undef FROM_INTERNAL_SIZE
8274 #undef INVLIST_LEN_OFFSET
8275 #undef INVLIST_ZERO_OFFSET
8276 #undef INVLIST_ITER_OFFSET
8277 #undef INVLIST_VERSION_ID
8279 /* End of inversion list object */
8282 - reg - regular expression, i.e. main body or parenthesized thing
8284 * Caller must absorb opening parenthesis.
8286 * Combining parenthesis handling with the base level of regular expression
8287 * is a trifle forced, but the need to tie the tails of the branches to what
8288 * follows makes it hard to avoid.
8290 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8292 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8294 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8298 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8299 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8302 regnode *ret; /* Will be the head of the group. */
8305 regnode *ender = NULL;
8308 U32 oregflags = RExC_flags;
8309 bool have_branch = 0;
8311 I32 freeze_paren = 0;
8312 I32 after_freeze = 0;
8314 /* for (?g), (?gc), and (?o) warnings; warning
8315 about (?c) will warn about (?g) -- japhy */
8317 #define WASTED_O 0x01
8318 #define WASTED_G 0x02
8319 #define WASTED_C 0x04
8320 #define WASTED_GC (0x02|0x04)
8321 I32 wastedflags = 0x00;
8323 char * parse_start = RExC_parse; /* MJD */
8324 char * const oregcomp_parse = RExC_parse;
8326 GET_RE_DEBUG_FLAGS_DECL;
8328 PERL_ARGS_ASSERT_REG;
8329 DEBUG_PARSE("reg ");
8331 *flagp = 0; /* Tentatively. */
8334 /* Make an OPEN node, if parenthesized. */
8336 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8337 char *start_verb = RExC_parse;
8338 STRLEN verb_len = 0;
8339 char *start_arg = NULL;
8340 unsigned char op = 0;
8342 int internal_argval = 0; /* internal_argval is only useful if !argok */
8343 while ( *RExC_parse && *RExC_parse != ')' ) {
8344 if ( *RExC_parse == ':' ) {
8345 start_arg = RExC_parse + 1;
8351 verb_len = RExC_parse - start_verb;
8354 while ( *RExC_parse && *RExC_parse != ')' )
8356 if ( *RExC_parse != ')' )
8357 vFAIL("Unterminated verb pattern argument");
8358 if ( RExC_parse == start_arg )
8361 if ( *RExC_parse != ')' )
8362 vFAIL("Unterminated verb pattern");
8365 switch ( *start_verb ) {
8366 case 'A': /* (*ACCEPT) */
8367 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8369 internal_argval = RExC_nestroot;
8372 case 'C': /* (*COMMIT) */
8373 if ( memEQs(start_verb,verb_len,"COMMIT") )
8376 case 'F': /* (*FAIL) */
8377 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8382 case ':': /* (*:NAME) */
8383 case 'M': /* (*MARK:NAME) */
8384 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8389 case 'P': /* (*PRUNE) */
8390 if ( memEQs(start_verb,verb_len,"PRUNE") )
8393 case 'S': /* (*SKIP) */
8394 if ( memEQs(start_verb,verb_len,"SKIP") )
8397 case 'T': /* (*THEN) */
8398 /* [19:06] <TimToady> :: is then */
8399 if ( memEQs(start_verb,verb_len,"THEN") ) {
8401 RExC_seen |= REG_SEEN_CUTGROUP;
8407 vFAIL3("Unknown verb pattern '%.*s'",
8408 verb_len, start_verb);
8411 if ( start_arg && internal_argval ) {
8412 vFAIL3("Verb pattern '%.*s' may not have an argument",
8413 verb_len, start_verb);
8414 } else if ( argok < 0 && !start_arg ) {
8415 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8416 verb_len, start_verb);
8418 ret = reganode(pRExC_state, op, internal_argval);
8419 if ( ! internal_argval && ! SIZE_ONLY ) {
8421 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8422 ARG(ret) = add_data( pRExC_state, 1, "S" );
8423 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8430 if (!internal_argval)
8431 RExC_seen |= REG_SEEN_VERBARG;
8432 } else if ( start_arg ) {
8433 vFAIL3("Verb pattern '%.*s' may not have an argument",
8434 verb_len, start_verb);
8436 ret = reg_node(pRExC_state, op);
8438 nextchar(pRExC_state);
8441 if (*RExC_parse == '?') { /* (?...) */
8442 bool is_logical = 0;
8443 const char * const seqstart = RExC_parse;
8444 bool has_use_defaults = FALSE;
8447 paren = *RExC_parse++;
8448 ret = NULL; /* For look-ahead/behind. */
8451 case 'P': /* (?P...) variants for those used to PCRE/Python */
8452 paren = *RExC_parse++;
8453 if ( paren == '<') /* (?P<...>) named capture */
8455 else if (paren == '>') { /* (?P>name) named recursion */
8456 goto named_recursion;
8458 else if (paren == '=') { /* (?P=...) named backref */
8459 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8460 you change this make sure you change that */
8461 char* name_start = RExC_parse;
8463 SV *sv_dat = reg_scan_name(pRExC_state,
8464 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8465 if (RExC_parse == name_start || *RExC_parse != ')')
8466 vFAIL2("Sequence %.3s... not terminated",parse_start);
8469 num = add_data( pRExC_state, 1, "S" );
8470 RExC_rxi->data->data[num]=(void*)sv_dat;
8471 SvREFCNT_inc_simple_void(sv_dat);
8474 ret = reganode(pRExC_state,
8477 : (ASCII_FOLD_RESTRICTED)
8479 : (AT_LEAST_UNI_SEMANTICS)
8487 Set_Node_Offset(ret, parse_start+1);
8488 Set_Node_Cur_Length(ret); /* MJD */
8490 nextchar(pRExC_state);
8494 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8496 case '<': /* (?<...) */
8497 if (*RExC_parse == '!')
8499 else if (*RExC_parse != '=')
8505 case '\'': /* (?'...') */
8506 name_start= RExC_parse;
8507 svname = reg_scan_name(pRExC_state,
8508 SIZE_ONLY ? /* reverse test from the others */
8509 REG_RSN_RETURN_NAME :
8510 REG_RSN_RETURN_NULL);
8511 if (RExC_parse == name_start) {
8513 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8516 if (*RExC_parse != paren)
8517 vFAIL2("Sequence (?%c... not terminated",
8518 paren=='>' ? '<' : paren);
8522 if (!svname) /* shouldn't happen */
8524 "panic: reg_scan_name returned NULL");
8525 if (!RExC_paren_names) {
8526 RExC_paren_names= newHV();
8527 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8529 RExC_paren_name_list= newAV();
8530 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8533 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8535 sv_dat = HeVAL(he_str);
8537 /* croak baby croak */
8539 "panic: paren_name hash element allocation failed");
8540 } else if ( SvPOK(sv_dat) ) {
8541 /* (?|...) can mean we have dupes so scan to check
8542 its already been stored. Maybe a flag indicating
8543 we are inside such a construct would be useful,
8544 but the arrays are likely to be quite small, so
8545 for now we punt -- dmq */
8546 IV count = SvIV(sv_dat);
8547 I32 *pv = (I32*)SvPVX(sv_dat);
8549 for ( i = 0 ; i < count ; i++ ) {
8550 if ( pv[i] == RExC_npar ) {
8556 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8557 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8558 pv[count] = RExC_npar;
8559 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8562 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8563 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8565 SvIV_set(sv_dat, 1);
8568 /* Yes this does cause a memory leak in debugging Perls */
8569 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8570 SvREFCNT_dec(svname);
8573 /*sv_dump(sv_dat);*/
8575 nextchar(pRExC_state);
8577 goto capturing_parens;
8579 RExC_seen |= REG_SEEN_LOOKBEHIND;
8580 RExC_in_lookbehind++;
8582 case '=': /* (?=...) */
8583 RExC_seen_zerolen++;
8585 case '!': /* (?!...) */
8586 RExC_seen_zerolen++;
8587 if (*RExC_parse == ')') {
8588 ret=reg_node(pRExC_state, OPFAIL);
8589 nextchar(pRExC_state);
8593 case '|': /* (?|...) */
8594 /* branch reset, behave like a (?:...) except that
8595 buffers in alternations share the same numbers */
8597 after_freeze = freeze_paren = RExC_npar;
8599 case ':': /* (?:...) */
8600 case '>': /* (?>...) */
8602 case '$': /* (?$...) */
8603 case '@': /* (?@...) */
8604 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8606 case '#': /* (?#...) */
8607 while (*RExC_parse && *RExC_parse != ')')
8609 if (*RExC_parse != ')')
8610 FAIL("Sequence (?#... not terminated");
8611 nextchar(pRExC_state);
8614 case '0' : /* (?0) */
8615 case 'R' : /* (?R) */
8616 if (*RExC_parse != ')')
8617 FAIL("Sequence (?R) not terminated");
8618 ret = reg_node(pRExC_state, GOSTART);
8619 *flagp |= POSTPONED;
8620 nextchar(pRExC_state);
8623 { /* named and numeric backreferences */
8625 case '&': /* (?&NAME) */
8626 parse_start = RExC_parse - 1;
8629 SV *sv_dat = reg_scan_name(pRExC_state,
8630 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8631 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8633 goto gen_recurse_regop;
8634 assert(0); /* NOT REACHED */
8636 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8638 vFAIL("Illegal pattern");
8640 goto parse_recursion;
8642 case '-': /* (?-1) */
8643 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8644 RExC_parse--; /* rewind to let it be handled later */
8648 case '1': case '2': case '3': case '4': /* (?1) */
8649 case '5': case '6': case '7': case '8': case '9':
8652 num = atoi(RExC_parse);
8653 parse_start = RExC_parse - 1; /* MJD */
8654 if (*RExC_parse == '-')
8656 while (isDIGIT(*RExC_parse))
8658 if (*RExC_parse!=')')
8659 vFAIL("Expecting close bracket");
8662 if ( paren == '-' ) {
8664 Diagram of capture buffer numbering.
8665 Top line is the normal capture buffer numbers
8666 Bottom line is the negative indexing as from
8670 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8674 num = RExC_npar + num;
8677 vFAIL("Reference to nonexistent group");
8679 } else if ( paren == '+' ) {
8680 num = RExC_npar + num - 1;
8683 ret = reganode(pRExC_state, GOSUB, num);
8685 if (num > (I32)RExC_rx->nparens) {
8687 vFAIL("Reference to nonexistent group");
8689 ARG2L_SET( ret, RExC_recurse_count++);
8691 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8692 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8696 RExC_seen |= REG_SEEN_RECURSE;
8697 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8698 Set_Node_Offset(ret, parse_start); /* MJD */
8700 *flagp |= POSTPONED;
8701 nextchar(pRExC_state);
8703 } /* named and numeric backreferences */
8704 assert(0); /* NOT REACHED */
8706 case '?': /* (??...) */
8708 if (*RExC_parse != '{') {
8710 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8713 *flagp |= POSTPONED;
8714 paren = *RExC_parse++;
8716 case '{': /* (?{...}) */
8719 struct reg_code_block *cb;
8721 RExC_seen_zerolen++;
8723 if ( !pRExC_state->num_code_blocks
8724 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8725 || pRExC_state->code_blocks[pRExC_state->code_index].start
8726 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8729 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8730 FAIL("panic: Sequence (?{...}): no code block found\n");
8731 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8733 /* this is a pre-compiled code block (?{...}) */
8734 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8735 RExC_parse = RExC_start + cb->end;
8738 if (cb->src_regex) {
8739 n = add_data(pRExC_state, 2, "rl");
8740 RExC_rxi->data->data[n] =
8741 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8742 RExC_rxi->data->data[n+1] = (void*)o;
8745 n = add_data(pRExC_state, 1,
8746 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8747 RExC_rxi->data->data[n] = (void*)o;
8750 pRExC_state->code_index++;
8751 nextchar(pRExC_state);
8755 ret = reg_node(pRExC_state, LOGICAL);
8756 eval = reganode(pRExC_state, EVAL, n);
8759 /* for later propagation into (??{}) return value */
8760 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8762 REGTAIL(pRExC_state, ret, eval);
8763 /* deal with the length of this later - MJD */
8766 ret = reganode(pRExC_state, EVAL, n);
8767 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8768 Set_Node_Offset(ret, parse_start);
8771 case '(': /* (?(?{...})...) and (?(?=...)...) */
8774 if (RExC_parse[0] == '?') { /* (?(?...)) */
8775 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8776 || RExC_parse[1] == '<'
8777 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8780 ret = reg_node(pRExC_state, LOGICAL);
8783 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8787 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8788 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8790 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8791 char *name_start= RExC_parse++;
8793 SV *sv_dat=reg_scan_name(pRExC_state,
8794 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8795 if (RExC_parse == name_start || *RExC_parse != ch)
8796 vFAIL2("Sequence (?(%c... not terminated",
8797 (ch == '>' ? '<' : ch));
8800 num = add_data( pRExC_state, 1, "S" );
8801 RExC_rxi->data->data[num]=(void*)sv_dat;
8802 SvREFCNT_inc_simple_void(sv_dat);
8804 ret = reganode(pRExC_state,NGROUPP,num);
8805 goto insert_if_check_paren;
8807 else if (RExC_parse[0] == 'D' &&
8808 RExC_parse[1] == 'E' &&
8809 RExC_parse[2] == 'F' &&
8810 RExC_parse[3] == 'I' &&
8811 RExC_parse[4] == 'N' &&
8812 RExC_parse[5] == 'E')
8814 ret = reganode(pRExC_state,DEFINEP,0);
8817 goto insert_if_check_paren;
8819 else if (RExC_parse[0] == 'R') {
8822 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8823 parno = atoi(RExC_parse++);
8824 while (isDIGIT(*RExC_parse))
8826 } else if (RExC_parse[0] == '&') {
8829 sv_dat = reg_scan_name(pRExC_state,
8830 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8831 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8833 ret = reganode(pRExC_state,INSUBP,parno);
8834 goto insert_if_check_paren;
8836 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8839 parno = atoi(RExC_parse++);
8841 while (isDIGIT(*RExC_parse))
8843 ret = reganode(pRExC_state, GROUPP, parno);
8845 insert_if_check_paren:
8846 if ((c = *nextchar(pRExC_state)) != ')')
8847 vFAIL("Switch condition not recognized");
8849 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8850 br = regbranch(pRExC_state, &flags, 1,depth+1);
8852 br = reganode(pRExC_state, LONGJMP, 0);
8854 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8855 c = *nextchar(pRExC_state);
8860 vFAIL("(?(DEFINE)....) does not allow branches");
8861 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8862 regbranch(pRExC_state, &flags, 1,depth+1);
8863 REGTAIL(pRExC_state, ret, lastbr);
8866 c = *nextchar(pRExC_state);
8871 vFAIL("Switch (?(condition)... contains too many branches");
8872 ender = reg_node(pRExC_state, TAIL);
8873 REGTAIL(pRExC_state, br, ender);
8875 REGTAIL(pRExC_state, lastbr, ender);
8876 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8879 REGTAIL(pRExC_state, ret, ender);
8880 RExC_size++; /* XXX WHY do we need this?!!
8881 For large programs it seems to be required
8882 but I can't figure out why. -- dmq*/
8886 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8890 RExC_parse--; /* for vFAIL to print correctly */
8891 vFAIL("Sequence (? incomplete");
8893 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8895 has_use_defaults = TRUE;
8896 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8897 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8898 ? REGEX_UNICODE_CHARSET
8899 : REGEX_DEPENDS_CHARSET);
8903 parse_flags: /* (?i) */
8905 U32 posflags = 0, negflags = 0;
8906 U32 *flagsp = &posflags;
8907 char has_charset_modifier = '\0';
8908 regex_charset cs = get_regex_charset(RExC_flags);
8909 if (cs == REGEX_DEPENDS_CHARSET
8910 && (RExC_utf8 || RExC_uni_semantics))
8912 cs = REGEX_UNICODE_CHARSET;
8915 while (*RExC_parse) {
8916 /* && strchr("iogcmsx", *RExC_parse) */
8917 /* (?g), (?gc) and (?o) are useless here
8918 and must be globally applied -- japhy */
8919 switch (*RExC_parse) {
8920 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8921 case LOCALE_PAT_MOD:
8922 if (has_charset_modifier) {
8923 goto excess_modifier;
8925 else if (flagsp == &negflags) {
8928 cs = REGEX_LOCALE_CHARSET;
8929 has_charset_modifier = LOCALE_PAT_MOD;
8930 RExC_contains_locale = 1;
8932 case UNICODE_PAT_MOD:
8933 if (has_charset_modifier) {
8934 goto excess_modifier;
8936 else if (flagsp == &negflags) {
8939 cs = REGEX_UNICODE_CHARSET;
8940 has_charset_modifier = UNICODE_PAT_MOD;
8942 case ASCII_RESTRICT_PAT_MOD:
8943 if (flagsp == &negflags) {
8946 if (has_charset_modifier) {
8947 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8948 goto excess_modifier;
8950 /* Doubled modifier implies more restricted */
8951 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8954 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8956 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8958 case DEPENDS_PAT_MOD:
8959 if (has_use_defaults) {
8960 goto fail_modifiers;
8962 else if (flagsp == &negflags) {
8965 else if (has_charset_modifier) {
8966 goto excess_modifier;
8969 /* The dual charset means unicode semantics if the
8970 * pattern (or target, not known until runtime) are
8971 * utf8, or something in the pattern indicates unicode
8973 cs = (RExC_utf8 || RExC_uni_semantics)
8974 ? REGEX_UNICODE_CHARSET
8975 : REGEX_DEPENDS_CHARSET;
8976 has_charset_modifier = DEPENDS_PAT_MOD;
8980 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8981 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8983 else if (has_charset_modifier == *(RExC_parse - 1)) {
8984 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8987 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8992 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8994 case ONCE_PAT_MOD: /* 'o' */
8995 case GLOBAL_PAT_MOD: /* 'g' */
8996 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8997 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8998 if (! (wastedflags & wflagbit) ) {
8999 wastedflags |= wflagbit;
9002 "Useless (%s%c) - %suse /%c modifier",
9003 flagsp == &negflags ? "?-" : "?",
9005 flagsp == &negflags ? "don't " : "",
9012 case CONTINUE_PAT_MOD: /* 'c' */
9013 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9014 if (! (wastedflags & WASTED_C) ) {
9015 wastedflags |= WASTED_GC;
9018 "Useless (%sc) - %suse /gc modifier",
9019 flagsp == &negflags ? "?-" : "?",
9020 flagsp == &negflags ? "don't " : ""
9025 case KEEPCOPY_PAT_MOD: /* 'p' */
9026 if (flagsp == &negflags) {
9028 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9030 *flagsp |= RXf_PMf_KEEPCOPY;
9034 /* A flag is a default iff it is following a minus, so
9035 * if there is a minus, it means will be trying to
9036 * re-specify a default which is an error */
9037 if (has_use_defaults || flagsp == &negflags) {
9040 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9044 wastedflags = 0; /* reset so (?g-c) warns twice */
9050 RExC_flags |= posflags;
9051 RExC_flags &= ~negflags;
9052 set_regex_charset(&RExC_flags, cs);
9054 oregflags |= posflags;
9055 oregflags &= ~negflags;
9056 set_regex_charset(&oregflags, cs);
9058 nextchar(pRExC_state);
9069 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9074 }} /* one for the default block, one for the switch */
9081 ret = reganode(pRExC_state, OPEN, parno);
9084 RExC_nestroot = parno;
9085 if (RExC_seen & REG_SEEN_RECURSE
9086 && !RExC_open_parens[parno-1])
9088 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9089 "Setting open paren #%"IVdf" to %d\n",
9090 (IV)parno, REG_NODE_NUM(ret)));
9091 RExC_open_parens[parno-1]= ret;
9094 Set_Node_Length(ret, 1); /* MJD */
9095 Set_Node_Offset(ret, RExC_parse); /* MJD */
9103 /* Pick up the branches, linking them together. */
9104 parse_start = RExC_parse; /* MJD */
9105 br = regbranch(pRExC_state, &flags, 1,depth+1);
9107 /* branch_len = (paren != 0); */
9111 if (*RExC_parse == '|') {
9112 if (!SIZE_ONLY && RExC_extralen) {
9113 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9116 reginsert(pRExC_state, BRANCH, br, depth+1);
9117 Set_Node_Length(br, paren != 0);
9118 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9122 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9124 else if (paren == ':') {
9125 *flagp |= flags&SIMPLE;
9127 if (is_open) { /* Starts with OPEN. */
9128 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9130 else if (paren != '?') /* Not Conditional */
9132 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9134 while (*RExC_parse == '|') {
9135 if (!SIZE_ONLY && RExC_extralen) {
9136 ender = reganode(pRExC_state, LONGJMP,0);
9137 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9140 RExC_extralen += 2; /* Account for LONGJMP. */
9141 nextchar(pRExC_state);
9143 if (RExC_npar > after_freeze)
9144 after_freeze = RExC_npar;
9145 RExC_npar = freeze_paren;
9147 br = regbranch(pRExC_state, &flags, 0, depth+1);
9151 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9153 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9156 if (have_branch || paren != ':') {
9157 /* Make a closing node, and hook it on the end. */
9160 ender = reg_node(pRExC_state, TAIL);
9163 ender = reganode(pRExC_state, CLOSE, parno);
9164 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9165 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9166 "Setting close paren #%"IVdf" to %d\n",
9167 (IV)parno, REG_NODE_NUM(ender)));
9168 RExC_close_parens[parno-1]= ender;
9169 if (RExC_nestroot == parno)
9172 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9173 Set_Node_Length(ender,1); /* MJD */
9179 *flagp &= ~HASWIDTH;
9182 ender = reg_node(pRExC_state, SUCCEED);
9185 ender = reg_node(pRExC_state, END);
9187 assert(!RExC_opend); /* there can only be one! */
9192 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9193 SV * const mysv_val1=sv_newmortal();
9194 SV * const mysv_val2=sv_newmortal();
9195 DEBUG_PARSE_MSG("lsbr");
9196 regprop(RExC_rx, mysv_val1, lastbr);
9197 regprop(RExC_rx, mysv_val2, ender);
9198 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9199 SvPV_nolen_const(mysv_val1),
9200 (IV)REG_NODE_NUM(lastbr),
9201 SvPV_nolen_const(mysv_val2),
9202 (IV)REG_NODE_NUM(ender),
9203 (IV)(ender - lastbr)
9206 REGTAIL(pRExC_state, lastbr, ender);
9208 if (have_branch && !SIZE_ONLY) {
9211 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9213 /* Hook the tails of the branches to the closing node. */
9214 for (br = ret; br; br = regnext(br)) {
9215 const U8 op = PL_regkind[OP(br)];
9217 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9218 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9221 else if (op == BRANCHJ) {
9222 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9223 /* for now we always disable this optimisation * /
9224 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9230 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9231 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9232 SV * const mysv_val1=sv_newmortal();
9233 SV * const mysv_val2=sv_newmortal();
9234 DEBUG_PARSE_MSG("NADA");
9235 regprop(RExC_rx, mysv_val1, ret);
9236 regprop(RExC_rx, mysv_val2, ender);
9237 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9238 SvPV_nolen_const(mysv_val1),
9239 (IV)REG_NODE_NUM(ret),
9240 SvPV_nolen_const(mysv_val2),
9241 (IV)REG_NODE_NUM(ender),
9246 if (OP(ender) == TAIL) {
9251 for ( opt= br + 1; opt < ender ; opt++ )
9253 NEXT_OFF(br)= ender - br;
9261 static const char parens[] = "=!<,>";
9263 if (paren && (p = strchr(parens, paren))) {
9264 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9265 int flag = (p - parens) > 1;
9268 node = SUSPEND, flag = 0;
9269 reginsert(pRExC_state, node,ret, depth+1);
9270 Set_Node_Cur_Length(ret);
9271 Set_Node_Offset(ret, parse_start + 1);
9273 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9277 /* Check for proper termination. */
9279 RExC_flags = oregflags;
9280 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9281 RExC_parse = oregcomp_parse;
9282 vFAIL("Unmatched (");
9285 else if (!paren && RExC_parse < RExC_end) {
9286 if (*RExC_parse == ')') {
9288 vFAIL("Unmatched )");
9291 FAIL("Junk on end of regexp"); /* "Can't happen". */
9292 assert(0); /* NOTREACHED */
9295 if (RExC_in_lookbehind) {
9296 RExC_in_lookbehind--;
9298 if (after_freeze > RExC_npar)
9299 RExC_npar = after_freeze;
9304 - regbranch - one alternative of an | operator
9306 * Implements the concatenation operator.
9309 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9313 regnode *chain = NULL;
9315 I32 flags = 0, c = 0;
9316 GET_RE_DEBUG_FLAGS_DECL;
9318 PERL_ARGS_ASSERT_REGBRANCH;
9320 DEBUG_PARSE("brnc");
9325 if (!SIZE_ONLY && RExC_extralen)
9326 ret = reganode(pRExC_state, BRANCHJ,0);
9328 ret = reg_node(pRExC_state, BRANCH);
9329 Set_Node_Length(ret, 1);
9333 if (!first && SIZE_ONLY)
9334 RExC_extralen += 1; /* BRANCHJ */
9336 *flagp = WORST; /* Tentatively. */
9339 nextchar(pRExC_state);
9340 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9342 latest = regpiece(pRExC_state, &flags,depth+1);
9343 if (latest == NULL) {
9344 if (flags & TRYAGAIN)
9348 else if (ret == NULL)
9350 *flagp |= flags&(HASWIDTH|POSTPONED);
9351 if (chain == NULL) /* First piece. */
9352 *flagp |= flags&SPSTART;
9355 REGTAIL(pRExC_state, chain, latest);
9360 if (chain == NULL) { /* Loop ran zero times. */
9361 chain = reg_node(pRExC_state, NOTHING);
9366 *flagp |= flags&SIMPLE;
9373 - regpiece - something followed by possible [*+?]
9375 * Note that the branching code sequences used for ? and the general cases
9376 * of * and + are somewhat optimized: they use the same NOTHING node as
9377 * both the endmarker for their branch list and the body of the last branch.
9378 * It might seem that this node could be dispensed with entirely, but the
9379 * endmarker role is not redundant.
9382 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9389 const char * const origparse = RExC_parse;
9391 I32 max = REG_INFTY;
9392 #ifdef RE_TRACK_PATTERN_OFFSETS
9395 const char *maxpos = NULL;
9396 GET_RE_DEBUG_FLAGS_DECL;
9398 PERL_ARGS_ASSERT_REGPIECE;
9400 DEBUG_PARSE("piec");
9402 ret = regatom(pRExC_state, &flags,depth+1);
9404 if (flags & TRYAGAIN)
9411 if (op == '{' && regcurly(RExC_parse)) {
9413 #ifdef RE_TRACK_PATTERN_OFFSETS
9414 parse_start = RExC_parse; /* MJD */
9416 next = RExC_parse + 1;
9417 while (isDIGIT(*next) || *next == ',') {
9426 if (*next == '}') { /* got one */
9430 min = atoi(RExC_parse);
9434 maxpos = RExC_parse;
9436 if (!max && *maxpos != '0')
9437 max = REG_INFTY; /* meaning "infinity" */
9438 else if (max >= REG_INFTY)
9439 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9441 nextchar(pRExC_state);
9444 if ((flags&SIMPLE)) {
9445 RExC_naughty += 2 + RExC_naughty / 2;
9446 reginsert(pRExC_state, CURLY, ret, depth+1);
9447 Set_Node_Offset(ret, parse_start+1); /* MJD */
9448 Set_Node_Cur_Length(ret);
9451 regnode * const w = reg_node(pRExC_state, WHILEM);
9454 REGTAIL(pRExC_state, ret, w);
9455 if (!SIZE_ONLY && RExC_extralen) {
9456 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9457 reginsert(pRExC_state, NOTHING,ret, depth+1);
9458 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9460 reginsert(pRExC_state, CURLYX,ret, depth+1);
9462 Set_Node_Offset(ret, parse_start+1);
9463 Set_Node_Length(ret,
9464 op == '{' ? (RExC_parse - parse_start) : 1);
9466 if (!SIZE_ONLY && RExC_extralen)
9467 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9468 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9470 RExC_whilem_seen++, RExC_extralen += 3;
9471 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9480 vFAIL("Can't do {n,m} with n > m");
9482 ARG1_SET(ret, (U16)min);
9483 ARG2_SET(ret, (U16)max);
9495 #if 0 /* Now runtime fix should be reliable. */
9497 /* if this is reinstated, don't forget to put this back into perldiag:
9499 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9501 (F) The part of the regexp subject to either the * or + quantifier
9502 could match an empty string. The {#} shows in the regular
9503 expression about where the problem was discovered.
9507 if (!(flags&HASWIDTH) && op != '?')
9508 vFAIL("Regexp *+ operand could be empty");
9511 #ifdef RE_TRACK_PATTERN_OFFSETS
9512 parse_start = RExC_parse;
9514 nextchar(pRExC_state);
9516 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9518 if (op == '*' && (flags&SIMPLE)) {
9519 reginsert(pRExC_state, STAR, ret, depth+1);
9523 else if (op == '*') {
9527 else if (op == '+' && (flags&SIMPLE)) {
9528 reginsert(pRExC_state, PLUS, ret, depth+1);
9532 else if (op == '+') {
9536 else if (op == '?') {
9541 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9542 ckWARN3reg(RExC_parse,
9543 "%.*s matches null string many times",
9544 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9548 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9549 nextchar(pRExC_state);
9550 reginsert(pRExC_state, MINMOD, ret, depth+1);
9551 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9553 #ifndef REG_ALLOW_MINMOD_SUSPEND
9556 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9558 nextchar(pRExC_state);
9559 ender = reg_node(pRExC_state, SUCCEED);
9560 REGTAIL(pRExC_state, ret, ender);
9561 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9563 ender = reg_node(pRExC_state, TAIL);
9564 REGTAIL(pRExC_state, ret, ender);
9568 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9570 vFAIL("Nested quantifiers");
9577 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9580 /* This is expected to be called by a parser routine that has recognized '\N'
9581 and needs to handle the rest. RExC_parse is expected to point at the first
9582 char following the N at the time of the call. On successful return,
9583 RExC_parse has been updated to point to just after the sequence identified
9584 by this routine, and <*flagp> has been updated.
9586 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9589 \N may begin either a named sequence, or if outside a character class, mean
9590 to match a non-newline. For non single-quoted regexes, the tokenizer has
9591 attempted to decide which, and in the case of a named sequence, converted it
9592 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9593 where c1... are the characters in the sequence. For single-quoted regexes,
9594 the tokenizer passes the \N sequence through unchanged; this code will not
9595 attempt to determine this nor expand those, instead raising a syntax error.
9596 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9597 or there is no '}', it signals that this \N occurrence means to match a
9600 Only the \N{U+...} form should occur in a character class, for the same
9601 reason that '.' inside a character class means to just match a period: it
9602 just doesn't make sense.
9604 The function raises an error (via vFAIL), and doesn't return for various
9605 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9606 success; it returns FALSE otherwise.
9608 If <valuep> is non-null, it means the caller can accept an input sequence
9609 consisting of a just a single code point; <*valuep> is set to that value
9610 if the input is such.
9612 If <node_p> is non-null it signifies that the caller can accept any other
9613 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9615 1) \N means not-a-NL: points to a newly created REG_ANY node;
9616 2) \N{}: points to a new NOTHING node;
9617 3) otherwise: points to a new EXACT node containing the resolved
9619 Note that FALSE is returned for single code point sequences if <valuep> is
9623 char * endbrace; /* '}' following the name */
9625 char *endchar; /* Points to '.' or '}' ending cur char in the input
9627 bool has_multiple_chars; /* true if the input stream contains a sequence of
9628 more than one character */
9630 GET_RE_DEBUG_FLAGS_DECL;
9632 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9636 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9638 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9639 * modifier. The other meaning does not */
9640 p = (RExC_flags & RXf_PMf_EXTENDED)
9641 ? regwhite( pRExC_state, RExC_parse )
9644 /* Disambiguate between \N meaning a named character versus \N meaning
9645 * [^\n]. The former is assumed when it can't be the latter. */
9646 if (*p != '{' || regcurly(p)) {
9649 /* no bare \N in a charclass */
9650 if (in_char_class) {
9651 vFAIL("\\N in a character class must be a named character: \\N{...}");
9655 nextchar(pRExC_state);
9656 *node_p = reg_node(pRExC_state, REG_ANY);
9657 *flagp |= HASWIDTH|SIMPLE;
9660 Set_Node_Length(*node_p, 1); /* MJD */
9664 /* Here, we have decided it should be a named character or sequence */
9666 /* The test above made sure that the next real character is a '{', but
9667 * under the /x modifier, it could be separated by space (or a comment and
9668 * \n) and this is not allowed (for consistency with \x{...} and the
9669 * tokenizer handling of \N{NAME}). */
9670 if (*RExC_parse != '{') {
9671 vFAIL("Missing braces on \\N{}");
9674 RExC_parse++; /* Skip past the '{' */
9676 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9677 || ! (endbrace == RExC_parse /* nothing between the {} */
9678 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9679 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9681 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9682 vFAIL("\\N{NAME} must be resolved by the lexer");
9685 if (endbrace == RExC_parse) { /* empty: \N{} */
9688 *node_p = reg_node(pRExC_state,NOTHING);
9690 else if (in_char_class) {
9691 if (SIZE_ONLY && in_char_class) {
9692 ckWARNreg(RExC_parse,
9693 "Ignoring zero length \\N{} in character class"
9701 nextchar(pRExC_state);
9705 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9706 RExC_parse += 2; /* Skip past the 'U+' */
9708 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9710 /* Code points are separated by dots. If none, there is only one code
9711 * point, and is terminated by the brace */
9712 has_multiple_chars = (endchar < endbrace);
9714 if (valuep && (! has_multiple_chars || in_char_class)) {
9715 /* We only pay attention to the first char of
9716 multichar strings being returned in char classes. I kinda wonder
9717 if this makes sense as it does change the behaviour
9718 from earlier versions, OTOH that behaviour was broken
9719 as well. XXX Solution is to recharacterize as
9720 [rest-of-class]|multi1|multi2... */
9722 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9723 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9724 | PERL_SCAN_DISALLOW_PREFIX
9725 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9727 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9729 /* The tokenizer should have guaranteed validity, but it's possible to
9730 * bypass it by using single quoting, so check */
9731 if (length_of_hex == 0
9732 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9734 RExC_parse += length_of_hex; /* Includes all the valid */
9735 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9736 ? UTF8SKIP(RExC_parse)
9738 /* Guard against malformed utf8 */
9739 if (RExC_parse >= endchar) {
9740 RExC_parse = endchar;
9742 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9745 if (in_char_class && has_multiple_chars) {
9746 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9748 RExC_parse = endbrace + 1;
9750 else if (! node_p || ! has_multiple_chars) {
9752 /* Here, the input is legal, but not according to the caller's
9753 * options. We fail without advancing the parse, so that the
9754 * caller can try again */
9760 /* What is done here is to convert this to a sub-pattern of the form
9761 * (?:\x{char1}\x{char2}...)
9762 * and then call reg recursively. That way, it retains its atomicness,
9763 * while not having to worry about special handling that some code
9764 * points may have. toke.c has converted the original Unicode values
9765 * to native, so that we can just pass on the hex values unchanged. We
9766 * do have to set a flag to keep recoding from happening in the
9769 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9771 char *orig_end = RExC_end;
9774 while (RExC_parse < endbrace) {
9776 /* Convert to notation the rest of the code understands */
9777 sv_catpv(substitute_parse, "\\x{");
9778 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9779 sv_catpv(substitute_parse, "}");
9781 /* Point to the beginning of the next character in the sequence. */
9782 RExC_parse = endchar + 1;
9783 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9785 sv_catpv(substitute_parse, ")");
9787 RExC_parse = SvPV(substitute_parse, len);
9789 /* Don't allow empty number */
9791 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9793 RExC_end = RExC_parse + len;
9795 /* The values are Unicode, and therefore not subject to recoding */
9796 RExC_override_recoding = 1;
9798 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9799 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9801 RExC_parse = endbrace;
9802 RExC_end = orig_end;
9803 RExC_override_recoding = 0;
9805 nextchar(pRExC_state);
9815 * It returns the code point in utf8 for the value in *encp.
9816 * value: a code value in the source encoding
9817 * encp: a pointer to an Encode object
9819 * If the result from Encode is not a single character,
9820 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9823 S_reg_recode(pTHX_ const char value, SV **encp)
9826 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9827 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9828 const STRLEN newlen = SvCUR(sv);
9829 UV uv = UNICODE_REPLACEMENT;
9831 PERL_ARGS_ASSERT_REG_RECODE;
9835 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9838 if (!newlen || numlen != newlen) {
9839 uv = UNICODE_REPLACEMENT;
9845 PERL_STATIC_INLINE U8
9846 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9850 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9856 op = get_regex_charset(RExC_flags);
9857 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9858 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9859 been, so there is no hole */
9865 PERL_STATIC_INLINE void
9866 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9868 /* This knows the details about sizing an EXACTish node, setting flags for
9869 * it (by setting <*flagp>, and potentially populating it with a single
9872 * If <len> is non-zero, this function assumes that the node has already
9873 * been populated, and just does the sizing. In this case <code_point>
9874 * should be the final code point that has already been placed into the
9875 * node. This value will be ignored except that under some circumstances
9876 * <*flagp> is set based on it.
9878 * If <len is zero, the function assumes that the node is to contain only
9879 * the single character given by <code_point> and calculates what <len>
9880 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9881 * additionally will populate the node's STRING with <code_point>, if <len>
9882 * is 0. In both cases <*flagp> is appropriately set
9884 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9885 * folded (the latter only when the rules indicate it can match 'ss') */
9887 bool len_passed_in = cBOOL(len != 0);
9888 U8 character[UTF8_MAXBYTES_CASE+1];
9890 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9892 if (! len_passed_in) {
9895 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9898 uvchr_to_utf8( character, code_point);
9899 len = UTF8SKIP(character);
9903 || code_point != LATIN_SMALL_LETTER_SHARP_S
9904 || ASCII_FOLD_RESTRICTED
9905 || ! AT_LEAST_UNI_SEMANTICS)
9907 *character = (U8) code_point;
9912 *(character + 1) = 's';
9918 RExC_size += STR_SZ(len);
9921 RExC_emit += STR_SZ(len);
9922 STR_LEN(node) = len;
9923 if (! len_passed_in) {
9924 Copy((char *) character, STRING(node), len, char);
9929 if (len == 1 && UNI_IS_INVARIANT(code_point))
9934 - regatom - the lowest level
9936 Try to identify anything special at the start of the pattern. If there
9937 is, then handle it as required. This may involve generating a single regop,
9938 such as for an assertion; or it may involve recursing, such as to
9939 handle a () structure.
9941 If the string doesn't start with something special then we gobble up
9942 as much literal text as we can.
9944 Once we have been able to handle whatever type of thing started the
9945 sequence, we return.
9947 Note: we have to be careful with escapes, as they can be both literal
9948 and special, and in the case of \10 and friends, context determines which.
9950 A summary of the code structure is:
9952 switch (first_byte) {
9953 cases for each special:
9954 handle this special;
9958 cases for each unambiguous special:
9959 handle this special;
9961 cases for each ambigous special/literal:
9963 if (special) handle here
9965 default: // unambiguously literal:
9968 default: // is a literal char
9971 create EXACTish node for literal;
9972 while (more input and node isn't full) {
9973 switch (input_byte) {
9974 cases for each special;
9975 make sure parse pointer is set so that the next call to
9976 regatom will see this special first
9977 goto loopdone; // EXACTish node terminated by prev. char
9979 append char to EXACTISH node;
9981 get next input byte;
9985 return the generated node;
9987 Specifically there are two separate switches for handling
9988 escape sequences, with the one for handling literal escapes requiring
9989 a dummy entry for all of the special escapes that are actually handled
9994 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9997 regnode *ret = NULL;
9999 char *parse_start = RExC_parse;
10001 GET_RE_DEBUG_FLAGS_DECL;
10002 DEBUG_PARSE("atom");
10003 *flagp = WORST; /* Tentatively. */
10005 PERL_ARGS_ASSERT_REGATOM;
10008 switch ((U8)*RExC_parse) {
10010 RExC_seen_zerolen++;
10011 nextchar(pRExC_state);
10012 if (RExC_flags & RXf_PMf_MULTILINE)
10013 ret = reg_node(pRExC_state, MBOL);
10014 else if (RExC_flags & RXf_PMf_SINGLELINE)
10015 ret = reg_node(pRExC_state, SBOL);
10017 ret = reg_node(pRExC_state, BOL);
10018 Set_Node_Length(ret, 1); /* MJD */
10021 nextchar(pRExC_state);
10023 RExC_seen_zerolen++;
10024 if (RExC_flags & RXf_PMf_MULTILINE)
10025 ret = reg_node(pRExC_state, MEOL);
10026 else if (RExC_flags & RXf_PMf_SINGLELINE)
10027 ret = reg_node(pRExC_state, SEOL);
10029 ret = reg_node(pRExC_state, EOL);
10030 Set_Node_Length(ret, 1); /* MJD */
10033 nextchar(pRExC_state);
10034 if (RExC_flags & RXf_PMf_SINGLELINE)
10035 ret = reg_node(pRExC_state, SANY);
10037 ret = reg_node(pRExC_state, REG_ANY);
10038 *flagp |= HASWIDTH|SIMPLE;
10040 Set_Node_Length(ret, 1); /* MJD */
10044 char * const oregcomp_parse = ++RExC_parse;
10045 ret = regclass(pRExC_state, flagp,depth+1);
10046 if (*RExC_parse != ']') {
10047 RExC_parse = oregcomp_parse;
10048 vFAIL("Unmatched [");
10050 nextchar(pRExC_state);
10051 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10055 nextchar(pRExC_state);
10056 ret = reg(pRExC_state, 1, &flags,depth+1);
10058 if (flags & TRYAGAIN) {
10059 if (RExC_parse == RExC_end) {
10060 /* Make parent create an empty node if needed. */
10061 *flagp |= TRYAGAIN;
10068 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10072 if (flags & TRYAGAIN) {
10073 *flagp |= TRYAGAIN;
10076 vFAIL("Internal urp");
10077 /* Supposed to be caught earlier. */
10083 vFAIL("Quantifier follows nothing");
10088 This switch handles escape sequences that resolve to some kind
10089 of special regop and not to literal text. Escape sequnces that
10090 resolve to literal text are handled below in the switch marked
10093 Every entry in this switch *must* have a corresponding entry
10094 in the literal escape switch. However, the opposite is not
10095 required, as the default for this switch is to jump to the
10096 literal text handling code.
10098 switch ((U8)*++RExC_parse) {
10099 /* Special Escapes */
10101 RExC_seen_zerolen++;
10102 ret = reg_node(pRExC_state, SBOL);
10104 goto finish_meta_pat;
10106 ret = reg_node(pRExC_state, GPOS);
10107 RExC_seen |= REG_SEEN_GPOS;
10109 goto finish_meta_pat;
10111 RExC_seen_zerolen++;
10112 ret = reg_node(pRExC_state, KEEPS);
10114 /* XXX:dmq : disabling in-place substitution seems to
10115 * be necessary here to avoid cases of memory corruption, as
10116 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10118 RExC_seen |= REG_SEEN_LOOKBEHIND;
10119 goto finish_meta_pat;
10121 ret = reg_node(pRExC_state, SEOL);
10123 RExC_seen_zerolen++; /* Do not optimize RE away */
10124 goto finish_meta_pat;
10126 ret = reg_node(pRExC_state, EOS);
10128 RExC_seen_zerolen++; /* Do not optimize RE away */
10129 goto finish_meta_pat;
10131 ret = reg_node(pRExC_state, CANY);
10132 RExC_seen |= REG_SEEN_CANY;
10133 *flagp |= HASWIDTH|SIMPLE;
10134 goto finish_meta_pat;
10136 ret = reg_node(pRExC_state, CLUMP);
10137 *flagp |= HASWIDTH;
10138 goto finish_meta_pat;
10140 op = ALNUM + get_regex_charset(RExC_flags);
10141 if (op > ALNUMA) { /* /aa is same as /a */
10144 ret = reg_node(pRExC_state, op);
10145 *flagp |= HASWIDTH|SIMPLE;
10146 goto finish_meta_pat;
10148 op = NALNUM + get_regex_charset(RExC_flags);
10149 if (op > NALNUMA) { /* /aa is same as /a */
10152 ret = reg_node(pRExC_state, op);
10153 *flagp |= HASWIDTH|SIMPLE;
10154 goto finish_meta_pat;
10156 RExC_seen_zerolen++;
10157 RExC_seen |= REG_SEEN_LOOKBEHIND;
10158 op = BOUND + get_regex_charset(RExC_flags);
10159 if (op > BOUNDA) { /* /aa is same as /a */
10162 ret = reg_node(pRExC_state, op);
10163 FLAGS(ret) = get_regex_charset(RExC_flags);
10165 goto finish_meta_pat;
10167 RExC_seen_zerolen++;
10168 RExC_seen |= REG_SEEN_LOOKBEHIND;
10169 op = NBOUND + get_regex_charset(RExC_flags);
10170 if (op > NBOUNDA) { /* /aa is same as /a */
10173 ret = reg_node(pRExC_state, op);
10174 FLAGS(ret) = get_regex_charset(RExC_flags);
10176 goto finish_meta_pat;
10178 op = SPACE + get_regex_charset(RExC_flags);
10179 if (op > SPACEA) { /* /aa is same as /a */
10182 ret = reg_node(pRExC_state, op);
10183 *flagp |= HASWIDTH|SIMPLE;
10184 goto finish_meta_pat;
10186 op = NSPACE + get_regex_charset(RExC_flags);
10187 if (op > NSPACEA) { /* /aa is same as /a */
10190 ret = reg_node(pRExC_state, op);
10191 *flagp |= HASWIDTH|SIMPLE;
10192 goto finish_meta_pat;
10200 U8 offset = get_regex_charset(RExC_flags);
10201 if (offset == REGEX_UNICODE_CHARSET) {
10202 offset = REGEX_DEPENDS_CHARSET;
10204 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10205 offset = REGEX_ASCII_RESTRICTED_CHARSET;
10209 ret = reg_node(pRExC_state, op);
10210 *flagp |= HASWIDTH|SIMPLE;
10211 goto finish_meta_pat;
10213 ret = reg_node(pRExC_state, LNBREAK);
10214 *flagp |= HASWIDTH|SIMPLE;
10215 goto finish_meta_pat;
10217 ret = reg_node(pRExC_state, HORIZWS);
10218 *flagp |= HASWIDTH|SIMPLE;
10219 goto finish_meta_pat;
10221 ret = reg_node(pRExC_state, NHORIZWS);
10222 *flagp |= HASWIDTH|SIMPLE;
10223 goto finish_meta_pat;
10225 ret = reg_node(pRExC_state, VERTWS);
10226 *flagp |= HASWIDTH|SIMPLE;
10227 goto finish_meta_pat;
10229 ret = reg_node(pRExC_state, NVERTWS);
10230 *flagp |= HASWIDTH|SIMPLE;
10232 nextchar(pRExC_state);
10233 Set_Node_Length(ret, 2); /* MJD */
10238 char* const oldregxend = RExC_end;
10240 char* parse_start = RExC_parse - 2;
10243 if (RExC_parse[1] == '{') {
10244 /* a lovely hack--pretend we saw [\pX] instead */
10245 RExC_end = strchr(RExC_parse, '}');
10247 const U8 c = (U8)*RExC_parse;
10249 RExC_end = oldregxend;
10250 vFAIL2("Missing right brace on \\%c{}", c);
10255 RExC_end = RExC_parse + 2;
10256 if (RExC_end > oldregxend)
10257 RExC_end = oldregxend;
10261 ret = regclass(pRExC_state, flagp,depth+1);
10263 RExC_end = oldregxend;
10266 Set_Node_Offset(ret, parse_start + 2);
10267 Set_Node_Cur_Length(ret);
10268 nextchar(pRExC_state);
10272 /* Handle \N and \N{NAME} with multiple code points here and not
10273 * below because it can be multicharacter. join_exact() will join
10274 * them up later on. Also this makes sure that things like
10275 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10276 * The options to the grok function call causes it to fail if the
10277 * sequence is just a single code point. We then go treat it as
10278 * just another character in the current EXACT node, and hence it
10279 * gets uniform treatment with all the other characters. The
10280 * special treatment for quantifiers is not needed for such single
10281 * character sequences */
10283 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10288 case 'k': /* Handle \k<NAME> and \k'NAME' */
10291 char ch= RExC_parse[1];
10292 if (ch != '<' && ch != '\'' && ch != '{') {
10294 vFAIL2("Sequence %.2s... not terminated",parse_start);
10296 /* this pretty much dupes the code for (?P=...) in reg(), if
10297 you change this make sure you change that */
10298 char* name_start = (RExC_parse += 2);
10300 SV *sv_dat = reg_scan_name(pRExC_state,
10301 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10302 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10303 if (RExC_parse == name_start || *RExC_parse != ch)
10304 vFAIL2("Sequence %.3s... not terminated",parse_start);
10307 num = add_data( pRExC_state, 1, "S" );
10308 RExC_rxi->data->data[num]=(void*)sv_dat;
10309 SvREFCNT_inc_simple_void(sv_dat);
10313 ret = reganode(pRExC_state,
10316 : (ASCII_FOLD_RESTRICTED)
10318 : (AT_LEAST_UNI_SEMANTICS)
10324 *flagp |= HASWIDTH;
10326 /* override incorrect value set in reganode MJD */
10327 Set_Node_Offset(ret, parse_start+1);
10328 Set_Node_Cur_Length(ret); /* MJD */
10329 nextchar(pRExC_state);
10335 case '1': case '2': case '3': case '4':
10336 case '5': case '6': case '7': case '8': case '9':
10339 bool isg = *RExC_parse == 'g';
10344 if (*RExC_parse == '{') {
10348 if (*RExC_parse == '-') {
10352 if (hasbrace && !isDIGIT(*RExC_parse)) {
10353 if (isrel) RExC_parse--;
10355 goto parse_named_seq;
10357 num = atoi(RExC_parse);
10358 if (isg && num == 0)
10359 vFAIL("Reference to invalid group 0");
10361 num = RExC_npar - num;
10363 vFAIL("Reference to nonexistent or unclosed group");
10365 if (!isg && num > 9 && num >= RExC_npar)
10366 /* Probably a character specified in octal, e.g. \35 */
10369 char * const parse_start = RExC_parse - 1; /* MJD */
10370 while (isDIGIT(*RExC_parse))
10372 if (parse_start == RExC_parse - 1)
10373 vFAIL("Unterminated \\g... pattern");
10375 if (*RExC_parse != '}')
10376 vFAIL("Unterminated \\g{...} pattern");
10380 if (num > (I32)RExC_rx->nparens)
10381 vFAIL("Reference to nonexistent group");
10384 ret = reganode(pRExC_state,
10387 : (ASCII_FOLD_RESTRICTED)
10389 : (AT_LEAST_UNI_SEMANTICS)
10395 *flagp |= HASWIDTH;
10397 /* override incorrect value set in reganode MJD */
10398 Set_Node_Offset(ret, parse_start+1);
10399 Set_Node_Cur_Length(ret); /* MJD */
10401 nextchar(pRExC_state);
10406 if (RExC_parse >= RExC_end)
10407 FAIL("Trailing \\");
10410 /* Do not generate "unrecognized" warnings here, we fall
10411 back into the quick-grab loop below */
10418 if (RExC_flags & RXf_PMf_EXTENDED) {
10419 if ( reg_skipcomment( pRExC_state ) )
10426 parse_start = RExC_parse - 1;
10435 #define MAX_NODE_STRING_SIZE 127
10436 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10438 U8 upper_parse = MAX_NODE_STRING_SIZE;
10441 bool next_is_quantifier;
10445 node_type = compute_EXACTish(pRExC_state);
10446 ret = reg_node(pRExC_state, node_type);
10448 /* In pass1, folded, we use a temporary buffer instead of the
10449 * actual node, as the node doesn't exist yet */
10450 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10456 /* XXX The node can hold up to 255 bytes, yet this only goes to
10457 * 127. I (khw) do not know why. Keeping it somewhat less than
10458 * 255 allows us to not have to worry about overflow due to
10459 * converting to utf8 and fold expansion, but that value is
10460 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10461 * split up by this limit into a single one using the real max of
10462 * 255. Even at 127, this breaks under rare circumstances. If
10463 * folding, we do not want to split a node at a character that is a
10464 * non-final in a multi-char fold, as an input string could just
10465 * happen to want to match across the node boundary. The join
10466 * would solve that problem if the join actually happens. But a
10467 * series of more than two nodes in a row each of 127 would cause
10468 * the first join to succeed to get to 254, but then there wouldn't
10469 * be room for the next one, which could at be one of those split
10470 * multi-char folds. I don't know of any fool-proof solution. One
10471 * could back off to end with only a code point that isn't such a
10472 * non-final, but it is possible for there not to be any in the
10474 for (p = RExC_parse - 1;
10475 len < upper_parse && p < RExC_end;
10480 if (RExC_flags & RXf_PMf_EXTENDED)
10481 p = regwhite( pRExC_state, p );
10492 /* Literal Escapes Switch
10494 This switch is meant to handle escape sequences that
10495 resolve to a literal character.
10497 Every escape sequence that represents something
10498 else, like an assertion or a char class, is handled
10499 in the switch marked 'Special Escapes' above in this
10500 routine, but also has an entry here as anything that
10501 isn't explicitly mentioned here will be treated as
10502 an unescaped equivalent literal.
10505 switch ((U8)*++p) {
10506 /* These are all the special escapes. */
10507 case 'A': /* Start assertion */
10508 case 'b': case 'B': /* Word-boundary assertion*/
10509 case 'C': /* Single char !DANGEROUS! */
10510 case 'd': case 'D': /* digit class */
10511 case 'g': case 'G': /* generic-backref, pos assertion */
10512 case 'h': case 'H': /* HORIZWS */
10513 case 'k': case 'K': /* named backref, keep marker */
10514 case 'p': case 'P': /* Unicode property */
10515 case 'R': /* LNBREAK */
10516 case 's': case 'S': /* space class */
10517 case 'v': case 'V': /* VERTWS */
10518 case 'w': case 'W': /* word class */
10519 case 'X': /* eXtended Unicode "combining character sequence" */
10520 case 'z': case 'Z': /* End of line/string assertion */
10524 /* Anything after here is an escape that resolves to a
10525 literal. (Except digits, which may or may not)
10531 case 'N': /* Handle a single-code point named character. */
10532 /* The options cause it to fail if a multiple code
10533 * point sequence. Handle those in the switch() above
10535 RExC_parse = p + 1;
10536 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10537 flagp, depth, FALSE))
10539 RExC_parse = p = oldp;
10543 if (ender > 0xff) {
10560 ender = ASCII_TO_NATIVE('\033');
10564 ender = ASCII_TO_NATIVE('\007');
10569 STRLEN brace_len = len;
10571 const char* error_msg;
10573 bool valid = grok_bslash_o(p,
10580 RExC_parse = p; /* going to die anyway; point
10581 to exact spot of failure */
10588 if (PL_encoding && ender < 0x100) {
10589 goto recode_encoding;
10591 if (ender > 0xff) {
10598 STRLEN brace_len = len;
10600 const char* error_msg;
10602 bool valid = grok_bslash_x(p,
10609 RExC_parse = p; /* going to die anyway; point
10610 to exact spot of failure */
10616 if (PL_encoding && ender < 0x100) {
10617 goto recode_encoding;
10619 if (ender > 0xff) {
10626 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10628 case '0': case '1': case '2': case '3':case '4':
10629 case '5': case '6': case '7':
10631 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10633 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10635 ender = grok_oct(p, &numlen, &flags, NULL);
10636 if (ender > 0xff) {
10645 if (PL_encoding && ender < 0x100)
10646 goto recode_encoding;
10649 if (! RExC_override_recoding) {
10650 SV* enc = PL_encoding;
10651 ender = reg_recode((const char)(U8)ender, &enc);
10652 if (!enc && SIZE_ONLY)
10653 ckWARNreg(p, "Invalid escape in the specified encoding");
10659 FAIL("Trailing \\");
10662 if (!SIZE_ONLY&& isALNUMC(*p)) {
10663 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10665 goto normal_default;
10669 /* Currently we don't warn when the lbrace is at the start
10670 * of a construct. This catches it in the middle of a
10671 * literal string, or when its the first thing after
10672 * something like "\b" */
10674 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10676 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10681 if (UTF8_IS_START(*p) && UTF) {
10683 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10684 &numlen, UTF8_ALLOW_DEFAULT);
10690 } /* End of switch on the literal */
10692 /* Here, have looked at the literal character and <ender>
10693 * contains its ordinal, <p> points to the character after it
10696 if ( RExC_flags & RXf_PMf_EXTENDED)
10697 p = regwhite( pRExC_state, p );
10699 /* If the next thing is a quantifier, it applies to this
10700 * character only, which means that this character has to be in
10701 * its own node and can't just be appended to the string in an
10702 * existing node, so if there are already other characters in
10703 * the node, close the node with just them, and set up to do
10704 * this character again next time through, when it will be the
10705 * only thing in its new node */
10706 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10714 /* See comments for join_exact() as to why we fold
10715 * this non-UTF at compile time */
10716 || (node_type == EXACTFU
10717 && ender == LATIN_SMALL_LETTER_SHARP_S))
10721 /* Prime the casefolded buffer. Locale rules, which
10722 * apply only to code points < 256, aren't known until
10723 * execution, so for them, just output the original
10724 * character using utf8. If we start to fold non-UTF
10725 * patterns, be sure to update join_exact() */
10726 if (LOC && ender < 256) {
10727 if (UNI_IS_INVARIANT(ender)) {
10731 *s = UTF8_TWO_BYTE_HI(ender);
10732 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10737 ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
10739 | ((LOC) ? FOLD_FLAGS_LOCALE
10740 : (ASCII_FOLD_RESTRICTED)
10741 ? FOLD_FLAGS_NOMIX_ASCII
10747 /* The loop increments <len> each time, as all but this
10748 * path (and the one just below for UTF) through it add
10749 * a single byte to the EXACTish node. But this one
10750 * has changed len to be the correct final value, so
10751 * subtract one to cancel out the increment that
10753 len += foldlen - 1;
10760 const STRLEN unilen = reguni(pRExC_state, ender, s);
10766 /* See comment just above for - 1 */
10770 REGC((char)ender, s++);
10773 if (next_is_quantifier) {
10775 /* Here, the next input is a quantifier, and to get here,
10776 * the current character is the only one in the node.
10777 * Also, here <len> doesn't include the final byte for this
10783 } /* End of loop through literal characters */
10785 /* Here we have either exhausted the input or ran out of room in
10786 * the node. (If we encountered a character that can't be in the
10787 * node, transfer is made directly to <loopdone>, and so we
10788 * wouldn't have fallen off the end of the loop.) In the latter
10789 * case, we artificially have to split the node into two, because
10790 * we just don't have enough space to hold everything. This
10791 * creates a problem if the final character participates in a
10792 * multi-character fold in the non-final position, as a match that
10793 * should have occurred won't, due to the way nodes are matched,
10794 * and our artificial boundary. So back off until we find a non-
10795 * problematic character -- one that isn't at the beginning or
10796 * middle of such a fold. (Either it doesn't participate in any
10797 * folds, or appears only in the final position of all the folds it
10798 * does participate in.) A better solution with far fewer false
10799 * positives, and that would fill the nodes more completely, would
10800 * be to actually have available all the multi-character folds to
10801 * test against, and to back-off only far enough to be sure that
10802 * this node isn't ending with a partial one. <upper_parse> is set
10803 * further below (if we need to reparse the node) to include just
10804 * up through that final non-problematic character that this code
10805 * identifies, so when it is set to less than the full node, we can
10806 * skip the rest of this */
10807 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10809 const STRLEN full_len = len;
10811 assert(len >= MAX_NODE_STRING_SIZE);
10813 /* Here, <s> points to the final byte of the final character.
10814 * Look backwards through the string until find a non-
10815 * problematic character */
10819 /* These two have no multi-char folds to non-UTF characters
10821 if (ASCII_FOLD_RESTRICTED || LOC) {
10825 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10829 if (! PL_NonL1NonFinalFold) {
10830 PL_NonL1NonFinalFold = _new_invlist_C_array(
10831 NonL1_Perl_Non_Final_Folds_invlist);
10834 /* Point to the first byte of the final character */
10835 s = (char *) utf8_hop((U8 *) s, -1);
10837 while (s >= s0) { /* Search backwards until find
10838 non-problematic char */
10839 if (UTF8_IS_INVARIANT(*s)) {
10841 /* There are no ascii characters that participate
10842 * in multi-char folds under /aa. In EBCDIC, the
10843 * non-ascii invariants are all control characters,
10844 * so don't ever participate in any folds. */
10845 if (ASCII_FOLD_RESTRICTED
10846 || ! IS_NON_FINAL_FOLD(*s))
10851 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10853 /* No Latin1 characters participate in multi-char
10854 * folds under /l */
10856 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10862 else if (! _invlist_contains_cp(
10863 PL_NonL1NonFinalFold,
10864 valid_utf8_to_uvchr((U8 *) s, NULL)))
10869 /* Here, the current character is problematic in that
10870 * it does occur in the non-final position of some
10871 * fold, so try the character before it, but have to
10872 * special case the very first byte in the string, so
10873 * we don't read outside the string */
10874 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10875 } /* End of loop backwards through the string */
10877 /* If there were only problematic characters in the string,
10878 * <s> will point to before s0, in which case the length
10879 * should be 0, otherwise include the length of the
10880 * non-problematic character just found */
10881 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10884 /* Here, have found the final character, if any, that is
10885 * non-problematic as far as ending the node without splitting
10886 * it across a potential multi-char fold. <len> contains the
10887 * number of bytes in the node up-to and including that
10888 * character, or is 0 if there is no such character, meaning
10889 * the whole node contains only problematic characters. In
10890 * this case, give up and just take the node as-is. We can't
10896 /* Here, the node does contain some characters that aren't
10897 * problematic. If one such is the final character in the
10898 * node, we are done */
10899 if (len == full_len) {
10902 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
10904 /* If the final character is problematic, but the
10905 * penultimate is not, back-off that last character to
10906 * later start a new node with it */
10911 /* Here, the final non-problematic character is earlier
10912 * in the input than the penultimate character. What we do
10913 * is reparse from the beginning, going up only as far as
10914 * this final ok one, thus guaranteeing that the node ends
10915 * in an acceptable character. The reason we reparse is
10916 * that we know how far in the character is, but we don't
10917 * know how to correlate its position with the input parse.
10918 * An alternate implementation would be to build that
10919 * correlation as we go along during the original parse,
10920 * but that would entail extra work for every node, whereas
10921 * this code gets executed only when the string is too
10922 * large for the node, and the final two characters are
10923 * problematic, an infrequent occurrence. Yet another
10924 * possible strategy would be to save the tail of the
10925 * string, and the next time regatom is called, initialize
10926 * with that. The problem with this is that unless you
10927 * back off one more character, you won't be guaranteed
10928 * regatom will get called again, unless regbranch,
10929 * regpiece ... are also changed. If you do back off that
10930 * extra character, so that there is input guaranteed to
10931 * force calling regatom, you can't handle the case where
10932 * just the first character in the node is acceptable. I
10933 * (khw) decided to try this method which doesn't have that
10934 * pitfall; if performance issues are found, we can do a
10935 * combination of the current approach plus that one */
10941 } /* End of verifying node ends with an appropriate char */
10943 loopdone: /* Jumped to when encounters something that shouldn't be in
10946 /* I (khw) don't know if you can get here with zero length, but the
10947 * old code handled this situation by creating a zero-length EXACT
10948 * node. Might as well be NOTHING instead */
10953 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
10956 RExC_parse = p - 1;
10957 Set_Node_Cur_Length(ret); /* MJD */
10958 nextchar(pRExC_state);
10960 /* len is STRLEN which is unsigned, need to copy to signed */
10963 vFAIL("Internal disaster");
10966 } /* End of label 'defchar:' */
10968 } /* End of giant switch on input character */
10974 S_regwhite( RExC_state_t *pRExC_state, char *p )
10976 const char *e = RExC_end;
10978 PERL_ARGS_ASSERT_REGWHITE;
10983 else if (*p == '#') {
10986 if (*p++ == '\n') {
10992 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11000 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11001 Character classes ([:foo:]) can also be negated ([:^foo:]).
11002 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11003 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11004 but trigger failures because they are currently unimplemented. */
11006 #define POSIXCC_DONE(c) ((c) == ':')
11007 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11008 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11011 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
11014 I32 namedclass = OOB_NAMEDCLASS;
11016 PERL_ARGS_ASSERT_REGPPOSIXCC;
11018 if (value == '[' && RExC_parse + 1 < RExC_end &&
11019 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11020 POSIXCC(UCHARAT(RExC_parse))) {
11021 const char c = UCHARAT(RExC_parse);
11022 char* const s = RExC_parse++;
11024 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11026 if (RExC_parse == RExC_end)
11027 /* Grandfather lone [:, [=, [. */
11030 const char* const t = RExC_parse++; /* skip over the c */
11033 if (UCHARAT(RExC_parse) == ']') {
11034 const char *posixcc = s + 1;
11035 RExC_parse++; /* skip over the ending ] */
11038 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11039 const I32 skip = t - posixcc;
11041 /* Initially switch on the length of the name. */
11044 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11045 namedclass = ANYOF_ALNUM;
11048 /* Names all of length 5. */
11049 /* alnum alpha ascii blank cntrl digit graph lower
11050 print punct space upper */
11051 /* Offset 4 gives the best switch position. */
11052 switch (posixcc[4]) {
11054 if (memEQ(posixcc, "alph", 4)) /* alpha */
11055 namedclass = ANYOF_ALPHA;
11058 if (memEQ(posixcc, "spac", 4)) /* space */
11059 namedclass = ANYOF_PSXSPC;
11062 if (memEQ(posixcc, "grap", 4)) /* graph */
11063 namedclass = ANYOF_GRAPH;
11066 if (memEQ(posixcc, "asci", 4)) /* ascii */
11067 namedclass = ANYOF_ASCII;
11070 if (memEQ(posixcc, "blan", 4)) /* blank */
11071 namedclass = ANYOF_BLANK;
11074 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11075 namedclass = ANYOF_CNTRL;
11078 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11079 namedclass = ANYOF_ALNUMC;
11082 if (memEQ(posixcc, "lowe", 4)) /* lower */
11083 namedclass = ANYOF_LOWER;
11084 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11085 namedclass = ANYOF_UPPER;
11088 if (memEQ(posixcc, "digi", 4)) /* digit */
11089 namedclass = ANYOF_DIGIT;
11090 else if (memEQ(posixcc, "prin", 4)) /* print */
11091 namedclass = ANYOF_PRINT;
11092 else if (memEQ(posixcc, "punc", 4)) /* punct */
11093 namedclass = ANYOF_PUNCT;
11098 if (memEQ(posixcc, "xdigit", 6))
11099 namedclass = ANYOF_XDIGIT;
11103 if (namedclass == OOB_NAMEDCLASS)
11104 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11107 /* The #defines are structured so each complement is +1 to
11108 * the normal one */
11112 assert (posixcc[skip] == ':');
11113 assert (posixcc[skip+1] == ']');
11114 } else if (!SIZE_ONLY) {
11115 /* [[=foo=]] and [[.foo.]] are still future. */
11117 /* adjust RExC_parse so the warning shows after
11118 the class closes */
11119 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11121 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11124 /* Maternal grandfather:
11125 * "[:" ending in ":" but not in ":]" */
11135 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
11139 PERL_ARGS_ASSERT_CHECKPOSIXCC;
11141 if (POSIXCC(UCHARAT(RExC_parse))) {
11142 const char *s = RExC_parse;
11143 const char c = *s++;
11145 while (isALNUM(*s))
11147 if (*s && c == *s && s[1] == ']') {
11149 "POSIX syntax [%c %c] belongs inside character classes",
11152 /* [[=foo=]] and [[.foo.]] are still future. */
11153 if (POSIXCC_NOTYET(c)) {
11154 /* adjust RExC_parse so the error shows after
11155 the class closes */
11156 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11158 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11164 /* Generate the code to add a full posix character <class> to the bracketed
11165 * character class given by <node>. (<node> is needed only under locale rules)
11166 * destlist is the inversion list for non-locale rules that this class is
11168 * sourcelist is the ASCII-range inversion list to add under /a rules
11169 * Xsourcelist is the full Unicode range list to use otherwise. */
11170 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11172 SV* scratch_list = NULL; \
11174 /* Set this class in the node for runtime matching */ \
11175 ANYOF_CLASS_SET(node, class); \
11177 /* For above Latin1 code points, we use the full Unicode range */ \
11178 _invlist_intersection(PL_AboveLatin1, \
11181 /* And set the output to it, adding instead if there already is an \
11182 * output. Checking if <destlist> is NULL first saves an extra \
11183 * clone. Its reference count will be decremented at the next \
11184 * union, etc, or if this is the only instance, at the end of the \
11186 if (! destlist) { \
11187 destlist = scratch_list; \
11190 _invlist_union(destlist, scratch_list, &destlist); \
11191 SvREFCNT_dec(scratch_list); \
11195 /* For non-locale, just add it to any existing list */ \
11196 _invlist_union(destlist, \
11197 (AT_LEAST_ASCII_RESTRICTED) \
11203 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11205 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11207 SV* scratch_list = NULL; \
11208 ANYOF_CLASS_SET(node, class); \
11209 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
11210 if (! destlist) { \
11211 destlist = scratch_list; \
11214 _invlist_union(destlist, scratch_list, &destlist); \
11215 SvREFCNT_dec(scratch_list); \
11219 _invlist_union_complement_2nd(destlist, \
11220 (AT_LEAST_ASCII_RESTRICTED) \
11224 /* Under /d, everything in the upper half of the Latin1 range \
11225 * matches this complement */ \
11226 if (DEPENDS_SEMANTICS) { \
11227 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11231 /* Generate the code to add a posix character <class> to the bracketed
11232 * character class given by <node>. (<node> is needed only under locale rules)
11233 * destlist is the inversion list for non-locale rules that this class is
11235 * sourcelist is the ASCII-range inversion list to add under /a rules
11236 * l1_sourcelist is the Latin1 range list to use otherwise.
11237 * Xpropertyname is the name to add to <run_time_list> of the property to
11238 * specify the code points above Latin1 that will have to be
11239 * determined at run-time
11240 * run_time_list is a SV* that contains text names of properties that are to
11241 * be computed at run time. This concatenates <Xpropertyname>
11242 * to it, appropriately
11243 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11245 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11246 l1_sourcelist, Xpropertyname, run_time_list) \
11247 /* First, resolve whether to use the ASCII-only list or the L1 \
11249 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
11250 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11251 Xpropertyname, run_time_list)
11253 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11254 Xpropertyname, run_time_list) \
11255 /* If not /a matching, there are going to be code points we will have \
11256 * to defer to runtime to look-up */ \
11257 if (! AT_LEAST_ASCII_RESTRICTED) { \
11258 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11261 ANYOF_CLASS_SET(node, class); \
11264 _invlist_union(destlist, sourcelist, &destlist); \
11267 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
11268 * this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
11270 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11271 l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11272 if (AT_LEAST_ASCII_RESTRICTED) { \
11273 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
11276 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11277 matches_above_unicode = TRUE; \
11279 ANYOF_CLASS_SET(node, namedclass); \
11282 SV* scratch_list = NULL; \
11283 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
11284 if (! destlist) { \
11285 destlist = scratch_list; \
11288 _invlist_union(destlist, scratch_list, &destlist); \
11289 SvREFCNT_dec(scratch_list); \
11291 if (DEPENDS_SEMANTICS) { \
11292 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11298 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
11300 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
11301 * alternate list, pointed to by 'alternate_ptr'. This is an array of
11302 * the multi-character folds of characters in the node */
11305 PERL_ARGS_ASSERT_ADD_ALTERNATE;
11307 if (! *alternate_ptr) {
11308 *alternate_ptr = newAV();
11310 sv = newSVpvn_utf8((char*)string, len, TRUE);
11311 av_push(*alternate_ptr, sv);
11315 /* The names of properties whose definitions are not known at compile time are
11316 * stored in this SV, after a constant heading. So if the length has been
11317 * changed since initialization, then there is a run-time definition. */
11318 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11320 /* This converts the named class defined in regcomp.h to its equivalent class
11321 * number defined in handy.h. */
11322 #define namedclass_to_classnum(class) ((class) / 2)
11325 parse a class specification and produce either an ANYOF node that
11326 matches the pattern or perhaps will be optimized into an EXACTish node
11327 instead. The node contains a bit map for the first 256 characters, with the
11328 corresponding bit set if that character is in the list. For characters
11329 above 255, a range list is used */
11332 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11336 UV prevvalue = OOB_UNICODE;
11341 IV namedclass = OOB_NAMEDCLASS;
11342 char *rangebegin = NULL;
11343 bool need_class = 0;
11344 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
11346 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11347 than just initialized. */
11348 SV* properties = NULL; /* Code points that match \p{} \P{} */
11349 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11350 extended beyond the Latin1 range */
11351 UV element_count = 0; /* Number of distinct elements in the class.
11352 Optimizations may be possible if this is tiny */
11355 /* Unicode properties are stored in a swash; this holds the current one
11356 * being parsed. If this swash is the only above-latin1 component of the
11357 * character class, an optimization is to pass it directly on to the
11358 * execution engine. Otherwise, it is set to NULL to indicate that there
11359 * are other things in the class that have to be dealt with at execution
11361 SV* swash = NULL; /* Code points that match \p{} \P{} */
11363 /* Set if a component of this character class is user-defined; just passed
11364 * on to the engine */
11365 bool has_user_defined_property = FALSE;
11367 /* inversion list of code points this node matches only when the target
11368 * string is in UTF-8. (Because is under /d) */
11369 SV* depends_list = NULL;
11371 /* inversion list of code points this node matches. For much of the
11372 * function, it includes only those that match regardless of the utf8ness
11373 * of the target string */
11374 SV* cp_list = NULL;
11376 /* List of multi-character folds that are matched by this node */
11377 AV* unicode_alternate = NULL;
11379 /* In a range, counts how many 0-2 of the ends of it came from literals,
11380 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11381 UV literal_endpoint = 0;
11383 bool invert = FALSE; /* Is this class to be complemented */
11385 /* Is there any thing like \W or [:^digit:] that matches above the legal
11386 * Unicode range? */
11387 bool runtime_posix_matches_above_Unicode = FALSE;
11389 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11390 case we need to change the emitted regop to an EXACT. */
11391 const char * orig_parse = RExC_parse;
11392 const I32 orig_size = RExC_size;
11393 GET_RE_DEBUG_FLAGS_DECL;
11395 PERL_ARGS_ASSERT_REGCLASS;
11397 PERL_UNUSED_ARG(depth);
11400 DEBUG_PARSE("clas");
11402 /* Assume we are going to generate an ANYOF node. */
11403 ret = reganode(pRExC_state, ANYOF, 0);
11407 ANYOF_FLAGS(ret) = 0;
11410 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11415 /* We have decided to not allow multi-char folds in inverted character
11416 * classes, due to the confusion that can happen, especially with
11417 * classes that are designed for a non-Unicode world: You have the
11418 * peculiar case that:
11419 "s s" =~ /^[^\xDF]+$/i => Y
11420 "ss" =~ /^[^\xDF]+$/i => N
11422 * See [perl #89750] */
11423 allow_full_fold = FALSE;
11427 RExC_size += ANYOF_SKIP;
11428 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11431 RExC_emit += ANYOF_SKIP;
11433 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11435 listsv = newSVpvs("# comment\n");
11436 initial_listsv_len = SvCUR(listsv);
11439 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11441 if (!SIZE_ONLY && POSIXCC(nextvalue))
11442 checkposixcc(pRExC_state);
11444 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11445 if (UCHARAT(RExC_parse) == ']')
11446 goto charclassloop;
11449 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11453 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11456 rangebegin = RExC_parse;
11460 value = utf8n_to_uvchr((U8*)RExC_parse,
11461 RExC_end - RExC_parse,
11462 &numlen, UTF8_ALLOW_DEFAULT);
11463 RExC_parse += numlen;
11466 value = UCHARAT(RExC_parse++);
11468 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11469 if (value == '[' && POSIXCC(nextvalue))
11470 namedclass = regpposixcc(pRExC_state, value);
11471 else if (value == '\\') {
11473 value = utf8n_to_uvchr((U8*)RExC_parse,
11474 RExC_end - RExC_parse,
11475 &numlen, UTF8_ALLOW_DEFAULT);
11476 RExC_parse += numlen;
11479 value = UCHARAT(RExC_parse++);
11480 /* Some compilers cannot handle switching on 64-bit integer
11481 * values, therefore value cannot be an UV. Yes, this will
11482 * be a problem later if we want switch on Unicode.
11483 * A similar issue a little bit later when switching on
11484 * namedclass. --jhi */
11485 switch ((I32)value) {
11486 case 'w': namedclass = ANYOF_ALNUM; break;
11487 case 'W': namedclass = ANYOF_NALNUM; break;
11488 case 's': namedclass = ANYOF_SPACE; break;
11489 case 'S': namedclass = ANYOF_NSPACE; break;
11490 case 'd': namedclass = ANYOF_DIGIT; break;
11491 case 'D': namedclass = ANYOF_NDIGIT; break;
11492 case 'v': namedclass = ANYOF_VERTWS; break;
11493 case 'V': namedclass = ANYOF_NVERTWS; break;
11494 case 'h': namedclass = ANYOF_HORIZWS; break;
11495 case 'H': namedclass = ANYOF_NHORIZWS; break;
11496 case 'N': /* Handle \N{NAME} in class */
11498 /* We only pay attention to the first char of
11499 multichar strings being returned. I kinda wonder
11500 if this makes sense as it does change the behaviour
11501 from earlier versions, OTOH that behaviour was broken
11503 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11504 TRUE /* => charclass */))
11514 if (RExC_parse >= RExC_end)
11515 vFAIL2("Empty \\%c{}", (U8)value);
11516 if (*RExC_parse == '{') {
11517 const U8 c = (U8)value;
11518 e = strchr(RExC_parse++, '}');
11520 vFAIL2("Missing right brace on \\%c{}", c);
11521 while (isSPACE(UCHARAT(RExC_parse)))
11523 if (e == RExC_parse)
11524 vFAIL2("Empty \\%c{}", c);
11525 n = e - RExC_parse;
11526 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11537 if (UCHARAT(RExC_parse) == '^') {
11540 value = value == 'p' ? 'P' : 'p'; /* toggle */
11541 while (isSPACE(UCHARAT(RExC_parse))) {
11546 /* Try to get the definition of the property into
11547 * <invlist>. If /i is in effect, the effective property
11548 * will have its name be <__NAME_i>. The design is
11549 * discussed in commit
11550 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11551 Newx(name, n + sizeof("_i__\n"), char);
11553 sprintf(name, "%s%.*s%s\n",
11554 (FOLD) ? "__" : "",
11560 /* Look up the property name, and get its swash and
11561 * inversion list, if the property is found */
11563 SvREFCNT_dec(swash);
11565 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11568 TRUE, /* this routine will handle
11569 undefined properties */
11570 NULL, FALSE /* No inversion list */
11574 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11575 || ! (invlist = _get_swash_invlist(swash)))
11578 SvREFCNT_dec(swash);
11582 /* Here didn't find it. It could be a user-defined
11583 * property that will be available at run-time. Add it
11584 * to the list to look up then */
11585 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11586 (value == 'p' ? '+' : '!'),
11588 has_user_defined_property = TRUE;
11590 /* We don't know yet, so have to assume that the
11591 * property could match something in the Latin1 range,
11592 * hence something that isn't utf8. Note that this
11593 * would cause things in <depends_list> to match
11594 * inappropriately, except that any \p{}, including
11595 * this one forces Unicode semantics, which means there
11596 * is <no depends_list> */
11597 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11601 /* Here, did get the swash and its inversion list. If
11602 * the swash is from a user-defined property, then this
11603 * whole character class should be regarded as such */
11604 has_user_defined_property =
11605 _is_swash_user_defined(swash);
11607 /* Invert if asking for the complement */
11608 if (value == 'P') {
11609 _invlist_union_complement_2nd(properties,
11613 /* The swash can't be used as-is, because we've
11614 * inverted things; delay removing it to here after
11615 * have copied its invlist above */
11616 SvREFCNT_dec(swash);
11620 _invlist_union(properties, invlist, &properties);
11625 RExC_parse = e + 1;
11626 namedclass = ANYOF_MAX; /* no official name, but it's named */
11628 /* \p means they want Unicode semantics */
11629 RExC_uni_semantics = 1;
11632 case 'n': value = '\n'; break;
11633 case 'r': value = '\r'; break;
11634 case 't': value = '\t'; break;
11635 case 'f': value = '\f'; break;
11636 case 'b': value = '\b'; break;
11637 case 'e': value = ASCII_TO_NATIVE('\033');break;
11638 case 'a': value = ASCII_TO_NATIVE('\007');break;
11640 RExC_parse--; /* function expects to be pointed at the 'o' */
11642 const char* error_msg;
11643 bool valid = grok_bslash_o(RExC_parse,
11648 RExC_parse += numlen;
11653 if (PL_encoding && value < 0x100) {
11654 goto recode_encoding;
11658 RExC_parse--; /* function expects to be pointed at the 'x' */
11660 const char* error_msg;
11661 bool valid = grok_bslash_x(RExC_parse,
11666 RExC_parse += numlen;
11671 if (PL_encoding && value < 0x100)
11672 goto recode_encoding;
11675 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11677 case '0': case '1': case '2': case '3': case '4':
11678 case '5': case '6': case '7':
11680 /* Take 1-3 octal digits */
11681 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11683 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11684 RExC_parse += numlen;
11685 if (PL_encoding && value < 0x100)
11686 goto recode_encoding;
11690 if (! RExC_override_recoding) {
11691 SV* enc = PL_encoding;
11692 value = reg_recode((const char)(U8)value, &enc);
11693 if (!enc && SIZE_ONLY)
11694 ckWARNreg(RExC_parse,
11695 "Invalid escape in the specified encoding");
11699 /* Allow \_ to not give an error */
11700 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11701 ckWARN2reg(RExC_parse,
11702 "Unrecognized escape \\%c in character class passed through",
11707 } /* end of \blah */
11710 literal_endpoint++;
11713 /* What matches in a locale is not known until runtime. This
11714 * includes what the Posix classes (like \w, [:space:]) match.
11715 * Room must be reserved (one time per class) to store such
11716 * classes, either if Perl is compiled so that locale nodes always
11717 * should have this space, or if there is such class info to be
11718 * stored. The space will contain a bit for each named class that
11719 * is to be matched against. This isn't needed for \p{} and
11720 * pseudo-classes, as they are not affected by locale, and hence
11721 * are dealt with separately */
11724 && (ANYOF_LOCALE == ANYOF_CLASS
11725 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11729 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11732 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11733 ANYOF_CLASS_ZERO(ret);
11735 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11738 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11740 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11741 * literal, as is the character that began the false range, i.e.
11742 * the 'a' in the examples */
11746 RExC_parse >= rangebegin ?
11747 RExC_parse - rangebegin : 0;
11748 ckWARN4reg(RExC_parse,
11749 "False [] range \"%*.*s\"",
11751 cp_list = add_cp_to_invlist(cp_list, '-');
11752 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11755 range = 0; /* this was not a true range */
11756 element_count += 2; /* So counts for three values */
11760 switch ((I32)namedclass) {
11762 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11763 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11764 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11766 case ANYOF_NALNUMC:
11767 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11768 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11769 runtime_posix_matches_above_Unicode);
11772 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11773 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11776 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11777 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11778 runtime_posix_matches_above_Unicode);
11782 ANYOF_CLASS_SET(ret, namedclass);
11785 _invlist_union(posixes, PL_ASCII, &posixes);
11790 ANYOF_CLASS_SET(ret, namedclass);
11793 _invlist_union_complement_2nd(posixes,
11794 PL_ASCII, &posixes);
11795 if (DEPENDS_SEMANTICS) {
11796 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11801 DO_POSIX(ret, namedclass, posixes,
11802 PL_PosixBlank, PL_XPosixBlank);
11805 DO_N_POSIX(ret, namedclass, posixes,
11806 PL_PosixBlank, PL_XPosixBlank);
11809 DO_POSIX(ret, namedclass, posixes,
11810 PL_PosixCntrl, PL_XPosixCntrl);
11813 DO_N_POSIX(ret, namedclass, posixes,
11814 PL_PosixCntrl, PL_XPosixCntrl);
11817 /* There are no digits in the Latin1 range outside of
11818 * ASCII, so call the macro that doesn't have to resolve
11820 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
11821 PL_PosixDigit, "XPosixDigit", listsv);
11824 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11825 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
11826 runtime_posix_matches_above_Unicode);
11829 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11830 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11833 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11834 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
11835 runtime_posix_matches_above_Unicode);
11837 case ANYOF_HORIZWS:
11838 /* For these, we use the cp_list, as /d doesn't make a
11839 * difference in what these match. There would be problems
11840 * if these characters had folds other than themselves, as
11841 * cp_list is subject to folding. It turns out that \h
11842 * is just a synonym for XPosixBlank */
11843 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
11845 case ANYOF_NHORIZWS:
11846 _invlist_union_complement_2nd(cp_list,
11847 PL_XPosixBlank, &cp_list);
11851 { /* These require special handling, as they differ under
11852 folding, matching Cased there (which in the ASCII range
11853 is the same as Alpha */
11859 if (FOLD && ! LOC) {
11860 ascii_source = PL_PosixAlpha;
11861 l1_source = PL_L1Cased;
11865 ascii_source = PL_PosixLower;
11866 l1_source = PL_L1PosixLower;
11867 Xname = "XPosixLower";
11869 if (namedclass == ANYOF_LOWER) {
11870 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11871 ascii_source, l1_source, Xname, listsv);
11874 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11875 posixes, ascii_source, l1_source, Xname, listsv,
11876 runtime_posix_matches_above_Unicode);
11881 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11882 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11885 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11886 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
11887 runtime_posix_matches_above_Unicode);
11890 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11891 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11894 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11895 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
11896 runtime_posix_matches_above_Unicode);
11899 DO_POSIX(ret, namedclass, posixes,
11900 PL_PosixSpace, PL_XPosixSpace);
11902 case ANYOF_NPSXSPC:
11903 DO_N_POSIX(ret, namedclass, posixes,
11904 PL_PosixSpace, PL_XPosixSpace);
11907 DO_POSIX(ret, namedclass, posixes,
11908 PL_PerlSpace, PL_XPerlSpace);
11911 DO_N_POSIX(ret, namedclass, posixes,
11912 PL_PerlSpace, PL_XPerlSpace);
11914 case ANYOF_UPPER: /* Same as LOWER, above */
11921 if (FOLD && ! LOC) {
11922 ascii_source = PL_PosixAlpha;
11923 l1_source = PL_L1Cased;
11927 ascii_source = PL_PosixUpper;
11928 l1_source = PL_L1PosixUpper;
11929 Xname = "XPosixUpper";
11931 if (namedclass == ANYOF_UPPER) {
11932 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11933 ascii_source, l1_source, Xname, listsv);
11936 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11937 posixes, ascii_source, l1_source, Xname, listsv,
11938 runtime_posix_matches_above_Unicode);
11942 case ANYOF_ALNUM: /* Really is 'Word' */
11943 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11944 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11947 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11948 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
11949 runtime_posix_matches_above_Unicode);
11952 /* For these, we use the cp_list, as /d doesn't make a
11953 * difference in what these match. There would be problems
11954 * if these characters had folds other than themselves, as
11955 * cp_list is subject to folding */
11956 _invlist_union(cp_list, PL_VertSpace, &cp_list);
11958 case ANYOF_NVERTWS:
11959 _invlist_union_complement_2nd(cp_list,
11960 PL_VertSpace, &cp_list);
11963 DO_POSIX(ret, namedclass, posixes,
11964 PL_PosixXDigit, PL_XPosixXDigit);
11966 case ANYOF_NXDIGIT:
11967 DO_N_POSIX(ret, namedclass, posixes,
11968 PL_PosixXDigit, PL_XPosixXDigit);
11971 /* this is to handle \p and \P */
11974 vFAIL("Invalid [::] class");
11978 continue; /* Go get next character */
11980 } /* end of namedclass \blah */
11983 if (prevvalue > value) /* b-a */ {
11984 const int w = RExC_parse - rangebegin;
11985 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11986 range = 0; /* not a valid range */
11990 prevvalue = value; /* save the beginning of the potential range */
11991 if (RExC_parse+1 < RExC_end
11992 && *RExC_parse == '-'
11993 && RExC_parse[1] != ']')
11997 /* a bad range like \w-, [:word:]- ? */
11998 if (namedclass > OOB_NAMEDCLASS) {
11999 if (ckWARN(WARN_REGEXP)) {
12001 RExC_parse >= rangebegin ?
12002 RExC_parse - rangebegin : 0;
12004 "False [] range \"%*.*s\"",
12008 cp_list = add_cp_to_invlist(cp_list, '-');
12012 range = 1; /* yeah, it's a range! */
12013 continue; /* but do it the next time */
12017 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12020 /* non-Latin1 code point implies unicode semantics. Must be set in
12021 * pass1 so is there for the whole of pass 2 */
12023 RExC_uni_semantics = 1;
12026 /* Ready to process either the single value, or the completed range */
12029 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12031 UV* this_range = _new_invlist(1);
12032 _append_range_to_invlist(this_range, prevvalue, value);
12034 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12035 * If this range was specified using something like 'i-j', we want
12036 * to include only the 'i' and the 'j', and not anything in
12037 * between, so exclude non-ASCII, non-alphabetics from it.
12038 * However, if the range was specified with something like
12039 * [\x89-\x91] or [\x89-j], all code points within it should be
12040 * included. literal_endpoint==2 means both ends of the range used
12041 * a literal character, not \x{foo} */
12042 if (literal_endpoint == 2
12043 && (prevvalue >= 'a' && value <= 'z')
12044 || (prevvalue >= 'A' && value <= 'Z'))
12046 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12047 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12049 _invlist_union(cp_list, this_range, &cp_list);
12050 literal_endpoint = 0;
12054 range = 0; /* this range (if it was one) is done now */
12055 } /* End of loop through all the text within the brackets */
12057 /* If the character class contains only a single element, it may be
12058 * optimizable into another node type which is smaller and runs faster.
12059 * Check if this is the case for this class */
12060 if (element_count == 1) {
12064 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12065 [:digit:] or \p{foo} */
12067 /* Certain named classes have equivalents that can appear outside a
12068 * character class, e.g. \w, \H. We use these instead of a
12069 * character class. */
12070 switch ((I32)namedclass) {
12073 /* The first group is for node types that depend on the charset
12074 * modifier to the regex. We first calculate the base node
12075 * type, and if it should be inverted */
12082 goto join_charset_classes;
12089 goto join_charset_classes;
12097 join_charset_classes:
12099 /* Now that we have the base node type, we take advantage
12100 * of the enum ordering of the charset modifiers to get the
12101 * exact node type, For example the base SPACE also has
12102 * SPACEL, SPACEU, and SPACEA */
12104 offset = get_regex_charset(RExC_flags);
12106 /* /aa is the same as /a for these */
12107 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12108 offset = REGEX_ASCII_RESTRICTED_CHARSET;
12110 else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12111 offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12116 /* The number of varieties of each of these is the same,
12117 * hence, so is the delta between the normal and
12118 * complemented nodes */
12120 op += NALNUM - ALNUM;
12122 *flagp |= HASWIDTH|SIMPLE;
12125 /* The second group doesn't depend of the charset modifiers.
12126 * We just have normal and complemented */
12127 case ANYOF_NHORIZWS:
12130 case ANYOF_HORIZWS:
12132 op = (invert) ? NHORIZWS : HORIZWS;
12133 *flagp |= HASWIDTH|SIMPLE;
12136 case ANYOF_NVERTWS:
12140 op = (invert) ? NVERTWS : VERTWS;
12141 *flagp |= HASWIDTH|SIMPLE;
12151 if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12156 /* A generic posix class. All the /a ones can be handled
12157 * by the POSIXA opcode. And all are closed under folding
12158 * in the ASCII range, so FOLD doesn't matter */
12159 if (AT_LEAST_ASCII_RESTRICTED
12160 || (! LOC && namedclass == ANYOF_ASCII))
12162 /* The odd numbered ones are the complements of the
12163 * next-lower even number one */
12164 if (namedclass % 2 == 1) {
12168 arg = namedclass_to_classnum(namedclass);
12169 op = (invert) ? NPOSIXA : POSIXA;
12174 else if (value == prevvalue) {
12176 /* Here, the class consists of just a single code point */
12179 if (! LOC && value == '\n') {
12180 op = REG_ANY; /* Optimize [^\n] */
12181 *flagp |= HASWIDTH|SIMPLE;
12185 else if (value < 256 || UTF) {
12187 /* Optimize a single value into an EXACTish node, but not if it
12188 * would require converting the pattern to UTF-8. */
12189 op = compute_EXACTish(pRExC_state);
12191 } /* Otherwise is a range */
12192 else if (! LOC) { /* locale could vary these */
12193 if (prevvalue == '0') {
12194 if (value == '9') {
12195 op = (invert) ? NDIGITA : DIGITA;
12196 *flagp |= HASWIDTH|SIMPLE;
12201 /* Here, we have changed <op> away from its initial value iff we found
12202 * an optimization */
12205 /* Throw away this ANYOF regnode, and emit the calculated one,
12206 * which should correspond to the beginning, not current, state of
12208 const char * cur_parse = RExC_parse;
12209 RExC_parse = (char *)orig_parse;
12213 /* To get locale nodes to not use the full ANYOF size would
12214 * require moving the code above that writes the portions
12215 * of it that aren't in other nodes to after this point.
12216 * e.g. ANYOF_CLASS_SET */
12217 RExC_size = orig_size;
12221 RExC_emit = (regnode *)orig_emit;
12224 ret = reg_node(pRExC_state, op);
12226 if (PL_regkind[op] == POSIXD) {
12230 *flagp |= HASWIDTH|SIMPLE;
12232 else if (PL_regkind[op] == EXACT) {
12233 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12236 RExC_parse = (char *) cur_parse;
12238 SvREFCNT_dec(listsv);
12245 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12247 /* If folding, we calculate all characters that could fold to or from the
12248 * ones already on the list */
12249 if (FOLD && cp_list) {
12250 UV start, end; /* End points of code point ranges */
12252 SV* fold_intersection = NULL;
12254 /* In the Latin1 range, the characters that can be folded-to or -from
12255 * are precisely the alphabetic characters. If the highest code point
12256 * is within Latin1, we can use the compiled-in list, and not have to
12257 * go out to disk. */
12258 if (invlist_highest(cp_list) < 256) {
12259 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12263 /* Here, there are non-Latin1 code points, so we will have to go
12264 * fetch the list of all the characters that participate in folds
12266 if (! PL_utf8_foldable) {
12267 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12268 &PL_sv_undef, 1, 0);
12269 PL_utf8_foldable = _get_swash_invlist(swash);
12270 SvREFCNT_dec(swash);
12273 /* This is a hash that for a particular fold gives all characters
12274 * that are involved in it */
12275 if (! PL_utf8_foldclosures) {
12277 /* If we were unable to find any folds, then we likely won't be
12278 * able to find the closures. So just create an empty list.
12279 * Folding will effectively be restricted to the non-Unicode
12280 * rules hard-coded into Perl. (This case happens legitimately
12281 * during compilation of Perl itself before the Unicode tables
12282 * are generated) */
12283 if (invlist_len(PL_utf8_foldable) == 0) {
12284 PL_utf8_foldclosures = newHV();
12287 /* If the folds haven't been read in, call a fold function
12289 if (! PL_utf8_tofold) {
12290 U8 dummy[UTF8_MAXBYTES+1];
12293 /* This particular string is above \xff in both UTF-8
12295 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
12296 assert(PL_utf8_tofold); /* Verify that worked */
12298 PL_utf8_foldclosures =
12299 _swash_inversion_hash(PL_utf8_tofold);
12303 /* Only the characters in this class that participate in folds need
12304 * be checked. Get the intersection of this class and all the
12305 * possible characters that are foldable. This can quickly narrow
12306 * down a large class */
12307 _invlist_intersection(PL_utf8_foldable, cp_list,
12308 &fold_intersection);
12311 /* Now look at the foldable characters in this class individually */
12312 invlist_iterinit(fold_intersection);
12313 while (invlist_iternext(fold_intersection, &start, &end)) {
12316 /* Locale folding for Latin1 characters is deferred until runtime */
12317 if (LOC && start < 256) {
12321 /* Look at every character in the range */
12322 for (j = start; j <= end; j++) {
12324 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12330 /* We have the latin1 folding rules hard-coded here so that
12331 * an innocent-looking character class, like /[ks]/i won't
12332 * have to go out to disk to find the possible matches.
12333 * XXX It would be better to generate these via regen, in
12334 * case a new version of the Unicode standard adds new
12335 * mappings, though that is not really likely, and may be
12336 * caught by the default: case of the switch below. */
12338 if (PL_fold_latin1[j] != j) {
12340 /* ASCII is always matched; non-ASCII is matched only
12341 * under Unicode rules */
12342 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12344 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12348 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12352 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12353 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12355 /* Certain Latin1 characters have matches outside
12356 * Latin1, or are multi-character. To get here, 'j' is
12357 * one of those characters. None of these matches is
12358 * valid for ASCII characters under /aa, which is why
12359 * the 'if' just above excludes those. The matches
12360 * fall into three categories:
12361 * 1) They are singly folded-to or -from an above 255
12362 * character, e.g., LATIN SMALL LETTER Y WITH
12363 * DIAERESIS and LATIN CAPITAL LETTER Y WITH
12365 * 2) They are part of a multi-char fold with another
12366 * latin1 character; only LATIN SMALL LETTER
12367 * SHARP S => "ss" fits this;
12368 * 3) They are part of a multi-char fold with a
12369 * character outside of Latin1, such as various
12371 * We aren't dealing fully with multi-char folds, except
12372 * we do deal with the pattern containing a character
12373 * that has a multi-char fold (not so much the inverse).
12374 * For types 1) and 3), the matches only happen when the
12375 * target string is utf8; that's not true for 2), and we
12376 * set a flag for it.
12378 * The code below adds the single fold closures for 'j'
12379 * to the inversion list. */
12384 add_cp_to_invlist(cp_list, KELVIN_SIGN);
12388 cp_list = add_cp_to_invlist(cp_list,
12389 LATIN_SMALL_LETTER_LONG_S);
12392 cp_list = add_cp_to_invlist(cp_list,
12393 GREEK_CAPITAL_LETTER_MU);
12394 cp_list = add_cp_to_invlist(cp_list,
12395 GREEK_SMALL_LETTER_MU);
12397 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12398 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12400 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12402 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12403 cp_list = add_cp_to_invlist(cp_list,
12404 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12406 case LATIN_SMALL_LETTER_SHARP_S:
12407 cp_list = add_cp_to_invlist(cp_list,
12408 LATIN_CAPITAL_LETTER_SHARP_S);
12410 /* Under /a, /d, and /u, this can match the two
12412 if (! ASCII_FOLD_RESTRICTED) {
12413 add_alternate(&unicode_alternate,
12416 /* And under /u or /a, it can match even if
12417 * the target is not utf8 */
12418 if (AT_LEAST_UNI_SEMANTICS) {
12419 ANYOF_FLAGS(ret) |=
12420 ANYOF_NONBITMAP_NON_UTF8;
12424 case 'F': case 'f':
12425 case 'I': case 'i':
12426 case 'L': case 'l':
12427 case 'T': case 't':
12428 case 'A': case 'a':
12429 case 'H': case 'h':
12430 case 'J': case 'j':
12431 case 'N': case 'n':
12432 case 'W': case 'w':
12433 case 'Y': case 'y':
12434 /* These all are targets of multi-character
12435 * folds from code points that require UTF8 to
12436 * express, so they can't match unless the
12437 * target string is in UTF-8, so no action here
12438 * is necessary, as regexec.c properly handles
12439 * the general case for UTF-8 matching */
12442 /* Use deprecated warning to increase the
12443 * chances of this being output */
12444 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12451 /* Here is an above Latin1 character. We don't have the rules
12452 * hard-coded for it. First, get its fold */
12453 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
12454 ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
12456 ? FOLD_FLAGS_LOCALE
12457 : (ASCII_FOLD_RESTRICTED)
12458 ? FOLD_FLAGS_NOMIX_ASCII
12461 if (foldlen > (STRLEN)UNISKIP(f)) {
12463 /* Any multicharacter foldings (disallowed in lookbehind
12464 * patterns) require the following transform: [ABCDEF] ->
12465 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
12466 * folds into "rst", all other characters fold to single
12467 * characters. We save away these multicharacter foldings,
12468 * to be later saved as part of the additional "s" data. */
12469 if (! RExC_in_lookbehind) {
12471 U8* e = foldbuf + foldlen;
12473 /* If any of the folded characters of this are in the
12474 * Latin1 range, tell the regex engine that this can
12475 * match a non-utf8 target string. */
12477 if (UTF8_IS_INVARIANT(*loc)
12478 || UTF8_IS_DOWNGRADEABLE_START(*loc))
12481 |= ANYOF_NONBITMAP_NON_UTF8;
12484 loc += UTF8SKIP(loc);
12487 add_alternate(&unicode_alternate, foldbuf, foldlen);
12491 /* Single character fold of above Latin1. Add everything
12492 * in its fold closure to the list that this node should
12496 /* The fold closures data structure is a hash with the keys
12497 * being every character that is folded to, like 'k', and
12498 * the values each an array of everything that folds to its
12499 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
12500 if ((listp = hv_fetch(PL_utf8_foldclosures,
12501 (char *) foldbuf, foldlen, FALSE)))
12503 AV* list = (AV*) *listp;
12505 for (k = 0; k <= av_len(list); k++) {
12506 SV** c_p = av_fetch(list, k, FALSE);
12509 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12513 /* /aa doesn't allow folds between ASCII and non-;
12514 * /l doesn't allow them between above and below
12516 if ((ASCII_FOLD_RESTRICTED
12517 && (isASCII(c) != isASCII(j)))
12518 || (LOC && ((c < 256) != (j < 256))))
12523 /* Folds involving non-ascii Latin1 characters
12524 * under /d are added to a separate list */
12525 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12527 cp_list = add_cp_to_invlist(cp_list, c);
12530 depends_list = add_cp_to_invlist(depends_list, c);
12537 SvREFCNT_dec(fold_intersection);
12540 /* And combine the result (if any) with any inversion list from posix
12541 * classes. The lists are kept separate up to now because we don't want to
12542 * fold the classes (folding of those is automatically handled by the swash
12543 * fetching code) */
12545 if (! DEPENDS_SEMANTICS) {
12547 _invlist_union(cp_list, posixes, &cp_list);
12548 SvREFCNT_dec(posixes);
12555 /* Under /d, we put into a separate list the Latin1 things that
12556 * match only when the target string is utf8 */
12557 SV* nonascii_but_latin1_properties = NULL;
12558 _invlist_intersection(posixes, PL_Latin1,
12559 &nonascii_but_latin1_properties);
12560 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12561 &nonascii_but_latin1_properties);
12562 _invlist_subtract(posixes, nonascii_but_latin1_properties,
12565 _invlist_union(cp_list, posixes, &cp_list);
12566 SvREFCNT_dec(posixes);
12572 if (depends_list) {
12573 _invlist_union(depends_list, nonascii_but_latin1_properties,
12575 SvREFCNT_dec(nonascii_but_latin1_properties);
12578 depends_list = nonascii_but_latin1_properties;
12583 /* And combine the result (if any) with any inversion list from properties.
12584 * The lists are kept separate up to now so that we can distinguish the two
12585 * in regards to matching above-Unicode. A run-time warning is generated
12586 * if a Unicode property is matched against a non-Unicode code point. But,
12587 * we allow user-defined properties to match anything, without any warning,
12588 * and we also suppress the warning if there is a portion of the character
12589 * class that isn't a Unicode property, and which matches above Unicode, \W
12590 * or [\x{110000}] for example.
12591 * (Note that in this case, unlike the Posix one above, there is no
12592 * <depends_list>, because having a Unicode property forces Unicode
12595 bool warn_super = ! has_user_defined_property;
12598 /* If it matters to the final outcome, see if a non-property
12599 * component of the class matches above Unicode. If so, the
12600 * warning gets suppressed. This is true even if just a single
12601 * such code point is specified, as though not strictly correct if
12602 * another such code point is matched against, the fact that they
12603 * are using above-Unicode code points indicates they should know
12604 * the issues involved */
12606 bool non_prop_matches_above_Unicode =
12607 runtime_posix_matches_above_Unicode
12608 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12610 non_prop_matches_above_Unicode =
12611 ! non_prop_matches_above_Unicode;
12613 warn_super = ! non_prop_matches_above_Unicode;
12616 _invlist_union(properties, cp_list, &cp_list);
12617 SvREFCNT_dec(properties);
12620 cp_list = properties;
12624 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12628 /* Here, we have calculated what code points should be in the character
12631 * Now we can see about various optimizations. Fold calculation (which we
12632 * did above) needs to take place before inversion. Otherwise /[^k]/i
12633 * would invert to include K, which under /i would match k, which it
12634 * shouldn't. Therefore we can't invert folded locale now, as it won't be
12635 * folded until runtime */
12637 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12638 * at compile time. Besides not inverting folded locale now, we can't invert
12639 * if there are things such as \w, which aren't known until runtime */
12641 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12643 && ! unicode_alternate
12644 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12646 _invlist_invert(cp_list);
12648 /* Any swash can't be used as-is, because we've inverted things */
12650 SvREFCNT_dec(swash);
12654 /* Clear the invert flag since have just done it here */
12658 /* If we didn't do folding, it's because some information isn't available
12659 * until runtime; set the run-time fold flag for these. (We don't have to
12660 * worry about properties folding, as that is taken care of by the swash
12662 if (FOLD && (LOC || unicode_alternate))
12664 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12667 /* Some character classes are equivalent to other nodes. Such nodes take
12668 * up less room and generally fewer operations to execute than ANYOF nodes.
12669 * Above, we checked for and optimized into some such equivalents for
12670 * certain common classes that are easy to test. Getting to this point in
12671 * the code means that the class didn't get optimized there. Since this
12672 * code is only executed in Pass 2, it is too late to save space--it has
12673 * been allocated in Pass 1, and currently isn't given back. But turning
12674 * things into an EXACTish node can allow the optimizer to join it to any
12675 * adjacent such nodes. And if the class is equivalent to things like /./,
12676 * expensive run-time swashes can be avoided. Now that we have more
12677 * complete information, we can find things necessarily missed by the
12678 * earlier code. I (khw) am not sure how much to look for here. It would
12679 * be easy, but perhaps too slow, to check any candidates against all the
12680 * node types they could possibly match using _invlistEQ(). */
12683 && ! unicode_alternate
12686 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12687 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12690 U8 op = END; /* The optimzation node-type */
12691 const char * cur_parse= RExC_parse;
12693 invlist_iterinit(cp_list);
12694 if (! invlist_iternext(cp_list, &start, &end)) {
12696 /* Here, the list is empty. This happens, for example, when a
12697 * Unicode property is the only thing in the character class, and
12698 * it doesn't match anything. (perluniprops.pod notes such
12701 *flagp |= HASWIDTH|SIMPLE;
12703 else if (start == end) { /* The range is a single code point */
12704 if (! invlist_iternext(cp_list, &start, &end)
12706 /* Don't do this optimization if it would require changing
12707 * the pattern to UTF-8 */
12708 && (start < 256 || UTF))
12710 /* Here, the list contains a single code point. Can optimize
12711 * into an EXACT node */
12720 /* A locale node under folding with one code point can be
12721 * an EXACTFL, as its fold won't be calculated until
12727 /* Here, we are generally folding, but there is only one
12728 * code point to match. If we have to, we use an EXACT
12729 * node, but it would be better for joining with adjacent
12730 * nodes in the optimization pass if we used the same
12731 * EXACTFish node that any such are likely to be. We can
12732 * do this iff the code point doesn't participate in any
12733 * folds. For example, an EXACTF of a colon is the same as
12734 * an EXACT one, since nothing folds to or from a colon.
12735 * In the Latin1 range, being an alpha means that the
12736 * character participates in a fold (except for the
12737 * feminine and masculine ordinals, which I (khw) don't
12738 * think are worrying about optimizing for). */
12740 if (isALPHA_L1(value)) {
12745 if (! PL_utf8_foldable) {
12746 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12747 &PL_sv_undef, 1, 0);
12748 PL_utf8_foldable = _get_swash_invlist(swash);
12749 SvREFCNT_dec(swash);
12751 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
12756 /* If we haven't found the node type, above, it means we
12757 * can use the prevailing one */
12759 op = compute_EXACTish(pRExC_state);
12764 else if (start == 0) {
12765 if (end == UV_MAX) {
12767 *flagp |= HASWIDTH|SIMPLE;
12770 else if (end == '\n' - 1
12771 && invlist_iternext(cp_list, &start, &end)
12772 && start == '\n' + 1 && end == UV_MAX)
12775 *flagp |= HASWIDTH|SIMPLE;
12781 RExC_parse = (char *)orig_parse;
12782 RExC_emit = (regnode *)orig_emit;
12784 ret = reg_node(pRExC_state, op);
12786 RExC_parse = (char *)cur_parse;
12788 if (PL_regkind[op] == EXACT) {
12789 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12792 SvREFCNT_dec(listsv);
12797 /* Here, <cp_list> contains all the code points we can determine at
12798 * compile time that match under all conditions. Go through it, and
12799 * for things that belong in the bitmap, put them there, and delete from
12800 * <cp_list>. While we are at it, see if everything above 255 is in the
12801 * list, and if so, set a flag to speed up execution */
12802 ANYOF_BITMAP_ZERO(ret);
12805 /* This gets set if we actually need to modify things */
12806 bool change_invlist = FALSE;
12810 /* Start looking through <cp_list> */
12811 invlist_iterinit(cp_list);
12812 while (invlist_iternext(cp_list, &start, &end)) {
12816 if (end == UV_MAX && start <= 256) {
12817 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12820 /* Quit if are above what we should change */
12825 change_invlist = TRUE;
12827 /* Set all the bits in the range, up to the max that we are doing */
12828 high = (end < 255) ? end : 255;
12829 for (i = start; i <= (int) high; i++) {
12830 if (! ANYOF_BITMAP_TEST(ret, i)) {
12831 ANYOF_BITMAP_SET(ret, i);
12838 /* Done with loop; remove any code points that are in the bitmap from
12840 if (change_invlist) {
12841 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
12844 /* If have completely emptied it, remove it completely */
12845 if (invlist_len(cp_list) == 0) {
12846 SvREFCNT_dec(cp_list);
12852 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
12855 /* Here, the bitmap has been populated with all the Latin1 code points that
12856 * always match. Can now add to the overall list those that match only
12857 * when the target string is UTF-8 (<depends_list>). */
12858 if (depends_list) {
12860 _invlist_union(cp_list, depends_list, &cp_list);
12861 SvREFCNT_dec(depends_list);
12864 cp_list = depends_list;
12868 /* If there is a swash and more than one element, we can't use the swash in
12869 * the optimization below. */
12870 if (swash && element_count > 1) {
12871 SvREFCNT_dec(swash);
12876 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12877 && ! unicode_alternate)
12879 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12880 SvREFCNT_dec(listsv);
12881 SvREFCNT_dec(unicode_alternate);
12884 /* av[0] stores the character class description in its textual form:
12885 * used later (regexec.c:Perl_regclass_swash()) to initialize the
12886 * appropriate swash, and is also useful for dumping the regnode.
12887 * av[1] if NULL, is a placeholder to later contain the swash computed
12888 * from av[0]. But if no further computation need be done, the
12889 * swash is stored there now.
12890 * av[2] stores the multicharacter foldings, used later in
12891 * regexec.c:S_reginclass().
12892 * av[3] stores the cp_list inversion list for use in addition or
12893 * instead of av[0]; used only if av[1] is NULL
12894 * av[4] is set if any component of the class is from a user-defined
12895 * property; used only if av[1] is NULL */
12896 AV * const av = newAV();
12899 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12903 av_store(av, 1, swash);
12904 SvREFCNT_dec(cp_list);
12907 av_store(av, 1, NULL);
12909 av_store(av, 3, cp_list);
12910 av_store(av, 4, newSVuv(has_user_defined_property));
12914 /* Store any computed multi-char folds only if we are allowing
12916 if (allow_full_fold) {
12917 av_store(av, 2, MUTABLE_SV(unicode_alternate));
12918 if (unicode_alternate) { /* This node is variable length */
12923 av_store(av, 2, NULL);
12925 rv = newRV_noinc(MUTABLE_SV(av));
12926 n = add_data(pRExC_state, 1, "s");
12927 RExC_rxi->data->data[n] = (void*)rv;
12931 *flagp |= HASWIDTH|SIMPLE;
12934 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12937 /* reg_skipcomment()
12939 Absorbs an /x style # comments from the input stream.
12940 Returns true if there is more text remaining in the stream.
12941 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12942 terminates the pattern without including a newline.
12944 Note its the callers responsibility to ensure that we are
12945 actually in /x mode
12950 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12954 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12956 while (RExC_parse < RExC_end)
12957 if (*RExC_parse++ == '\n') {
12962 /* we ran off the end of the pattern without ending
12963 the comment, so we have to add an \n when wrapping */
12964 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12972 Advances the parse position, and optionally absorbs
12973 "whitespace" from the inputstream.
12975 Without /x "whitespace" means (?#...) style comments only,
12976 with /x this means (?#...) and # comments and whitespace proper.
12978 Returns the RExC_parse point from BEFORE the scan occurs.
12980 This is the /x friendly way of saying RExC_parse++.
12984 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12986 char* const retval = RExC_parse++;
12988 PERL_ARGS_ASSERT_NEXTCHAR;
12991 if (RExC_end - RExC_parse >= 3
12992 && *RExC_parse == '('
12993 && RExC_parse[1] == '?'
12994 && RExC_parse[2] == '#')
12996 while (*RExC_parse != ')') {
12997 if (RExC_parse == RExC_end)
12998 FAIL("Sequence (?#... not terminated");
13004 if (RExC_flags & RXf_PMf_EXTENDED) {
13005 if (isSPACE(*RExC_parse)) {
13009 else if (*RExC_parse == '#') {
13010 if ( reg_skipcomment( pRExC_state ) )
13019 - reg_node - emit a node
13021 STATIC regnode * /* Location. */
13022 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13026 regnode * const ret = RExC_emit;
13027 GET_RE_DEBUG_FLAGS_DECL;
13029 PERL_ARGS_ASSERT_REG_NODE;
13032 SIZE_ALIGN(RExC_size);
13036 if (RExC_emit >= RExC_emit_bound)
13037 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13038 op, RExC_emit, RExC_emit_bound);
13040 NODE_ALIGN_FILL(ret);
13042 FILL_ADVANCE_NODE(ptr, op);
13043 #ifdef RE_TRACK_PATTERN_OFFSETS
13044 if (RExC_offsets) { /* MJD */
13045 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13046 "reg_node", __LINE__,
13048 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13049 ? "Overwriting end of array!\n" : "OK",
13050 (UV)(RExC_emit - RExC_emit_start),
13051 (UV)(RExC_parse - RExC_start),
13052 (UV)RExC_offsets[0]));
13053 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13061 - reganode - emit a node with an argument
13063 STATIC regnode * /* Location. */
13064 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13068 regnode * const ret = RExC_emit;
13069 GET_RE_DEBUG_FLAGS_DECL;
13071 PERL_ARGS_ASSERT_REGANODE;
13074 SIZE_ALIGN(RExC_size);
13079 assert(2==regarglen[op]+1);
13081 Anything larger than this has to allocate the extra amount.
13082 If we changed this to be:
13084 RExC_size += (1 + regarglen[op]);
13086 then it wouldn't matter. Its not clear what side effect
13087 might come from that so its not done so far.
13092 if (RExC_emit >= RExC_emit_bound)
13093 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13094 op, RExC_emit, RExC_emit_bound);
13096 NODE_ALIGN_FILL(ret);
13098 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13099 #ifdef RE_TRACK_PATTERN_OFFSETS
13100 if (RExC_offsets) { /* MJD */
13101 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13105 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13106 "Overwriting end of array!\n" : "OK",
13107 (UV)(RExC_emit - RExC_emit_start),
13108 (UV)(RExC_parse - RExC_start),
13109 (UV)RExC_offsets[0]));
13110 Set_Cur_Node_Offset;
13118 - reguni - emit (if appropriate) a Unicode character
13121 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13125 PERL_ARGS_ASSERT_REGUNI;
13127 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13131 - reginsert - insert an operator in front of already-emitted operand
13133 * Means relocating the operand.
13136 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13142 const int offset = regarglen[(U8)op];
13143 const int size = NODE_STEP_REGNODE + offset;
13144 GET_RE_DEBUG_FLAGS_DECL;
13146 PERL_ARGS_ASSERT_REGINSERT;
13147 PERL_UNUSED_ARG(depth);
13148 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13149 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13158 if (RExC_open_parens) {
13160 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13161 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13162 if ( RExC_open_parens[paren] >= opnd ) {
13163 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13164 RExC_open_parens[paren] += size;
13166 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13168 if ( RExC_close_parens[paren] >= opnd ) {
13169 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13170 RExC_close_parens[paren] += size;
13172 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13177 while (src > opnd) {
13178 StructCopy(--src, --dst, regnode);
13179 #ifdef RE_TRACK_PATTERN_OFFSETS
13180 if (RExC_offsets) { /* MJD 20010112 */
13181 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13185 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13186 ? "Overwriting end of array!\n" : "OK",
13187 (UV)(src - RExC_emit_start),
13188 (UV)(dst - RExC_emit_start),
13189 (UV)RExC_offsets[0]));
13190 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13191 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13197 place = opnd; /* Op node, where operand used to be. */
13198 #ifdef RE_TRACK_PATTERN_OFFSETS
13199 if (RExC_offsets) { /* MJD */
13200 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13204 (UV)(place - RExC_emit_start) > RExC_offsets[0]
13205 ? "Overwriting end of array!\n" : "OK",
13206 (UV)(place - RExC_emit_start),
13207 (UV)(RExC_parse - RExC_start),
13208 (UV)RExC_offsets[0]));
13209 Set_Node_Offset(place, RExC_parse);
13210 Set_Node_Length(place, 1);
13213 src = NEXTOPER(place);
13214 FILL_ADVANCE_NODE(place, op);
13215 Zero(src, offset, regnode);
13219 - regtail - set the next-pointer at the end of a node chain of p to val.
13220 - SEE ALSO: regtail_study
13222 /* TODO: All three parms should be const */
13224 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13228 GET_RE_DEBUG_FLAGS_DECL;
13230 PERL_ARGS_ASSERT_REGTAIL;
13232 PERL_UNUSED_ARG(depth);
13238 /* Find last node. */
13241 regnode * const temp = regnext(scan);
13243 SV * const mysv=sv_newmortal();
13244 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13245 regprop(RExC_rx, mysv, scan);
13246 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13247 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13248 (temp == NULL ? "->" : ""),
13249 (temp == NULL ? PL_reg_name[OP(val)] : "")
13257 if (reg_off_by_arg[OP(scan)]) {
13258 ARG_SET(scan, val - scan);
13261 NEXT_OFF(scan) = val - scan;
13267 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13268 - Look for optimizable sequences at the same time.
13269 - currently only looks for EXACT chains.
13271 This is experimental code. The idea is to use this routine to perform
13272 in place optimizations on branches and groups as they are constructed,
13273 with the long term intention of removing optimization from study_chunk so
13274 that it is purely analytical.
13276 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13277 to control which is which.
13280 /* TODO: All four parms should be const */
13283 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13288 #ifdef EXPERIMENTAL_INPLACESCAN
13291 GET_RE_DEBUG_FLAGS_DECL;
13293 PERL_ARGS_ASSERT_REGTAIL_STUDY;
13299 /* Find last node. */
13303 regnode * const temp = regnext(scan);
13304 #ifdef EXPERIMENTAL_INPLACESCAN
13305 if (PL_regkind[OP(scan)] == EXACT) {
13306 bool has_exactf_sharp_s; /* Unexamined in this routine */
13307 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13312 switch (OP(scan)) {
13318 case EXACTFU_TRICKYFOLD:
13320 if( exact == PSEUDO )
13322 else if ( exact != OP(scan) )
13331 SV * const mysv=sv_newmortal();
13332 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13333 regprop(RExC_rx, mysv, scan);
13334 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13335 SvPV_nolen_const(mysv),
13336 REG_NODE_NUM(scan),
13337 PL_reg_name[exact]);
13344 SV * const mysv_val=sv_newmortal();
13345 DEBUG_PARSE_MSG("");
13346 regprop(RExC_rx, mysv_val, val);
13347 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13348 SvPV_nolen_const(mysv_val),
13349 (IV)REG_NODE_NUM(val),
13353 if (reg_off_by_arg[OP(scan)]) {
13354 ARG_SET(scan, val - scan);
13357 NEXT_OFF(scan) = val - scan;
13365 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13369 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13375 for (bit=0; bit<32; bit++) {
13376 if (flags & (1<<bit)) {
13377 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
13380 if (!set++ && lead)
13381 PerlIO_printf(Perl_debug_log, "%s",lead);
13382 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13385 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13386 if (!set++ && lead) {
13387 PerlIO_printf(Perl_debug_log, "%s",lead);
13390 case REGEX_UNICODE_CHARSET:
13391 PerlIO_printf(Perl_debug_log, "UNICODE");
13393 case REGEX_LOCALE_CHARSET:
13394 PerlIO_printf(Perl_debug_log, "LOCALE");
13396 case REGEX_ASCII_RESTRICTED_CHARSET:
13397 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13399 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13400 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13403 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13409 PerlIO_printf(Perl_debug_log, "\n");
13411 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13417 Perl_regdump(pTHX_ const regexp *r)
13421 SV * const sv = sv_newmortal();
13422 SV *dsv= sv_newmortal();
13423 RXi_GET_DECL(r,ri);
13424 GET_RE_DEBUG_FLAGS_DECL;
13426 PERL_ARGS_ASSERT_REGDUMP;
13428 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13430 /* Header fields of interest. */
13431 if (r->anchored_substr) {
13432 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13433 RE_SV_DUMPLEN(r->anchored_substr), 30);
13434 PerlIO_printf(Perl_debug_log,
13435 "anchored %s%s at %"IVdf" ",
13436 s, RE_SV_TAIL(r->anchored_substr),
13437 (IV)r->anchored_offset);
13438 } else if (r->anchored_utf8) {
13439 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13440 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13441 PerlIO_printf(Perl_debug_log,
13442 "anchored utf8 %s%s at %"IVdf" ",
13443 s, RE_SV_TAIL(r->anchored_utf8),
13444 (IV)r->anchored_offset);
13446 if (r->float_substr) {
13447 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13448 RE_SV_DUMPLEN(r->float_substr), 30);
13449 PerlIO_printf(Perl_debug_log,
13450 "floating %s%s at %"IVdf"..%"UVuf" ",
13451 s, RE_SV_TAIL(r->float_substr),
13452 (IV)r->float_min_offset, (UV)r->float_max_offset);
13453 } else if (r->float_utf8) {
13454 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13455 RE_SV_DUMPLEN(r->float_utf8), 30);
13456 PerlIO_printf(Perl_debug_log,
13457 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13458 s, RE_SV_TAIL(r->float_utf8),
13459 (IV)r->float_min_offset, (UV)r->float_max_offset);
13461 if (r->check_substr || r->check_utf8)
13462 PerlIO_printf(Perl_debug_log,
13464 (r->check_substr == r->float_substr
13465 && r->check_utf8 == r->float_utf8
13466 ? "(checking floating" : "(checking anchored"));
13467 if (r->extflags & RXf_NOSCAN)
13468 PerlIO_printf(Perl_debug_log, " noscan");
13469 if (r->extflags & RXf_CHECK_ALL)
13470 PerlIO_printf(Perl_debug_log, " isall");
13471 if (r->check_substr || r->check_utf8)
13472 PerlIO_printf(Perl_debug_log, ") ");
13474 if (ri->regstclass) {
13475 regprop(r, sv, ri->regstclass);
13476 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13478 if (r->extflags & RXf_ANCH) {
13479 PerlIO_printf(Perl_debug_log, "anchored");
13480 if (r->extflags & RXf_ANCH_BOL)
13481 PerlIO_printf(Perl_debug_log, "(BOL)");
13482 if (r->extflags & RXf_ANCH_MBOL)
13483 PerlIO_printf(Perl_debug_log, "(MBOL)");
13484 if (r->extflags & RXf_ANCH_SBOL)
13485 PerlIO_printf(Perl_debug_log, "(SBOL)");
13486 if (r->extflags & RXf_ANCH_GPOS)
13487 PerlIO_printf(Perl_debug_log, "(GPOS)");
13488 PerlIO_putc(Perl_debug_log, ' ');
13490 if (r->extflags & RXf_GPOS_SEEN)
13491 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13492 if (r->intflags & PREGf_SKIP)
13493 PerlIO_printf(Perl_debug_log, "plus ");
13494 if (r->intflags & PREGf_IMPLICIT)
13495 PerlIO_printf(Perl_debug_log, "implicit ");
13496 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13497 if (r->extflags & RXf_EVAL_SEEN)
13498 PerlIO_printf(Perl_debug_log, "with eval ");
13499 PerlIO_printf(Perl_debug_log, "\n");
13500 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13502 PERL_ARGS_ASSERT_REGDUMP;
13503 PERL_UNUSED_CONTEXT;
13504 PERL_UNUSED_ARG(r);
13505 #endif /* DEBUGGING */
13509 - regprop - printable representation of opcode
13511 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13514 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13515 if (flags & ANYOF_INVERT) \
13516 /*make sure the invert info is in each */ \
13517 sv_catpvs(sv, "^"); \
13523 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13529 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13530 static const char * const anyofs[] = {
13562 RXi_GET_DECL(prog,progi);
13563 GET_RE_DEBUG_FLAGS_DECL;
13565 PERL_ARGS_ASSERT_REGPROP;
13569 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13570 /* It would be nice to FAIL() here, but this may be called from
13571 regexec.c, and it would be hard to supply pRExC_state. */
13572 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13573 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13575 k = PL_regkind[OP(o)];
13578 sv_catpvs(sv, " ");
13579 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13580 * is a crude hack but it may be the best for now since
13581 * we have no flag "this EXACTish node was UTF-8"
13583 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13584 PERL_PV_ESCAPE_UNI_DETECT |
13585 PERL_PV_ESCAPE_NONASCII |
13586 PERL_PV_PRETTY_ELLIPSES |
13587 PERL_PV_PRETTY_LTGT |
13588 PERL_PV_PRETTY_NOCLEAR
13590 } else if (k == TRIE) {
13591 /* print the details of the trie in dumpuntil instead, as
13592 * progi->data isn't available here */
13593 const char op = OP(o);
13594 const U32 n = ARG(o);
13595 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13596 (reg_ac_data *)progi->data->data[n] :
13598 const reg_trie_data * const trie
13599 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13601 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13602 DEBUG_TRIE_COMPILE_r(
13603 Perl_sv_catpvf(aTHX_ sv,
13604 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13605 (UV)trie->startstate,
13606 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13607 (UV)trie->wordcount,
13610 (UV)TRIE_CHARCOUNT(trie),
13611 (UV)trie->uniquecharcount
13614 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13616 int rangestart = -1;
13617 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13618 sv_catpvs(sv, "[");
13619 for (i = 0; i <= 256; i++) {
13620 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13621 if (rangestart == -1)
13623 } else if (rangestart != -1) {
13624 if (i <= rangestart + 3)
13625 for (; rangestart < i; rangestart++)
13626 put_byte(sv, rangestart);
13628 put_byte(sv, rangestart);
13629 sv_catpvs(sv, "-");
13630 put_byte(sv, i - 1);
13635 sv_catpvs(sv, "]");
13638 } else if (k == CURLY) {
13639 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13640 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13641 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13643 else if (k == WHILEM && o->flags) /* Ordinal/of */
13644 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13645 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13646 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13647 if ( RXp_PAREN_NAMES(prog) ) {
13648 if ( k != REF || (OP(o) < NREF)) {
13649 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13650 SV **name= av_fetch(list, ARG(o), 0 );
13652 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13655 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13656 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13657 I32 *nums=(I32*)SvPVX(sv_dat);
13658 SV **name= av_fetch(list, nums[0], 0 );
13661 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13662 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13663 (n ? "," : ""), (IV)nums[n]);
13665 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13669 } else if (k == GOSUB)
13670 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13671 else if (k == VERB) {
13673 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13674 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13675 } else if (k == LOGICAL)
13676 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13677 else if (k == ANYOF) {
13678 int i, rangestart = -1;
13679 const U8 flags = ANYOF_FLAGS(o);
13683 if (flags & ANYOF_LOCALE)
13684 sv_catpvs(sv, "{loc}");
13685 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13686 sv_catpvs(sv, "{i}");
13687 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13688 if (flags & ANYOF_INVERT)
13689 sv_catpvs(sv, "^");
13691 /* output what the standard cp 0-255 bitmap matches */
13692 for (i = 0; i <= 256; i++) {
13693 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13694 if (rangestart == -1)
13696 } else if (rangestart != -1) {
13697 if (i <= rangestart + 3)
13698 for (; rangestart < i; rangestart++)
13699 put_byte(sv, rangestart);
13701 put_byte(sv, rangestart);
13702 sv_catpvs(sv, "-");
13703 put_byte(sv, i - 1);
13710 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13711 /* output any special charclass tests (used entirely under use locale) */
13712 if (ANYOF_CLASS_TEST_ANY_SET(o))
13713 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13714 if (ANYOF_CLASS_TEST(o,i)) {
13715 sv_catpv(sv, anyofs[i]);
13719 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13721 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13722 sv_catpvs(sv, "{non-utf8-latin1-all}");
13725 /* output information about the unicode matching */
13726 if (flags & ANYOF_UNICODE_ALL)
13727 sv_catpvs(sv, "{unicode_all}");
13728 else if (ANYOF_NONBITMAP(o))
13729 sv_catpvs(sv, "{unicode}");
13730 if (flags & ANYOF_NONBITMAP_NON_UTF8)
13731 sv_catpvs(sv, "{outside bitmap}");
13733 if (ANYOF_NONBITMAP(o)) {
13734 SV *lv; /* Set if there is something outside the bit map */
13735 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13736 bool byte_output = FALSE; /* If something in the bitmap has been
13739 if (lv && lv != &PL_sv_undef) {
13741 U8 s[UTF8_MAXBYTES_CASE+1];
13743 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13744 uvchr_to_utf8(s, i);
13747 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
13751 && swash_fetch(sw, s, TRUE))
13753 if (rangestart == -1)
13755 } else if (rangestart != -1) {
13756 byte_output = TRUE;
13757 if (i <= rangestart + 3)
13758 for (; rangestart < i; rangestart++) {
13759 put_byte(sv, rangestart);
13762 put_byte(sv, rangestart);
13763 sv_catpvs(sv, "-");
13772 char *s = savesvpv(lv);
13773 char * const origs = s;
13775 while (*s && *s != '\n')
13779 const char * const t = ++s;
13782 sv_catpvs(sv, " ");
13788 /* Truncate very long output */
13789 if (s - origs > 256) {
13790 Perl_sv_catpvf(aTHX_ sv,
13792 (int) (s - origs - 1),
13798 else if (*s == '\t') {
13817 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13819 else if (k == POSIXD) {
13820 U8 index = FLAGS(o) * 2;
13821 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
13822 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
13825 sv_catpv(sv, anyofs[index]);
13828 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13829 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13831 PERL_UNUSED_CONTEXT;
13832 PERL_UNUSED_ARG(sv);
13833 PERL_UNUSED_ARG(o);
13834 PERL_UNUSED_ARG(prog);
13835 #endif /* DEBUGGING */
13839 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13840 { /* Assume that RE_INTUIT is set */
13842 struct regexp *const prog = (struct regexp *)SvANY(r);
13843 GET_RE_DEBUG_FLAGS_DECL;
13845 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13846 PERL_UNUSED_CONTEXT;
13850 const char * const s = SvPV_nolen_const(prog->check_substr
13851 ? prog->check_substr : prog->check_utf8);
13853 if (!PL_colorset) reginitcolors();
13854 PerlIO_printf(Perl_debug_log,
13855 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13857 prog->check_substr ? "" : "utf8 ",
13858 PL_colors[5],PL_colors[0],
13861 (strlen(s) > 60 ? "..." : ""));
13864 return prog->check_substr ? prog->check_substr : prog->check_utf8;
13870 handles refcounting and freeing the perl core regexp structure. When
13871 it is necessary to actually free the structure the first thing it
13872 does is call the 'free' method of the regexp_engine associated to
13873 the regexp, allowing the handling of the void *pprivate; member
13874 first. (This routine is not overridable by extensions, which is why
13875 the extensions free is called first.)
13877 See regdupe and regdupe_internal if you change anything here.
13879 #ifndef PERL_IN_XSUB_RE
13881 Perl_pregfree(pTHX_ REGEXP *r)
13887 Perl_pregfree2(pTHX_ REGEXP *rx)
13890 struct regexp *const r = (struct regexp *)SvANY(rx);
13891 GET_RE_DEBUG_FLAGS_DECL;
13893 PERL_ARGS_ASSERT_PREGFREE2;
13895 if (r->mother_re) {
13896 ReREFCNT_dec(r->mother_re);
13898 CALLREGFREE_PVT(rx); /* free the private data */
13899 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13902 SvREFCNT_dec(r->anchored_substr);
13903 SvREFCNT_dec(r->anchored_utf8);
13904 SvREFCNT_dec(r->float_substr);
13905 SvREFCNT_dec(r->float_utf8);
13906 Safefree(r->substrs);
13908 RX_MATCH_COPY_FREE(rx);
13909 #ifdef PERL_OLD_COPY_ON_WRITE
13910 SvREFCNT_dec(r->saved_copy);
13913 SvREFCNT_dec(r->qr_anoncv);
13918 This is a hacky workaround to the structural issue of match results
13919 being stored in the regexp structure which is in turn stored in
13920 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13921 could be PL_curpm in multiple contexts, and could require multiple
13922 result sets being associated with the pattern simultaneously, such
13923 as when doing a recursive match with (??{$qr})
13925 The solution is to make a lightweight copy of the regexp structure
13926 when a qr// is returned from the code executed by (??{$qr}) this
13927 lightweight copy doesn't actually own any of its data except for
13928 the starp/end and the actual regexp structure itself.
13934 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13936 struct regexp *ret;
13937 struct regexp *const r = (struct regexp *)SvANY(rx);
13939 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13942 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13943 ret = (struct regexp *)SvANY(ret_x);
13945 (void)ReREFCNT_inc(rx);
13946 /* We can take advantage of the existing "copied buffer" mechanism in SVs
13947 by pointing directly at the buffer, but flagging that the allocated
13948 space in the copy is zero. As we've just done a struct copy, it's now
13949 a case of zero-ing that, rather than copying the current length. */
13950 SvPV_set(ret_x, RX_WRAPPED(rx));
13951 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13952 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13953 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13954 SvLEN_set(ret_x, 0);
13955 SvSTASH_set(ret_x, NULL);
13956 SvMAGIC_set(ret_x, NULL);
13958 const I32 npar = r->nparens+1;
13959 Newx(ret->offs, npar, regexp_paren_pair);
13960 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13963 Newx(ret->substrs, 1, struct reg_substr_data);
13964 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13966 SvREFCNT_inc_void(ret->anchored_substr);
13967 SvREFCNT_inc_void(ret->anchored_utf8);
13968 SvREFCNT_inc_void(ret->float_substr);
13969 SvREFCNT_inc_void(ret->float_utf8);
13971 /* check_substr and check_utf8, if non-NULL, point to either their
13972 anchored or float namesakes, and don't hold a second reference. */
13974 RX_MATCH_COPIED_off(ret_x);
13975 #ifdef PERL_OLD_COPY_ON_WRITE
13976 ret->saved_copy = NULL;
13978 ret->mother_re = rx;
13979 SvREFCNT_inc_void(ret->qr_anoncv);
13985 /* regfree_internal()
13987 Free the private data in a regexp. This is overloadable by
13988 extensions. Perl takes care of the regexp structure in pregfree(),
13989 this covers the *pprivate pointer which technically perl doesn't
13990 know about, however of course we have to handle the
13991 regexp_internal structure when no extension is in use.
13993 Note this is called before freeing anything in the regexp
13998 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14001 struct regexp *const r = (struct regexp *)SvANY(rx);
14002 RXi_GET_DECL(r,ri);
14003 GET_RE_DEBUG_FLAGS_DECL;
14005 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14011 SV *dsv= sv_newmortal();
14012 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14013 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14014 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14015 PL_colors[4],PL_colors[5],s);
14018 #ifdef RE_TRACK_PATTERN_OFFSETS
14020 Safefree(ri->u.offsets); /* 20010421 MJD */
14022 if (ri->code_blocks) {
14024 for (n = 0; n < ri->num_code_blocks; n++)
14025 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14026 Safefree(ri->code_blocks);
14030 int n = ri->data->count;
14033 /* If you add a ->what type here, update the comment in regcomp.h */
14034 switch (ri->data->what[n]) {
14040 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14043 Safefree(ri->data->data[n]);
14049 { /* Aho Corasick add-on structure for a trie node.
14050 Used in stclass optimization only */
14052 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14054 refcount = --aho->refcount;
14057 PerlMemShared_free(aho->states);
14058 PerlMemShared_free(aho->fail);
14059 /* do this last!!!! */
14060 PerlMemShared_free(ri->data->data[n]);
14061 PerlMemShared_free(ri->regstclass);
14067 /* trie structure. */
14069 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14071 refcount = --trie->refcount;
14074 PerlMemShared_free(trie->charmap);
14075 PerlMemShared_free(trie->states);
14076 PerlMemShared_free(trie->trans);
14078 PerlMemShared_free(trie->bitmap);
14080 PerlMemShared_free(trie->jump);
14081 PerlMemShared_free(trie->wordinfo);
14082 /* do this last!!!! */
14083 PerlMemShared_free(ri->data->data[n]);
14088 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14091 Safefree(ri->data->what);
14092 Safefree(ri->data);
14098 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14099 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14100 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14103 re_dup - duplicate a regexp.
14105 This routine is expected to clone a given regexp structure. It is only
14106 compiled under USE_ITHREADS.
14108 After all of the core data stored in struct regexp is duplicated
14109 the regexp_engine.dupe method is used to copy any private data
14110 stored in the *pprivate pointer. This allows extensions to handle
14111 any duplication it needs to do.
14113 See pregfree() and regfree_internal() if you change anything here.
14115 #if defined(USE_ITHREADS)
14116 #ifndef PERL_IN_XSUB_RE
14118 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14122 const struct regexp *r = (const struct regexp *)SvANY(sstr);
14123 struct regexp *ret = (struct regexp *)SvANY(dstr);
14125 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14127 npar = r->nparens+1;
14128 Newx(ret->offs, npar, regexp_paren_pair);
14129 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14131 /* no need to copy these */
14132 Newx(ret->swap, npar, regexp_paren_pair);
14135 if (ret->substrs) {
14136 /* Do it this way to avoid reading from *r after the StructCopy().
14137 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14138 cache, it doesn't matter. */
14139 const bool anchored = r->check_substr
14140 ? r->check_substr == r->anchored_substr
14141 : r->check_utf8 == r->anchored_utf8;
14142 Newx(ret->substrs, 1, struct reg_substr_data);
14143 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14145 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14146 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14147 ret->float_substr = sv_dup_inc(ret->float_substr, param);
14148 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14150 /* check_substr and check_utf8, if non-NULL, point to either their
14151 anchored or float namesakes, and don't hold a second reference. */
14153 if (ret->check_substr) {
14155 assert(r->check_utf8 == r->anchored_utf8);
14156 ret->check_substr = ret->anchored_substr;
14157 ret->check_utf8 = ret->anchored_utf8;
14159 assert(r->check_substr == r->float_substr);
14160 assert(r->check_utf8 == r->float_utf8);
14161 ret->check_substr = ret->float_substr;
14162 ret->check_utf8 = ret->float_utf8;
14164 } else if (ret->check_utf8) {
14166 ret->check_utf8 = ret->anchored_utf8;
14168 ret->check_utf8 = ret->float_utf8;
14173 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14174 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14177 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14179 if (RX_MATCH_COPIED(dstr))
14180 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
14182 ret->subbeg = NULL;
14183 #ifdef PERL_OLD_COPY_ON_WRITE
14184 ret->saved_copy = NULL;
14187 if (ret->mother_re) {
14188 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
14189 /* Our storage points directly to our mother regexp, but that's
14190 1: a buffer in a different thread
14191 2: something we no longer hold a reference on
14192 so we need to copy it locally. */
14193 /* Note we need to use SvCUR(), rather than
14194 SvLEN(), on our mother_re, because it, in
14195 turn, may well be pointing to its own mother_re. */
14196 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
14197 SvCUR(ret->mother_re)+1));
14198 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
14200 ret->mother_re = NULL;
14204 #endif /* PERL_IN_XSUB_RE */
14209 This is the internal complement to regdupe() which is used to copy
14210 the structure pointed to by the *pprivate pointer in the regexp.
14211 This is the core version of the extension overridable cloning hook.
14212 The regexp structure being duplicated will be copied by perl prior
14213 to this and will be provided as the regexp *r argument, however
14214 with the /old/ structures pprivate pointer value. Thus this routine
14215 may override any copying normally done by perl.
14217 It returns a pointer to the new regexp_internal structure.
14221 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14224 struct regexp *const r = (struct regexp *)SvANY(rx);
14225 regexp_internal *reti;
14227 RXi_GET_DECL(r,ri);
14229 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14233 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14234 Copy(ri->program, reti->program, len+1, regnode);
14236 reti->num_code_blocks = ri->num_code_blocks;
14237 if (ri->code_blocks) {
14239 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14240 struct reg_code_block);
14241 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14242 struct reg_code_block);
14243 for (n = 0; n < ri->num_code_blocks; n++)
14244 reti->code_blocks[n].src_regex = (REGEXP*)
14245 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14248 reti->code_blocks = NULL;
14250 reti->regstclass = NULL;
14253 struct reg_data *d;
14254 const int count = ri->data->count;
14257 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14258 char, struct reg_data);
14259 Newx(d->what, count, U8);
14262 for (i = 0; i < count; i++) {
14263 d->what[i] = ri->data->what[i];
14264 switch (d->what[i]) {
14265 /* see also regcomp.h and regfree_internal() */
14266 case 'a': /* actually an AV, but the dup function is identical. */
14270 case 'u': /* actually an HV, but the dup function is identical. */
14271 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14274 /* This is cheating. */
14275 Newx(d->data[i], 1, struct regnode_charclass_class);
14276 StructCopy(ri->data->data[i], d->data[i],
14277 struct regnode_charclass_class);
14278 reti->regstclass = (regnode*)d->data[i];
14281 /* Trie stclasses are readonly and can thus be shared
14282 * without duplication. We free the stclass in pregfree
14283 * when the corresponding reg_ac_data struct is freed.
14285 reti->regstclass= ri->regstclass;
14289 ((reg_trie_data*)ri->data->data[i])->refcount++;
14294 d->data[i] = ri->data->data[i];
14297 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14306 reti->name_list_idx = ri->name_list_idx;
14308 #ifdef RE_TRACK_PATTERN_OFFSETS
14309 if (ri->u.offsets) {
14310 Newx(reti->u.offsets, 2*len+1, U32);
14311 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14314 SetProgLen(reti,len);
14317 return (void*)reti;
14320 #endif /* USE_ITHREADS */
14322 #ifndef PERL_IN_XSUB_RE
14325 - regnext - dig the "next" pointer out of a node
14328 Perl_regnext(pTHX_ register regnode *p)
14336 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
14337 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14340 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14349 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14352 STRLEN l1 = strlen(pat1);
14353 STRLEN l2 = strlen(pat2);
14356 const char *message;
14358 PERL_ARGS_ASSERT_RE_CROAK2;
14364 Copy(pat1, buf, l1 , char);
14365 Copy(pat2, buf + l1, l2 , char);
14366 buf[l1 + l2] = '\n';
14367 buf[l1 + l2 + 1] = '\0';
14369 /* ANSI variant takes additional second argument */
14370 va_start(args, pat2);
14374 msv = vmess(buf, &args);
14376 message = SvPV_const(msv,l1);
14379 Copy(message, buf, l1 , char);
14380 buf[l1-1] = '\0'; /* Overwrite \n */
14381 Perl_croak(aTHX_ "%s", buf);
14384 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
14386 #ifndef PERL_IN_XSUB_RE
14388 Perl_save_re_context(pTHX)
14392 struct re_save_state *state;
14394 SAVEVPTR(PL_curcop);
14395 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14397 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14398 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14399 SSPUSHUV(SAVEt_RE_STATE);
14401 Copy(&PL_reg_state, state, 1, struct re_save_state);
14403 PL_reg_oldsaved = NULL;
14404 PL_reg_oldsavedlen = 0;
14405 PL_reg_maxiter = 0;
14406 PL_reg_leftiter = 0;
14407 PL_reg_poscache = NULL;
14408 PL_reg_poscache_size = 0;
14409 #ifdef PERL_OLD_COPY_ON_WRITE
14413 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14415 const REGEXP * const rx = PM_GETRE(PL_curpm);
14418 for (i = 1; i <= RX_NPARENS(rx); i++) {
14419 char digits[TYPE_CHARS(long)];
14420 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14421 GV *const *const gvp
14422 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14425 GV * const gv = *gvp;
14426 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14436 clear_re(pTHX_ void *r)
14439 ReREFCNT_dec((REGEXP *)r);
14445 S_put_byte(pTHX_ SV *sv, int c)
14447 PERL_ARGS_ASSERT_PUT_BYTE;
14449 /* Our definition of isPRINT() ignores locales, so only bytes that are
14450 not part of UTF-8 are considered printable. I assume that the same
14451 holds for UTF-EBCDIC.
14452 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14453 which Wikipedia says:
14455 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14456 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14457 identical, to the ASCII delete (DEL) or rubout control character.
14458 ) So the old condition can be simplified to !isPRINT(c) */
14461 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14464 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14468 const char string = c;
14469 if (c == '-' || c == ']' || c == '\\' || c == '^')
14470 sv_catpvs(sv, "\\");
14471 sv_catpvn(sv, &string, 1);
14476 #define CLEAR_OPTSTART \
14477 if (optstart) STMT_START { \
14478 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14482 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14484 STATIC const regnode *
14485 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14486 const regnode *last, const regnode *plast,
14487 SV* sv, I32 indent, U32 depth)
14490 U8 op = PSEUDO; /* Arbitrary non-END op. */
14491 const regnode *next;
14492 const regnode *optstart= NULL;
14494 RXi_GET_DECL(r,ri);
14495 GET_RE_DEBUG_FLAGS_DECL;
14497 PERL_ARGS_ASSERT_DUMPUNTIL;
14499 #ifdef DEBUG_DUMPUNTIL
14500 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14501 last ? last-start : 0,plast ? plast-start : 0);
14504 if (plast && plast < last)
14507 while (PL_regkind[op] != END && (!last || node < last)) {
14508 /* While that wasn't END last time... */
14511 if (op == CLOSE || op == WHILEM)
14513 next = regnext((regnode *)node);
14516 if (OP(node) == OPTIMIZED) {
14517 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14524 regprop(r, sv, node);
14525 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14526 (int)(2*indent + 1), "", SvPVX_const(sv));
14528 if (OP(node) != OPTIMIZED) {
14529 if (next == NULL) /* Next ptr. */
14530 PerlIO_printf(Perl_debug_log, " (0)");
14531 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14532 PerlIO_printf(Perl_debug_log, " (FAIL)");
14534 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14535 (void)PerlIO_putc(Perl_debug_log, '\n');
14539 if (PL_regkind[(U8)op] == BRANCHJ) {
14542 const regnode *nnode = (OP(next) == LONGJMP
14543 ? regnext((regnode *)next)
14545 if (last && nnode > last)
14547 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14550 else if (PL_regkind[(U8)op] == BRANCH) {
14552 DUMPUNTIL(NEXTOPER(node), next);
14554 else if ( PL_regkind[(U8)op] == TRIE ) {
14555 const regnode *this_trie = node;
14556 const char op = OP(node);
14557 const U32 n = ARG(node);
14558 const reg_ac_data * const ac = op>=AHOCORASICK ?
14559 (reg_ac_data *)ri->data->data[n] :
14561 const reg_trie_data * const trie =
14562 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14564 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14566 const regnode *nextbranch= NULL;
14569 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14570 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14572 PerlIO_printf(Perl_debug_log, "%*s%s ",
14573 (int)(2*(indent+3)), "",
14574 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14575 PL_colors[0], PL_colors[1],
14576 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14577 PERL_PV_PRETTY_ELLIPSES |
14578 PERL_PV_PRETTY_LTGT
14583 U16 dist= trie->jump[word_idx+1];
14584 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14585 (UV)((dist ? this_trie + dist : next) - start));
14588 nextbranch= this_trie + trie->jump[0];
14589 DUMPUNTIL(this_trie + dist, nextbranch);
14591 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14592 nextbranch= regnext((regnode *)nextbranch);
14594 PerlIO_printf(Perl_debug_log, "\n");
14597 if (last && next > last)
14602 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14603 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14604 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14606 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14608 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14610 else if ( op == PLUS || op == STAR) {
14611 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14613 else if (PL_regkind[(U8)op] == ANYOF) {
14614 /* arglen 1 + class block */
14615 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14616 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14617 node = NEXTOPER(node);
14619 else if (PL_regkind[(U8)op] == EXACT) {
14620 /* Literal string, where present. */
14621 node += NODE_SZ_STR(node) - 1;
14622 node = NEXTOPER(node);
14625 node = NEXTOPER(node);
14626 node += regarglen[(U8)op];
14628 if (op == CURLYX || op == OPEN)
14632 #ifdef DEBUG_DUMPUNTIL
14633 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14638 #endif /* DEBUGGING */
14642 * c-indentation-style: bsd
14643 * c-basic-offset: 4
14644 * indent-tabs-mode: nil
14647 * ex: set ts=8 sts=4 sw=4 et: