5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
92 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
100 # if defined(BUGGY_MSC6)
101 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
102 # pragma optimize("a",off)
103 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
104 # pragma optimize("w",on )
105 # endif /* BUGGY_MSC6 */
109 #define STATIC static
113 typedef struct RExC_state_t {
114 U32 flags; /* RXf_* are we folding, multilining? */
115 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
116 char *precomp; /* uncompiled string. */
117 REGEXP *rx_sv; /* The SV that is the regexp. */
118 regexp *rx; /* perl core regexp structure */
119 regexp_internal *rxi; /* internal data for regexp object pprivate field */
120 char *start; /* Start of input for compile */
121 char *end; /* End of input for compile */
122 char *parse; /* Input-scan pointer. */
123 I32 whilem_seen; /* number of WHILEM in this expr */
124 regnode *emit_start; /* Start of emitted-code area */
125 regnode *emit_bound; /* First regnode outside of the allocated space */
126 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
130 I32 size; /* Code size. */
131 I32 npar; /* Capture buffer count, (OPEN). */
132 I32 cpar; /* Capture buffer count, (CLOSE). */
133 I32 nestroot; /* root parens we are in - used by accept */
136 regnode **open_parens; /* pointers to open parens */
137 regnode **close_parens; /* pointers to close parens */
138 regnode *opend; /* END node in program */
139 I32 utf8; /* whether the pattern is utf8 or not */
140 I32 orig_utf8; /* whether the pattern was originally in utf8 */
141 /* XXX use this for future optimisation of case
142 * where pattern must be upgraded to utf8. */
143 I32 uni_semantics; /* If a d charset modifier should use unicode
144 rules, even if the pattern is not in
146 HV *paren_names; /* Paren names */
148 regnode **recurse; /* Recurse regops */
149 I32 recurse_count; /* Number of recurse regops */
152 I32 override_recoding;
153 struct reg_code_block *code_blocks; /* positions of literal (?{})
155 int num_code_blocks; /* size of code_blocks[] */
156 int code_index; /* next code_blocks[] slot */
158 char *starttry; /* -Dr: where regtry was called. */
159 #define RExC_starttry (pRExC_state->starttry)
161 SV *runtime_code_qr; /* qr with the runtime code blocks */
163 const char *lastparse;
165 AV *paren_name_list; /* idx -> name */
166 #define RExC_lastparse (pRExC_state->lastparse)
167 #define RExC_lastnum (pRExC_state->lastnum)
168 #define RExC_paren_name_list (pRExC_state->paren_name_list)
172 #define RExC_flags (pRExC_state->flags)
173 #define RExC_pm_flags (pRExC_state->pm_flags)
174 #define RExC_precomp (pRExC_state->precomp)
175 #define RExC_rx_sv (pRExC_state->rx_sv)
176 #define RExC_rx (pRExC_state->rx)
177 #define RExC_rxi (pRExC_state->rxi)
178 #define RExC_start (pRExC_state->start)
179 #define RExC_end (pRExC_state->end)
180 #define RExC_parse (pRExC_state->parse)
181 #define RExC_whilem_seen (pRExC_state->whilem_seen)
182 #ifdef RE_TRACK_PATTERN_OFFSETS
183 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
185 #define RExC_emit (pRExC_state->emit)
186 #define RExC_emit_start (pRExC_state->emit_start)
187 #define RExC_emit_bound (pRExC_state->emit_bound)
188 #define RExC_naughty (pRExC_state->naughty)
189 #define RExC_sawback (pRExC_state->sawback)
190 #define RExC_seen (pRExC_state->seen)
191 #define RExC_size (pRExC_state->size)
192 #define RExC_npar (pRExC_state->npar)
193 #define RExC_nestroot (pRExC_state->nestroot)
194 #define RExC_extralen (pRExC_state->extralen)
195 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
196 #define RExC_utf8 (pRExC_state->utf8)
197 #define RExC_uni_semantics (pRExC_state->uni_semantics)
198 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
199 #define RExC_open_parens (pRExC_state->open_parens)
200 #define RExC_close_parens (pRExC_state->close_parens)
201 #define RExC_opend (pRExC_state->opend)
202 #define RExC_paren_names (pRExC_state->paren_names)
203 #define RExC_recurse (pRExC_state->recurse)
204 #define RExC_recurse_count (pRExC_state->recurse_count)
205 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
206 #define RExC_contains_locale (pRExC_state->contains_locale)
207 #define RExC_override_recoding (pRExC_state->override_recoding)
210 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
211 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
212 ((*s) == '{' && regcurly(s)))
215 #undef SPSTART /* dratted cpp namespace... */
218 * Flags to be passed up and down.
220 #define WORST 0 /* Worst case. */
221 #define HASWIDTH 0x01 /* Known to match non-null strings. */
223 /* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
224 * character, and if utf8, must be invariant. Note that this is not the same
225 * thing as REGNODE_SIMPLE */
227 #define SPSTART 0x04 /* Starts with * or +. */
228 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
229 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
231 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
233 /* whether trie related optimizations are enabled */
234 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
235 #define TRIE_STUDY_OPT
236 #define FULL_TRIE_STUDY
242 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
243 #define PBITVAL(paren) (1 << ((paren) & 7))
244 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
245 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
246 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
248 /* If not already in utf8, do a longjmp back to the beginning */
249 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
250 #define REQUIRE_UTF8 STMT_START { \
251 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
254 /* About scan_data_t.
256 During optimisation we recurse through the regexp program performing
257 various inplace (keyhole style) optimisations. In addition study_chunk
258 and scan_commit populate this data structure with information about
259 what strings MUST appear in the pattern. We look for the longest
260 string that must appear at a fixed location, and we look for the
261 longest string that may appear at a floating location. So for instance
266 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
267 strings (because they follow a .* construct). study_chunk will identify
268 both FOO and BAR as being the longest fixed and floating strings respectively.
270 The strings can be composites, for instance
274 will result in a composite fixed substring 'foo'.
276 For each string some basic information is maintained:
278 - offset or min_offset
279 This is the position the string must appear at, or not before.
280 It also implicitly (when combined with minlenp) tells us how many
281 characters must match before the string we are searching for.
282 Likewise when combined with minlenp and the length of the string it
283 tells us how many characters must appear after the string we have
287 Only used for floating strings. This is the rightmost point that
288 the string can appear at. If set to I32 max it indicates that the
289 string can occur infinitely far to the right.
292 A pointer to the minimum length of the pattern that the string
293 was found inside. This is important as in the case of positive
294 lookahead or positive lookbehind we can have multiple patterns
299 The minimum length of the pattern overall is 3, the minimum length
300 of the lookahead part is 3, but the minimum length of the part that
301 will actually match is 1. So 'FOO's minimum length is 3, but the
302 minimum length for the F is 1. This is important as the minimum length
303 is used to determine offsets in front of and behind the string being
304 looked for. Since strings can be composites this is the length of the
305 pattern at the time it was committed with a scan_commit. Note that
306 the length is calculated by study_chunk, so that the minimum lengths
307 are not known until the full pattern has been compiled, thus the
308 pointer to the value.
312 In the case of lookbehind the string being searched for can be
313 offset past the start point of the final matching string.
314 If this value was just blithely removed from the min_offset it would
315 invalidate some of the calculations for how many chars must match
316 before or after (as they are derived from min_offset and minlen and
317 the length of the string being searched for).
318 When the final pattern is compiled and the data is moved from the
319 scan_data_t structure into the regexp structure the information
320 about lookbehind is factored in, with the information that would
321 have been lost precalculated in the end_shift field for the
324 The fields pos_min and pos_delta are used to store the minimum offset
325 and the delta to the maximum offset at the current point in the pattern.
329 typedef struct scan_data_t {
330 /*I32 len_min; unused */
331 /*I32 len_delta; unused */
335 I32 last_end; /* min value, <0 unless valid. */
338 SV **longest; /* Either &l_fixed, or &l_float. */
339 SV *longest_fixed; /* longest fixed string found in pattern */
340 I32 offset_fixed; /* offset where it starts */
341 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
342 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
343 SV *longest_float; /* longest floating string found in pattern */
344 I32 offset_float_min; /* earliest point in string it can appear */
345 I32 offset_float_max; /* latest point in string it can appear */
346 I32 *minlen_float; /* pointer to the minlen relevant to the string */
347 I32 lookbehind_float; /* is the position of the string modified by LB */
351 struct regnode_charclass_class *start_class;
355 * Forward declarations for pregcomp()'s friends.
358 static const scan_data_t zero_scan_data =
359 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
361 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
362 #define SF_BEFORE_SEOL 0x0001
363 #define SF_BEFORE_MEOL 0x0002
364 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
365 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
368 # define SF_FIX_SHIFT_EOL (0+2)
369 # define SF_FL_SHIFT_EOL (0+4)
371 # define SF_FIX_SHIFT_EOL (+2)
372 # define SF_FL_SHIFT_EOL (+4)
375 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
376 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
378 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
379 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
380 #define SF_IS_INF 0x0040
381 #define SF_HAS_PAR 0x0080
382 #define SF_IN_PAR 0x0100
383 #define SF_HAS_EVAL 0x0200
384 #define SCF_DO_SUBSTR 0x0400
385 #define SCF_DO_STCLASS_AND 0x0800
386 #define SCF_DO_STCLASS_OR 0x1000
387 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
388 #define SCF_WHILEM_VISITED_POS 0x2000
390 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
391 #define SCF_SEEN_ACCEPT 0x8000
393 #define UTF cBOOL(RExC_utf8)
395 /* The enums for all these are ordered so things work out correctly */
396 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
397 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
398 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
399 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
400 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
401 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
402 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
404 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
406 #define OOB_NAMEDCLASS -1
408 /* There is no code point that is out-of-bounds, so this is problematic. But
409 * its only current use is to initialize a variable that is always set before
411 #define OOB_UNICODE 0xDEADBEEF
413 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
414 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
417 /* length of regex to show in messages that don't mark a position within */
418 #define RegexLengthToShowInErrorMessages 127
421 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
422 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
423 * op/pragma/warn/regcomp.
425 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
426 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
428 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
431 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
432 * arg. Show regex, up to a maximum length. If it's too long, chop and add
435 #define _FAIL(code) STMT_START { \
436 const char *ellipses = ""; \
437 IV len = RExC_end - RExC_precomp; \
440 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
441 if (len > RegexLengthToShowInErrorMessages) { \
442 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
443 len = RegexLengthToShowInErrorMessages - 10; \
449 #define FAIL(msg) _FAIL( \
450 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
451 msg, (int)len, RExC_precomp, ellipses))
453 #define FAIL2(msg,arg) _FAIL( \
454 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
455 arg, (int)len, RExC_precomp, ellipses))
458 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
460 #define Simple_vFAIL(m) STMT_START { \
461 const IV offset = RExC_parse - RExC_precomp; \
462 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
463 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
467 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
469 #define vFAIL(m) STMT_START { \
471 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
476 * Like Simple_vFAIL(), but accepts two arguments.
478 #define Simple_vFAIL2(m,a1) STMT_START { \
479 const IV offset = RExC_parse - RExC_precomp; \
480 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
481 (int)offset, RExC_precomp, RExC_precomp + offset); \
485 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
487 #define vFAIL2(m,a1) STMT_START { \
489 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
490 Simple_vFAIL2(m, a1); \
495 * Like Simple_vFAIL(), but accepts three arguments.
497 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
498 const IV offset = RExC_parse - RExC_precomp; \
499 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
500 (int)offset, RExC_precomp, RExC_precomp + offset); \
504 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
506 #define vFAIL3(m,a1,a2) STMT_START { \
508 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
509 Simple_vFAIL3(m, a1, a2); \
513 * Like Simple_vFAIL(), but accepts four arguments.
515 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
516 const IV offset = RExC_parse - RExC_precomp; \
517 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
518 (int)offset, RExC_precomp, RExC_precomp + offset); \
521 #define ckWARNreg(loc,m) STMT_START { \
522 const IV offset = loc - RExC_precomp; \
523 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 (int)offset, RExC_precomp, RExC_precomp + offset); \
527 #define ckWARNregdep(loc,m) STMT_START { \
528 const IV offset = loc - RExC_precomp; \
529 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
531 (int)offset, RExC_precomp, RExC_precomp + offset); \
534 #define ckWARN2regdep(loc,m, a1) STMT_START { \
535 const IV offset = loc - RExC_precomp; \
536 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
538 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
541 #define ckWARN2reg(loc, m, a1) STMT_START { \
542 const IV offset = loc - RExC_precomp; \
543 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
544 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
547 #define vWARN3(loc, m, a1, a2) STMT_START { \
548 const IV offset = loc - RExC_precomp; \
549 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
550 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
553 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
554 const IV offset = loc - RExC_precomp; \
555 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
556 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
559 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
560 const IV offset = loc - RExC_precomp; \
561 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
562 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
565 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
566 const IV offset = loc - RExC_precomp; \
567 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
568 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
571 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
572 const IV offset = loc - RExC_precomp; \
573 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
574 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
578 /* Allow for side effects in s */
579 #define REGC(c,s) STMT_START { \
580 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
583 /* Macros for recording node offsets. 20001227 mjd@plover.com
584 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
585 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
586 * Element 0 holds the number n.
587 * Position is 1 indexed.
589 #ifndef RE_TRACK_PATTERN_OFFSETS
590 #define Set_Node_Offset_To_R(node,byte)
591 #define Set_Node_Offset(node,byte)
592 #define Set_Cur_Node_Offset
593 #define Set_Node_Length_To_R(node,len)
594 #define Set_Node_Length(node,len)
595 #define Set_Node_Cur_Length(node)
596 #define Node_Offset(n)
597 #define Node_Length(n)
598 #define Set_Node_Offset_Length(node,offset,len)
599 #define ProgLen(ri) ri->u.proglen
600 #define SetProgLen(ri,x) ri->u.proglen = x
602 #define ProgLen(ri) ri->u.offsets[0]
603 #define SetProgLen(ri,x) ri->u.offsets[0] = x
604 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
606 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
607 __LINE__, (int)(node), (int)(byte))); \
609 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
611 RExC_offsets[2*(node)-1] = (byte); \
616 #define Set_Node_Offset(node,byte) \
617 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
618 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
620 #define Set_Node_Length_To_R(node,len) STMT_START { \
622 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
623 __LINE__, (int)(node), (int)(len))); \
625 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
627 RExC_offsets[2*(node)] = (len); \
632 #define Set_Node_Length(node,len) \
633 Set_Node_Length_To_R((node)-RExC_emit_start, len)
634 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
635 #define Set_Node_Cur_Length(node) \
636 Set_Node_Length(node, RExC_parse - parse_start)
638 /* Get offsets and lengths */
639 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
640 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
642 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
643 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
644 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
648 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
649 #define EXPERIMENTAL_INPLACESCAN
650 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
652 #define DEBUG_STUDYDATA(str,data,depth) \
653 DEBUG_OPTIMISE_MORE_r(if(data){ \
654 PerlIO_printf(Perl_debug_log, \
655 "%*s" str "Pos:%"IVdf"/%"IVdf \
656 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
657 (int)(depth)*2, "", \
658 (IV)((data)->pos_min), \
659 (IV)((data)->pos_delta), \
660 (UV)((data)->flags), \
661 (IV)((data)->whilem_c), \
662 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
663 is_inf ? "INF " : "" \
665 if ((data)->last_found) \
666 PerlIO_printf(Perl_debug_log, \
667 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
668 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
669 SvPVX_const((data)->last_found), \
670 (IV)((data)->last_end), \
671 (IV)((data)->last_start_min), \
672 (IV)((data)->last_start_max), \
673 ((data)->longest && \
674 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
675 SvPVX_const((data)->longest_fixed), \
676 (IV)((data)->offset_fixed), \
677 ((data)->longest && \
678 (data)->longest==&((data)->longest_float)) ? "*" : "", \
679 SvPVX_const((data)->longest_float), \
680 (IV)((data)->offset_float_min), \
681 (IV)((data)->offset_float_max) \
683 PerlIO_printf(Perl_debug_log,"\n"); \
686 static void clear_re(pTHX_ void *r);
688 /* Mark that we cannot extend a found fixed substring at this point.
689 Update the longest found anchored substring and the longest found
690 floating substrings if needed. */
693 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
695 const STRLEN l = CHR_SVLEN(data->last_found);
696 const STRLEN old_l = CHR_SVLEN(*data->longest);
697 GET_RE_DEBUG_FLAGS_DECL;
699 PERL_ARGS_ASSERT_SCAN_COMMIT;
701 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
702 SvSetMagicSV(*data->longest, data->last_found);
703 if (*data->longest == data->longest_fixed) {
704 data->offset_fixed = l ? data->last_start_min : data->pos_min;
705 if (data->flags & SF_BEFORE_EOL)
707 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
709 data->flags &= ~SF_FIX_BEFORE_EOL;
710 data->minlen_fixed=minlenp;
711 data->lookbehind_fixed=0;
713 else { /* *data->longest == data->longest_float */
714 data->offset_float_min = l ? data->last_start_min : data->pos_min;
715 data->offset_float_max = (l
716 ? data->last_start_max
717 : data->pos_min + data->pos_delta);
718 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
719 data->offset_float_max = I32_MAX;
720 if (data->flags & SF_BEFORE_EOL)
722 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
724 data->flags &= ~SF_FL_BEFORE_EOL;
725 data->minlen_float=minlenp;
726 data->lookbehind_float=0;
729 SvCUR_set(data->last_found, 0);
731 SV * const sv = data->last_found;
732 if (SvUTF8(sv) && SvMAGICAL(sv)) {
733 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
739 data->flags &= ~SF_BEFORE_EOL;
740 DEBUG_STUDYDATA("commit: ",data,0);
743 /* Can match anything (initialization) */
745 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
747 PERL_ARGS_ASSERT_CL_ANYTHING;
749 ANYOF_BITMAP_SETALL(cl);
750 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
751 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
753 /* If any portion of the regex is to operate under locale rules,
754 * initialization includes it. The reason this isn't done for all regexes
755 * is that the optimizer was written under the assumption that locale was
756 * all-or-nothing. Given the complexity and lack of documentation in the
757 * optimizer, and that there are inadequate test cases for locale, so many
758 * parts of it may not work properly, it is safest to avoid locale unless
760 if (RExC_contains_locale) {
761 ANYOF_CLASS_SETALL(cl); /* /l uses class */
762 cl->flags |= ANYOF_LOCALE;
765 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
769 /* Can match anything (initialization) */
771 S_cl_is_anything(const struct regnode_charclass_class *cl)
775 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
777 for (value = 0; value <= ANYOF_MAX; value += 2)
778 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
780 if (!(cl->flags & ANYOF_UNICODE_ALL))
782 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
787 /* Can match anything (initialization) */
789 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
791 PERL_ARGS_ASSERT_CL_INIT;
793 Zero(cl, 1, struct regnode_charclass_class);
795 cl_anything(pRExC_state, cl);
796 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
799 /* These two functions currently do the exact same thing */
800 #define cl_init_zero S_cl_init
802 /* 'AND' a given class with another one. Can create false positives. 'cl'
803 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
804 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
806 S_cl_and(struct regnode_charclass_class *cl,
807 const struct regnode_charclass_class *and_with)
809 PERL_ARGS_ASSERT_CL_AND;
811 assert(and_with->type == ANYOF);
813 /* I (khw) am not sure all these restrictions are necessary XXX */
814 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
815 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
816 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
817 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
818 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
821 if (and_with->flags & ANYOF_INVERT)
822 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
823 cl->bitmap[i] &= ~and_with->bitmap[i];
825 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
826 cl->bitmap[i] &= and_with->bitmap[i];
827 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
829 if (and_with->flags & ANYOF_INVERT) {
831 /* Here, the and'ed node is inverted. Get the AND of the flags that
832 * aren't affected by the inversion. Those that are affected are
833 * handled individually below */
834 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
835 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
836 cl->flags |= affected_flags;
838 /* We currently don't know how to deal with things that aren't in the
839 * bitmap, but we know that the intersection is no greater than what
840 * is already in cl, so let there be false positives that get sorted
841 * out after the synthetic start class succeeds, and the node is
842 * matched for real. */
844 /* The inversion of these two flags indicate that the resulting
845 * intersection doesn't have them */
846 if (and_with->flags & ANYOF_UNICODE_ALL) {
847 cl->flags &= ~ANYOF_UNICODE_ALL;
849 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
850 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
853 else { /* and'd node is not inverted */
854 U8 outside_bitmap_but_not_utf8; /* Temp variable */
856 if (! ANYOF_NONBITMAP(and_with)) {
858 /* Here 'and_with' doesn't match anything outside the bitmap
859 * (except possibly ANYOF_UNICODE_ALL), which means the
860 * intersection can't either, except for ANYOF_UNICODE_ALL, in
861 * which case we don't know what the intersection is, but it's no
862 * greater than what cl already has, so can just leave it alone,
863 * with possible false positives */
864 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
865 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
866 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
869 else if (! ANYOF_NONBITMAP(cl)) {
871 /* Here, 'and_with' does match something outside the bitmap, and cl
872 * doesn't have a list of things to match outside the bitmap. If
873 * cl can match all code points above 255, the intersection will
874 * be those above-255 code points that 'and_with' matches. If cl
875 * can't match all Unicode code points, it means that it can't
876 * match anything outside the bitmap (since the 'if' that got us
877 * into this block tested for that), so we leave the bitmap empty.
879 if (cl->flags & ANYOF_UNICODE_ALL) {
880 ARG_SET(cl, ARG(and_with));
882 /* and_with's ARG may match things that don't require UTF8.
883 * And now cl's will too, in spite of this being an 'and'. See
884 * the comments below about the kludge */
885 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
889 /* Here, both 'and_with' and cl match something outside the
890 * bitmap. Currently we do not do the intersection, so just match
891 * whatever cl had at the beginning. */
895 /* Take the intersection of the two sets of flags. However, the
896 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
897 * kludge around the fact that this flag is not treated like the others
898 * which are initialized in cl_anything(). The way the optimizer works
899 * is that the synthetic start class (SSC) is initialized to match
900 * anything, and then the first time a real node is encountered, its
901 * values are AND'd with the SSC's with the result being the values of
902 * the real node. However, there are paths through the optimizer where
903 * the AND never gets called, so those initialized bits are set
904 * inappropriately, which is not usually a big deal, as they just cause
905 * false positives in the SSC, which will just mean a probably
906 * imperceptible slow down in execution. However this bit has a
907 * higher false positive consequence in that it can cause utf8.pm,
908 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
909 * bigger slowdown and also causes significant extra memory to be used.
910 * In order to prevent this, the code now takes a different tack. The
911 * bit isn't set unless some part of the regular expression needs it,
912 * but once set it won't get cleared. This means that these extra
913 * modules won't get loaded unless there was some path through the
914 * pattern that would have required them anyway, and so any false
915 * positives that occur by not ANDing them out when they could be
916 * aren't as severe as they would be if we treated this bit like all
918 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
919 & ANYOF_NONBITMAP_NON_UTF8;
920 cl->flags &= and_with->flags;
921 cl->flags |= outside_bitmap_but_not_utf8;
925 /* 'OR' a given class with another one. Can create false positives. 'cl'
926 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
927 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
929 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
931 PERL_ARGS_ASSERT_CL_OR;
933 if (or_with->flags & ANYOF_INVERT) {
935 /* Here, the or'd node is to be inverted. This means we take the
936 * complement of everything not in the bitmap, but currently we don't
937 * know what that is, so give up and match anything */
938 if (ANYOF_NONBITMAP(or_with)) {
939 cl_anything(pRExC_state, cl);
942 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
943 * <= (B1 | !B2) | (CL1 | !CL2)
944 * which is wasteful if CL2 is small, but we ignore CL2:
945 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
946 * XXXX Can we handle case-fold? Unclear:
947 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
948 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
950 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
951 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
952 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
955 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
956 cl->bitmap[i] |= ~or_with->bitmap[i];
957 } /* XXXX: logic is complicated otherwise */
959 cl_anything(pRExC_state, cl);
962 /* And, we can just take the union of the flags that aren't affected
963 * by the inversion */
964 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
966 /* For the remaining flags:
967 ANYOF_UNICODE_ALL and inverted means to not match anything above
968 255, which means that the union with cl should just be
969 what cl has in it, so can ignore this flag
970 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
971 is 127-255 to match them, but then invert that, so the
972 union with cl should just be what cl has in it, so can
975 } else { /* 'or_with' is not inverted */
976 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
977 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
978 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
979 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
982 /* OR char bitmap and class bitmap separately */
983 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
984 cl->bitmap[i] |= or_with->bitmap[i];
985 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
986 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
987 cl->classflags[i] |= or_with->classflags[i];
988 cl->flags |= ANYOF_CLASS;
991 else { /* XXXX: logic is complicated, leave it along for a moment. */
992 cl_anything(pRExC_state, cl);
995 if (ANYOF_NONBITMAP(or_with)) {
997 /* Use the added node's outside-the-bit-map match if there isn't a
998 * conflict. If there is a conflict (both nodes match something
999 * outside the bitmap, but what they match outside is not the same
1000 * pointer, and hence not easily compared until XXX we extend
1001 * inversion lists this far), give up and allow the start class to
1002 * match everything outside the bitmap. If that stuff is all above
1003 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1004 if (! ANYOF_NONBITMAP(cl)) {
1005 ARG_SET(cl, ARG(or_with));
1007 else if (ARG(cl) != ARG(or_with)) {
1009 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1010 cl_anything(pRExC_state, cl);
1013 cl->flags |= ANYOF_UNICODE_ALL;
1018 /* Take the union */
1019 cl->flags |= or_with->flags;
1023 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1024 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1025 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1026 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1031 dump_trie(trie,widecharmap,revcharmap)
1032 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1033 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1035 These routines dump out a trie in a somewhat readable format.
1036 The _interim_ variants are used for debugging the interim
1037 tables that are used to generate the final compressed
1038 representation which is what dump_trie expects.
1040 Part of the reason for their existence is to provide a form
1041 of documentation as to how the different representations function.
1046 Dumps the final compressed table form of the trie to Perl_debug_log.
1047 Used for debugging make_trie().
1051 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1052 AV *revcharmap, U32 depth)
1055 SV *sv=sv_newmortal();
1056 int colwidth= widecharmap ? 6 : 4;
1058 GET_RE_DEBUG_FLAGS_DECL;
1060 PERL_ARGS_ASSERT_DUMP_TRIE;
1062 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1063 (int)depth * 2 + 2,"",
1064 "Match","Base","Ofs" );
1066 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1067 SV ** const tmp = av_fetch( revcharmap, state, 0);
1069 PerlIO_printf( Perl_debug_log, "%*s",
1071 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1072 PL_colors[0], PL_colors[1],
1073 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1074 PERL_PV_ESCAPE_FIRSTCHAR
1079 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1080 (int)depth * 2 + 2,"");
1082 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1083 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1084 PerlIO_printf( Perl_debug_log, "\n");
1086 for( state = 1 ; state < trie->statecount ; state++ ) {
1087 const U32 base = trie->states[ state ].trans.base;
1089 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1091 if ( trie->states[ state ].wordnum ) {
1092 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1094 PerlIO_printf( Perl_debug_log, "%6s", "" );
1097 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1102 while( ( base + ofs < trie->uniquecharcount ) ||
1103 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1104 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1107 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1109 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1110 if ( ( base + ofs >= trie->uniquecharcount ) &&
1111 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1112 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1114 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1116 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1118 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1122 PerlIO_printf( Perl_debug_log, "]");
1125 PerlIO_printf( Perl_debug_log, "\n" );
1127 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1128 for (word=1; word <= trie->wordcount; word++) {
1129 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1130 (int)word, (int)(trie->wordinfo[word].prev),
1131 (int)(trie->wordinfo[word].len));
1133 PerlIO_printf(Perl_debug_log, "\n" );
1136 Dumps a fully constructed but uncompressed trie in list form.
1137 List tries normally only are used for construction when the number of
1138 possible chars (trie->uniquecharcount) is very high.
1139 Used for debugging make_trie().
1142 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1143 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1147 SV *sv=sv_newmortal();
1148 int colwidth= widecharmap ? 6 : 4;
1149 GET_RE_DEBUG_FLAGS_DECL;
1151 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1153 /* print out the table precompression. */
1154 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1155 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1156 "------:-----+-----------------\n" );
1158 for( state=1 ; state < next_alloc ; state ++ ) {
1161 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1162 (int)depth * 2 + 2,"", (UV)state );
1163 if ( ! trie->states[ state ].wordnum ) {
1164 PerlIO_printf( Perl_debug_log, "%5s| ","");
1166 PerlIO_printf( Perl_debug_log, "W%4x| ",
1167 trie->states[ state ].wordnum
1170 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1171 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1173 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1175 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1176 PL_colors[0], PL_colors[1],
1177 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1178 PERL_PV_ESCAPE_FIRSTCHAR
1180 TRIE_LIST_ITEM(state,charid).forid,
1181 (UV)TRIE_LIST_ITEM(state,charid).newstate
1184 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1185 (int)((depth * 2) + 14), "");
1188 PerlIO_printf( Perl_debug_log, "\n");
1193 Dumps a fully constructed but uncompressed trie in table form.
1194 This is the normal DFA style state transition table, with a few
1195 twists to facilitate compression later.
1196 Used for debugging make_trie().
1199 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1200 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1205 SV *sv=sv_newmortal();
1206 int colwidth= widecharmap ? 6 : 4;
1207 GET_RE_DEBUG_FLAGS_DECL;
1209 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1212 print out the table precompression so that we can do a visual check
1213 that they are identical.
1216 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1218 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1219 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1221 PerlIO_printf( Perl_debug_log, "%*s",
1223 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1224 PL_colors[0], PL_colors[1],
1225 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1226 PERL_PV_ESCAPE_FIRSTCHAR
1232 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1234 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1235 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1238 PerlIO_printf( Perl_debug_log, "\n" );
1240 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1242 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1243 (int)depth * 2 + 2,"",
1244 (UV)TRIE_NODENUM( state ) );
1246 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1247 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1249 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1251 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1253 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1254 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1256 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1257 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1265 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1266 startbranch: the first branch in the whole branch sequence
1267 first : start branch of sequence of branch-exact nodes.
1268 May be the same as startbranch
1269 last : Thing following the last branch.
1270 May be the same as tail.
1271 tail : item following the branch sequence
1272 count : words in the sequence
1273 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1274 depth : indent depth
1276 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1278 A trie is an N'ary tree where the branches are determined by digital
1279 decomposition of the key. IE, at the root node you look up the 1st character and
1280 follow that branch repeat until you find the end of the branches. Nodes can be
1281 marked as "accepting" meaning they represent a complete word. Eg:
1285 would convert into the following structure. Numbers represent states, letters
1286 following numbers represent valid transitions on the letter from that state, if
1287 the number is in square brackets it represents an accepting state, otherwise it
1288 will be in parenthesis.
1290 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1294 (1) +-i->(6)-+-s->[7]
1296 +-s->(3)-+-h->(4)-+-e->[5]
1298 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1300 This shows that when matching against the string 'hers' we will begin at state 1
1301 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1302 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1303 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1304 single traverse. We store a mapping from accepting to state to which word was
1305 matched, and then when we have multiple possibilities we try to complete the
1306 rest of the regex in the order in which they occured in the alternation.
1308 The only prior NFA like behaviour that would be changed by the TRIE support is
1309 the silent ignoring of duplicate alternations which are of the form:
1311 / (DUPE|DUPE) X? (?{ ... }) Y /x
1313 Thus EVAL blocks following a trie may be called a different number of times with
1314 and without the optimisation. With the optimisations dupes will be silently
1315 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1316 the following demonstrates:
1318 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1320 which prints out 'word' three times, but
1322 'words'=~/(word|word|word)(?{ print $1 })S/
1324 which doesnt print it out at all. This is due to other optimisations kicking in.
1326 Example of what happens on a structural level:
1328 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1330 1: CURLYM[1] {1,32767}(18)
1341 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1342 and should turn into:
1344 1: CURLYM[1] {1,32767}(18)
1346 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1354 Cases where tail != last would be like /(?foo|bar)baz/:
1364 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1365 and would end up looking like:
1368 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1375 d = uvuni_to_utf8_flags(d, uv, 0);
1377 is the recommended Unicode-aware way of saying
1382 #define TRIE_STORE_REVCHAR(val) \
1385 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1386 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1387 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1388 SvCUR_set(zlopp, kapow - flrbbbbb); \
1391 av_push(revcharmap, zlopp); \
1393 char ooooff = (char)val; \
1394 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1398 #define TRIE_READ_CHAR STMT_START { \
1401 /* if it is UTF then it is either already folded, or does not need folding */ \
1402 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1404 else if (folder == PL_fold_latin1) { \
1405 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1406 if ( foldlen > 0 ) { \
1407 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1413 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1414 skiplen = UNISKIP(uvc); \
1415 foldlen -= skiplen; \
1416 scan = foldbuf + skiplen; \
1419 /* raw data, will be folded later if needed */ \
1427 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1428 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1429 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1430 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1432 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1433 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1434 TRIE_LIST_CUR( state )++; \
1437 #define TRIE_LIST_NEW(state) STMT_START { \
1438 Newxz( trie->states[ state ].trans.list, \
1439 4, reg_trie_trans_le ); \
1440 TRIE_LIST_CUR( state ) = 1; \
1441 TRIE_LIST_LEN( state ) = 4; \
1444 #define TRIE_HANDLE_WORD(state) STMT_START { \
1445 U16 dupe= trie->states[ state ].wordnum; \
1446 regnode * const noper_next = regnext( noper ); \
1449 /* store the word for dumping */ \
1451 if (OP(noper) != NOTHING) \
1452 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1454 tmp = newSVpvn_utf8( "", 0, UTF ); \
1455 av_push( trie_words, tmp ); \
1459 trie->wordinfo[curword].prev = 0; \
1460 trie->wordinfo[curword].len = wordlen; \
1461 trie->wordinfo[curword].accept = state; \
1463 if ( noper_next < tail ) { \
1465 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1466 trie->jump[curword] = (U16)(noper_next - convert); \
1468 jumper = noper_next; \
1470 nextbranch= regnext(cur); \
1474 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1475 /* chain, so that when the bits of chain are later */\
1476 /* linked together, the dups appear in the chain */\
1477 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1478 trie->wordinfo[dupe].prev = curword; \
1480 /* we haven't inserted this word yet. */ \
1481 trie->states[ state ].wordnum = curword; \
1486 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1487 ( ( base + charid >= ucharcount \
1488 && base + charid < ubound \
1489 && state == trie->trans[ base - ucharcount + charid ].check \
1490 && trie->trans[ base - ucharcount + charid ].next ) \
1491 ? trie->trans[ base - ucharcount + charid ].next \
1492 : ( state==1 ? special : 0 ) \
1496 #define MADE_JUMP_TRIE 2
1497 #define MADE_EXACT_TRIE 4
1500 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1503 /* first pass, loop through and scan words */
1504 reg_trie_data *trie;
1505 HV *widecharmap = NULL;
1506 AV *revcharmap = newAV();
1508 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1513 regnode *jumper = NULL;
1514 regnode *nextbranch = NULL;
1515 regnode *convert = NULL;
1516 U32 *prev_states; /* temp array mapping each state to previous one */
1517 /* we just use folder as a flag in utf8 */
1518 const U8 * folder = NULL;
1521 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1522 AV *trie_words = NULL;
1523 /* along with revcharmap, this only used during construction but both are
1524 * useful during debugging so we store them in the struct when debugging.
1527 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1528 STRLEN trie_charcount=0;
1530 SV *re_trie_maxbuff;
1531 GET_RE_DEBUG_FLAGS_DECL;
1533 PERL_ARGS_ASSERT_MAKE_TRIE;
1535 PERL_UNUSED_ARG(depth);
1542 case EXACTFU_TRICKYFOLD:
1543 case EXACTFU: folder = PL_fold_latin1; break;
1544 case EXACTF: folder = PL_fold; break;
1545 case EXACTFL: folder = PL_fold_locale; break;
1546 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1549 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1551 trie->startstate = 1;
1552 trie->wordcount = word_count;
1553 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1554 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1556 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1557 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1558 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1561 trie_words = newAV();
1564 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1565 if (!SvIOK(re_trie_maxbuff)) {
1566 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1568 DEBUG_TRIE_COMPILE_r({
1569 PerlIO_printf( Perl_debug_log,
1570 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1571 (int)depth * 2 + 2, "",
1572 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1573 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1577 /* Find the node we are going to overwrite */
1578 if ( first == startbranch && OP( last ) != BRANCH ) {
1579 /* whole branch chain */
1582 /* branch sub-chain */
1583 convert = NEXTOPER( first );
1586 /* -- First loop and Setup --
1588 We first traverse the branches and scan each word to determine if it
1589 contains widechars, and how many unique chars there are, this is
1590 important as we have to build a table with at least as many columns as we
1593 We use an array of integers to represent the character codes 0..255
1594 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1595 native representation of the character value as the key and IV's for the
1598 *TODO* If we keep track of how many times each character is used we can
1599 remap the columns so that the table compression later on is more
1600 efficient in terms of memory by ensuring the most common value is in the
1601 middle and the least common are on the outside. IMO this would be better
1602 than a most to least common mapping as theres a decent chance the most
1603 common letter will share a node with the least common, meaning the node
1604 will not be compressible. With a middle is most common approach the worst
1605 case is when we have the least common nodes twice.
1609 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1610 regnode *noper = NEXTOPER( cur );
1611 const U8 *uc = (U8*)STRING( noper );
1612 const U8 *e = uc + STR_LEN( noper );
1614 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1616 const U8 *scan = (U8*)NULL;
1617 U32 wordlen = 0; /* required init */
1619 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1621 if (OP(noper) == NOTHING) {
1622 regnode *noper_next= regnext(noper);
1623 if (noper_next != tail && OP(noper_next) == flags) {
1625 uc= (U8*)STRING(noper);
1626 e= uc + STR_LEN(noper);
1627 trie->minlen= STR_LEN(noper);
1634 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1635 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1636 regardless of encoding */
1637 if (OP( noper ) == EXACTFU_SS) {
1638 /* false positives are ok, so just set this */
1639 TRIE_BITMAP_SET(trie,0xDF);
1642 for ( ; uc < e ; uc += len ) {
1643 TRIE_CHARCOUNT(trie)++;
1648 U8 folded= folder[ (U8) uvc ];
1649 if ( !trie->charmap[ folded ] ) {
1650 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1651 TRIE_STORE_REVCHAR( folded );
1654 if ( !trie->charmap[ uvc ] ) {
1655 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1656 TRIE_STORE_REVCHAR( uvc );
1659 /* store the codepoint in the bitmap, and its folded
1661 TRIE_BITMAP_SET(trie, uvc);
1663 /* store the folded codepoint */
1664 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1667 /* store first byte of utf8 representation of
1668 variant codepoints */
1669 if (! UNI_IS_INVARIANT(uvc)) {
1670 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1673 set_bit = 0; /* We've done our bit :-) */
1678 widecharmap = newHV();
1680 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1683 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1685 if ( !SvTRUE( *svpp ) ) {
1686 sv_setiv( *svpp, ++trie->uniquecharcount );
1687 TRIE_STORE_REVCHAR(uvc);
1691 if( cur == first ) {
1692 trie->minlen = chars;
1693 trie->maxlen = chars;
1694 } else if (chars < trie->minlen) {
1695 trie->minlen = chars;
1696 } else if (chars > trie->maxlen) {
1697 trie->maxlen = chars;
1699 if (OP( noper ) == EXACTFU_SS) {
1700 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1701 if (trie->minlen > 1)
1704 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1705 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1706 * - We assume that any such sequence might match a 2 byte string */
1707 if (trie->minlen > 2 )
1711 } /* end first pass */
1712 DEBUG_TRIE_COMPILE_r(
1713 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1714 (int)depth * 2 + 2,"",
1715 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1716 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1717 (int)trie->minlen, (int)trie->maxlen )
1721 We now know what we are dealing with in terms of unique chars and
1722 string sizes so we can calculate how much memory a naive
1723 representation using a flat table will take. If it's over a reasonable
1724 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1725 conservative but potentially much slower representation using an array
1728 At the end we convert both representations into the same compressed
1729 form that will be used in regexec.c for matching with. The latter
1730 is a form that cannot be used to construct with but has memory
1731 properties similar to the list form and access properties similar
1732 to the table form making it both suitable for fast searches and
1733 small enough that its feasable to store for the duration of a program.
1735 See the comment in the code where the compressed table is produced
1736 inplace from the flat tabe representation for an explanation of how
1737 the compression works.
1742 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1745 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1747 Second Pass -- Array Of Lists Representation
1749 Each state will be represented by a list of charid:state records
1750 (reg_trie_trans_le) the first such element holds the CUR and LEN
1751 points of the allocated array. (See defines above).
1753 We build the initial structure using the lists, and then convert
1754 it into the compressed table form which allows faster lookups
1755 (but cant be modified once converted).
1758 STRLEN transcount = 1;
1760 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1761 "%*sCompiling trie using list compiler\n",
1762 (int)depth * 2 + 2, ""));
1764 trie->states = (reg_trie_state *)
1765 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1766 sizeof(reg_trie_state) );
1770 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1772 regnode *noper = NEXTOPER( cur );
1773 U8 *uc = (U8*)STRING( noper );
1774 const U8 *e = uc + STR_LEN( noper );
1775 U32 state = 1; /* required init */
1776 U16 charid = 0; /* sanity init */
1777 U8 *scan = (U8*)NULL; /* sanity init */
1778 STRLEN foldlen = 0; /* required init */
1779 U32 wordlen = 0; /* required init */
1780 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1783 if (OP(noper) == NOTHING) {
1784 regnode *noper_next= regnext(noper);
1785 if (noper_next != tail && OP(noper_next) == flags) {
1787 uc= (U8*)STRING(noper);
1788 e= uc + STR_LEN(noper);
1792 if (OP(noper) != NOTHING) {
1793 for ( ; uc < e ; uc += len ) {
1798 charid = trie->charmap[ uvc ];
1800 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1804 charid=(U16)SvIV( *svpp );
1807 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1814 if ( !trie->states[ state ].trans.list ) {
1815 TRIE_LIST_NEW( state );
1817 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1818 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1819 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1824 newstate = next_alloc++;
1825 prev_states[newstate] = state;
1826 TRIE_LIST_PUSH( state, charid, newstate );
1831 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1835 TRIE_HANDLE_WORD(state);
1837 } /* end second pass */
1839 /* next alloc is the NEXT state to be allocated */
1840 trie->statecount = next_alloc;
1841 trie->states = (reg_trie_state *)
1842 PerlMemShared_realloc( trie->states,
1844 * sizeof(reg_trie_state) );
1846 /* and now dump it out before we compress it */
1847 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1848 revcharmap, next_alloc,
1852 trie->trans = (reg_trie_trans *)
1853 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1860 for( state=1 ; state < next_alloc ; state ++ ) {
1864 DEBUG_TRIE_COMPILE_MORE_r(
1865 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1869 if (trie->states[state].trans.list) {
1870 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1874 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1875 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1876 if ( forid < minid ) {
1878 } else if ( forid > maxid ) {
1882 if ( transcount < tp + maxid - minid + 1) {
1884 trie->trans = (reg_trie_trans *)
1885 PerlMemShared_realloc( trie->trans,
1887 * sizeof(reg_trie_trans) );
1888 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1890 base = trie->uniquecharcount + tp - minid;
1891 if ( maxid == minid ) {
1893 for ( ; zp < tp ; zp++ ) {
1894 if ( ! trie->trans[ zp ].next ) {
1895 base = trie->uniquecharcount + zp - minid;
1896 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1897 trie->trans[ zp ].check = state;
1903 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1904 trie->trans[ tp ].check = state;
1909 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1910 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1911 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1912 trie->trans[ tid ].check = state;
1914 tp += ( maxid - minid + 1 );
1916 Safefree(trie->states[ state ].trans.list);
1919 DEBUG_TRIE_COMPILE_MORE_r(
1920 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1923 trie->states[ state ].trans.base=base;
1925 trie->lasttrans = tp + 1;
1929 Second Pass -- Flat Table Representation.
1931 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1932 We know that we will need Charcount+1 trans at most to store the data
1933 (one row per char at worst case) So we preallocate both structures
1934 assuming worst case.
1936 We then construct the trie using only the .next slots of the entry
1939 We use the .check field of the first entry of the node temporarily to
1940 make compression both faster and easier by keeping track of how many non
1941 zero fields are in the node.
1943 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1946 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1947 number representing the first entry of the node, and state as a
1948 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1949 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1950 are 2 entrys per node. eg:
1958 The table is internally in the right hand, idx form. However as we also
1959 have to deal with the states array which is indexed by nodenum we have to
1960 use TRIE_NODENUM() to convert.
1963 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1964 "%*sCompiling trie using table compiler\n",
1965 (int)depth * 2 + 2, ""));
1967 trie->trans = (reg_trie_trans *)
1968 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1969 * trie->uniquecharcount + 1,
1970 sizeof(reg_trie_trans) );
1971 trie->states = (reg_trie_state *)
1972 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1973 sizeof(reg_trie_state) );
1974 next_alloc = trie->uniquecharcount + 1;
1977 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1979 regnode *noper = NEXTOPER( cur );
1980 const U8 *uc = (U8*)STRING( noper );
1981 const U8 *e = uc + STR_LEN( noper );
1983 U32 state = 1; /* required init */
1985 U16 charid = 0; /* sanity init */
1986 U32 accept_state = 0; /* sanity init */
1987 U8 *scan = (U8*)NULL; /* sanity init */
1989 STRLEN foldlen = 0; /* required init */
1990 U32 wordlen = 0; /* required init */
1992 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1994 if (OP(noper) == NOTHING) {
1995 regnode *noper_next= regnext(noper);
1996 if (noper_next != tail && OP(noper_next) == flags) {
1998 uc= (U8*)STRING(noper);
1999 e= uc + STR_LEN(noper);
2003 if ( OP(noper) != NOTHING ) {
2004 for ( ; uc < e ; uc += len ) {
2009 charid = trie->charmap[ uvc ];
2011 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2012 charid = svpp ? (U16)SvIV(*svpp) : 0;
2016 if ( !trie->trans[ state + charid ].next ) {
2017 trie->trans[ state + charid ].next = next_alloc;
2018 trie->trans[ state ].check++;
2019 prev_states[TRIE_NODENUM(next_alloc)]
2020 = TRIE_NODENUM(state);
2021 next_alloc += trie->uniquecharcount;
2023 state = trie->trans[ state + charid ].next;
2025 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2027 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2030 accept_state = TRIE_NODENUM( state );
2031 TRIE_HANDLE_WORD(accept_state);
2033 } /* end second pass */
2035 /* and now dump it out before we compress it */
2036 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2038 next_alloc, depth+1));
2042 * Inplace compress the table.*
2044 For sparse data sets the table constructed by the trie algorithm will
2045 be mostly 0/FAIL transitions or to put it another way mostly empty.
2046 (Note that leaf nodes will not contain any transitions.)
2048 This algorithm compresses the tables by eliminating most such
2049 transitions, at the cost of a modest bit of extra work during lookup:
2051 - Each states[] entry contains a .base field which indicates the
2052 index in the state[] array wheres its transition data is stored.
2054 - If .base is 0 there are no valid transitions from that node.
2056 - If .base is nonzero then charid is added to it to find an entry in
2059 -If trans[states[state].base+charid].check!=state then the
2060 transition is taken to be a 0/Fail transition. Thus if there are fail
2061 transitions at the front of the node then the .base offset will point
2062 somewhere inside the previous nodes data (or maybe even into a node
2063 even earlier), but the .check field determines if the transition is
2067 The following process inplace converts the table to the compressed
2068 table: We first do not compress the root node 1,and mark all its
2069 .check pointers as 1 and set its .base pointer as 1 as well. This
2070 allows us to do a DFA construction from the compressed table later,
2071 and ensures that any .base pointers we calculate later are greater
2074 - We set 'pos' to indicate the first entry of the second node.
2076 - We then iterate over the columns of the node, finding the first and
2077 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2078 and set the .check pointers accordingly, and advance pos
2079 appropriately and repreat for the next node. Note that when we copy
2080 the next pointers we have to convert them from the original
2081 NODEIDX form to NODENUM form as the former is not valid post
2084 - If a node has no transitions used we mark its base as 0 and do not
2085 advance the pos pointer.
2087 - If a node only has one transition we use a second pointer into the
2088 structure to fill in allocated fail transitions from other states.
2089 This pointer is independent of the main pointer and scans forward
2090 looking for null transitions that are allocated to a state. When it
2091 finds one it writes the single transition into the "hole". If the
2092 pointer doesnt find one the single transition is appended as normal.
2094 - Once compressed we can Renew/realloc the structures to release the
2097 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2098 specifically Fig 3.47 and the associated pseudocode.
2102 const U32 laststate = TRIE_NODENUM( next_alloc );
2105 trie->statecount = laststate;
2107 for ( state = 1 ; state < laststate ; state++ ) {
2109 const U32 stateidx = TRIE_NODEIDX( state );
2110 const U32 o_used = trie->trans[ stateidx ].check;
2111 U32 used = trie->trans[ stateidx ].check;
2112 trie->trans[ stateidx ].check = 0;
2114 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2115 if ( flag || trie->trans[ stateidx + charid ].next ) {
2116 if ( trie->trans[ stateidx + charid ].next ) {
2118 for ( ; zp < pos ; zp++ ) {
2119 if ( ! trie->trans[ zp ].next ) {
2123 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2124 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2125 trie->trans[ zp ].check = state;
2126 if ( ++zp > pos ) pos = zp;
2133 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2135 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2136 trie->trans[ pos ].check = state;
2141 trie->lasttrans = pos + 1;
2142 trie->states = (reg_trie_state *)
2143 PerlMemShared_realloc( trie->states, laststate
2144 * sizeof(reg_trie_state) );
2145 DEBUG_TRIE_COMPILE_MORE_r(
2146 PerlIO_printf( Perl_debug_log,
2147 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2148 (int)depth * 2 + 2,"",
2149 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2152 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2155 } /* end table compress */
2157 DEBUG_TRIE_COMPILE_MORE_r(
2158 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2159 (int)depth * 2 + 2, "",
2160 (UV)trie->statecount,
2161 (UV)trie->lasttrans)
2163 /* resize the trans array to remove unused space */
2164 trie->trans = (reg_trie_trans *)
2165 PerlMemShared_realloc( trie->trans, trie->lasttrans
2166 * sizeof(reg_trie_trans) );
2168 { /* Modify the program and insert the new TRIE node */
2169 U8 nodetype =(U8)(flags & 0xFF);
2173 regnode *optimize = NULL;
2174 #ifdef RE_TRACK_PATTERN_OFFSETS
2177 U32 mjd_nodelen = 0;
2178 #endif /* RE_TRACK_PATTERN_OFFSETS */
2179 #endif /* DEBUGGING */
2181 This means we convert either the first branch or the first Exact,
2182 depending on whether the thing following (in 'last') is a branch
2183 or not and whther first is the startbranch (ie is it a sub part of
2184 the alternation or is it the whole thing.)
2185 Assuming its a sub part we convert the EXACT otherwise we convert
2186 the whole branch sequence, including the first.
2188 /* Find the node we are going to overwrite */
2189 if ( first != startbranch || OP( last ) == BRANCH ) {
2190 /* branch sub-chain */
2191 NEXT_OFF( first ) = (U16)(last - first);
2192 #ifdef RE_TRACK_PATTERN_OFFSETS
2194 mjd_offset= Node_Offset((convert));
2195 mjd_nodelen= Node_Length((convert));
2198 /* whole branch chain */
2200 #ifdef RE_TRACK_PATTERN_OFFSETS
2203 const regnode *nop = NEXTOPER( convert );
2204 mjd_offset= Node_Offset((nop));
2205 mjd_nodelen= Node_Length((nop));
2209 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2210 (int)depth * 2 + 2, "",
2211 (UV)mjd_offset, (UV)mjd_nodelen)
2214 /* But first we check to see if there is a common prefix we can
2215 split out as an EXACT and put in front of the TRIE node. */
2216 trie->startstate= 1;
2217 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2219 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2223 const U32 base = trie->states[ state ].trans.base;
2225 if ( trie->states[state].wordnum )
2228 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2229 if ( ( base + ofs >= trie->uniquecharcount ) &&
2230 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2231 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2233 if ( ++count > 1 ) {
2234 SV **tmp = av_fetch( revcharmap, ofs, 0);
2235 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2236 if ( state == 1 ) break;
2238 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2240 PerlIO_printf(Perl_debug_log,
2241 "%*sNew Start State=%"UVuf" Class: [",
2242 (int)depth * 2 + 2, "",
2245 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2246 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2248 TRIE_BITMAP_SET(trie,*ch);
2250 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2252 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2256 TRIE_BITMAP_SET(trie,*ch);
2258 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2259 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2265 SV **tmp = av_fetch( revcharmap, idx, 0);
2267 char *ch = SvPV( *tmp, len );
2269 SV *sv=sv_newmortal();
2270 PerlIO_printf( Perl_debug_log,
2271 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2272 (int)depth * 2 + 2, "",
2274 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2275 PL_colors[0], PL_colors[1],
2276 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2277 PERL_PV_ESCAPE_FIRSTCHAR
2282 OP( convert ) = nodetype;
2283 str=STRING(convert);
2286 STR_LEN(convert) += len;
2292 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2297 trie->prefixlen = (state-1);
2299 regnode *n = convert+NODE_SZ_STR(convert);
2300 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2301 trie->startstate = state;
2302 trie->minlen -= (state - 1);
2303 trie->maxlen -= (state - 1);
2305 /* At least the UNICOS C compiler choked on this
2306 * being argument to DEBUG_r(), so let's just have
2309 #ifdef PERL_EXT_RE_BUILD
2315 regnode *fix = convert;
2316 U32 word = trie->wordcount;
2318 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2319 while( ++fix < n ) {
2320 Set_Node_Offset_Length(fix, 0, 0);
2323 SV ** const tmp = av_fetch( trie_words, word, 0 );
2325 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2326 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2328 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2336 NEXT_OFF(convert) = (U16)(tail - convert);
2337 DEBUG_r(optimize= n);
2343 if ( trie->maxlen ) {
2344 NEXT_OFF( convert ) = (U16)(tail - convert);
2345 ARG_SET( convert, data_slot );
2346 /* Store the offset to the first unabsorbed branch in
2347 jump[0], which is otherwise unused by the jump logic.
2348 We use this when dumping a trie and during optimisation. */
2350 trie->jump[0] = (U16)(nextbranch - convert);
2352 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2353 * and there is a bitmap
2354 * and the first "jump target" node we found leaves enough room
2355 * then convert the TRIE node into a TRIEC node, with the bitmap
2356 * embedded inline in the opcode - this is hypothetically faster.
2358 if ( !trie->states[trie->startstate].wordnum
2360 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2362 OP( convert ) = TRIEC;
2363 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2364 PerlMemShared_free(trie->bitmap);
2367 OP( convert ) = TRIE;
2369 /* store the type in the flags */
2370 convert->flags = nodetype;
2374 + regarglen[ OP( convert ) ];
2376 /* XXX We really should free up the resource in trie now,
2377 as we won't use them - (which resources?) dmq */
2379 /* needed for dumping*/
2380 DEBUG_r(if (optimize) {
2381 regnode *opt = convert;
2383 while ( ++opt < optimize) {
2384 Set_Node_Offset_Length(opt,0,0);
2387 Try to clean up some of the debris left after the
2390 while( optimize < jumper ) {
2391 mjd_nodelen += Node_Length((optimize));
2392 OP( optimize ) = OPTIMIZED;
2393 Set_Node_Offset_Length(optimize,0,0);
2396 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2398 } /* end node insert */
2399 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2401 /* Finish populating the prev field of the wordinfo array. Walk back
2402 * from each accept state until we find another accept state, and if
2403 * so, point the first word's .prev field at the second word. If the
2404 * second already has a .prev field set, stop now. This will be the
2405 * case either if we've already processed that word's accept state,
2406 * or that state had multiple words, and the overspill words were
2407 * already linked up earlier.
2414 for (word=1; word <= trie->wordcount; word++) {
2416 if (trie->wordinfo[word].prev)
2418 state = trie->wordinfo[word].accept;
2420 state = prev_states[state];
2423 prev = trie->states[state].wordnum;
2427 trie->wordinfo[word].prev = prev;
2429 Safefree(prev_states);
2433 /* and now dump out the compressed format */
2434 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2436 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2438 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2439 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2441 SvREFCNT_dec(revcharmap);
2445 : trie->startstate>1
2451 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2453 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2455 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2456 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2459 We find the fail state for each state in the trie, this state is the longest proper
2460 suffix of the current state's 'word' that is also a proper prefix of another word in our
2461 trie. State 1 represents the word '' and is thus the default fail state. This allows
2462 the DFA not to have to restart after its tried and failed a word at a given point, it
2463 simply continues as though it had been matching the other word in the first place.
2465 'abcdgu'=~/abcdefg|cdgu/
2466 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2467 fail, which would bring us to the state representing 'd' in the second word where we would
2468 try 'g' and succeed, proceeding to match 'cdgu'.
2470 /* add a fail transition */
2471 const U32 trie_offset = ARG(source);
2472 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2474 const U32 ucharcount = trie->uniquecharcount;
2475 const U32 numstates = trie->statecount;
2476 const U32 ubound = trie->lasttrans + ucharcount;
2480 U32 base = trie->states[ 1 ].trans.base;
2483 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2484 GET_RE_DEBUG_FLAGS_DECL;
2486 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2488 PERL_UNUSED_ARG(depth);
2492 ARG_SET( stclass, data_slot );
2493 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2494 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2495 aho->trie=trie_offset;
2496 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2497 Copy( trie->states, aho->states, numstates, reg_trie_state );
2498 Newxz( q, numstates, U32);
2499 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2502 /* initialize fail[0..1] to be 1 so that we always have
2503 a valid final fail state */
2504 fail[ 0 ] = fail[ 1 ] = 1;
2506 for ( charid = 0; charid < ucharcount ; charid++ ) {
2507 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2509 q[ q_write ] = newstate;
2510 /* set to point at the root */
2511 fail[ q[ q_write++ ] ]=1;
2514 while ( q_read < q_write) {
2515 const U32 cur = q[ q_read++ % numstates ];
2516 base = trie->states[ cur ].trans.base;
2518 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2519 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2521 U32 fail_state = cur;
2524 fail_state = fail[ fail_state ];
2525 fail_base = aho->states[ fail_state ].trans.base;
2526 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2528 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2529 fail[ ch_state ] = fail_state;
2530 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2532 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2534 q[ q_write++ % numstates] = ch_state;
2538 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2539 when we fail in state 1, this allows us to use the
2540 charclass scan to find a valid start char. This is based on the principle
2541 that theres a good chance the string being searched contains lots of stuff
2542 that cant be a start char.
2544 fail[ 0 ] = fail[ 1 ] = 0;
2545 DEBUG_TRIE_COMPILE_r({
2546 PerlIO_printf(Perl_debug_log,
2547 "%*sStclass Failtable (%"UVuf" states): 0",
2548 (int)(depth * 2), "", (UV)numstates
2550 for( q_read=1; q_read<numstates; q_read++ ) {
2551 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2553 PerlIO_printf(Perl_debug_log, "\n");
2556 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2561 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2562 * These need to be revisited when a newer toolchain becomes available.
2564 #if defined(__sparc64__) && defined(__GNUC__)
2565 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2566 # undef SPARC64_GCC_WORKAROUND
2567 # define SPARC64_GCC_WORKAROUND 1
2571 #define DEBUG_PEEP(str,scan,depth) \
2572 DEBUG_OPTIMISE_r({if (scan){ \
2573 SV * const mysv=sv_newmortal(); \
2574 regnode *Next = regnext(scan); \
2575 regprop(RExC_rx, mysv, scan); \
2576 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2577 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2578 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2582 /* The below joins as many adjacent EXACTish nodes as possible into a single
2583 * one, and looks for problematic sequences of characters whose folds vs.
2584 * non-folds have sufficiently different lengths, that the optimizer would be
2585 * fooled into rejecting legitimate matches of them, and the trie construction
2586 * code needs to handle specially. The joining is only done if:
2587 * 1) there is room in the current conglomerated node to entirely contain the
2589 * 2) they are the exact same node type
2591 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2592 * these get optimized out
2594 * If there are problematic code sequences, *min_subtract is set to the delta
2595 * that the minimum size of the node can be less than its actual size. And,
2596 * the node type of the result is changed to reflect that it contains these
2599 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2600 * and contains LATIN SMALL LETTER SHARP S
2602 * This is as good a place as any to discuss the design of handling these
2603 * problematic sequences. It's been wrong in Perl for a very long time. There
2604 * are three code points currently in Unicode whose folded lengths differ so
2605 * much from the un-folded lengths that it causes problems for the optimizer
2606 * and trie construction. Why only these are problematic, and not others where
2607 * lengths also differ is something I (khw) do not understand. New versions of
2608 * Unicode might add more such code points. Hopefully the logic in
2609 * fold_grind.t that figures out what to test (in part by verifying that each
2610 * size-combination gets tested) will catch any that do come along, so they can
2611 * be added to the special handling below. The chances of new ones are
2612 * actually rather small, as most, if not all, of the world's scripts that have
2613 * casefolding have already been encoded by Unicode. Also, a number of
2614 * Unicode's decisions were made to allow compatibility with pre-existing
2615 * standards, and almost all of those have already been dealt with. These
2616 * would otherwise be the most likely candidates for generating further tricky
2617 * sequences. In other words, Unicode by itself is unlikely to add new ones
2618 * unless it is for compatibility with pre-existing standards, and there aren't
2619 * many of those left.
2621 * The previous designs for dealing with these involved assigning a special
2622 * node for them. This approach doesn't work, as evidenced by this example:
2623 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2624 * Both these fold to "sss", but if the pattern is parsed to create a node
2625 * that would match just the \xDF, it won't be able to handle the case where a
2626 * successful match would have to cross the node's boundary. The new approach
2627 * that hopefully generally solves the problem generates an EXACTFU_SS node
2630 * There are a number of components to the approach (a lot of work for just
2631 * three code points!):
2632 * 1) This routine examines each EXACTFish node that could contain the
2633 * problematic sequences. It returns in *min_subtract how much to
2634 * subtract from the the actual length of the string to get a real minimum
2635 * for one that could match it. This number is usually 0 except for the
2636 * problematic sequences. This delta is used by the caller to adjust the
2637 * min length of the match, and the delta between min and max, so that the
2638 * optimizer doesn't reject these possibilities based on size constraints.
2639 * 2) These sequences require special handling by the trie code, so this code
2640 * changes the joined node type to special ops: EXACTFU_TRICKYFOLD and
2642 * 3) This is sufficient for the two Greek sequences (described below), but
2643 * the one involving the Sharp s (\xDF) needs more. The node type
2644 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2645 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2646 * case where there is a possible fold length change. That means that a
2647 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2648 * itself with length changes, and so can be processed faster. regexec.c
2649 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2650 * is pre-folded by regcomp.c. This saves effort in regex matching.
2651 * However, the pre-folding isn't done for non-UTF8 patterns because the
2652 * fold of the MICRO SIGN requires UTF-8. Also what EXACTF and EXACTFL
2653 * nodes fold to isn't known until runtime. The fold possibilities for
2654 * the non-UTF8 patterns are quite simple, except for the sharp s. All
2655 * the ones that don't involve a UTF-8 target string are members of a
2656 * fold-pair, and arrays are set up for all of them so that the other
2657 * member of the pair can be found quickly. Code elsewhere in this file
2658 * makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even
2659 * if the pattern isn't UTF-8. This avoids the issues described in the
2661 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2662 * 'ss' or not is not knowable at compile time. It will match iff the
2663 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2664 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2665 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2666 * described in item 3). An assumption that the optimizer part of
2667 * regexec.c (probably unwittingly) makes is that a character in the
2668 * pattern corresponds to at most a single character in the target string.
2669 * (And I do mean character, and not byte here, unlike other parts of the
2670 * documentation that have never been updated to account for multibyte
2671 * Unicode.) This assumption is wrong only in this case, as all other
2672 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2673 * virtue of having this file pre-fold UTF-8 patterns. I'm
2674 * reluctant to try to change this assumption, so instead the code punts.
2675 * This routine examines EXACTF nodes for the sharp s, and returns a
2676 * boolean indicating whether or not the node is an EXACTF node that
2677 * contains a sharp s. When it is true, the caller sets a flag that later
2678 * causes the optimizer in this file to not set values for the floating
2679 * and fixed string lengths, and thus avoids the optimizer code in
2680 * regexec.c that makes the invalid assumption. Thus, there is no
2681 * optimization based on string lengths for EXACTF nodes that contain the
2682 * sharp s. This only happens for /id rules (which means the pattern
2686 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2687 if (PL_regkind[OP(scan)] == EXACT) \
2688 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2691 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2692 /* Merge several consecutive EXACTish nodes into one. */
2693 regnode *n = regnext(scan);
2695 regnode *next = scan + NODE_SZ_STR(scan);
2699 regnode *stop = scan;
2700 GET_RE_DEBUG_FLAGS_DECL;
2702 PERL_UNUSED_ARG(depth);
2705 PERL_ARGS_ASSERT_JOIN_EXACT;
2706 #ifndef EXPERIMENTAL_INPLACESCAN
2707 PERL_UNUSED_ARG(flags);
2708 PERL_UNUSED_ARG(val);
2710 DEBUG_PEEP("join",scan,depth);
2712 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2713 * EXACT ones that are mergeable to the current one. */
2715 && (PL_regkind[OP(n)] == NOTHING
2716 || (stringok && OP(n) == OP(scan)))
2718 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2721 if (OP(n) == TAIL || n > next)
2723 if (PL_regkind[OP(n)] == NOTHING) {
2724 DEBUG_PEEP("skip:",n,depth);
2725 NEXT_OFF(scan) += NEXT_OFF(n);
2726 next = n + NODE_STEP_REGNODE;
2733 else if (stringok) {
2734 const unsigned int oldl = STR_LEN(scan);
2735 regnode * const nnext = regnext(n);
2737 /* XXX I (khw) kind of doubt that this works on platforms where
2738 * U8_MAX is above 255 because of lots of other assumptions */
2739 if (oldl + STR_LEN(n) > U8_MAX)
2742 DEBUG_PEEP("merg",n,depth);
2745 NEXT_OFF(scan) += NEXT_OFF(n);
2746 STR_LEN(scan) += STR_LEN(n);
2747 next = n + NODE_SZ_STR(n);
2748 /* Now we can overwrite *n : */
2749 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2757 #ifdef EXPERIMENTAL_INPLACESCAN
2758 if (flags && !NEXT_OFF(n)) {
2759 DEBUG_PEEP("atch", val, depth);
2760 if (reg_off_by_arg[OP(n)]) {
2761 ARG_SET(n, val - n);
2764 NEXT_OFF(n) = val - n;
2772 *has_exactf_sharp_s = FALSE;
2774 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2775 * can now analyze for sequences of problematic code points. (Prior to
2776 * this final joining, sequences could have been split over boundaries, and
2777 * hence missed). The sequences only happen in folding, hence for any
2778 * non-EXACT EXACTish node */
2779 if (OP(scan) != EXACT) {
2781 U8 * s0 = (U8*) STRING(scan);
2782 U8 * const s_end = s0 + STR_LEN(scan);
2784 /* The below is perhaps overboard, but this allows us to save a test
2785 * each time through the loop at the expense of a mask. This is
2786 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2787 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2788 * This uses an exclusive 'or' to find that bit and then inverts it to
2789 * form a mask, with just a single 0, in the bit position where 'S' and
2791 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2792 const U8 s_masked = 's' & S_or_s_mask;
2794 /* One pass is made over the node's string looking for all the
2795 * possibilities. to avoid some tests in the loop, there are two main
2796 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2800 /* There are two problematic Greek code points in Unicode
2803 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2804 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2810 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2811 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2813 * This means that in case-insensitive matching (or "loose
2814 * matching", as Unicode calls it), an EXACTF of length six (the
2815 * UTF-8 encoded byte length of the above casefolded versions) can
2816 * match a target string of length two (the byte length of UTF-8
2817 * encoded U+0390 or U+03B0). This would rather mess up the
2818 * minimum length computation. (there are other code points that
2819 * also fold to these two sequences, but the delta is smaller)
2821 * If these sequences are found, the minimum length is decreased by
2822 * four (six minus two).
2824 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2825 * LETTER SHARP S. We decrease the min length by 1 for each
2826 * occurrence of 'ss' found */
2828 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2829 # define U390_first_byte 0xb4
2830 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2831 # define U3B0_first_byte 0xb5
2832 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2834 # define U390_first_byte 0xce
2835 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2836 # define U3B0_first_byte 0xcf
2837 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2839 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2840 yields a net of 0 */
2841 /* Examine the string for one of the problematic sequences */
2843 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2844 * sequence we are looking for is 2 */
2848 /* Look for the first byte in each problematic sequence */
2850 /* We don't have to worry about other things that fold to
2851 * 's' (such as the long s, U+017F), as all above-latin1
2852 * code points have been pre-folded */
2856 /* Current character is an 's' or 'S'. If next one is
2857 * as well, we have the dreaded sequence */
2858 if (((*(s+1) & S_or_s_mask) == s_masked)
2859 /* These two node types don't have special handling
2861 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2864 OP(scan) = EXACTFU_SS;
2865 s++; /* No need to look at this character again */
2869 case U390_first_byte:
2870 if (s_end - s >= len
2872 /* The 1's are because are skipping comparing the
2874 && memEQ(s + 1, U390_tail, len - 1))
2876 goto greek_sequence;
2880 case U3B0_first_byte:
2881 if (! (s_end - s >= len
2882 && memEQ(s + 1, U3B0_tail, len - 1)))
2889 /* This requires special handling by trie's, so change
2890 * the node type to indicate this. If EXACTFA and
2891 * EXACTFL were ever to be handled by trie's, this
2892 * would have to be changed. If this node has already
2893 * been changed to EXACTFU_SS in this loop, leave it as
2894 * is. (I (khw) think it doesn't matter in regexec.c
2895 * for UTF patterns, but no need to change it */
2896 if (OP(scan) == EXACTFU) {
2897 OP(scan) = EXACTFU_TRICKYFOLD;
2899 s += 6; /* We already know what this sequence is. Skip
2905 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2907 /* Here, the pattern is not UTF-8. We need to look only for the
2908 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2909 * in the final position. Otherwise we can stop looking 1 byte
2910 * earlier because have to find both the first and second 's' */
2911 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2913 for (s = s0; s < upper; s++) {
2918 && ((*(s+1) & S_or_s_mask) == s_masked))
2922 /* EXACTF nodes need to know that the minimum
2923 * length changed so that a sharp s in the string
2924 * can match this ss in the pattern, but they
2925 * remain EXACTF nodes, as they won't match this
2926 * unless the target string is is UTF-8, which we
2927 * don't know until runtime */
2928 if (OP(scan) != EXACTF) {
2929 OP(scan) = EXACTFU_SS;
2934 case LATIN_SMALL_LETTER_SHARP_S:
2935 if (OP(scan) == EXACTF) {
2936 *has_exactf_sharp_s = TRUE;
2945 /* Allow dumping but overwriting the collection of skipped
2946 * ops and/or strings with fake optimized ops */
2947 n = scan + NODE_SZ_STR(scan);
2955 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2959 /* REx optimizer. Converts nodes into quicker variants "in place".
2960 Finds fixed substrings. */
2962 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2963 to the position after last scanned or to NULL. */
2965 #define INIT_AND_WITHP \
2966 assert(!and_withp); \
2967 Newx(and_withp,1,struct regnode_charclass_class); \
2968 SAVEFREEPV(and_withp)
2970 /* this is a chain of data about sub patterns we are processing that
2971 need to be handled separately/specially in study_chunk. Its so
2972 we can simulate recursion without losing state. */
2974 typedef struct scan_frame {
2975 regnode *last; /* last node to process in this frame */
2976 regnode *next; /* next node to process when last is reached */
2977 struct scan_frame *prev; /*previous frame*/
2978 I32 stop; /* what stopparen do we use */
2982 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2984 #define CASE_SYNST_FNC(nAmE) \
2986 if (flags & SCF_DO_STCLASS_AND) { \
2987 for (value = 0; value < 256; value++) \
2988 if (!is_ ## nAmE ## _cp(value)) \
2989 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2992 for (value = 0; value < 256; value++) \
2993 if (is_ ## nAmE ## _cp(value)) \
2994 ANYOF_BITMAP_SET(data->start_class, value); \
2998 if (flags & SCF_DO_STCLASS_AND) { \
2999 for (value = 0; value < 256; value++) \
3000 if (is_ ## nAmE ## _cp(value)) \
3001 ANYOF_BITMAP_CLEAR(data->start_class, value); \
3004 for (value = 0; value < 256; value++) \
3005 if (!is_ ## nAmE ## _cp(value)) \
3006 ANYOF_BITMAP_SET(data->start_class, value); \
3013 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3014 I32 *minlenp, I32 *deltap,
3019 struct regnode_charclass_class *and_withp,
3020 U32 flags, U32 depth)
3021 /* scanp: Start here (read-write). */
3022 /* deltap: Write maxlen-minlen here. */
3023 /* last: Stop before this one. */
3024 /* data: string data about the pattern */
3025 /* stopparen: treat close N as END */
3026 /* recursed: which subroutines have we recursed into */
3027 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3030 I32 min = 0, pars = 0, code;
3031 regnode *scan = *scanp, *next;
3033 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3034 int is_inf_internal = 0; /* The studied chunk is infinite */
3035 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3036 scan_data_t data_fake;
3037 SV *re_trie_maxbuff = NULL;
3038 regnode *first_non_open = scan;
3039 I32 stopmin = I32_MAX;
3040 scan_frame *frame = NULL;
3041 GET_RE_DEBUG_FLAGS_DECL;
3043 PERL_ARGS_ASSERT_STUDY_CHUNK;
3046 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3050 while (first_non_open && OP(first_non_open) == OPEN)
3051 first_non_open=regnext(first_non_open);
3056 while ( scan && OP(scan) != END && scan < last ){
3057 UV min_subtract = 0; /* How much to subtract from the minimum node
3058 length to get a real minimum (because the
3059 folded version may be shorter) */
3060 bool has_exactf_sharp_s = FALSE;
3061 /* Peephole optimizer: */
3062 DEBUG_STUDYDATA("Peep:", data,depth);
3063 DEBUG_PEEP("Peep",scan,depth);
3065 /* Its not clear to khw or hv why this is done here, and not in the
3066 * clauses that deal with EXACT nodes. khw's guess is that it's
3067 * because of a previous design */
3068 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3070 /* Follow the next-chain of the current node and optimize
3071 away all the NOTHINGs from it. */
3072 if (OP(scan) != CURLYX) {
3073 const int max = (reg_off_by_arg[OP(scan)]
3075 /* I32 may be smaller than U16 on CRAYs! */
3076 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3077 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3081 /* Skip NOTHING and LONGJMP. */
3082 while ((n = regnext(n))
3083 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3084 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3085 && off + noff < max)
3087 if (reg_off_by_arg[OP(scan)])
3090 NEXT_OFF(scan) = off;
3095 /* The principal pseudo-switch. Cannot be a switch, since we
3096 look into several different things. */
3097 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3098 || OP(scan) == IFTHEN) {
3099 next = regnext(scan);
3101 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3103 if (OP(next) == code || code == IFTHEN) {
3104 /* NOTE - There is similar code to this block below for handling
3105 TRIE nodes on a re-study. If you change stuff here check there
3107 I32 max1 = 0, min1 = I32_MAX, num = 0;
3108 struct regnode_charclass_class accum;
3109 regnode * const startbranch=scan;
3111 if (flags & SCF_DO_SUBSTR)
3112 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3113 if (flags & SCF_DO_STCLASS)
3114 cl_init_zero(pRExC_state, &accum);
3116 while (OP(scan) == code) {
3117 I32 deltanext, minnext, f = 0, fake;
3118 struct regnode_charclass_class this_class;
3121 data_fake.flags = 0;
3123 data_fake.whilem_c = data->whilem_c;
3124 data_fake.last_closep = data->last_closep;
3127 data_fake.last_closep = &fake;
3129 data_fake.pos_delta = delta;
3130 next = regnext(scan);
3131 scan = NEXTOPER(scan);
3133 scan = NEXTOPER(scan);
3134 if (flags & SCF_DO_STCLASS) {
3135 cl_init(pRExC_state, &this_class);
3136 data_fake.start_class = &this_class;
3137 f = SCF_DO_STCLASS_AND;
3139 if (flags & SCF_WHILEM_VISITED_POS)
3140 f |= SCF_WHILEM_VISITED_POS;
3142 /* we suppose the run is continuous, last=next...*/
3143 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3145 stopparen, recursed, NULL, f,depth+1);
3148 if (max1 < minnext + deltanext)
3149 max1 = minnext + deltanext;
3150 if (deltanext == I32_MAX)
3151 is_inf = is_inf_internal = 1;
3153 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3155 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3156 if ( stopmin > minnext)
3157 stopmin = min + min1;
3158 flags &= ~SCF_DO_SUBSTR;
3160 data->flags |= SCF_SEEN_ACCEPT;
3163 if (data_fake.flags & SF_HAS_EVAL)
3164 data->flags |= SF_HAS_EVAL;
3165 data->whilem_c = data_fake.whilem_c;
3167 if (flags & SCF_DO_STCLASS)
3168 cl_or(pRExC_state, &accum, &this_class);
3170 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3172 if (flags & SCF_DO_SUBSTR) {
3173 data->pos_min += min1;
3174 data->pos_delta += max1 - min1;
3175 if (max1 != min1 || is_inf)
3176 data->longest = &(data->longest_float);
3179 delta += max1 - min1;
3180 if (flags & SCF_DO_STCLASS_OR) {
3181 cl_or(pRExC_state, data->start_class, &accum);
3183 cl_and(data->start_class, and_withp);
3184 flags &= ~SCF_DO_STCLASS;
3187 else if (flags & SCF_DO_STCLASS_AND) {
3189 cl_and(data->start_class, &accum);
3190 flags &= ~SCF_DO_STCLASS;
3193 /* Switch to OR mode: cache the old value of
3194 * data->start_class */
3196 StructCopy(data->start_class, and_withp,
3197 struct regnode_charclass_class);
3198 flags &= ~SCF_DO_STCLASS_AND;
3199 StructCopy(&accum, data->start_class,
3200 struct regnode_charclass_class);
3201 flags |= SCF_DO_STCLASS_OR;
3202 data->start_class->flags |= ANYOF_EOS;
3206 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3209 Assuming this was/is a branch we are dealing with: 'scan' now
3210 points at the item that follows the branch sequence, whatever
3211 it is. We now start at the beginning of the sequence and look
3218 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3220 If we can find such a subsequence we need to turn the first
3221 element into a trie and then add the subsequent branch exact
3222 strings to the trie.
3226 1. patterns where the whole set of branches can be converted.
3228 2. patterns where only a subset can be converted.
3230 In case 1 we can replace the whole set with a single regop
3231 for the trie. In case 2 we need to keep the start and end
3234 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3235 becomes BRANCH TRIE; BRANCH X;
3237 There is an additional case, that being where there is a
3238 common prefix, which gets split out into an EXACT like node
3239 preceding the TRIE node.
3241 If x(1..n)==tail then we can do a simple trie, if not we make
3242 a "jump" trie, such that when we match the appropriate word
3243 we "jump" to the appropriate tail node. Essentially we turn
3244 a nested if into a case structure of sorts.
3249 if (!re_trie_maxbuff) {
3250 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3251 if (!SvIOK(re_trie_maxbuff))
3252 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3254 if ( SvIV(re_trie_maxbuff)>=0 ) {
3256 regnode *first = (regnode *)NULL;
3257 regnode *last = (regnode *)NULL;
3258 regnode *tail = scan;
3263 SV * const mysv = sv_newmortal(); /* for dumping */
3265 /* var tail is used because there may be a TAIL
3266 regop in the way. Ie, the exacts will point to the
3267 thing following the TAIL, but the last branch will
3268 point at the TAIL. So we advance tail. If we
3269 have nested (?:) we may have to move through several
3273 while ( OP( tail ) == TAIL ) {
3274 /* this is the TAIL generated by (?:) */
3275 tail = regnext( tail );
3279 DEBUG_TRIE_COMPILE_r({
3280 regprop(RExC_rx, mysv, tail );
3281 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3282 (int)depth * 2 + 2, "",
3283 "Looking for TRIE'able sequences. Tail node is: ",
3284 SvPV_nolen_const( mysv )
3290 Step through the branches
3291 cur represents each branch,
3292 noper is the first thing to be matched as part of that branch
3293 noper_next is the regnext() of that node.
3295 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3296 via a "jump trie" but we also support building with NOJUMPTRIE,
3297 which restricts the trie logic to structures like /FOO|BAR/.
3299 If noper is a trieable nodetype then the branch is a possible optimization
3300 target. If we are building under NOJUMPTRIE then we require that noper_next
3301 is the same as scan (our current position in the regex program).
3303 Once we have two or more consecutive such branches we can create a
3304 trie of the EXACT's contents and stitch it in place into the program.
3306 If the sequence represents all of the branches in the alternation we
3307 replace the entire thing with a single TRIE node.
3309 Otherwise when it is a subsequence we need to stitch it in place and
3310 replace only the relevant branches. This means the first branch has
3311 to remain as it is used by the alternation logic, and its next pointer,
3312 and needs to be repointed at the item on the branch chain following
3313 the last branch we have optimized away.
3315 This could be either a BRANCH, in which case the subsequence is internal,
3316 or it could be the item following the branch sequence in which case the
3317 subsequence is at the end (which does not necessarily mean the first node
3318 is the start of the alternation).
3320 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3323 ----------------+-----------
3327 EXACTFU_SS | EXACTFU
3328 EXACTFU_TRICKYFOLD | EXACTFU
3333 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3334 ( EXACT == (X) ) ? EXACT : \
3335 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3338 /* dont use tail as the end marker for this traverse */
3339 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3340 regnode * const noper = NEXTOPER( cur );
3341 U8 noper_type = OP( noper );
3342 U8 noper_trietype = TRIE_TYPE( noper_type );
3343 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3344 regnode * const noper_next = regnext( noper );
3345 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3346 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3349 DEBUG_TRIE_COMPILE_r({
3350 regprop(RExC_rx, mysv, cur);
3351 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3352 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3354 regprop(RExC_rx, mysv, noper);
3355 PerlIO_printf( Perl_debug_log, " -> %s",
3356 SvPV_nolen_const(mysv));
3359 regprop(RExC_rx, mysv, noper_next );
3360 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3361 SvPV_nolen_const(mysv));
3363 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3364 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3365 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3369 /* Is noper a trieable nodetype that can be merged with the
3370 * current trie (if there is one)? */
3374 ( noper_trietype == NOTHING)
3375 || ( trietype == NOTHING )
3376 || ( trietype == noper_trietype )
3379 && noper_next == tail
3383 /* Handle mergable triable node
3384 * Either we are the first node in a new trieable sequence,
3385 * in which case we do some bookkeeping, otherwise we update
3386 * the end pointer. */
3389 if ( noper_trietype == NOTHING ) {
3390 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3391 regnode * const noper_next = regnext( noper );
3392 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3393 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3396 if ( noper_next_trietype ) {
3397 trietype = noper_next_trietype;
3398 } else if (noper_next_type) {
3399 /* a NOTHING regop is 1 regop wide. We need at least two
3400 * for a trie so we can't merge this in */
3404 trietype = noper_trietype;
3407 if ( trietype == NOTHING )
3408 trietype = noper_trietype;
3413 } /* end handle mergable triable node */
3415 /* handle unmergable node -
3416 * noper may either be a triable node which can not be tried
3417 * together with the current trie, or a non triable node */
3419 /* If last is set and trietype is not NOTHING then we have found
3420 * at least two triable branch sequences in a row of a similar
3421 * trietype so we can turn them into a trie. If/when we
3422 * allow NOTHING to start a trie sequence this condition will be
3423 * required, and it isn't expensive so we leave it in for now. */
3424 if ( trietype != NOTHING )
3425 make_trie( pRExC_state,
3426 startbranch, first, cur, tail, count,
3427 trietype, depth+1 );
3428 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3432 && noper_next == tail
3435 /* noper is triable, so we can start a new trie sequence */
3438 trietype = noper_trietype;
3440 /* if we already saw a first but the current node is not triable then we have
3441 * to reset the first information. */
3446 } /* end handle unmergable node */
3447 } /* loop over branches */
3448 DEBUG_TRIE_COMPILE_r({
3449 regprop(RExC_rx, mysv, cur);
3450 PerlIO_printf( Perl_debug_log,
3451 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3452 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3456 if ( trietype != NOTHING ) {
3457 /* the last branch of the sequence was part of a trie,
3458 * so we have to construct it here outside of the loop
3460 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3461 #ifdef TRIE_STUDY_OPT
3462 if ( ((made == MADE_EXACT_TRIE &&
3463 startbranch == first)
3464 || ( first_non_open == first )) &&
3466 flags |= SCF_TRIE_RESTUDY;
3467 if ( startbranch == first
3470 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3475 /* at this point we know whatever we have is a NOTHING sequence/branch
3476 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3478 if ( startbranch == first ) {
3480 /* the entire thing is a NOTHING sequence, something like this:
3481 * (?:|) So we can turn it into a plain NOTHING op. */
3482 DEBUG_TRIE_COMPILE_r({
3483 regprop(RExC_rx, mysv, cur);
3484 PerlIO_printf( Perl_debug_log,
3485 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3486 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3489 OP(startbranch)= NOTHING;
3490 NEXT_OFF(startbranch)= tail - startbranch;
3491 for ( opt= startbranch + 1; opt < tail ; opt++ )
3495 } /* end if ( last) */
3496 } /* TRIE_MAXBUF is non zero */
3501 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3502 scan = NEXTOPER(NEXTOPER(scan));
3503 } else /* single branch is optimized. */
3504 scan = NEXTOPER(scan);
3506 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3507 scan_frame *newframe = NULL;
3512 if (OP(scan) != SUSPEND) {
3513 /* set the pointer */
3514 if (OP(scan) == GOSUB) {
3516 RExC_recurse[ARG2L(scan)] = scan;
3517 start = RExC_open_parens[paren-1];
3518 end = RExC_close_parens[paren-1];
3521 start = RExC_rxi->program + 1;
3525 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3526 SAVEFREEPV(recursed);
3528 if (!PAREN_TEST(recursed,paren+1)) {
3529 PAREN_SET(recursed,paren+1);
3530 Newx(newframe,1,scan_frame);
3532 if (flags & SCF_DO_SUBSTR) {
3533 SCAN_COMMIT(pRExC_state,data,minlenp);
3534 data->longest = &(data->longest_float);
3536 is_inf = is_inf_internal = 1;
3537 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3538 cl_anything(pRExC_state, data->start_class);
3539 flags &= ~SCF_DO_STCLASS;
3542 Newx(newframe,1,scan_frame);
3545 end = regnext(scan);
3550 SAVEFREEPV(newframe);
3551 newframe->next = regnext(scan);
3552 newframe->last = last;
3553 newframe->stop = stopparen;
3554 newframe->prev = frame;
3564 else if (OP(scan) == EXACT) {
3565 I32 l = STR_LEN(scan);
3568 const U8 * const s = (U8*)STRING(scan);
3569 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3570 l = utf8_length(s, s + l);
3572 uc = *((U8*)STRING(scan));
3575 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3576 /* The code below prefers earlier match for fixed
3577 offset, later match for variable offset. */
3578 if (data->last_end == -1) { /* Update the start info. */
3579 data->last_start_min = data->pos_min;
3580 data->last_start_max = is_inf
3581 ? I32_MAX : data->pos_min + data->pos_delta;
3583 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3585 SvUTF8_on(data->last_found);
3587 SV * const sv = data->last_found;
3588 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3589 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3590 if (mg && mg->mg_len >= 0)
3591 mg->mg_len += utf8_length((U8*)STRING(scan),
3592 (U8*)STRING(scan)+STR_LEN(scan));
3594 data->last_end = data->pos_min + l;
3595 data->pos_min += l; /* As in the first entry. */
3596 data->flags &= ~SF_BEFORE_EOL;
3598 if (flags & SCF_DO_STCLASS_AND) {
3599 /* Check whether it is compatible with what we know already! */
3603 /* If compatible, we or it in below. It is compatible if is
3604 * in the bitmp and either 1) its bit or its fold is set, or 2)
3605 * it's for a locale. Even if there isn't unicode semantics
3606 * here, at runtime there may be because of matching against a
3607 * utf8 string, so accept a possible false positive for
3608 * latin1-range folds */
3610 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3611 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3612 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3613 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3618 ANYOF_CLASS_ZERO(data->start_class);
3619 ANYOF_BITMAP_ZERO(data->start_class);
3621 ANYOF_BITMAP_SET(data->start_class, uc);
3622 else if (uc >= 0x100) {
3625 /* Some Unicode code points fold to the Latin1 range; as
3626 * XXX temporary code, instead of figuring out if this is
3627 * one, just assume it is and set all the start class bits
3628 * that could be some such above 255 code point's fold
3629 * which will generate fals positives. As the code
3630 * elsewhere that does compute the fold settles down, it
3631 * can be extracted out and re-used here */
3632 for (i = 0; i < 256; i++){
3633 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3634 ANYOF_BITMAP_SET(data->start_class, i);
3638 data->start_class->flags &= ~ANYOF_EOS;
3640 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3642 else if (flags & SCF_DO_STCLASS_OR) {
3643 /* false positive possible if the class is case-folded */
3645 ANYOF_BITMAP_SET(data->start_class, uc);
3647 data->start_class->flags |= ANYOF_UNICODE_ALL;
3648 data->start_class->flags &= ~ANYOF_EOS;
3649 cl_and(data->start_class, and_withp);
3651 flags &= ~SCF_DO_STCLASS;
3653 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3654 I32 l = STR_LEN(scan);
3655 UV uc = *((U8*)STRING(scan));
3657 /* Search for fixed substrings supports EXACT only. */
3658 if (flags & SCF_DO_SUBSTR) {
3660 SCAN_COMMIT(pRExC_state, data, minlenp);
3663 const U8 * const s = (U8 *)STRING(scan);
3664 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3665 l = utf8_length(s, s + l);
3667 if (has_exactf_sharp_s) {
3668 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3670 min += l - min_subtract;
3674 delta += min_subtract;
3675 if (flags & SCF_DO_SUBSTR) {
3676 data->pos_min += l - min_subtract;
3677 if (data->pos_min < 0) {
3680 data->pos_delta += min_subtract;
3682 data->longest = &(data->longest_float);
3685 if (flags & SCF_DO_STCLASS_AND) {
3686 /* Check whether it is compatible with what we know already! */
3689 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3690 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3691 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3695 ANYOF_CLASS_ZERO(data->start_class);
3696 ANYOF_BITMAP_ZERO(data->start_class);
3698 ANYOF_BITMAP_SET(data->start_class, uc);
3699 data->start_class->flags &= ~ANYOF_EOS;
3700 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3701 if (OP(scan) == EXACTFL) {
3702 /* XXX This set is probably no longer necessary, and
3703 * probably wrong as LOCALE now is on in the initial
3705 data->start_class->flags |= ANYOF_LOCALE;
3709 /* Also set the other member of the fold pair. In case
3710 * that unicode semantics is called for at runtime, use
3711 * the full latin1 fold. (Can't do this for locale,
3712 * because not known until runtime) */
3713 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3715 /* All other (EXACTFL handled above) folds except under
3716 * /iaa that include s, S, and sharp_s also may include
3718 if (OP(scan) != EXACTFA) {
3719 if (uc == 's' || uc == 'S') {
3720 ANYOF_BITMAP_SET(data->start_class,
3721 LATIN_SMALL_LETTER_SHARP_S);
3723 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3724 ANYOF_BITMAP_SET(data->start_class, 's');
3725 ANYOF_BITMAP_SET(data->start_class, 'S');
3730 else if (uc >= 0x100) {
3732 for (i = 0; i < 256; i++){
3733 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3734 ANYOF_BITMAP_SET(data->start_class, i);
3739 else if (flags & SCF_DO_STCLASS_OR) {
3740 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3741 /* false positive possible if the class is case-folded.
3742 Assume that the locale settings are the same... */
3744 ANYOF_BITMAP_SET(data->start_class, uc);
3745 if (OP(scan) != EXACTFL) {
3747 /* And set the other member of the fold pair, but
3748 * can't do that in locale because not known until
3750 ANYOF_BITMAP_SET(data->start_class,
3751 PL_fold_latin1[uc]);
3753 /* All folds except under /iaa that include s, S,
3754 * and sharp_s also may include the others */
3755 if (OP(scan) != EXACTFA) {
3756 if (uc == 's' || uc == 'S') {
3757 ANYOF_BITMAP_SET(data->start_class,
3758 LATIN_SMALL_LETTER_SHARP_S);
3760 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3761 ANYOF_BITMAP_SET(data->start_class, 's');
3762 ANYOF_BITMAP_SET(data->start_class, 'S');
3767 data->start_class->flags &= ~ANYOF_EOS;
3769 cl_and(data->start_class, and_withp);
3771 flags &= ~SCF_DO_STCLASS;
3773 else if (REGNODE_VARIES(OP(scan))) {
3774 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3775 I32 f = flags, pos_before = 0;
3776 regnode * const oscan = scan;
3777 struct regnode_charclass_class this_class;
3778 struct regnode_charclass_class *oclass = NULL;
3779 I32 next_is_eval = 0;
3781 switch (PL_regkind[OP(scan)]) {
3782 case WHILEM: /* End of (?:...)* . */
3783 scan = NEXTOPER(scan);
3786 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3787 next = NEXTOPER(scan);
3788 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3790 maxcount = REG_INFTY;
3791 next = regnext(scan);
3792 scan = NEXTOPER(scan);
3796 if (flags & SCF_DO_SUBSTR)
3801 if (flags & SCF_DO_STCLASS) {
3803 maxcount = REG_INFTY;
3804 next = regnext(scan);
3805 scan = NEXTOPER(scan);
3808 is_inf = is_inf_internal = 1;
3809 scan = regnext(scan);
3810 if (flags & SCF_DO_SUBSTR) {
3811 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3812 data->longest = &(data->longest_float);
3814 goto optimize_curly_tail;
3816 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3817 && (scan->flags == stopparen))
3822 mincount = ARG1(scan);
3823 maxcount = ARG2(scan);
3825 next = regnext(scan);
3826 if (OP(scan) == CURLYX) {
3827 I32 lp = (data ? *(data->last_closep) : 0);
3828 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3830 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3831 next_is_eval = (OP(scan) == EVAL);
3833 if (flags & SCF_DO_SUBSTR) {
3834 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3835 pos_before = data->pos_min;
3839 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3841 data->flags |= SF_IS_INF;
3843 if (flags & SCF_DO_STCLASS) {
3844 cl_init(pRExC_state, &this_class);
3845 oclass = data->start_class;
3846 data->start_class = &this_class;
3847 f |= SCF_DO_STCLASS_AND;
3848 f &= ~SCF_DO_STCLASS_OR;
3850 /* Exclude from super-linear cache processing any {n,m}
3851 regops for which the combination of input pos and regex
3852 pos is not enough information to determine if a match
3855 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3856 regex pos at the \s*, the prospects for a match depend not
3857 only on the input position but also on how many (bar\s*)
3858 repeats into the {4,8} we are. */
3859 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3860 f &= ~SCF_WHILEM_VISITED_POS;
3862 /* This will finish on WHILEM, setting scan, or on NULL: */
3863 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3864 last, data, stopparen, recursed, NULL,
3866 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3868 if (flags & SCF_DO_STCLASS)
3869 data->start_class = oclass;
3870 if (mincount == 0 || minnext == 0) {
3871 if (flags & SCF_DO_STCLASS_OR) {
3872 cl_or(pRExC_state, data->start_class, &this_class);
3874 else if (flags & SCF_DO_STCLASS_AND) {
3875 /* Switch to OR mode: cache the old value of
3876 * data->start_class */
3878 StructCopy(data->start_class, and_withp,
3879 struct regnode_charclass_class);
3880 flags &= ~SCF_DO_STCLASS_AND;
3881 StructCopy(&this_class, data->start_class,
3882 struct regnode_charclass_class);
3883 flags |= SCF_DO_STCLASS_OR;
3884 data->start_class->flags |= ANYOF_EOS;
3886 } else { /* Non-zero len */
3887 if (flags & SCF_DO_STCLASS_OR) {
3888 cl_or(pRExC_state, data->start_class, &this_class);
3889 cl_and(data->start_class, and_withp);
3891 else if (flags & SCF_DO_STCLASS_AND)
3892 cl_and(data->start_class, &this_class);
3893 flags &= ~SCF_DO_STCLASS;
3895 if (!scan) /* It was not CURLYX, but CURLY. */
3897 if ( /* ? quantifier ok, except for (?{ ... }) */
3898 (next_is_eval || !(mincount == 0 && maxcount == 1))
3899 && (minnext == 0) && (deltanext == 0)
3900 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3901 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3903 ckWARNreg(RExC_parse,
3904 "Quantifier unexpected on zero-length expression");
3907 min += minnext * mincount;
3908 is_inf_internal |= ((maxcount == REG_INFTY
3909 && (minnext + deltanext) > 0)
3910 || deltanext == I32_MAX);
3911 is_inf |= is_inf_internal;
3912 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3914 /* Try powerful optimization CURLYX => CURLYN. */
3915 if ( OP(oscan) == CURLYX && data
3916 && data->flags & SF_IN_PAR
3917 && !(data->flags & SF_HAS_EVAL)
3918 && !deltanext && minnext == 1 ) {
3919 /* Try to optimize to CURLYN. */
3920 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3921 regnode * const nxt1 = nxt;
3928 if (!REGNODE_SIMPLE(OP(nxt))
3929 && !(PL_regkind[OP(nxt)] == EXACT
3930 && STR_LEN(nxt) == 1))
3936 if (OP(nxt) != CLOSE)
3938 if (RExC_open_parens) {
3939 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3940 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3942 /* Now we know that nxt2 is the only contents: */
3943 oscan->flags = (U8)ARG(nxt);
3945 OP(nxt1) = NOTHING; /* was OPEN. */
3948 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3949 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3950 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3951 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3952 OP(nxt + 1) = OPTIMIZED; /* was count. */
3953 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3958 /* Try optimization CURLYX => CURLYM. */
3959 if ( OP(oscan) == CURLYX && data
3960 && !(data->flags & SF_HAS_PAR)
3961 && !(data->flags & SF_HAS_EVAL)
3962 && !deltanext /* atom is fixed width */
3963 && minnext != 0 /* CURLYM can't handle zero width */
3964 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3966 /* XXXX How to optimize if data == 0? */
3967 /* Optimize to a simpler form. */
3968 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3972 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3973 && (OP(nxt2) != WHILEM))
3975 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3976 /* Need to optimize away parenths. */
3977 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3978 /* Set the parenth number. */
3979 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3981 oscan->flags = (U8)ARG(nxt);
3982 if (RExC_open_parens) {
3983 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3984 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3986 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3987 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3990 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3991 OP(nxt + 1) = OPTIMIZED; /* was count. */
3992 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3993 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3996 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3997 regnode *nnxt = regnext(nxt1);
3999 if (reg_off_by_arg[OP(nxt1)])
4000 ARG_SET(nxt1, nxt2 - nxt1);
4001 else if (nxt2 - nxt1 < U16_MAX)
4002 NEXT_OFF(nxt1) = nxt2 - nxt1;
4004 OP(nxt) = NOTHING; /* Cannot beautify */
4009 /* Optimize again: */
4010 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4011 NULL, stopparen, recursed, NULL, 0,depth+1);
4016 else if ((OP(oscan) == CURLYX)
4017 && (flags & SCF_WHILEM_VISITED_POS)
4018 /* See the comment on a similar expression above.
4019 However, this time it's not a subexpression
4020 we care about, but the expression itself. */
4021 && (maxcount == REG_INFTY)
4022 && data && ++data->whilem_c < 16) {
4023 /* This stays as CURLYX, we can put the count/of pair. */
4024 /* Find WHILEM (as in regexec.c) */
4025 regnode *nxt = oscan + NEXT_OFF(oscan);
4027 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4029 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4030 | (RExC_whilem_seen << 4)); /* On WHILEM */
4032 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4034 if (flags & SCF_DO_SUBSTR) {
4035 SV *last_str = NULL;
4036 int counted = mincount != 0;
4038 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4039 #if defined(SPARC64_GCC_WORKAROUND)
4042 const char *s = NULL;
4045 if (pos_before >= data->last_start_min)
4048 b = data->last_start_min;
4051 s = SvPV_const(data->last_found, l);
4052 old = b - data->last_start_min;
4055 I32 b = pos_before >= data->last_start_min
4056 ? pos_before : data->last_start_min;
4058 const char * const s = SvPV_const(data->last_found, l);
4059 I32 old = b - data->last_start_min;
4063 old = utf8_hop((U8*)s, old) - (U8*)s;
4065 /* Get the added string: */
4066 last_str = newSVpvn_utf8(s + old, l, UTF);
4067 if (deltanext == 0 && pos_before == b) {
4068 /* What was added is a constant string */
4070 SvGROW(last_str, (mincount * l) + 1);
4071 repeatcpy(SvPVX(last_str) + l,
4072 SvPVX_const(last_str), l, mincount - 1);
4073 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4074 /* Add additional parts. */
4075 SvCUR_set(data->last_found,
4076 SvCUR(data->last_found) - l);
4077 sv_catsv(data->last_found, last_str);
4079 SV * sv = data->last_found;
4081 SvUTF8(sv) && SvMAGICAL(sv) ?
4082 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4083 if (mg && mg->mg_len >= 0)
4084 mg->mg_len += CHR_SVLEN(last_str) - l;
4086 data->last_end += l * (mincount - 1);
4089 /* start offset must point into the last copy */
4090 data->last_start_min += minnext * (mincount - 1);
4091 data->last_start_max += is_inf ? I32_MAX
4092 : (maxcount - 1) * (minnext + data->pos_delta);
4095 /* It is counted once already... */
4096 data->pos_min += minnext * (mincount - counted);
4097 data->pos_delta += - counted * deltanext +
4098 (minnext + deltanext) * maxcount - minnext * mincount;
4099 if (mincount != maxcount) {
4100 /* Cannot extend fixed substrings found inside
4102 SCAN_COMMIT(pRExC_state,data,minlenp);
4103 if (mincount && last_str) {
4104 SV * const sv = data->last_found;
4105 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4106 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4110 sv_setsv(sv, last_str);
4111 data->last_end = data->pos_min;
4112 data->last_start_min =
4113 data->pos_min - CHR_SVLEN(last_str);
4114 data->last_start_max = is_inf
4116 : data->pos_min + data->pos_delta
4117 - CHR_SVLEN(last_str);
4119 data->longest = &(data->longest_float);
4121 SvREFCNT_dec(last_str);
4123 if (data && (fl & SF_HAS_EVAL))
4124 data->flags |= SF_HAS_EVAL;
4125 optimize_curly_tail:
4126 if (OP(oscan) != CURLYX) {
4127 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4129 NEXT_OFF(oscan) += NEXT_OFF(next);
4132 default: /* REF, ANYOFV, and CLUMP only? */
4133 if (flags & SCF_DO_SUBSTR) {
4134 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4135 data->longest = &(data->longest_float);
4137 is_inf = is_inf_internal = 1;
4138 if (flags & SCF_DO_STCLASS_OR)
4139 cl_anything(pRExC_state, data->start_class);
4140 flags &= ~SCF_DO_STCLASS;
4144 else if (OP(scan) == LNBREAK) {
4145 if (flags & SCF_DO_STCLASS) {
4147 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4148 if (flags & SCF_DO_STCLASS_AND) {
4149 for (value = 0; value < 256; value++)
4150 if (!is_VERTWS_cp(value))
4151 ANYOF_BITMAP_CLEAR(data->start_class, value);
4154 for (value = 0; value < 256; value++)
4155 if (is_VERTWS_cp(value))
4156 ANYOF_BITMAP_SET(data->start_class, value);
4158 if (flags & SCF_DO_STCLASS_OR)
4159 cl_and(data->start_class, and_withp);
4160 flags &= ~SCF_DO_STCLASS;
4164 if (flags & SCF_DO_SUBSTR) {
4165 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4167 data->pos_delta += 1;
4168 data->longest = &(data->longest_float);
4171 else if (REGNODE_SIMPLE(OP(scan))) {
4174 if (flags & SCF_DO_SUBSTR) {
4175 SCAN_COMMIT(pRExC_state,data,minlenp);
4179 if (flags & SCF_DO_STCLASS) {
4180 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4182 /* Some of the logic below assumes that switching
4183 locale on will only add false positives. */
4184 switch (PL_regkind[OP(scan)]) {
4188 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4189 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4190 cl_anything(pRExC_state, data->start_class);
4193 if (OP(scan) == SANY)
4195 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4196 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4197 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4198 cl_anything(pRExC_state, data->start_class);
4200 if (flags & SCF_DO_STCLASS_AND || !value)
4201 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4204 if (flags & SCF_DO_STCLASS_AND)
4205 cl_and(data->start_class,
4206 (struct regnode_charclass_class*)scan);
4208 cl_or(pRExC_state, data->start_class,
4209 (struct regnode_charclass_class*)scan);
4212 if (flags & SCF_DO_STCLASS_AND) {
4213 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4214 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4215 if (OP(scan) == ALNUMU) {
4216 for (value = 0; value < 256; value++) {
4217 if (!isWORDCHAR_L1(value)) {
4218 ANYOF_BITMAP_CLEAR(data->start_class, value);
4222 for (value = 0; value < 256; value++) {
4223 if (!isALNUM(value)) {
4224 ANYOF_BITMAP_CLEAR(data->start_class, value);
4231 if (data->start_class->flags & ANYOF_LOCALE)
4232 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4234 /* Even if under locale, set the bits for non-locale
4235 * in case it isn't a true locale-node. This will
4236 * create false positives if it truly is locale */
4237 if (OP(scan) == ALNUMU) {
4238 for (value = 0; value < 256; value++) {
4239 if (isWORDCHAR_L1(value)) {
4240 ANYOF_BITMAP_SET(data->start_class, value);
4244 for (value = 0; value < 256; value++) {
4245 if (isALNUM(value)) {
4246 ANYOF_BITMAP_SET(data->start_class, value);
4253 if (flags & SCF_DO_STCLASS_AND) {
4254 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4255 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4256 if (OP(scan) == NALNUMU) {
4257 for (value = 0; value < 256; value++) {
4258 if (isWORDCHAR_L1(value)) {
4259 ANYOF_BITMAP_CLEAR(data->start_class, value);
4263 for (value = 0; value < 256; value++) {
4264 if (isALNUM(value)) {
4265 ANYOF_BITMAP_CLEAR(data->start_class, value);
4272 if (data->start_class->flags & ANYOF_LOCALE)
4273 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4275 /* Even if under locale, set the bits for non-locale in
4276 * case it isn't a true locale-node. This will create
4277 * false positives if it truly is locale */
4278 if (OP(scan) == NALNUMU) {
4279 for (value = 0; value < 256; value++) {
4280 if (! isWORDCHAR_L1(value)) {
4281 ANYOF_BITMAP_SET(data->start_class, value);
4285 for (value = 0; value < 256; value++) {
4286 if (! isALNUM(value)) {
4287 ANYOF_BITMAP_SET(data->start_class, value);
4294 if (flags & SCF_DO_STCLASS_AND) {
4295 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4296 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4297 if (OP(scan) == SPACEU) {
4298 for (value = 0; value < 256; value++) {
4299 if (!isSPACE_L1(value)) {
4300 ANYOF_BITMAP_CLEAR(data->start_class, value);
4304 for (value = 0; value < 256; value++) {
4305 if (!isSPACE(value)) {
4306 ANYOF_BITMAP_CLEAR(data->start_class, value);
4313 if (data->start_class->flags & ANYOF_LOCALE) {
4314 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4316 if (OP(scan) == SPACEU) {
4317 for (value = 0; value < 256; value++) {
4318 if (isSPACE_L1(value)) {
4319 ANYOF_BITMAP_SET(data->start_class, value);
4323 for (value = 0; value < 256; value++) {
4324 if (isSPACE(value)) {
4325 ANYOF_BITMAP_SET(data->start_class, value);
4332 if (flags & SCF_DO_STCLASS_AND) {
4333 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4334 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4335 if (OP(scan) == NSPACEU) {
4336 for (value = 0; value < 256; value++) {
4337 if (isSPACE_L1(value)) {
4338 ANYOF_BITMAP_CLEAR(data->start_class, value);
4342 for (value = 0; value < 256; value++) {
4343 if (isSPACE(value)) {
4344 ANYOF_BITMAP_CLEAR(data->start_class, value);
4351 if (data->start_class->flags & ANYOF_LOCALE)
4352 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4353 if (OP(scan) == NSPACEU) {
4354 for (value = 0; value < 256; value++) {
4355 if (!isSPACE_L1(value)) {
4356 ANYOF_BITMAP_SET(data->start_class, value);
4361 for (value = 0; value < 256; value++) {
4362 if (!isSPACE(value)) {
4363 ANYOF_BITMAP_SET(data->start_class, value);
4370 if (flags & SCF_DO_STCLASS_AND) {
4371 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4372 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4373 for (value = 0; value < 256; value++)
4374 if (!isDIGIT(value))
4375 ANYOF_BITMAP_CLEAR(data->start_class, value);
4379 if (data->start_class->flags & ANYOF_LOCALE)
4380 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4381 for (value = 0; value < 256; value++)
4383 ANYOF_BITMAP_SET(data->start_class, value);
4387 if (flags & SCF_DO_STCLASS_AND) {
4388 if (!(data->start_class->flags & ANYOF_LOCALE))
4389 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4390 for (value = 0; value < 256; value++)
4392 ANYOF_BITMAP_CLEAR(data->start_class, value);
4395 if (data->start_class->flags & ANYOF_LOCALE)
4396 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4397 for (value = 0; value < 256; value++)
4398 if (!isDIGIT(value))
4399 ANYOF_BITMAP_SET(data->start_class, value);
4402 CASE_SYNST_FNC(VERTWS);
4403 CASE_SYNST_FNC(HORIZWS);
4406 if (flags & SCF_DO_STCLASS_OR)
4407 cl_and(data->start_class, and_withp);
4408 flags &= ~SCF_DO_STCLASS;
4411 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4412 data->flags |= (OP(scan) == MEOL
4415 SCAN_COMMIT(pRExC_state, data, minlenp);
4418 else if ( PL_regkind[OP(scan)] == BRANCHJ
4419 /* Lookbehind, or need to calculate parens/evals/stclass: */
4420 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4421 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4422 if ( OP(scan) == UNLESSM &&
4424 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4425 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4428 regnode *upto= regnext(scan);
4430 SV * const mysv_val=sv_newmortal();
4431 DEBUG_STUDYDATA("OPFAIL",data,depth);
4433 /*DEBUG_PARSE_MSG("opfail");*/
4434 regprop(RExC_rx, mysv_val, upto);
4435 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4436 SvPV_nolen_const(mysv_val),
4437 (IV)REG_NODE_NUM(upto),
4442 NEXT_OFF(scan) = upto - scan;
4443 for (opt= scan + 1; opt < upto ; opt++)
4444 OP(opt) = OPTIMIZED;
4448 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4449 || OP(scan) == UNLESSM )
4451 /* Negative Lookahead/lookbehind
4452 In this case we can't do fixed string optimisation.
4455 I32 deltanext, minnext, fake = 0;
4457 struct regnode_charclass_class intrnl;
4460 data_fake.flags = 0;
4462 data_fake.whilem_c = data->whilem_c;
4463 data_fake.last_closep = data->last_closep;
4466 data_fake.last_closep = &fake;
4467 data_fake.pos_delta = delta;
4468 if ( flags & SCF_DO_STCLASS && !scan->flags
4469 && OP(scan) == IFMATCH ) { /* Lookahead */
4470 cl_init(pRExC_state, &intrnl);
4471 data_fake.start_class = &intrnl;
4472 f |= SCF_DO_STCLASS_AND;
4474 if (flags & SCF_WHILEM_VISITED_POS)
4475 f |= SCF_WHILEM_VISITED_POS;
4476 next = regnext(scan);
4477 nscan = NEXTOPER(NEXTOPER(scan));
4478 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4479 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4482 FAIL("Variable length lookbehind not implemented");
4484 else if (minnext > (I32)U8_MAX) {
4485 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4487 scan->flags = (U8)minnext;
4490 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4492 if (data_fake.flags & SF_HAS_EVAL)
4493 data->flags |= SF_HAS_EVAL;
4494 data->whilem_c = data_fake.whilem_c;
4496 if (f & SCF_DO_STCLASS_AND) {
4497 if (flags & SCF_DO_STCLASS_OR) {
4498 /* OR before, AND after: ideally we would recurse with
4499 * data_fake to get the AND applied by study of the
4500 * remainder of the pattern, and then derecurse;
4501 * *** HACK *** for now just treat as "no information".
4502 * See [perl #56690].
4504 cl_init(pRExC_state, data->start_class);
4506 /* AND before and after: combine and continue */
4507 const int was = (data->start_class->flags & ANYOF_EOS);
4509 cl_and(data->start_class, &intrnl);
4511 data->start_class->flags |= ANYOF_EOS;
4515 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4517 /* Positive Lookahead/lookbehind
4518 In this case we can do fixed string optimisation,
4519 but we must be careful about it. Note in the case of
4520 lookbehind the positions will be offset by the minimum
4521 length of the pattern, something we won't know about
4522 until after the recurse.
4524 I32 deltanext, fake = 0;
4526 struct regnode_charclass_class intrnl;
4528 /* We use SAVEFREEPV so that when the full compile
4529 is finished perl will clean up the allocated
4530 minlens when it's all done. This way we don't
4531 have to worry about freeing them when we know
4532 they wont be used, which would be a pain.
4535 Newx( minnextp, 1, I32 );
4536 SAVEFREEPV(minnextp);
4539 StructCopy(data, &data_fake, scan_data_t);
4540 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4543 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4544 data_fake.last_found=newSVsv(data->last_found);
4548 data_fake.last_closep = &fake;
4549 data_fake.flags = 0;
4550 data_fake.pos_delta = delta;
4552 data_fake.flags |= SF_IS_INF;
4553 if ( flags & SCF_DO_STCLASS && !scan->flags
4554 && OP(scan) == IFMATCH ) { /* Lookahead */
4555 cl_init(pRExC_state, &intrnl);
4556 data_fake.start_class = &intrnl;
4557 f |= SCF_DO_STCLASS_AND;
4559 if (flags & SCF_WHILEM_VISITED_POS)
4560 f |= SCF_WHILEM_VISITED_POS;
4561 next = regnext(scan);
4562 nscan = NEXTOPER(NEXTOPER(scan));
4564 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4565 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4568 FAIL("Variable length lookbehind not implemented");
4570 else if (*minnextp > (I32)U8_MAX) {
4571 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4573 scan->flags = (U8)*minnextp;
4578 if (f & SCF_DO_STCLASS_AND) {
4579 const int was = (data->start_class->flags & ANYOF_EOS);
4581 cl_and(data->start_class, &intrnl);
4583 data->start_class->flags |= ANYOF_EOS;
4586 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4588 if (data_fake.flags & SF_HAS_EVAL)
4589 data->flags |= SF_HAS_EVAL;
4590 data->whilem_c = data_fake.whilem_c;
4591 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4592 if (RExC_rx->minlen<*minnextp)
4593 RExC_rx->minlen=*minnextp;
4594 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4595 SvREFCNT_dec(data_fake.last_found);
4597 if ( data_fake.minlen_fixed != minlenp )
4599 data->offset_fixed= data_fake.offset_fixed;
4600 data->minlen_fixed= data_fake.minlen_fixed;
4601 data->lookbehind_fixed+= scan->flags;
4603 if ( data_fake.minlen_float != minlenp )
4605 data->minlen_float= data_fake.minlen_float;
4606 data->offset_float_min=data_fake.offset_float_min;
4607 data->offset_float_max=data_fake.offset_float_max;
4608 data->lookbehind_float+= scan->flags;
4615 else if (OP(scan) == OPEN) {
4616 if (stopparen != (I32)ARG(scan))
4619 else if (OP(scan) == CLOSE) {
4620 if (stopparen == (I32)ARG(scan)) {
4623 if ((I32)ARG(scan) == is_par) {
4624 next = regnext(scan);
4626 if ( next && (OP(next) != WHILEM) && next < last)
4627 is_par = 0; /* Disable optimization */
4630 *(data->last_closep) = ARG(scan);
4632 else if (OP(scan) == EVAL) {
4634 data->flags |= SF_HAS_EVAL;
4636 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4637 if (flags & SCF_DO_SUBSTR) {
4638 SCAN_COMMIT(pRExC_state,data,minlenp);
4639 flags &= ~SCF_DO_SUBSTR;
4641 if (data && OP(scan)==ACCEPT) {
4642 data->flags |= SCF_SEEN_ACCEPT;
4647 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4649 if (flags & SCF_DO_SUBSTR) {
4650 SCAN_COMMIT(pRExC_state,data,minlenp);
4651 data->longest = &(data->longest_float);
4653 is_inf = is_inf_internal = 1;
4654 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4655 cl_anything(pRExC_state, data->start_class);
4656 flags &= ~SCF_DO_STCLASS;
4658 else if (OP(scan) == GPOS) {
4659 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4660 !(delta || is_inf || (data && data->pos_delta)))
4662 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4663 RExC_rx->extflags |= RXf_ANCH_GPOS;
4664 if (RExC_rx->gofs < (U32)min)
4665 RExC_rx->gofs = min;
4667 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4671 #ifdef TRIE_STUDY_OPT
4672 #ifdef FULL_TRIE_STUDY
4673 else if (PL_regkind[OP(scan)] == TRIE) {
4674 /* NOTE - There is similar code to this block above for handling
4675 BRANCH nodes on the initial study. If you change stuff here
4677 regnode *trie_node= scan;
4678 regnode *tail= regnext(scan);
4679 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4680 I32 max1 = 0, min1 = I32_MAX;
4681 struct regnode_charclass_class accum;
4683 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4684 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4685 if (flags & SCF_DO_STCLASS)
4686 cl_init_zero(pRExC_state, &accum);
4692 const regnode *nextbranch= NULL;
4695 for ( word=1 ; word <= trie->wordcount ; word++)
4697 I32 deltanext=0, minnext=0, f = 0, fake;
4698 struct regnode_charclass_class this_class;
4700 data_fake.flags = 0;
4702 data_fake.whilem_c = data->whilem_c;
4703 data_fake.last_closep = data->last_closep;
4706 data_fake.last_closep = &fake;
4707 data_fake.pos_delta = delta;
4708 if (flags & SCF_DO_STCLASS) {
4709 cl_init(pRExC_state, &this_class);
4710 data_fake.start_class = &this_class;
4711 f = SCF_DO_STCLASS_AND;
4713 if (flags & SCF_WHILEM_VISITED_POS)
4714 f |= SCF_WHILEM_VISITED_POS;
4716 if (trie->jump[word]) {
4718 nextbranch = trie_node + trie->jump[0];
4719 scan= trie_node + trie->jump[word];
4720 /* We go from the jump point to the branch that follows
4721 it. Note this means we need the vestigal unused branches
4722 even though they arent otherwise used.
4724 minnext = study_chunk(pRExC_state, &scan, minlenp,
4725 &deltanext, (regnode *)nextbranch, &data_fake,
4726 stopparen, recursed, NULL, f,depth+1);
4728 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4729 nextbranch= regnext((regnode*)nextbranch);
4731 if (min1 > (I32)(minnext + trie->minlen))
4732 min1 = minnext + trie->minlen;
4733 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4734 max1 = minnext + deltanext + trie->maxlen;
4735 if (deltanext == I32_MAX)
4736 is_inf = is_inf_internal = 1;
4738 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4740 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4741 if ( stopmin > min + min1)
4742 stopmin = min + min1;
4743 flags &= ~SCF_DO_SUBSTR;
4745 data->flags |= SCF_SEEN_ACCEPT;
4748 if (data_fake.flags & SF_HAS_EVAL)
4749 data->flags |= SF_HAS_EVAL;
4750 data->whilem_c = data_fake.whilem_c;
4752 if (flags & SCF_DO_STCLASS)
4753 cl_or(pRExC_state, &accum, &this_class);
4756 if (flags & SCF_DO_SUBSTR) {
4757 data->pos_min += min1;
4758 data->pos_delta += max1 - min1;
4759 if (max1 != min1 || is_inf)
4760 data->longest = &(data->longest_float);
4763 delta += max1 - min1;
4764 if (flags & SCF_DO_STCLASS_OR) {
4765 cl_or(pRExC_state, data->start_class, &accum);
4767 cl_and(data->start_class, and_withp);
4768 flags &= ~SCF_DO_STCLASS;
4771 else if (flags & SCF_DO_STCLASS_AND) {
4773 cl_and(data->start_class, &accum);
4774 flags &= ~SCF_DO_STCLASS;
4777 /* Switch to OR mode: cache the old value of
4778 * data->start_class */
4780 StructCopy(data->start_class, and_withp,
4781 struct regnode_charclass_class);
4782 flags &= ~SCF_DO_STCLASS_AND;
4783 StructCopy(&accum, data->start_class,
4784 struct regnode_charclass_class);
4785 flags |= SCF_DO_STCLASS_OR;
4786 data->start_class->flags |= ANYOF_EOS;
4793 else if (PL_regkind[OP(scan)] == TRIE) {
4794 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4797 min += trie->minlen;
4798 delta += (trie->maxlen - trie->minlen);
4799 flags &= ~SCF_DO_STCLASS; /* xxx */
4800 if (flags & SCF_DO_SUBSTR) {
4801 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4802 data->pos_min += trie->minlen;
4803 data->pos_delta += (trie->maxlen - trie->minlen);
4804 if (trie->maxlen != trie->minlen)
4805 data->longest = &(data->longest_float);
4807 if (trie->jump) /* no more substrings -- for now /grr*/
4808 flags &= ~SCF_DO_SUBSTR;
4810 #endif /* old or new */
4811 #endif /* TRIE_STUDY_OPT */
4813 /* Else: zero-length, ignore. */
4814 scan = regnext(scan);
4819 stopparen = frame->stop;
4820 frame = frame->prev;
4821 goto fake_study_recurse;
4826 DEBUG_STUDYDATA("pre-fin:",data,depth);
4829 *deltap = is_inf_internal ? I32_MAX : delta;
4830 if (flags & SCF_DO_SUBSTR && is_inf)
4831 data->pos_delta = I32_MAX - data->pos_min;
4832 if (is_par > (I32)U8_MAX)
4834 if (is_par && pars==1 && data) {
4835 data->flags |= SF_IN_PAR;
4836 data->flags &= ~SF_HAS_PAR;
4838 else if (pars && data) {
4839 data->flags |= SF_HAS_PAR;
4840 data->flags &= ~SF_IN_PAR;
4842 if (flags & SCF_DO_STCLASS_OR)
4843 cl_and(data->start_class, and_withp);
4844 if (flags & SCF_TRIE_RESTUDY)
4845 data->flags |= SCF_TRIE_RESTUDY;
4847 DEBUG_STUDYDATA("post-fin:",data,depth);
4849 return min < stopmin ? min : stopmin;
4853 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4855 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4857 PERL_ARGS_ASSERT_ADD_DATA;
4859 Renewc(RExC_rxi->data,
4860 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4861 char, struct reg_data);
4863 Renew(RExC_rxi->data->what, count + n, U8);
4865 Newx(RExC_rxi->data->what, n, U8);
4866 RExC_rxi->data->count = count + n;
4867 Copy(s, RExC_rxi->data->what + count, n, U8);
4871 /*XXX: todo make this not included in a non debugging perl */
4872 #ifndef PERL_IN_XSUB_RE
4874 Perl_reginitcolors(pTHX)
4877 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4879 char *t = savepv(s);
4883 t = strchr(t, '\t');
4889 PL_colors[i] = t = (char *)"";
4894 PL_colors[i++] = (char *)"";
4901 #ifdef TRIE_STUDY_OPT
4902 #define CHECK_RESTUDY_GOTO \
4904 (data.flags & SCF_TRIE_RESTUDY) \
4908 #define CHECK_RESTUDY_GOTO
4912 * pregcomp - compile a regular expression into internal code
4914 * Decides which engine's compiler to call based on the hint currently in
4918 #ifndef PERL_IN_XSUB_RE
4920 /* return the currently in-scope regex engine (or the default if none) */
4922 regexp_engine const *
4923 Perl_current_re_engine(pTHX)
4927 if (IN_PERL_COMPILETIME) {
4928 HV * const table = GvHV(PL_hintgv);
4932 return &PL_core_reg_engine;
4933 ptr = hv_fetchs(table, "regcomp", FALSE);
4934 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4935 return &PL_core_reg_engine;
4936 return INT2PTR(regexp_engine*,SvIV(*ptr));
4940 if (!PL_curcop->cop_hints_hash)
4941 return &PL_core_reg_engine;
4942 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4943 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4944 return &PL_core_reg_engine;
4945 return INT2PTR(regexp_engine*,SvIV(ptr));
4951 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4954 regexp_engine const *eng = current_re_engine();
4955 GET_RE_DEBUG_FLAGS_DECL;
4957 PERL_ARGS_ASSERT_PREGCOMP;
4959 /* Dispatch a request to compile a regexp to correct regexp engine. */
4961 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4964 return CALLREGCOMP_ENG(eng, pattern, flags);
4968 /* public(ish) entry point for the perl core's own regex compiling code.
4969 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4970 * pattern rather than a list of OPs, and uses the internal engine rather
4971 * than the current one */
4974 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4976 SV *pat = pattern; /* defeat constness! */
4977 PERL_ARGS_ASSERT_RE_COMPILE;
4978 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4979 #ifdef PERL_IN_XSUB_RE
4982 &PL_core_reg_engine,
4984 NULL, NULL, rx_flags, 0);
4987 /* see if there are any run-time code blocks in the pattern.
4988 * False positives are allowed */
4991 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4992 U32 pm_flags, char *pat, STRLEN plen)
4997 /* avoid infinitely recursing when we recompile the pattern parcelled up
4998 * as qr'...'. A single constant qr// string can't have have any
4999 * run-time component in it, and thus, no runtime code. (A non-qr
5000 * string, however, can, e.g. $x =~ '(?{})') */
5001 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
5004 for (s = 0; s < plen; s++) {
5005 if (n < pRExC_state->num_code_blocks
5006 && s == pRExC_state->code_blocks[n].start)
5008 s = pRExC_state->code_blocks[n].end;
5012 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5014 if (pat[s] == '(' && pat[s+1] == '?' &&
5015 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
5022 /* Handle run-time code blocks. We will already have compiled any direct
5023 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5024 * copy of it, but with any literal code blocks blanked out and
5025 * appropriate chars escaped; then feed it into
5027 * eval "qr'modified_pattern'"
5031 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5035 * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
5037 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5038 * and merge them with any code blocks of the original regexp.
5040 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5041 * instead, just save the qr and return FALSE; this tells our caller that
5042 * the original pattern needs upgrading to utf8.
5046 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5047 char *pat, STRLEN plen)
5051 GET_RE_DEBUG_FLAGS_DECL;
5053 if (pRExC_state->runtime_code_qr) {
5054 /* this is the second time we've been called; this should
5055 * only happen if the main pattern got upgraded to utf8
5056 * during compilation; re-use the qr we compiled first time
5057 * round (which should be utf8 too)
5059 qr = pRExC_state->runtime_code_qr;
5060 pRExC_state->runtime_code_qr = NULL;
5061 assert(RExC_utf8 && SvUTF8(qr));
5067 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5071 /* determine how many extra chars we need for ' and \ escaping */
5072 for (s = 0; s < plen; s++) {
5073 if (pat[s] == '\'' || pat[s] == '\\')
5077 Newx(newpat, newlen, char);
5079 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5081 for (s = 0; s < plen; s++) {
5082 if (n < pRExC_state->num_code_blocks
5083 && s == pRExC_state->code_blocks[n].start)
5085 /* blank out literal code block */
5086 assert(pat[s] == '(');
5087 while (s <= pRExC_state->code_blocks[n].end) {
5095 if (pat[s] == '\'' || pat[s] == '\\')
5100 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5104 PerlIO_printf(Perl_debug_log,
5105 "%sre-parsing pattern for runtime code:%s %s\n",
5106 PL_colors[4],PL_colors[5],newpat);
5109 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5115 PUSHSTACKi(PERLSI_REQUIRE);
5116 /* this causes the toker to collapse \\ into \ when parsing
5117 * qr''; normally only q'' does this. It also alters hints
5119 PL_reg_state.re_reparsing = TRUE;
5120 eval_sv(sv, G_SCALAR);
5126 Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5127 assert(SvROK(qr_ref));
5129 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5130 /* the leaving below frees the tmp qr_ref.
5131 * Give qr a life of its own */
5139 if (!RExC_utf8 && SvUTF8(qr)) {
5140 /* first time through; the pattern got upgraded; save the
5141 * qr for the next time through */
5142 assert(!pRExC_state->runtime_code_qr);
5143 pRExC_state->runtime_code_qr = qr;
5148 /* extract any code blocks within the returned qr// */
5151 /* merge the main (r1) and run-time (r2) code blocks into one */
5153 RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5154 struct reg_code_block *new_block, *dst;
5155 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5158 if (!r2->num_code_blocks) /* we guessed wrong */
5162 r1->num_code_blocks + r2->num_code_blocks,
5163 struct reg_code_block);
5166 while ( i1 < r1->num_code_blocks
5167 || i2 < r2->num_code_blocks)
5169 struct reg_code_block *src;
5172 if (i1 == r1->num_code_blocks) {
5173 src = &r2->code_blocks[i2++];
5176 else if (i2 == r2->num_code_blocks)
5177 src = &r1->code_blocks[i1++];
5178 else if ( r1->code_blocks[i1].start
5179 < r2->code_blocks[i2].start)
5181 src = &r1->code_blocks[i1++];
5182 assert(src->end < r2->code_blocks[i2].start);
5185 assert( r1->code_blocks[i1].start
5186 > r2->code_blocks[i2].start);
5187 src = &r2->code_blocks[i2++];
5189 assert(src->end < r1->code_blocks[i1].start);
5192 assert(pat[src->start] == '(');
5193 assert(pat[src->end] == ')');
5194 dst->start = src->start;
5195 dst->end = src->end;
5196 dst->block = src->block;
5197 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5201 r1->num_code_blocks += r2->num_code_blocks;
5202 Safefree(r1->code_blocks);
5203 r1->code_blocks = new_block;
5212 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5214 /* This is the common code for setting up the floating and fixed length
5215 * string data extracted from Perlre_op_compile() below. Returns a boolean
5216 * as to whether succeeded or not */
5220 if (! (longest_length
5221 || (eol /* Can't have SEOL and MULTI */
5222 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5224 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5225 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5230 /* copy the information about the longest from the reg_scan_data
5231 over to the program. */
5232 if (SvUTF8(sv_longest)) {
5233 *rx_utf8 = sv_longest;
5236 *rx_substr = sv_longest;
5239 /* end_shift is how many chars that must be matched that
5240 follow this item. We calculate it ahead of time as once the
5241 lookbehind offset is added in we lose the ability to correctly
5243 ml = minlen ? *(minlen) : (I32)longest_length;
5244 *rx_end_shift = ml - offset
5245 - longest_length + (SvTAIL(sv_longest) != 0)
5248 t = (eol/* Can't have SEOL and MULTI */
5249 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5250 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5256 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5257 * regular expression into internal code.
5258 * The pattern may be passed either as:
5259 * a list of SVs (patternp plus pat_count)
5260 * a list of OPs (expr)
5261 * If both are passed, the SV list is used, but the OP list indicates
5262 * which SVs are actually pre-compiled code blocks
5264 * The SVs in the list have magic and qr overloading applied to them (and
5265 * the list may be modified in-place with replacement SVs in the latter
5268 * If the pattern hasn't changed from old_re, then old_re will be
5271 * eng is the current engine. If that engine has an op_comp method, then
5272 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5273 * do the initial concatenation of arguments and pass on to the external
5276 * If is_bare_re is not null, set it to a boolean indicating whether the
5277 * arg list reduced (after overloading) to a single bare regex which has
5278 * been returned (i.e. /$qr/).
5280 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5282 * pm_flags contains the PMf_* flags, typically based on those from the
5283 * pm_flags field of the related PMOP. Currently we're only interested in
5284 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5286 * We can't allocate space until we know how big the compiled form will be,
5287 * but we can't compile it (and thus know how big it is) until we've got a
5288 * place to put the code. So we cheat: we compile it twice, once with code
5289 * generation turned off and size counting turned on, and once "for real".
5290 * This also means that we don't allocate space until we are sure that the
5291 * thing really will compile successfully, and we never have to move the
5292 * code and thus invalidate pointers into it. (Note that it has to be in
5293 * one piece because free() must be able to free it all.) [NB: not true in perl]
5295 * Beware that the optimization-preparation code in here knows about some
5296 * of the structure of the compiled regexp. [I'll say.]
5300 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5301 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5302 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5307 regexp_internal *ri;
5317 /* these are all flags - maybe they should be turned
5318 * into a single int with different bit masks */
5319 I32 sawlookahead = 0;
5322 bool used_setjump = FALSE;
5323 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5324 bool code_is_utf8 = 0;
5325 bool VOL recompile = 0;
5326 bool runtime_code = 0;
5330 RExC_state_t RExC_state;
5331 RExC_state_t * const pRExC_state = &RExC_state;
5332 #ifdef TRIE_STUDY_OPT
5334 RExC_state_t copyRExC_state;
5336 GET_RE_DEBUG_FLAGS_DECL;
5338 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5340 DEBUG_r(if (!PL_colorset) reginitcolors());
5342 #ifndef PERL_IN_XSUB_RE
5343 /* Initialize these here instead of as-needed, as is quick and avoids
5344 * having to test them each time otherwise */
5345 if (! PL_AboveLatin1) {
5346 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5347 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5348 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5350 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5351 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5353 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5354 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5356 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5357 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5359 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5361 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5362 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5364 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5366 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5367 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5369 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5370 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5372 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5373 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5375 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5376 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5378 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5379 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5381 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5382 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5384 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5385 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5387 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5389 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5390 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5392 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5393 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5397 pRExC_state->code_blocks = NULL;
5398 pRExC_state->num_code_blocks = 0;
5401 *is_bare_re = FALSE;
5403 if (expr && (expr->op_type == OP_LIST ||
5404 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5406 /* is the source UTF8, and how many code blocks are there? */
5410 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5411 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5413 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5414 /* count of DO blocks */
5418 pRExC_state->num_code_blocks = ncode;
5419 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5424 /* handle a list of SVs */
5428 /* apply magic and RE overloading to each arg */
5429 for (svp = patternp; svp < patternp + pat_count; svp++) {
5432 if (SvROK(rx) && SvAMAGIC(rx)) {
5433 SV *sv = AMG_CALLunary(rx, regexp_amg);
5437 if (SvTYPE(sv) != SVt_REGEXP)
5438 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5444 if (pat_count > 1) {
5445 /* concat multiple args and find any code block indexes */
5450 STRLEN orig_patlen = 0;
5452 if (pRExC_state->num_code_blocks) {
5453 o = cLISTOPx(expr)->op_first;
5454 assert(o->op_type == OP_PUSHMARK);
5458 pat = newSVpvn("", 0);
5461 /* determine if the pattern is going to be utf8 (needed
5462 * in advance to align code block indices correctly).
5463 * XXX This could fail to be detected for an arg with
5464 * overloading but not concat overloading; but the main effect
5465 * in this obscure case is to need a 'use re eval' for a
5466 * literal code block */
5467 for (svp = patternp; svp < patternp + pat_count; svp++) {
5474 for (svp = patternp; svp < patternp + pat_count; svp++) {
5475 SV *sv, *msv = *svp;
5479 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5480 assert(n < pRExC_state->num_code_blocks);
5481 pRExC_state->code_blocks[n].start = SvCUR(pat);
5482 pRExC_state->code_blocks[n].block = o;
5483 pRExC_state->code_blocks[n].src_regex = NULL;
5486 o = o->op_sibling; /* skip CONST */
5492 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5493 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5496 /* overloading involved: all bets are off over literal
5497 * code. Pretend we haven't seen it */
5498 pRExC_state->num_code_blocks -= n;
5504 while (SvAMAGIC(msv)
5505 && (sv = AMG_CALLunary(msv, string_amg))
5511 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5513 orig_patlen = SvCUR(pat);
5514 sv_catsv_nomg(pat, msv);
5517 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5520 /* extract any code blocks within any embedded qr//'s */
5521 if (rx && SvTYPE(rx) == SVt_REGEXP
5522 && RX_ENGINE((REGEXP*)rx)->op_comp)
5525 RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5526 if (ri->num_code_blocks) {
5528 /* the presence of an embedded qr// with code means
5529 * we should always recompile: the text of the
5530 * qr// may not have changed, but it may be a
5531 * different closure than last time */
5533 Renew(pRExC_state->code_blocks,
5534 pRExC_state->num_code_blocks + ri->num_code_blocks,
5535 struct reg_code_block);
5536 pRExC_state->num_code_blocks += ri->num_code_blocks;
5537 for (i=0; i < ri->num_code_blocks; i++) {
5538 struct reg_code_block *src, *dst;
5539 STRLEN offset = orig_patlen
5540 + ((struct regexp *)SvANY(rx))->pre_prefix;
5541 assert(n < pRExC_state->num_code_blocks);
5542 src = &ri->code_blocks[i];
5543 dst = &pRExC_state->code_blocks[n];
5544 dst->start = src->start + offset;
5545 dst->end = src->end + offset;
5546 dst->block = src->block;
5547 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5561 while (SvAMAGIC(pat)
5562 && (sv = AMG_CALLunary(pat, string_amg))
5570 /* handle bare regex: foo =~ $re */
5575 if (SvTYPE(re) == SVt_REGEXP) {
5579 Safefree(pRExC_state->code_blocks);
5585 /* not a list of SVs, so must be a list of OPs */
5587 if (expr->op_type == OP_LIST) {
5592 pat = newSVpvn("", 0);
5597 /* given a list of CONSTs and DO blocks in expr, append all
5598 * the CONSTs to pat, and record the start and end of each
5599 * code block in code_blocks[] (each DO{} op is followed by an
5600 * OP_CONST containing the corresponding literal '(?{...})
5603 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5604 if (o->op_type == OP_CONST) {
5605 sv_catsv(pat, cSVOPo_sv);
5607 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5611 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5612 assert(i+1 < pRExC_state->num_code_blocks);
5613 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5614 pRExC_state->code_blocks[i].block = o;
5615 pRExC_state->code_blocks[i].src_regex = NULL;
5621 assert(expr->op_type == OP_CONST);
5622 pat = cSVOPx_sv(expr);
5626 exp = SvPV_nomg(pat, plen);
5628 if (!eng->op_comp) {
5629 if ((SvUTF8(pat) && IN_BYTES)
5630 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5632 /* make a temporary copy; either to convert to bytes,
5633 * or to avoid repeating get-magic / overloaded stringify */
5634 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5635 (IN_BYTES ? 0 : SvUTF8(pat)));
5637 Safefree(pRExC_state->code_blocks);
5638 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5641 /* ignore the utf8ness if the pattern is 0 length */
5642 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5643 RExC_uni_semantics = 0;
5644 RExC_contains_locale = 0;
5645 pRExC_state->runtime_code_qr = NULL;
5647 /****************** LONG JUMP TARGET HERE***********************/
5648 /* Longjmp back to here if have to switch in midstream to utf8 */
5649 if (! RExC_orig_utf8) {
5650 JMPENV_PUSH(jump_ret);
5651 used_setjump = TRUE;
5654 if (jump_ret == 0) { /* First time through */
5658 SV *dsv= sv_newmortal();
5659 RE_PV_QUOTED_DECL(s, RExC_utf8,
5660 dsv, exp, plen, 60);
5661 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5662 PL_colors[4],PL_colors[5],s);
5665 else { /* longjumped back */
5668 STRLEN s = 0, d = 0;
5671 /* If the cause for the longjmp was other than changing to utf8, pop
5672 * our own setjmp, and longjmp to the correct handler */
5673 if (jump_ret != UTF8_LONGJMP) {
5675 JMPENV_JUMP(jump_ret);
5680 /* It's possible to write a regexp in ascii that represents Unicode
5681 codepoints outside of the byte range, such as via \x{100}. If we
5682 detect such a sequence we have to convert the entire pattern to utf8
5683 and then recompile, as our sizing calculation will have been based
5684 on 1 byte == 1 character, but we will need to use utf8 to encode
5685 at least some part of the pattern, and therefore must convert the whole
5688 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5689 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5691 /* upgrade pattern to UTF8, and if there are code blocks,
5692 * recalculate the indices.
5693 * This is essentially an unrolled Perl_bytes_to_utf8() */
5695 src = (U8*)SvPV_nomg(pat, plen);
5696 Newx(dst, plen * 2 + 1, U8);
5699 const UV uv = NATIVE_TO_ASCII(src[s]);
5700 if (UNI_IS_INVARIANT(uv))
5701 dst[d] = (U8)UTF_TO_NATIVE(uv);
5703 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5704 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5706 if (n < pRExC_state->num_code_blocks) {
5707 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5708 pRExC_state->code_blocks[n].start = d;
5709 assert(dst[d] == '(');
5712 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5713 pRExC_state->code_blocks[n].end = d;
5714 assert(dst[d] == ')');
5727 RExC_orig_utf8 = RExC_utf8 = 1;
5730 /* return old regex if pattern hasn't changed */
5734 && !!RX_UTF8(old_re) == !!RExC_utf8
5735 && RX_PRECOMP(old_re)
5736 && RX_PRELEN(old_re) == plen
5737 && memEQ(RX_PRECOMP(old_re), exp, plen))
5739 /* with runtime code, always recompile */
5740 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5742 if (!runtime_code) {
5746 Safefree(pRExC_state->code_blocks);
5750 else if ((pm_flags & PMf_USE_RE_EVAL)
5751 /* this second condition covers the non-regex literal case,
5752 * i.e. $foo =~ '(?{})'. */
5753 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5754 && (PL_hints & HINT_RE_EVAL))
5756 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5759 #ifdef TRIE_STUDY_OPT
5763 rx_flags = orig_rx_flags;
5765 if (initial_charset == REGEX_LOCALE_CHARSET) {
5766 RExC_contains_locale = 1;
5768 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5770 /* Set to use unicode semantics if the pattern is in utf8 and has the
5771 * 'depends' charset specified, as it means unicode when utf8 */
5772 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5776 RExC_flags = rx_flags;
5777 RExC_pm_flags = pm_flags;
5780 if (PL_tainting && PL_tainted)
5781 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5783 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5784 /* whoops, we have a non-utf8 pattern, whilst run-time code
5785 * got compiled as utf8. Try again with a utf8 pattern */
5786 JMPENV_JUMP(UTF8_LONGJMP);
5789 assert(!pRExC_state->runtime_code_qr);
5794 RExC_in_lookbehind = 0;
5795 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5797 RExC_override_recoding = 0;
5799 /* First pass: determine size, legality. */
5807 RExC_emit = &PL_regdummy;
5808 RExC_whilem_seen = 0;
5809 RExC_open_parens = NULL;
5810 RExC_close_parens = NULL;
5812 RExC_paren_names = NULL;
5814 RExC_paren_name_list = NULL;
5816 RExC_recurse = NULL;
5817 RExC_recurse_count = 0;
5818 pRExC_state->code_index = 0;
5820 #if 0 /* REGC() is (currently) a NOP at the first pass.
5821 * Clever compilers notice this and complain. --jhi */
5822 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5825 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5827 RExC_lastparse=NULL;
5829 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5830 RExC_precomp = NULL;
5831 Safefree(pRExC_state->code_blocks);
5835 /* Here, finished first pass. Get rid of any added setjmp */
5841 PerlIO_printf(Perl_debug_log,
5842 "Required size %"IVdf" nodes\n"
5843 "Starting second pass (creation)\n",
5846 RExC_lastparse=NULL;
5849 /* The first pass could have found things that force Unicode semantics */
5850 if ((RExC_utf8 || RExC_uni_semantics)
5851 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5853 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5856 /* Small enough for pointer-storage convention?
5857 If extralen==0, this means that we will not need long jumps. */
5858 if (RExC_size >= 0x10000L && RExC_extralen)
5859 RExC_size += RExC_extralen;
5862 if (RExC_whilem_seen > 15)
5863 RExC_whilem_seen = 15;
5865 /* Allocate space and zero-initialize. Note, the two step process
5866 of zeroing when in debug mode, thus anything assigned has to
5867 happen after that */
5868 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5869 r = (struct regexp*)SvANY(rx);
5870 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5871 char, regexp_internal);
5872 if ( r == NULL || ri == NULL )
5873 FAIL("Regexp out of space");
5875 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5876 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5878 /* bulk initialize base fields with 0. */
5879 Zero(ri, sizeof(regexp_internal), char);
5882 /* non-zero initialization begins here */
5885 r->extflags = rx_flags;
5886 if (pm_flags & PMf_IS_QR) {
5887 ri->code_blocks = pRExC_state->code_blocks;
5888 ri->num_code_blocks = pRExC_state->num_code_blocks;
5891 SAVEFREEPV(pRExC_state->code_blocks);
5894 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5895 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5897 /* The caret is output if there are any defaults: if not all the STD
5898 * flags are set, or if no character set specifier is needed */
5900 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5902 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5903 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5904 >> RXf_PMf_STD_PMMOD_SHIFT);
5905 const char *fptr = STD_PAT_MODS; /*"msix"*/
5907 /* Allocate for the worst case, which is all the std flags are turned
5908 * on. If more precision is desired, we could do a population count of
5909 * the flags set. This could be done with a small lookup table, or by
5910 * shifting, masking and adding, or even, when available, assembly
5911 * language for a machine-language population count.
5912 * We never output a minus, as all those are defaults, so are
5913 * covered by the caret */
5914 const STRLEN wraplen = plen + has_p + has_runon
5915 + has_default /* If needs a caret */
5917 /* If needs a character set specifier */
5918 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5919 + (sizeof(STD_PAT_MODS) - 1)
5920 + (sizeof("(?:)") - 1);
5922 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5925 SvFLAGS(rx) |= SVf_UTF8;
5928 /* If a default, cover it using the caret */
5930 *p++= DEFAULT_PAT_MOD;
5934 const char* const name = get_regex_charset_name(r->extflags, &len);
5935 Copy(name, p, len, char);
5939 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5942 while((ch = *fptr++)) {
5950 Copy(RExC_precomp, p, plen, char);
5951 assert ((RX_WRAPPED(rx) - p) < 16);
5952 r->pre_prefix = p - RX_WRAPPED(rx);
5958 SvCUR_set(rx, p - SvPVX_const(rx));
5962 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5964 if (RExC_seen & REG_SEEN_RECURSE) {
5965 Newxz(RExC_open_parens, RExC_npar,regnode *);
5966 SAVEFREEPV(RExC_open_parens);
5967 Newxz(RExC_close_parens,RExC_npar,regnode *);
5968 SAVEFREEPV(RExC_close_parens);
5971 /* Useful during FAIL. */
5972 #ifdef RE_TRACK_PATTERN_OFFSETS
5973 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5974 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5975 "%s %"UVuf" bytes for offset annotations.\n",
5976 ri->u.offsets ? "Got" : "Couldn't get",
5977 (UV)((2*RExC_size+1) * sizeof(U32))));
5979 SetProgLen(ri,RExC_size);
5983 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5985 /* Second pass: emit code. */
5986 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5987 RExC_pm_flags = pm_flags;
5992 RExC_emit_start = ri->program;
5993 RExC_emit = ri->program;
5994 RExC_emit_bound = ri->program + RExC_size + 1;
5995 pRExC_state->code_index = 0;
5997 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5998 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6002 /* XXXX To minimize changes to RE engine we always allocate
6003 3-units-long substrs field. */
6004 Newx(r->substrs, 1, struct reg_substr_data);
6005 if (RExC_recurse_count) {
6006 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6007 SAVEFREEPV(RExC_recurse);
6011 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6012 Zero(r->substrs, 1, struct reg_substr_data);
6014 #ifdef TRIE_STUDY_OPT
6016 StructCopy(&zero_scan_data, &data, scan_data_t);
6017 copyRExC_state = RExC_state;
6020 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6022 RExC_state = copyRExC_state;
6023 if (seen & REG_TOP_LEVEL_BRANCHES)
6024 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6026 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6027 if (data.last_found) {
6028 SvREFCNT_dec(data.longest_fixed);
6029 SvREFCNT_dec(data.longest_float);
6030 SvREFCNT_dec(data.last_found);
6032 StructCopy(&zero_scan_data, &data, scan_data_t);
6035 StructCopy(&zero_scan_data, &data, scan_data_t);
6038 /* Dig out information for optimizations. */
6039 r->extflags = RExC_flags; /* was pm_op */
6040 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6043 SvUTF8_on(rx); /* Unicode in it? */
6044 ri->regstclass = NULL;
6045 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6046 r->intflags |= PREGf_NAUGHTY;
6047 scan = ri->program + 1; /* First BRANCH. */
6049 /* testing for BRANCH here tells us whether there is "must appear"
6050 data in the pattern. If there is then we can use it for optimisations */
6051 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6053 STRLEN longest_float_length, longest_fixed_length;
6054 struct regnode_charclass_class ch_class; /* pointed to by data */
6056 I32 last_close = 0; /* pointed to by data */
6057 regnode *first= scan;
6058 regnode *first_next= regnext(first);
6060 * Skip introductions and multiplicators >= 1
6061 * so that we can extract the 'meat' of the pattern that must
6062 * match in the large if() sequence following.
6063 * NOTE that EXACT is NOT covered here, as it is normally
6064 * picked up by the optimiser separately.
6066 * This is unfortunate as the optimiser isnt handling lookahead
6067 * properly currently.
6070 while ((OP(first) == OPEN && (sawopen = 1)) ||
6071 /* An OR of *one* alternative - should not happen now. */
6072 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6073 /* for now we can't handle lookbehind IFMATCH*/
6074 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6075 (OP(first) == PLUS) ||
6076 (OP(first) == MINMOD) ||
6077 /* An {n,m} with n>0 */
6078 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6079 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6082 * the only op that could be a regnode is PLUS, all the rest
6083 * will be regnode_1 or regnode_2.
6086 if (OP(first) == PLUS)
6089 first += regarglen[OP(first)];
6091 first = NEXTOPER(first);
6092 first_next= regnext(first);
6095 /* Starting-point info. */
6097 DEBUG_PEEP("first:",first,0);
6098 /* Ignore EXACT as we deal with it later. */
6099 if (PL_regkind[OP(first)] == EXACT) {
6100 if (OP(first) == EXACT)
6101 NOOP; /* Empty, get anchored substr later. */
6103 ri->regstclass = first;
6106 else if (PL_regkind[OP(first)] == TRIE &&
6107 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6110 /* this can happen only on restudy */
6111 if ( OP(first) == TRIE ) {
6112 struct regnode_1 *trieop = (struct regnode_1 *)
6113 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6114 StructCopy(first,trieop,struct regnode_1);
6115 trie_op=(regnode *)trieop;
6117 struct regnode_charclass *trieop = (struct regnode_charclass *)
6118 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6119 StructCopy(first,trieop,struct regnode_charclass);
6120 trie_op=(regnode *)trieop;
6123 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6124 ri->regstclass = trie_op;
6127 else if (REGNODE_SIMPLE(OP(first)))
6128 ri->regstclass = first;
6129 else if (PL_regkind[OP(first)] == BOUND ||
6130 PL_regkind[OP(first)] == NBOUND)
6131 ri->regstclass = first;
6132 else if (PL_regkind[OP(first)] == BOL) {
6133 r->extflags |= (OP(first) == MBOL
6135 : (OP(first) == SBOL
6138 first = NEXTOPER(first);
6141 else if (OP(first) == GPOS) {
6142 r->extflags |= RXf_ANCH_GPOS;
6143 first = NEXTOPER(first);
6146 else if ((!sawopen || !RExC_sawback) &&
6147 (OP(first) == STAR &&
6148 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6149 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6151 /* turn .* into ^.* with an implied $*=1 */
6153 (OP(NEXTOPER(first)) == REG_ANY)
6156 r->extflags |= type;
6157 r->intflags |= PREGf_IMPLICIT;
6158 first = NEXTOPER(first);
6161 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6162 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6163 /* x+ must match at the 1st pos of run of x's */
6164 r->intflags |= PREGf_SKIP;
6166 /* Scan is after the zeroth branch, first is atomic matcher. */
6167 #ifdef TRIE_STUDY_OPT
6170 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6171 (IV)(first - scan + 1))
6175 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6176 (IV)(first - scan + 1))
6182 * If there's something expensive in the r.e., find the
6183 * longest literal string that must appear and make it the
6184 * regmust. Resolve ties in favor of later strings, since
6185 * the regstart check works with the beginning of the r.e.
6186 * and avoiding duplication strengthens checking. Not a
6187 * strong reason, but sufficient in the absence of others.
6188 * [Now we resolve ties in favor of the earlier string if
6189 * it happens that c_offset_min has been invalidated, since the
6190 * earlier string may buy us something the later one won't.]
6193 data.longest_fixed = newSVpvs("");
6194 data.longest_float = newSVpvs("");
6195 data.last_found = newSVpvs("");
6196 data.longest = &(data.longest_fixed);
6198 if (!ri->regstclass) {
6199 cl_init(pRExC_state, &ch_class);
6200 data.start_class = &ch_class;
6201 stclass_flag = SCF_DO_STCLASS_AND;
6202 } else /* XXXX Check for BOUND? */
6204 data.last_closep = &last_close;
6206 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6207 &data, -1, NULL, NULL,
6208 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6214 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6215 && data.last_start_min == 0 && data.last_end > 0
6216 && !RExC_seen_zerolen
6217 && !(RExC_seen & REG_SEEN_VERBARG)
6218 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6219 r->extflags |= RXf_CHECK_ALL;
6220 scan_commit(pRExC_state, &data,&minlen,0);
6221 SvREFCNT_dec(data.last_found);
6223 longest_float_length = CHR_SVLEN(data.longest_float);
6225 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6226 && data.offset_fixed == data.offset_float_min
6227 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6228 && S_setup_longest (aTHX_ pRExC_state,
6232 &(r->float_end_shift),
6233 data.lookbehind_float,
6234 data.offset_float_min,
6236 longest_float_length,
6237 data.flags & SF_FL_BEFORE_EOL,
6238 data.flags & SF_FL_BEFORE_MEOL))
6240 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6241 r->float_max_offset = data.offset_float_max;
6242 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6243 r->float_max_offset -= data.lookbehind_float;
6246 r->float_substr = r->float_utf8 = NULL;
6247 SvREFCNT_dec(data.longest_float);
6248 longest_float_length = 0;
6251 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6253 if (S_setup_longest (aTHX_ pRExC_state,
6255 &(r->anchored_utf8),
6256 &(r->anchored_substr),
6257 &(r->anchored_end_shift),
6258 data.lookbehind_fixed,
6261 longest_fixed_length,
6262 data.flags & SF_FIX_BEFORE_EOL,
6263 data.flags & SF_FIX_BEFORE_MEOL))
6265 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6268 r->anchored_substr = r->anchored_utf8 = NULL;
6269 SvREFCNT_dec(data.longest_fixed);
6270 longest_fixed_length = 0;
6274 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6275 ri->regstclass = NULL;
6277 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6279 && !(data.start_class->flags & ANYOF_EOS)
6280 && !cl_is_anything(data.start_class))
6282 const U32 n = add_data(pRExC_state, 1, "f");
6283 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6285 Newx(RExC_rxi->data->data[n], 1,
6286 struct regnode_charclass_class);
6287 StructCopy(data.start_class,
6288 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6289 struct regnode_charclass_class);
6290 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6291 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6292 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6293 regprop(r, sv, (regnode*)data.start_class);
6294 PerlIO_printf(Perl_debug_log,
6295 "synthetic stclass \"%s\".\n",
6296 SvPVX_const(sv));});
6299 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6300 if (longest_fixed_length > longest_float_length) {
6301 r->check_end_shift = r->anchored_end_shift;
6302 r->check_substr = r->anchored_substr;
6303 r->check_utf8 = r->anchored_utf8;
6304 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6305 if (r->extflags & RXf_ANCH_SINGLE)
6306 r->extflags |= RXf_NOSCAN;
6309 r->check_end_shift = r->float_end_shift;
6310 r->check_substr = r->float_substr;
6311 r->check_utf8 = r->float_utf8;
6312 r->check_offset_min = r->float_min_offset;
6313 r->check_offset_max = r->float_max_offset;
6315 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6316 This should be changed ASAP! */
6317 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6318 r->extflags |= RXf_USE_INTUIT;
6319 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6320 r->extflags |= RXf_INTUIT_TAIL;
6322 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6323 if ( (STRLEN)minlen < longest_float_length )
6324 minlen= longest_float_length;
6325 if ( (STRLEN)minlen < longest_fixed_length )
6326 minlen= longest_fixed_length;
6330 /* Several toplevels. Best we can is to set minlen. */
6332 struct regnode_charclass_class ch_class;
6335 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6337 scan = ri->program + 1;
6338 cl_init(pRExC_state, &ch_class);
6339 data.start_class = &ch_class;
6340 data.last_closep = &last_close;
6343 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6344 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6348 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6349 = r->float_substr = r->float_utf8 = NULL;
6351 if (!(data.start_class->flags & ANYOF_EOS)
6352 && !cl_is_anything(data.start_class))
6354 const U32 n = add_data(pRExC_state, 1, "f");
6355 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6357 Newx(RExC_rxi->data->data[n], 1,
6358 struct regnode_charclass_class);
6359 StructCopy(data.start_class,
6360 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6361 struct regnode_charclass_class);
6362 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6363 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6364 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6365 regprop(r, sv, (regnode*)data.start_class);
6366 PerlIO_printf(Perl_debug_log,
6367 "synthetic stclass \"%s\".\n",
6368 SvPVX_const(sv));});
6372 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6373 the "real" pattern. */
6375 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6376 (IV)minlen, (IV)r->minlen);
6378 r->minlenret = minlen;
6379 if (r->minlen < minlen)
6382 if (RExC_seen & REG_SEEN_GPOS)
6383 r->extflags |= RXf_GPOS_SEEN;
6384 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6385 r->extflags |= RXf_LOOKBEHIND_SEEN;
6386 if (pRExC_state->num_code_blocks)
6387 r->extflags |= RXf_EVAL_SEEN;
6388 if (RExC_seen & REG_SEEN_CANY)
6389 r->extflags |= RXf_CANY_SEEN;
6390 if (RExC_seen & REG_SEEN_VERBARG)
6391 r->intflags |= PREGf_VERBARG_SEEN;
6392 if (RExC_seen & REG_SEEN_CUTGROUP)
6393 r->intflags |= PREGf_CUTGROUP_SEEN;
6394 if (pm_flags & PMf_USE_RE_EVAL)
6395 r->intflags |= PREGf_USE_RE_EVAL;
6396 if (RExC_paren_names)
6397 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6399 RXp_PAREN_NAMES(r) = NULL;
6401 #ifdef STUPID_PATTERN_CHECKS
6402 if (RX_PRELEN(rx) == 0)
6403 r->extflags |= RXf_NULL;
6404 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6405 /* XXX: this should happen BEFORE we compile */
6406 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6407 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6408 r->extflags |= RXf_WHITE;
6409 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6410 r->extflags |= RXf_START_ONLY;
6412 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6413 /* XXX: this should happen BEFORE we compile */
6414 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6416 regnode *first = ri->program + 1;
6419 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6420 r->extflags |= RXf_NULL;
6421 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6422 r->extflags |= RXf_START_ONLY;
6423 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6424 && OP(regnext(first)) == END)
6425 r->extflags |= RXf_WHITE;
6429 if (RExC_paren_names) {
6430 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6431 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6434 ri->name_list_idx = 0;
6436 if (RExC_recurse_count) {
6437 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6438 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6439 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6442 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6443 /* assume we don't need to swap parens around before we match */
6446 PerlIO_printf(Perl_debug_log,"Final program:\n");
6449 #ifdef RE_TRACK_PATTERN_OFFSETS
6450 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6451 const U32 len = ri->u.offsets[0];
6453 GET_RE_DEBUG_FLAGS_DECL;
6454 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6455 for (i = 1; i <= len; i++) {
6456 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6457 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6458 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6460 PerlIO_printf(Perl_debug_log, "\n");
6468 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6471 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6473 PERL_UNUSED_ARG(value);
6475 if (flags & RXapif_FETCH) {
6476 return reg_named_buff_fetch(rx, key, flags);
6477 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6478 Perl_croak_no_modify(aTHX);
6480 } else if (flags & RXapif_EXISTS) {
6481 return reg_named_buff_exists(rx, key, flags)
6484 } else if (flags & RXapif_REGNAMES) {
6485 return reg_named_buff_all(rx, flags);
6486 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6487 return reg_named_buff_scalar(rx, flags);
6489 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6495 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6498 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6499 PERL_UNUSED_ARG(lastkey);
6501 if (flags & RXapif_FIRSTKEY)
6502 return reg_named_buff_firstkey(rx, flags);
6503 else if (flags & RXapif_NEXTKEY)
6504 return reg_named_buff_nextkey(rx, flags);
6506 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6512 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6515 AV *retarray = NULL;
6517 struct regexp *const rx = (struct regexp *)SvANY(r);
6519 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6521 if (flags & RXapif_ALL)
6524 if (rx && RXp_PAREN_NAMES(rx)) {
6525 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6528 SV* sv_dat=HeVAL(he_str);
6529 I32 *nums=(I32*)SvPVX(sv_dat);
6530 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6531 if ((I32)(rx->nparens) >= nums[i]
6532 && rx->offs[nums[i]].start != -1
6533 && rx->offs[nums[i]].end != -1)
6536 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6541 ret = newSVsv(&PL_sv_undef);
6544 av_push(retarray, ret);
6547 return newRV_noinc(MUTABLE_SV(retarray));
6554 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6557 struct regexp *const rx = (struct regexp *)SvANY(r);
6559 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6561 if (rx && RXp_PAREN_NAMES(rx)) {
6562 if (flags & RXapif_ALL) {
6563 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6565 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6579 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6581 struct regexp *const rx = (struct regexp *)SvANY(r);
6583 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6585 if ( rx && RXp_PAREN_NAMES(rx) ) {
6586 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6588 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6595 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6597 struct regexp *const rx = (struct regexp *)SvANY(r);
6598 GET_RE_DEBUG_FLAGS_DECL;
6600 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6602 if (rx && RXp_PAREN_NAMES(rx)) {
6603 HV *hv = RXp_PAREN_NAMES(rx);
6605 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6608 SV* sv_dat = HeVAL(temphe);
6609 I32 *nums = (I32*)SvPVX(sv_dat);
6610 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6611 if ((I32)(rx->lastparen) >= nums[i] &&
6612 rx->offs[nums[i]].start != -1 &&
6613 rx->offs[nums[i]].end != -1)
6619 if (parno || flags & RXapif_ALL) {
6620 return newSVhek(HeKEY_hek(temphe));
6628 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6633 struct regexp *const rx = (struct regexp *)SvANY(r);
6635 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6637 if (rx && RXp_PAREN_NAMES(rx)) {
6638 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6639 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6640 } else if (flags & RXapif_ONE) {
6641 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6642 av = MUTABLE_AV(SvRV(ret));
6643 length = av_len(av);
6645 return newSViv(length + 1);
6647 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6651 return &PL_sv_undef;
6655 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6657 struct regexp *const rx = (struct regexp *)SvANY(r);
6660 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6662 if (rx && RXp_PAREN_NAMES(rx)) {
6663 HV *hv= RXp_PAREN_NAMES(rx);
6665 (void)hv_iterinit(hv);
6666 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6669 SV* sv_dat = HeVAL(temphe);
6670 I32 *nums = (I32*)SvPVX(sv_dat);
6671 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6672 if ((I32)(rx->lastparen) >= nums[i] &&
6673 rx->offs[nums[i]].start != -1 &&
6674 rx->offs[nums[i]].end != -1)
6680 if (parno || flags & RXapif_ALL) {
6681 av_push(av, newSVhek(HeKEY_hek(temphe)));
6686 return newRV_noinc(MUTABLE_SV(av));
6690 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6693 struct regexp *const rx = (struct regexp *)SvANY(r);
6698 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6701 sv_setsv(sv,&PL_sv_undef);
6705 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6707 i = rx->offs[0].start;
6711 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6713 s = rx->subbeg + rx->offs[0].end;
6714 i = rx->sublen - rx->offs[0].end;
6717 if ( 0 <= paren && paren <= (I32)rx->nparens &&
6718 (s1 = rx->offs[paren].start) != -1 &&
6719 (t1 = rx->offs[paren].end) != -1)
6723 s = rx->subbeg + s1;
6725 sv_setsv(sv,&PL_sv_undef);
6728 assert(rx->sublen >= (s - rx->subbeg) + i );
6730 const int oldtainted = PL_tainted;
6732 sv_setpvn(sv, s, i);
6733 PL_tainted = oldtainted;
6734 if ( (rx->extflags & RXf_CANY_SEEN)
6735 ? (RXp_MATCH_UTF8(rx)
6736 && (!i || is_utf8_string((U8*)s, i)))
6737 : (RXp_MATCH_UTF8(rx)) )
6744 if (RXp_MATCH_TAINTED(rx)) {
6745 if (SvTYPE(sv) >= SVt_PVMG) {
6746 MAGIC* const mg = SvMAGIC(sv);
6749 SvMAGIC_set(sv, mg->mg_moremagic);
6751 if ((mgt = SvMAGIC(sv))) {
6752 mg->mg_moremagic = mgt;
6753 SvMAGIC_set(sv, mg);
6763 sv_setsv(sv,&PL_sv_undef);
6769 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6770 SV const * const value)
6772 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6774 PERL_UNUSED_ARG(rx);
6775 PERL_UNUSED_ARG(paren);
6776 PERL_UNUSED_ARG(value);
6779 Perl_croak_no_modify(aTHX);
6783 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6786 struct regexp *const rx = (struct regexp *)SvANY(r);
6790 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6792 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6794 /* $` / ${^PREMATCH} */
6795 case RX_BUFF_IDX_PREMATCH:
6796 if (rx->offs[0].start != -1) {
6797 i = rx->offs[0].start;
6805 /* $' / ${^POSTMATCH} */
6806 case RX_BUFF_IDX_POSTMATCH:
6807 if (rx->offs[0].end != -1) {
6808 i = rx->sublen - rx->offs[0].end;
6810 s1 = rx->offs[0].end;
6816 /* $& / ${^MATCH}, $1, $2, ... */
6818 if (paren <= (I32)rx->nparens &&
6819 (s1 = rx->offs[paren].start) != -1 &&
6820 (t1 = rx->offs[paren].end) != -1)
6825 if (ckWARN(WARN_UNINITIALIZED))
6826 report_uninit((const SV *)sv);
6831 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6832 const char * const s = rx->subbeg + s1;
6837 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6844 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6846 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6847 PERL_UNUSED_ARG(rx);
6851 return newSVpvs("Regexp");
6854 /* Scans the name of a named buffer from the pattern.
6855 * If flags is REG_RSN_RETURN_NULL returns null.
6856 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6857 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6858 * to the parsed name as looked up in the RExC_paren_names hash.
6859 * If there is an error throws a vFAIL().. type exception.
6862 #define REG_RSN_RETURN_NULL 0
6863 #define REG_RSN_RETURN_NAME 1
6864 #define REG_RSN_RETURN_DATA 2
6867 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6869 char *name_start = RExC_parse;
6871 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6873 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6874 /* skip IDFIRST by using do...while */
6877 RExC_parse += UTF8SKIP(RExC_parse);
6878 } while (isALNUM_utf8((U8*)RExC_parse));
6882 } while (isALNUM(*RExC_parse));
6884 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6885 vFAIL("Group name must start with a non-digit word character");
6889 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6890 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6891 if ( flags == REG_RSN_RETURN_NAME)
6893 else if (flags==REG_RSN_RETURN_DATA) {
6896 if ( ! sv_name ) /* should not happen*/
6897 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6898 if (RExC_paren_names)
6899 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6901 sv_dat = HeVAL(he_str);
6903 vFAIL("Reference to nonexistent named group");
6907 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6908 (unsigned long) flags);
6910 assert(0); /* NOT REACHED */
6915 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6916 int rem=(int)(RExC_end - RExC_parse); \
6925 if (RExC_lastparse!=RExC_parse) \
6926 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6929 iscut ? "..." : "<" \
6932 PerlIO_printf(Perl_debug_log,"%16s",""); \
6935 num = RExC_size + 1; \
6937 num=REG_NODE_NUM(RExC_emit); \
6938 if (RExC_lastnum!=num) \
6939 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6941 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6942 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6943 (int)((depth*2)), "", \
6947 RExC_lastparse=RExC_parse; \
6952 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6953 DEBUG_PARSE_MSG((funcname)); \
6954 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6956 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6957 DEBUG_PARSE_MSG((funcname)); \
6958 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6961 /* This section of code defines the inversion list object and its methods. The
6962 * interfaces are highly subject to change, so as much as possible is static to
6963 * this file. An inversion list is here implemented as a malloc'd C UV array
6964 * with some added info that is placed as UVs at the beginning in a header
6965 * portion. An inversion list for Unicode is an array of code points, sorted
6966 * by ordinal number. The zeroth element is the first code point in the list.
6967 * The 1th element is the first element beyond that not in the list. In other
6968 * words, the first range is
6969 * invlist[0]..(invlist[1]-1)
6970 * The other ranges follow. Thus every element whose index is divisible by two
6971 * marks the beginning of a range that is in the list, and every element not
6972 * divisible by two marks the beginning of a range not in the list. A single
6973 * element inversion list that contains the single code point N generally
6974 * consists of two elements
6977 * (The exception is when N is the highest representable value on the
6978 * machine, in which case the list containing just it would be a single
6979 * element, itself. By extension, if the last range in the list extends to
6980 * infinity, then the first element of that range will be in the inversion list
6981 * at a position that is divisible by two, and is the final element in the
6983 * Taking the complement (inverting) an inversion list is quite simple, if the
6984 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6985 * This implementation reserves an element at the beginning of each inversion list
6986 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6987 * beginning of the list is either that element if 0, or the next one if 1.
6989 * More about inversion lists can be found in "Unicode Demystified"
6990 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6991 * More will be coming when functionality is added later.
6993 * The inversion list data structure is currently implemented as an SV pointing
6994 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6995 * array of UV whose memory management is automatically handled by the existing
6996 * facilities for SV's.
6998 * Some of the methods should always be private to the implementation, and some
6999 * should eventually be made public */
7001 #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
7002 #define INVLIST_ITER_OFFSET 1 /* Current iteration position */
7004 /* This is a combination of a version and data structure type, so that one
7005 * being passed in can be validated to be an inversion list of the correct
7006 * vintage. When the structure of the header is changed, a new random number
7007 * in the range 2**31-1 should be generated and the new() method changed to
7008 * insert that at this location. Then, if an auxiliary program doesn't change
7009 * correspondingly, it will be discovered immediately */
7010 #define INVLIST_VERSION_ID_OFFSET 2
7011 #define INVLIST_VERSION_ID 1064334010
7013 /* For safety, when adding new elements, remember to #undef them at the end of
7014 * the inversion list code section */
7016 #define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
7017 /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
7018 * contains the code point U+00000, and begins here. If 1, the inversion list
7019 * doesn't contain U+0000, and it begins at the next UV in the array.
7020 * Inverting an inversion list consists of adding or removing the 0 at the
7021 * beginning of it. By reserving a space for that 0, inversion can be made
7024 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
7026 /* Internally things are UVs */
7027 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7028 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7030 #define INVLIST_INITIAL_LEN 10
7032 PERL_STATIC_INLINE UV*
7033 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7035 /* Returns a pointer to the first element in the inversion list's array.
7036 * This is called upon initialization of an inversion list. Where the
7037 * array begins depends on whether the list has the code point U+0000
7038 * in it or not. The other parameter tells it whether the code that
7039 * follows this call is about to put a 0 in the inversion list or not.
7040 * The first element is either the element with 0, if 0, or the next one,
7043 UV* zero = get_invlist_zero_addr(invlist);
7045 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7048 assert(! *get_invlist_len_addr(invlist));
7050 /* 1^1 = 0; 1^0 = 1 */
7051 *zero = 1 ^ will_have_0;
7052 return zero + *zero;
7055 PERL_STATIC_INLINE UV*
7056 S_invlist_array(pTHX_ SV* const invlist)
7058 /* Returns the pointer to the inversion list's array. Every time the
7059 * length changes, this needs to be called in case malloc or realloc moved
7062 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7064 /* Must not be empty. If these fail, you probably didn't check for <len>
7065 * being non-zero before trying to get the array */
7066 assert(*get_invlist_len_addr(invlist));
7067 assert(*get_invlist_zero_addr(invlist) == 0
7068 || *get_invlist_zero_addr(invlist) == 1);
7070 /* The array begins either at the element reserved for zero if the
7071 * list contains 0 (that element will be set to 0), or otherwise the next
7072 * element (in which case the reserved element will be set to 1). */
7073 return (UV *) (get_invlist_zero_addr(invlist)
7074 + *get_invlist_zero_addr(invlist));
7077 PERL_STATIC_INLINE UV*
7078 S_get_invlist_len_addr(pTHX_ SV* invlist)
7080 /* Return the address of the UV that contains the current number
7081 * of used elements in the inversion list */
7083 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
7085 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
7088 PERL_STATIC_INLINE UV
7089 S_invlist_len(pTHX_ SV* const invlist)
7091 /* Returns the current number of elements stored in the inversion list's
7094 PERL_ARGS_ASSERT_INVLIST_LEN;
7096 return *get_invlist_len_addr(invlist);
7099 PERL_STATIC_INLINE void
7100 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7102 /* Sets the current number of elements stored in the inversion list */
7104 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7106 *get_invlist_len_addr(invlist) = len;
7108 assert(len <= SvLEN(invlist));
7110 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7111 /* If the list contains U+0000, that element is part of the header,
7112 * and should not be counted as part of the array. It will contain
7113 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7115 * SvCUR_set(invlist,
7116 * TO_INTERNAL_SIZE(len
7117 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7118 * But, this is only valid if len is not 0. The consequences of not doing
7119 * this is that the memory allocation code may think that 1 more UV is
7120 * being used than actually is, and so might do an unnecessary grow. That
7121 * seems worth not bothering to make this the precise amount.
7123 * Note that when inverting, SvCUR shouldn't change */
7126 PERL_STATIC_INLINE UV
7127 S_invlist_max(pTHX_ SV* const invlist)
7129 /* Returns the maximum number of elements storable in the inversion list's
7130 * array, without having to realloc() */
7132 PERL_ARGS_ASSERT_INVLIST_MAX;
7134 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7137 PERL_STATIC_INLINE UV*
7138 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7140 /* Return the address of the UV that is reserved to hold 0 if the inversion
7141 * list contains 0. This has to be the last element of the heading, as the
7142 * list proper starts with either it if 0, or the next element if not.
7143 * (But we force it to contain either 0 or 1) */
7145 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7147 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7150 #ifndef PERL_IN_XSUB_RE
7152 Perl__new_invlist(pTHX_ IV initial_size)
7155 /* Return a pointer to a newly constructed inversion list, with enough
7156 * space to store 'initial_size' elements. If that number is negative, a
7157 * system default is used instead */
7161 if (initial_size < 0) {
7162 initial_size = INVLIST_INITIAL_LEN;
7165 /* Allocate the initial space */
7166 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7167 invlist_set_len(new_list, 0);
7169 /* Force iterinit() to be used to get iteration to work */
7170 *get_invlist_iter_addr(new_list) = UV_MAX;
7172 /* This should force a segfault if a method doesn't initialize this
7174 *get_invlist_zero_addr(new_list) = UV_MAX;
7176 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7177 #if HEADER_LENGTH != 4
7178 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7186 S__new_invlist_C_array(pTHX_ UV* list)
7188 /* Return a pointer to a newly constructed inversion list, initialized to
7189 * point to <list>, which has to be in the exact correct inversion list
7190 * form, including internal fields. Thus this is a dangerous routine that
7191 * should not be used in the wrong hands */
7193 SV* invlist = newSV_type(SVt_PV);
7195 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7197 SvPV_set(invlist, (char *) list);
7198 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7199 shouldn't touch it */
7200 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
7202 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7203 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7210 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7212 /* Grow the maximum size of an inversion list */
7214 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7216 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7219 PERL_STATIC_INLINE void
7220 S_invlist_trim(pTHX_ SV* const invlist)
7222 PERL_ARGS_ASSERT_INVLIST_TRIM;
7224 /* Change the length of the inversion list to how many entries it currently
7227 SvPV_shrink_to_cur((SV *) invlist);
7230 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
7232 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
7233 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
7235 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7238 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7240 /* Subject to change or removal. Append the range from 'start' to 'end' at
7241 * the end of the inversion list. The range must be above any existing
7245 UV max = invlist_max(invlist);
7246 UV len = invlist_len(invlist);
7248 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7250 if (len == 0) { /* Empty lists must be initialized */
7251 array = _invlist_array_init(invlist, start == 0);
7254 /* Here, the existing list is non-empty. The current max entry in the
7255 * list is generally the first value not in the set, except when the
7256 * set extends to the end of permissible values, in which case it is
7257 * the first entry in that final set, and so this call is an attempt to
7258 * append out-of-order */
7260 UV final_element = len - 1;
7261 array = invlist_array(invlist);
7262 if (array[final_element] > start
7263 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7265 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7266 array[final_element], start,
7267 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7270 /* Here, it is a legal append. If the new range begins with the first
7271 * value not in the set, it is extending the set, so the new first
7272 * value not in the set is one greater than the newly extended range.
7274 if (array[final_element] == start) {
7275 if (end != UV_MAX) {
7276 array[final_element] = end + 1;
7279 /* But if the end is the maximum representable on the machine,
7280 * just let the range that this would extend to have no end */
7281 invlist_set_len(invlist, len - 1);
7287 /* Here the new range doesn't extend any existing set. Add it */
7289 len += 2; /* Includes an element each for the start and end of range */
7291 /* If overflows the existing space, extend, which may cause the array to be
7294 invlist_extend(invlist, len);
7295 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7296 failure in invlist_array() */
7297 array = invlist_array(invlist);
7300 invlist_set_len(invlist, len);
7303 /* The next item on the list starts the range, the one after that is
7304 * one past the new range. */
7305 array[len - 2] = start;
7306 if (end != UV_MAX) {
7307 array[len - 1] = end + 1;
7310 /* But if the end is the maximum representable on the machine, just let
7311 * the range have no end */
7312 invlist_set_len(invlist, len - 1);
7316 #ifndef PERL_IN_XSUB_RE
7319 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7321 /* Searches the inversion list for the entry that contains the input code
7322 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7323 * return value is the index into the list's array of the range that
7327 IV high = invlist_len(invlist);
7328 const UV * const array = invlist_array(invlist);
7330 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7332 /* If list is empty or the code point is before the first element, return
7334 if (high == 0 || cp < array[0]) {
7338 /* Binary search. What we are looking for is <i> such that
7339 * array[i] <= cp < array[i+1]
7340 * The loop below converges on the i+1. */
7341 while (low < high) {
7342 IV mid = (low + high) / 2;
7343 if (array[mid] <= cp) {
7346 /* We could do this extra test to exit the loop early.
7347 if (cp < array[low]) {
7352 else { /* cp < array[mid] */
7361 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7363 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7364 * but is used when the swash has an inversion list. This makes this much
7365 * faster, as it uses a binary search instead of a linear one. This is
7366 * intimately tied to that function, and perhaps should be in utf8.c,
7367 * except it is intimately tied to inversion lists as well. It assumes
7368 * that <swatch> is all 0's on input */
7371 const IV len = invlist_len(invlist);
7375 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7377 if (len == 0) { /* Empty inversion list */
7381 array = invlist_array(invlist);
7383 /* Find which element it is */
7384 i = _invlist_search(invlist, start);
7386 /* We populate from <start> to <end> */
7387 while (current < end) {
7390 /* The inversion list gives the results for every possible code point
7391 * after the first one in the list. Only those ranges whose index is
7392 * even are ones that the inversion list matches. For the odd ones,
7393 * and if the initial code point is not in the list, we have to skip
7394 * forward to the next element */
7395 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7397 if (i >= len) { /* Finished if beyond the end of the array */
7401 if (current >= end) { /* Finished if beyond the end of what we
7406 assert(current >= start);
7408 /* The current range ends one below the next one, except don't go past
7411 upper = (i < len && array[i] < end) ? array[i] : end;
7413 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7414 * for each code point in it */
7415 for (; current < upper; current++) {
7416 const STRLEN offset = (STRLEN)(current - start);
7417 swatch[offset >> 3] |= 1 << (offset & 7);
7420 /* Quit if at the end of the list */
7423 /* But first, have to deal with the highest possible code point on
7424 * the platform. The previous code assumes that <end> is one
7425 * beyond where we want to populate, but that is impossible at the
7426 * platform's infinity, so have to handle it specially */
7427 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7429 const STRLEN offset = (STRLEN)(end - start);
7430 swatch[offset >> 3] |= 1 << (offset & 7);
7435 /* Advance to the next range, which will be for code points not in the
7444 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7446 /* Take the union of two inversion lists and point <output> to it. *output
7447 * should be defined upon input, and if it points to one of the two lists,
7448 * the reference count to that list will be decremented. The first list,
7449 * <a>, may be NULL, in which case a copy of the second list is returned.
7450 * If <complement_b> is TRUE, the union is taken of the complement
7451 * (inversion) of <b> instead of b itself.
7453 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7454 * Richard Gillam, published by Addison-Wesley, and explained at some
7455 * length there. The preface says to incorporate its examples into your
7456 * code at your own risk.
7458 * The algorithm is like a merge sort.
7460 * XXX A potential performance improvement is to keep track as we go along
7461 * if only one of the inputs contributes to the result, meaning the other
7462 * is a subset of that one. In that case, we can skip the final copy and
7463 * return the larger of the input lists, but then outside code might need
7464 * to keep track of whether to free the input list or not */
7466 UV* array_a; /* a's array */
7468 UV len_a; /* length of a's array */
7471 SV* u; /* the resulting union */
7475 UV i_a = 0; /* current index into a's array */
7479 /* running count, as explained in the algorithm source book; items are
7480 * stopped accumulating and are output when the count changes to/from 0.
7481 * The count is incremented when we start a range that's in the set, and
7482 * decremented when we start a range that's not in the set. So its range
7483 * is 0 to 2. Only when the count is zero is something not in the set.
7487 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7490 /* If either one is empty, the union is the other one */
7491 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7498 *output = invlist_clone(b);
7500 _invlist_invert(*output);
7502 } /* else *output already = b; */
7505 else if ((len_b = invlist_len(b)) == 0) {
7510 /* The complement of an empty list is a list that has everything in it,
7511 * so the union with <a> includes everything too */
7516 *output = _new_invlist(1);
7517 _append_range_to_invlist(*output, 0, UV_MAX);
7519 else if (*output != a) {
7520 *output = invlist_clone(a);
7522 /* else *output already = a; */
7526 /* Here both lists exist and are non-empty */
7527 array_a = invlist_array(a);
7528 array_b = invlist_array(b);
7530 /* If are to take the union of 'a' with the complement of b, set it
7531 * up so are looking at b's complement. */
7534 /* To complement, we invert: if the first element is 0, remove it. To
7535 * do this, we just pretend the array starts one later, and clear the
7536 * flag as we don't have to do anything else later */
7537 if (array_b[0] == 0) {
7540 complement_b = FALSE;
7544 /* But if the first element is not zero, we unshift a 0 before the
7545 * array. The data structure reserves a space for that 0 (which
7546 * should be a '1' right now), so physical shifting is unneeded,
7547 * but temporarily change that element to 0. Before exiting the
7548 * routine, we must restore the element to '1' */
7555 /* Size the union for the worst case: that the sets are completely
7557 u = _new_invlist(len_a + len_b);
7559 /* Will contain U+0000 if either component does */
7560 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7561 || (len_b > 0 && array_b[0] == 0));
7563 /* Go through each list item by item, stopping when exhausted one of
7565 while (i_a < len_a && i_b < len_b) {
7566 UV cp; /* The element to potentially add to the union's array */
7567 bool cp_in_set; /* is it in the the input list's set or not */
7569 /* We need to take one or the other of the two inputs for the union.
7570 * Since we are merging two sorted lists, we take the smaller of the
7571 * next items. In case of a tie, we take the one that is in its set
7572 * first. If we took one not in the set first, it would decrement the
7573 * count, possibly to 0 which would cause it to be output as ending the
7574 * range, and the next time through we would take the same number, and
7575 * output it again as beginning the next range. By doing it the
7576 * opposite way, there is no possibility that the count will be
7577 * momentarily decremented to 0, and thus the two adjoining ranges will
7578 * be seamlessly merged. (In a tie and both are in the set or both not
7579 * in the set, it doesn't matter which we take first.) */
7580 if (array_a[i_a] < array_b[i_b]
7581 || (array_a[i_a] == array_b[i_b]
7582 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7584 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7588 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7592 /* Here, have chosen which of the two inputs to look at. Only output
7593 * if the running count changes to/from 0, which marks the
7594 * beginning/end of a range in that's in the set */
7597 array_u[i_u++] = cp;
7604 array_u[i_u++] = cp;
7609 /* Here, we are finished going through at least one of the lists, which
7610 * means there is something remaining in at most one. We check if the list
7611 * that hasn't been exhausted is positioned such that we are in the middle
7612 * of a range in its set or not. (i_a and i_b point to the element beyond
7613 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7614 * is potentially more to output.
7615 * There are four cases:
7616 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7617 * in the union is entirely from the non-exhausted set.
7618 * 2) Both were in their sets, count is 2. Nothing further should
7619 * be output, as everything that remains will be in the exhausted
7620 * list's set, hence in the union; decrementing to 1 but not 0 insures
7622 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7623 * Nothing further should be output because the union includes
7624 * everything from the exhausted set. Not decrementing ensures that.
7625 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7626 * decrementing to 0 insures that we look at the remainder of the
7627 * non-exhausted set */
7628 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7629 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7634 /* The final length is what we've output so far, plus what else is about to
7635 * be output. (If 'count' is non-zero, then the input list we exhausted
7636 * has everything remaining up to the machine's limit in its set, and hence
7637 * in the union, so there will be no further output. */
7640 /* At most one of the subexpressions will be non-zero */
7641 len_u += (len_a - i_a) + (len_b - i_b);
7644 /* Set result to final length, which can change the pointer to array_u, so
7646 if (len_u != invlist_len(u)) {
7647 invlist_set_len(u, len_u);
7649 array_u = invlist_array(u);
7652 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7653 * the other) ended with everything above it not in its set. That means
7654 * that the remaining part of the union is precisely the same as the
7655 * non-exhausted list, so can just copy it unchanged. (If both list were
7656 * exhausted at the same time, then the operations below will be both 0.)
7659 IV copy_count; /* At most one will have a non-zero copy count */
7660 if ((copy_count = len_a - i_a) > 0) {
7661 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7663 else if ((copy_count = len_b - i_b) > 0) {
7664 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7668 /* We may be removing a reference to one of the inputs */
7669 if (a == *output || b == *output) {
7670 SvREFCNT_dec(*output);
7673 /* If we've changed b, restore it */
7683 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7685 /* Take the intersection of two inversion lists and point <i> to it. *i
7686 * should be defined upon input, and if it points to one of the two lists,
7687 * the reference count to that list will be decremented.
7688 * If <complement_b> is TRUE, the result will be the intersection of <a>
7689 * and the complement (or inversion) of <b> instead of <b> directly.
7691 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7692 * Richard Gillam, published by Addison-Wesley, and explained at some
7693 * length there. The preface says to incorporate its examples into your
7694 * code at your own risk. In fact, it had bugs
7696 * The algorithm is like a merge sort, and is essentially the same as the
7700 UV* array_a; /* a's array */
7702 UV len_a; /* length of a's array */
7705 SV* r; /* the resulting intersection */
7709 UV i_a = 0; /* current index into a's array */
7713 /* running count, as explained in the algorithm source book; items are
7714 * stopped accumulating and are output when the count changes to/from 2.
7715 * The count is incremented when we start a range that's in the set, and
7716 * decremented when we start a range that's not in the set. So its range
7717 * is 0 to 2. Only when the count is 2 is something in the intersection.
7721 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7724 /* Special case if either one is empty */
7725 len_a = invlist_len(a);
7726 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7728 if (len_a != 0 && complement_b) {
7730 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7731 * be empty. Here, also we are using 'b's complement, which hence
7732 * must be every possible code point. Thus the intersection is
7735 *i = invlist_clone(a);
7741 /* else *i is already 'a' */
7745 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7746 * intersection must be empty */
7753 *i = _new_invlist(0);
7757 /* Here both lists exist and are non-empty */
7758 array_a = invlist_array(a);
7759 array_b = invlist_array(b);
7761 /* If are to take the intersection of 'a' with the complement of b, set it
7762 * up so are looking at b's complement. */
7765 /* To complement, we invert: if the first element is 0, remove it. To
7766 * do this, we just pretend the array starts one later, and clear the
7767 * flag as we don't have to do anything else later */
7768 if (array_b[0] == 0) {
7771 complement_b = FALSE;
7775 /* But if the first element is not zero, we unshift a 0 before the
7776 * array. The data structure reserves a space for that 0 (which
7777 * should be a '1' right now), so physical shifting is unneeded,
7778 * but temporarily change that element to 0. Before exiting the
7779 * routine, we must restore the element to '1' */
7786 /* Size the intersection for the worst case: that the intersection ends up
7787 * fragmenting everything to be completely disjoint */
7788 r= _new_invlist(len_a + len_b);
7790 /* Will contain U+0000 iff both components do */
7791 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7792 && len_b > 0 && array_b[0] == 0);
7794 /* Go through each list item by item, stopping when exhausted one of
7796 while (i_a < len_a && i_b < len_b) {
7797 UV cp; /* The element to potentially add to the intersection's
7799 bool cp_in_set; /* Is it in the input list's set or not */
7801 /* We need to take one or the other of the two inputs for the
7802 * intersection. Since we are merging two sorted lists, we take the
7803 * smaller of the next items. In case of a tie, we take the one that
7804 * is not in its set first (a difference from the union algorithm). If
7805 * we took one in the set first, it would increment the count, possibly
7806 * to 2 which would cause it to be output as starting a range in the
7807 * intersection, and the next time through we would take that same
7808 * number, and output it again as ending the set. By doing it the
7809 * opposite of this, there is no possibility that the count will be
7810 * momentarily incremented to 2. (In a tie and both are in the set or
7811 * both not in the set, it doesn't matter which we take first.) */
7812 if (array_a[i_a] < array_b[i_b]
7813 || (array_a[i_a] == array_b[i_b]
7814 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7816 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7820 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7824 /* Here, have chosen which of the two inputs to look at. Only output
7825 * if the running count changes to/from 2, which marks the
7826 * beginning/end of a range that's in the intersection */
7830 array_r[i_r++] = cp;
7835 array_r[i_r++] = cp;
7841 /* Here, we are finished going through at least one of the lists, which
7842 * means there is something remaining in at most one. We check if the list
7843 * that has been exhausted is positioned such that we are in the middle
7844 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7845 * the ones we care about.) There are four cases:
7846 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7847 * nothing left in the intersection.
7848 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7849 * above 2. What should be output is exactly that which is in the
7850 * non-exhausted set, as everything it has is also in the intersection
7851 * set, and everything it doesn't have can't be in the intersection
7852 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7853 * gets incremented to 2. Like the previous case, the intersection is
7854 * everything that remains in the non-exhausted set.
7855 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7856 * remains 1. And the intersection has nothing more. */
7857 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7858 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7863 /* The final length is what we've output so far plus what else is in the
7864 * intersection. At most one of the subexpressions below will be non-zero */
7867 len_r += (len_a - i_a) + (len_b - i_b);
7870 /* Set result to final length, which can change the pointer to array_r, so
7872 if (len_r != invlist_len(r)) {
7873 invlist_set_len(r, len_r);
7875 array_r = invlist_array(r);
7878 /* Finish outputting any remaining */
7879 if (count >= 2) { /* At most one will have a non-zero copy count */
7881 if ((copy_count = len_a - i_a) > 0) {
7882 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7884 else if ((copy_count = len_b - i_b) > 0) {
7885 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7889 /* We may be removing a reference to one of the inputs */
7890 if (a == *i || b == *i) {
7894 /* If we've changed b, restore it */
7904 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7906 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7907 * set. A pointer to the inversion list is returned. This may actually be
7908 * a new list, in which case the passed in one has been destroyed. The
7909 * passed in inversion list can be NULL, in which case a new one is created
7910 * with just the one range in it */
7915 if (invlist == NULL) {
7916 invlist = _new_invlist(2);
7920 len = invlist_len(invlist);
7923 /* If comes after the final entry, can just append it to the end */
7925 || start >= invlist_array(invlist)
7926 [invlist_len(invlist) - 1])
7928 _append_range_to_invlist(invlist, start, end);
7932 /* Here, can't just append things, create and return a new inversion list
7933 * which is the union of this range and the existing inversion list */
7934 range_invlist = _new_invlist(2);
7935 _append_range_to_invlist(range_invlist, start, end);
7937 _invlist_union(invlist, range_invlist, &invlist);
7939 /* The temporary can be freed */
7940 SvREFCNT_dec(range_invlist);
7947 PERL_STATIC_INLINE bool
7948 S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp)
7950 /* Does <invlist> contain code point <cp> as part of the set? */
7952 IV index = _invlist_search(invlist, cp);
7954 PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP;
7956 return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index);
7959 PERL_STATIC_INLINE SV*
7960 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7961 return _add_range_to_invlist(invlist, cp, cp);
7964 #ifndef PERL_IN_XSUB_RE
7966 Perl__invlist_invert(pTHX_ SV* const invlist)
7968 /* Complement the input inversion list. This adds a 0 if the list didn't
7969 * have a zero; removes it otherwise. As described above, the data
7970 * structure is set up so that this is very efficient */
7972 UV* len_pos = get_invlist_len_addr(invlist);
7974 PERL_ARGS_ASSERT__INVLIST_INVERT;
7976 /* The inverse of matching nothing is matching everything */
7977 if (*len_pos == 0) {
7978 _append_range_to_invlist(invlist, 0, UV_MAX);
7982 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7983 * zero element was a 0, so it is being removed, so the length decrements
7984 * by 1; and vice-versa. SvCUR is unaffected */
7985 if (*get_invlist_zero_addr(invlist) ^= 1) {
7994 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7996 /* Complement the input inversion list (which must be a Unicode property,
7997 * all of which don't match above the Unicode maximum code point.) And
7998 * Perl has chosen to not have the inversion match above that either. This
7999 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8005 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8007 _invlist_invert(invlist);
8009 len = invlist_len(invlist);
8011 if (len != 0) { /* If empty do nothing */
8012 array = invlist_array(invlist);
8013 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8014 /* Add 0x110000. First, grow if necessary */
8016 if (invlist_max(invlist) < len) {
8017 invlist_extend(invlist, len);
8018 array = invlist_array(invlist);
8020 invlist_set_len(invlist, len);
8021 array[len - 1] = PERL_UNICODE_MAX + 1;
8023 else { /* Remove the 0x110000 */
8024 invlist_set_len(invlist, len - 1);
8032 PERL_STATIC_INLINE SV*
8033 S_invlist_clone(pTHX_ SV* const invlist)
8036 /* Return a new inversion list that is a copy of the input one, which is
8039 /* Need to allocate extra space to accommodate Perl's addition of a
8040 * trailing NUL to SvPV's, since it thinks they are always strings */
8041 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
8042 STRLEN length = SvCUR(invlist);
8044 PERL_ARGS_ASSERT_INVLIST_CLONE;
8046 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8047 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8052 PERL_STATIC_INLINE UV*
8053 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8055 /* Return the address of the UV that contains the current iteration
8058 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8060 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8063 PERL_STATIC_INLINE UV*
8064 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8066 /* Return the address of the UV that contains the version id. */
8068 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8070 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8073 PERL_STATIC_INLINE void
8074 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8076 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8078 *get_invlist_iter_addr(invlist) = 0;
8082 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8084 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8085 * This call sets in <*start> and <*end>, the next range in <invlist>.
8086 * Returns <TRUE> if successful and the next call will return the next
8087 * range; <FALSE> if was already at the end of the list. If the latter,
8088 * <*start> and <*end> are unchanged, and the next call to this function
8089 * will start over at the beginning of the list */
8091 UV* pos = get_invlist_iter_addr(invlist);
8092 UV len = invlist_len(invlist);
8095 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8098 *pos = UV_MAX; /* Force iternit() to be required next time */
8102 array = invlist_array(invlist);
8104 *start = array[(*pos)++];
8110 *end = array[(*pos)++] - 1;
8116 PERL_STATIC_INLINE UV
8117 S_invlist_highest(pTHX_ SV* const invlist)
8119 /* Returns the highest code point that matches an inversion list. This API
8120 * has an ambiguity, as it returns 0 under either the highest is actually
8121 * 0, or if the list is empty. If this distinction matters to you, check
8122 * for emptiness before calling this function */
8124 UV len = invlist_len(invlist);
8127 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8133 array = invlist_array(invlist);
8135 /* The last element in the array in the inversion list always starts a
8136 * range that goes to infinity. That range may be for code points that are
8137 * matched in the inversion list, or it may be for ones that aren't
8138 * matched. In the latter case, the highest code point in the set is one
8139 * less than the beginning of this range; otherwise it is the final element
8140 * of this range: infinity */
8141 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8143 : array[len - 1] - 1;
8146 #ifndef PERL_IN_XSUB_RE
8148 Perl__invlist_contents(pTHX_ SV* const invlist)
8150 /* Get the contents of an inversion list into a string SV so that they can
8151 * be printed out. It uses the format traditionally done for debug tracing
8155 SV* output = newSVpvs("\n");
8157 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8159 invlist_iterinit(invlist);
8160 while (invlist_iternext(invlist, &start, &end)) {
8161 if (end == UV_MAX) {
8162 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8164 else if (end != start) {
8165 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8169 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8179 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8181 /* Dumps out the ranges in an inversion list. The string 'header'
8182 * if present is output on a line before the first range */
8186 if (header && strlen(header)) {
8187 PerlIO_printf(Perl_debug_log, "%s\n", header);
8189 invlist_iterinit(invlist);
8190 while (invlist_iternext(invlist, &start, &end)) {
8191 if (end == UV_MAX) {
8192 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8195 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8203 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8205 /* Return a boolean as to if the two passed in inversion lists are
8206 * identical. The final argument, if TRUE, says to take the complement of
8207 * the second inversion list before doing the comparison */
8209 UV* array_a = invlist_array(a);
8210 UV* array_b = invlist_array(b);
8211 UV len_a = invlist_len(a);
8212 UV len_b = invlist_len(b);
8214 UV i = 0; /* current index into the arrays */
8215 bool retval = TRUE; /* Assume are identical until proven otherwise */
8217 PERL_ARGS_ASSERT__INVLISTEQ;
8219 /* If are to compare 'a' with the complement of b, set it
8220 * up so are looking at b's complement. */
8223 /* The complement of nothing is everything, so <a> would have to have
8224 * just one element, starting at zero (ending at infinity) */
8226 return (len_a == 1 && array_a[0] == 0);
8228 else if (array_b[0] == 0) {
8230 /* Otherwise, to complement, we invert. Here, the first element is
8231 * 0, just remove it. To do this, we just pretend the array starts
8232 * one later, and clear the flag as we don't have to do anything
8237 complement_b = FALSE;
8241 /* But if the first element is not zero, we unshift a 0 before the
8242 * array. The data structure reserves a space for that 0 (which
8243 * should be a '1' right now), so physical shifting is unneeded,
8244 * but temporarily change that element to 0. Before exiting the
8245 * routine, we must restore the element to '1' */
8252 /* Make sure that the lengths are the same, as well as the final element
8253 * before looping through the remainder. (Thus we test the length, final,
8254 * and first elements right off the bat) */
8255 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8258 else for (i = 0; i < len_a - 1; i++) {
8259 if (array_a[i] != array_b[i]) {
8272 #undef HEADER_LENGTH
8273 #undef INVLIST_INITIAL_LENGTH
8274 #undef TO_INTERNAL_SIZE
8275 #undef FROM_INTERNAL_SIZE
8276 #undef INVLIST_LEN_OFFSET
8277 #undef INVLIST_ZERO_OFFSET
8278 #undef INVLIST_ITER_OFFSET
8279 #undef INVLIST_VERSION_ID
8281 /* End of inversion list object */
8284 - reg - regular expression, i.e. main body or parenthesized thing
8286 * Caller must absorb opening parenthesis.
8288 * Combining parenthesis handling with the base level of regular expression
8289 * is a trifle forced, but the need to tie the tails of the branches to what
8290 * follows makes it hard to avoid.
8292 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8294 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8296 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8300 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8301 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8304 regnode *ret; /* Will be the head of the group. */
8307 regnode *ender = NULL;
8310 U32 oregflags = RExC_flags;
8311 bool have_branch = 0;
8313 I32 freeze_paren = 0;
8314 I32 after_freeze = 0;
8316 /* for (?g), (?gc), and (?o) warnings; warning
8317 about (?c) will warn about (?g) -- japhy */
8319 #define WASTED_O 0x01
8320 #define WASTED_G 0x02
8321 #define WASTED_C 0x04
8322 #define WASTED_GC (0x02|0x04)
8323 I32 wastedflags = 0x00;
8325 char * parse_start = RExC_parse; /* MJD */
8326 char * const oregcomp_parse = RExC_parse;
8328 GET_RE_DEBUG_FLAGS_DECL;
8330 PERL_ARGS_ASSERT_REG;
8331 DEBUG_PARSE("reg ");
8333 *flagp = 0; /* Tentatively. */
8336 /* Make an OPEN node, if parenthesized. */
8338 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8339 char *start_verb = RExC_parse;
8340 STRLEN verb_len = 0;
8341 char *start_arg = NULL;
8342 unsigned char op = 0;
8344 int internal_argval = 0; /* internal_argval is only useful if !argok */
8345 while ( *RExC_parse && *RExC_parse != ')' ) {
8346 if ( *RExC_parse == ':' ) {
8347 start_arg = RExC_parse + 1;
8353 verb_len = RExC_parse - start_verb;
8356 while ( *RExC_parse && *RExC_parse != ')' )
8358 if ( *RExC_parse != ')' )
8359 vFAIL("Unterminated verb pattern argument");
8360 if ( RExC_parse == start_arg )
8363 if ( *RExC_parse != ')' )
8364 vFAIL("Unterminated verb pattern");
8367 switch ( *start_verb ) {
8368 case 'A': /* (*ACCEPT) */
8369 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8371 internal_argval = RExC_nestroot;
8374 case 'C': /* (*COMMIT) */
8375 if ( memEQs(start_verb,verb_len,"COMMIT") )
8378 case 'F': /* (*FAIL) */
8379 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8384 case ':': /* (*:NAME) */
8385 case 'M': /* (*MARK:NAME) */
8386 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8391 case 'P': /* (*PRUNE) */
8392 if ( memEQs(start_verb,verb_len,"PRUNE") )
8395 case 'S': /* (*SKIP) */
8396 if ( memEQs(start_verb,verb_len,"SKIP") )
8399 case 'T': /* (*THEN) */
8400 /* [19:06] <TimToady> :: is then */
8401 if ( memEQs(start_verb,verb_len,"THEN") ) {
8403 RExC_seen |= REG_SEEN_CUTGROUP;
8409 vFAIL3("Unknown verb pattern '%.*s'",
8410 verb_len, start_verb);
8413 if ( start_arg && internal_argval ) {
8414 vFAIL3("Verb pattern '%.*s' may not have an argument",
8415 verb_len, start_verb);
8416 } else if ( argok < 0 && !start_arg ) {
8417 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8418 verb_len, start_verb);
8420 ret = reganode(pRExC_state, op, internal_argval);
8421 if ( ! internal_argval && ! SIZE_ONLY ) {
8423 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8424 ARG(ret) = add_data( pRExC_state, 1, "S" );
8425 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8432 if (!internal_argval)
8433 RExC_seen |= REG_SEEN_VERBARG;
8434 } else if ( start_arg ) {
8435 vFAIL3("Verb pattern '%.*s' may not have an argument",
8436 verb_len, start_verb);
8438 ret = reg_node(pRExC_state, op);
8440 nextchar(pRExC_state);
8443 if (*RExC_parse == '?') { /* (?...) */
8444 bool is_logical = 0;
8445 const char * const seqstart = RExC_parse;
8446 bool has_use_defaults = FALSE;
8449 paren = *RExC_parse++;
8450 ret = NULL; /* For look-ahead/behind. */
8453 case 'P': /* (?P...) variants for those used to PCRE/Python */
8454 paren = *RExC_parse++;
8455 if ( paren == '<') /* (?P<...>) named capture */
8457 else if (paren == '>') { /* (?P>name) named recursion */
8458 goto named_recursion;
8460 else if (paren == '=') { /* (?P=...) named backref */
8461 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8462 you change this make sure you change that */
8463 char* name_start = RExC_parse;
8465 SV *sv_dat = reg_scan_name(pRExC_state,
8466 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8467 if (RExC_parse == name_start || *RExC_parse != ')')
8468 vFAIL2("Sequence %.3s... not terminated",parse_start);
8471 num = add_data( pRExC_state, 1, "S" );
8472 RExC_rxi->data->data[num]=(void*)sv_dat;
8473 SvREFCNT_inc_simple_void(sv_dat);
8476 ret = reganode(pRExC_state,
8479 : (ASCII_FOLD_RESTRICTED)
8481 : (AT_LEAST_UNI_SEMANTICS)
8489 Set_Node_Offset(ret, parse_start+1);
8490 Set_Node_Cur_Length(ret); /* MJD */
8492 nextchar(pRExC_state);
8496 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8498 case '<': /* (?<...) */
8499 if (*RExC_parse == '!')
8501 else if (*RExC_parse != '=')
8507 case '\'': /* (?'...') */
8508 name_start= RExC_parse;
8509 svname = reg_scan_name(pRExC_state,
8510 SIZE_ONLY ? /* reverse test from the others */
8511 REG_RSN_RETURN_NAME :
8512 REG_RSN_RETURN_NULL);
8513 if (RExC_parse == name_start) {
8515 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8518 if (*RExC_parse != paren)
8519 vFAIL2("Sequence (?%c... not terminated",
8520 paren=='>' ? '<' : paren);
8524 if (!svname) /* shouldn't happen */
8526 "panic: reg_scan_name returned NULL");
8527 if (!RExC_paren_names) {
8528 RExC_paren_names= newHV();
8529 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8531 RExC_paren_name_list= newAV();
8532 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8535 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8537 sv_dat = HeVAL(he_str);
8539 /* croak baby croak */
8541 "panic: paren_name hash element allocation failed");
8542 } else if ( SvPOK(sv_dat) ) {
8543 /* (?|...) can mean we have dupes so scan to check
8544 its already been stored. Maybe a flag indicating
8545 we are inside such a construct would be useful,
8546 but the arrays are likely to be quite small, so
8547 for now we punt -- dmq */
8548 IV count = SvIV(sv_dat);
8549 I32 *pv = (I32*)SvPVX(sv_dat);
8551 for ( i = 0 ; i < count ; i++ ) {
8552 if ( pv[i] == RExC_npar ) {
8558 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8559 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8560 pv[count] = RExC_npar;
8561 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8564 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8565 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8567 SvIV_set(sv_dat, 1);
8570 /* Yes this does cause a memory leak in debugging Perls */
8571 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8572 SvREFCNT_dec(svname);
8575 /*sv_dump(sv_dat);*/
8577 nextchar(pRExC_state);
8579 goto capturing_parens;
8581 RExC_seen |= REG_SEEN_LOOKBEHIND;
8582 RExC_in_lookbehind++;
8584 case '=': /* (?=...) */
8585 RExC_seen_zerolen++;
8587 case '!': /* (?!...) */
8588 RExC_seen_zerolen++;
8589 if (*RExC_parse == ')') {
8590 ret=reg_node(pRExC_state, OPFAIL);
8591 nextchar(pRExC_state);
8595 case '|': /* (?|...) */
8596 /* branch reset, behave like a (?:...) except that
8597 buffers in alternations share the same numbers */
8599 after_freeze = freeze_paren = RExC_npar;
8601 case ':': /* (?:...) */
8602 case '>': /* (?>...) */
8604 case '$': /* (?$...) */
8605 case '@': /* (?@...) */
8606 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8608 case '#': /* (?#...) */
8609 while (*RExC_parse && *RExC_parse != ')')
8611 if (*RExC_parse != ')')
8612 FAIL("Sequence (?#... not terminated");
8613 nextchar(pRExC_state);
8616 case '0' : /* (?0) */
8617 case 'R' : /* (?R) */
8618 if (*RExC_parse != ')')
8619 FAIL("Sequence (?R) not terminated");
8620 ret = reg_node(pRExC_state, GOSTART);
8621 *flagp |= POSTPONED;
8622 nextchar(pRExC_state);
8625 { /* named and numeric backreferences */
8627 case '&': /* (?&NAME) */
8628 parse_start = RExC_parse - 1;
8631 SV *sv_dat = reg_scan_name(pRExC_state,
8632 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8633 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8635 goto gen_recurse_regop;
8636 assert(0); /* NOT REACHED */
8638 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8640 vFAIL("Illegal pattern");
8642 goto parse_recursion;
8644 case '-': /* (?-1) */
8645 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8646 RExC_parse--; /* rewind to let it be handled later */
8650 case '1': case '2': case '3': case '4': /* (?1) */
8651 case '5': case '6': case '7': case '8': case '9':
8654 num = atoi(RExC_parse);
8655 parse_start = RExC_parse - 1; /* MJD */
8656 if (*RExC_parse == '-')
8658 while (isDIGIT(*RExC_parse))
8660 if (*RExC_parse!=')')
8661 vFAIL("Expecting close bracket");
8664 if ( paren == '-' ) {
8666 Diagram of capture buffer numbering.
8667 Top line is the normal capture buffer numbers
8668 Bottom line is the negative indexing as from
8672 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8676 num = RExC_npar + num;
8679 vFAIL("Reference to nonexistent group");
8681 } else if ( paren == '+' ) {
8682 num = RExC_npar + num - 1;
8685 ret = reganode(pRExC_state, GOSUB, num);
8687 if (num > (I32)RExC_rx->nparens) {
8689 vFAIL("Reference to nonexistent group");
8691 ARG2L_SET( ret, RExC_recurse_count++);
8693 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8694 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8698 RExC_seen |= REG_SEEN_RECURSE;
8699 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8700 Set_Node_Offset(ret, parse_start); /* MJD */
8702 *flagp |= POSTPONED;
8703 nextchar(pRExC_state);
8705 } /* named and numeric backreferences */
8706 assert(0); /* NOT REACHED */
8708 case '?': /* (??...) */
8710 if (*RExC_parse != '{') {
8712 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8715 *flagp |= POSTPONED;
8716 paren = *RExC_parse++;
8718 case '{': /* (?{...}) */
8721 struct reg_code_block *cb;
8723 RExC_seen_zerolen++;
8725 if ( !pRExC_state->num_code_blocks
8726 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8727 || pRExC_state->code_blocks[pRExC_state->code_index].start
8728 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8731 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8732 FAIL("panic: Sequence (?{...}): no code block found\n");
8733 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8735 /* this is a pre-compiled code block (?{...}) */
8736 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8737 RExC_parse = RExC_start + cb->end;
8740 if (cb->src_regex) {
8741 n = add_data(pRExC_state, 2, "rl");
8742 RExC_rxi->data->data[n] =
8743 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8744 RExC_rxi->data->data[n+1] = (void*)o;
8747 n = add_data(pRExC_state, 1,
8748 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8749 RExC_rxi->data->data[n] = (void*)o;
8752 pRExC_state->code_index++;
8753 nextchar(pRExC_state);
8757 ret = reg_node(pRExC_state, LOGICAL);
8758 eval = reganode(pRExC_state, EVAL, n);
8761 /* for later propagation into (??{}) return value */
8762 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8764 REGTAIL(pRExC_state, ret, eval);
8765 /* deal with the length of this later - MJD */
8768 ret = reganode(pRExC_state, EVAL, n);
8769 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8770 Set_Node_Offset(ret, parse_start);
8773 case '(': /* (?(?{...})...) and (?(?=...)...) */
8776 if (RExC_parse[0] == '?') { /* (?(?...)) */
8777 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8778 || RExC_parse[1] == '<'
8779 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8782 ret = reg_node(pRExC_state, LOGICAL);
8785 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8789 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8790 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8792 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8793 char *name_start= RExC_parse++;
8795 SV *sv_dat=reg_scan_name(pRExC_state,
8796 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8797 if (RExC_parse == name_start || *RExC_parse != ch)
8798 vFAIL2("Sequence (?(%c... not terminated",
8799 (ch == '>' ? '<' : ch));
8802 num = add_data( pRExC_state, 1, "S" );
8803 RExC_rxi->data->data[num]=(void*)sv_dat;
8804 SvREFCNT_inc_simple_void(sv_dat);
8806 ret = reganode(pRExC_state,NGROUPP,num);
8807 goto insert_if_check_paren;
8809 else if (RExC_parse[0] == 'D' &&
8810 RExC_parse[1] == 'E' &&
8811 RExC_parse[2] == 'F' &&
8812 RExC_parse[3] == 'I' &&
8813 RExC_parse[4] == 'N' &&
8814 RExC_parse[5] == 'E')
8816 ret = reganode(pRExC_state,DEFINEP,0);
8819 goto insert_if_check_paren;
8821 else if (RExC_parse[0] == 'R') {
8824 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8825 parno = atoi(RExC_parse++);
8826 while (isDIGIT(*RExC_parse))
8828 } else if (RExC_parse[0] == '&') {
8831 sv_dat = reg_scan_name(pRExC_state,
8832 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8833 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8835 ret = reganode(pRExC_state,INSUBP,parno);
8836 goto insert_if_check_paren;
8838 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8841 parno = atoi(RExC_parse++);
8843 while (isDIGIT(*RExC_parse))
8845 ret = reganode(pRExC_state, GROUPP, parno);
8847 insert_if_check_paren:
8848 if ((c = *nextchar(pRExC_state)) != ')')
8849 vFAIL("Switch condition not recognized");
8851 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8852 br = regbranch(pRExC_state, &flags, 1,depth+1);
8854 br = reganode(pRExC_state, LONGJMP, 0);
8856 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8857 c = *nextchar(pRExC_state);
8862 vFAIL("(?(DEFINE)....) does not allow branches");
8863 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8864 regbranch(pRExC_state, &flags, 1,depth+1);
8865 REGTAIL(pRExC_state, ret, lastbr);
8868 c = *nextchar(pRExC_state);
8873 vFAIL("Switch (?(condition)... contains too many branches");
8874 ender = reg_node(pRExC_state, TAIL);
8875 REGTAIL(pRExC_state, br, ender);
8877 REGTAIL(pRExC_state, lastbr, ender);
8878 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8881 REGTAIL(pRExC_state, ret, ender);
8882 RExC_size++; /* XXX WHY do we need this?!!
8883 For large programs it seems to be required
8884 but I can't figure out why. -- dmq*/
8888 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8892 RExC_parse--; /* for vFAIL to print correctly */
8893 vFAIL("Sequence (? incomplete");
8895 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8897 has_use_defaults = TRUE;
8898 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8899 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8900 ? REGEX_UNICODE_CHARSET
8901 : REGEX_DEPENDS_CHARSET);
8905 parse_flags: /* (?i) */
8907 U32 posflags = 0, negflags = 0;
8908 U32 *flagsp = &posflags;
8909 char has_charset_modifier = '\0';
8910 regex_charset cs = get_regex_charset(RExC_flags);
8911 if (cs == REGEX_DEPENDS_CHARSET
8912 && (RExC_utf8 || RExC_uni_semantics))
8914 cs = REGEX_UNICODE_CHARSET;
8917 while (*RExC_parse) {
8918 /* && strchr("iogcmsx", *RExC_parse) */
8919 /* (?g), (?gc) and (?o) are useless here
8920 and must be globally applied -- japhy */
8921 switch (*RExC_parse) {
8922 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8923 case LOCALE_PAT_MOD:
8924 if (has_charset_modifier) {
8925 goto excess_modifier;
8927 else if (flagsp == &negflags) {
8930 cs = REGEX_LOCALE_CHARSET;
8931 has_charset_modifier = LOCALE_PAT_MOD;
8932 RExC_contains_locale = 1;
8934 case UNICODE_PAT_MOD:
8935 if (has_charset_modifier) {
8936 goto excess_modifier;
8938 else if (flagsp == &negflags) {
8941 cs = REGEX_UNICODE_CHARSET;
8942 has_charset_modifier = UNICODE_PAT_MOD;
8944 case ASCII_RESTRICT_PAT_MOD:
8945 if (flagsp == &negflags) {
8948 if (has_charset_modifier) {
8949 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8950 goto excess_modifier;
8952 /* Doubled modifier implies more restricted */
8953 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8956 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8958 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8960 case DEPENDS_PAT_MOD:
8961 if (has_use_defaults) {
8962 goto fail_modifiers;
8964 else if (flagsp == &negflags) {
8967 else if (has_charset_modifier) {
8968 goto excess_modifier;
8971 /* The dual charset means unicode semantics if the
8972 * pattern (or target, not known until runtime) are
8973 * utf8, or something in the pattern indicates unicode
8975 cs = (RExC_utf8 || RExC_uni_semantics)
8976 ? REGEX_UNICODE_CHARSET
8977 : REGEX_DEPENDS_CHARSET;
8978 has_charset_modifier = DEPENDS_PAT_MOD;
8982 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8983 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8985 else if (has_charset_modifier == *(RExC_parse - 1)) {
8986 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8989 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8994 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8996 case ONCE_PAT_MOD: /* 'o' */
8997 case GLOBAL_PAT_MOD: /* 'g' */
8998 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8999 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9000 if (! (wastedflags & wflagbit) ) {
9001 wastedflags |= wflagbit;
9004 "Useless (%s%c) - %suse /%c modifier",
9005 flagsp == &negflags ? "?-" : "?",
9007 flagsp == &negflags ? "don't " : "",
9014 case CONTINUE_PAT_MOD: /* 'c' */
9015 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9016 if (! (wastedflags & WASTED_C) ) {
9017 wastedflags |= WASTED_GC;
9020 "Useless (%sc) - %suse /gc modifier",
9021 flagsp == &negflags ? "?-" : "?",
9022 flagsp == &negflags ? "don't " : ""
9027 case KEEPCOPY_PAT_MOD: /* 'p' */
9028 if (flagsp == &negflags) {
9030 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9032 *flagsp |= RXf_PMf_KEEPCOPY;
9036 /* A flag is a default iff it is following a minus, so
9037 * if there is a minus, it means will be trying to
9038 * re-specify a default which is an error */
9039 if (has_use_defaults || flagsp == &negflags) {
9042 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9046 wastedflags = 0; /* reset so (?g-c) warns twice */
9052 RExC_flags |= posflags;
9053 RExC_flags &= ~negflags;
9054 set_regex_charset(&RExC_flags, cs);
9056 oregflags |= posflags;
9057 oregflags &= ~negflags;
9058 set_regex_charset(&oregflags, cs);
9060 nextchar(pRExC_state);
9071 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9076 }} /* one for the default block, one for the switch */
9083 ret = reganode(pRExC_state, OPEN, parno);
9086 RExC_nestroot = parno;
9087 if (RExC_seen & REG_SEEN_RECURSE
9088 && !RExC_open_parens[parno-1])
9090 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9091 "Setting open paren #%"IVdf" to %d\n",
9092 (IV)parno, REG_NODE_NUM(ret)));
9093 RExC_open_parens[parno-1]= ret;
9096 Set_Node_Length(ret, 1); /* MJD */
9097 Set_Node_Offset(ret, RExC_parse); /* MJD */
9105 /* Pick up the branches, linking them together. */
9106 parse_start = RExC_parse; /* MJD */
9107 br = regbranch(pRExC_state, &flags, 1,depth+1);
9109 /* branch_len = (paren != 0); */
9113 if (*RExC_parse == '|') {
9114 if (!SIZE_ONLY && RExC_extralen) {
9115 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9118 reginsert(pRExC_state, BRANCH, br, depth+1);
9119 Set_Node_Length(br, paren != 0);
9120 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9124 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9126 else if (paren == ':') {
9127 *flagp |= flags&SIMPLE;
9129 if (is_open) { /* Starts with OPEN. */
9130 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9132 else if (paren != '?') /* Not Conditional */
9134 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9136 while (*RExC_parse == '|') {
9137 if (!SIZE_ONLY && RExC_extralen) {
9138 ender = reganode(pRExC_state, LONGJMP,0);
9139 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9142 RExC_extralen += 2; /* Account for LONGJMP. */
9143 nextchar(pRExC_state);
9145 if (RExC_npar > after_freeze)
9146 after_freeze = RExC_npar;
9147 RExC_npar = freeze_paren;
9149 br = regbranch(pRExC_state, &flags, 0, depth+1);
9153 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9155 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9158 if (have_branch || paren != ':') {
9159 /* Make a closing node, and hook it on the end. */
9162 ender = reg_node(pRExC_state, TAIL);
9165 ender = reganode(pRExC_state, CLOSE, parno);
9166 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9167 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9168 "Setting close paren #%"IVdf" to %d\n",
9169 (IV)parno, REG_NODE_NUM(ender)));
9170 RExC_close_parens[parno-1]= ender;
9171 if (RExC_nestroot == parno)
9174 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9175 Set_Node_Length(ender,1); /* MJD */
9181 *flagp &= ~HASWIDTH;
9184 ender = reg_node(pRExC_state, SUCCEED);
9187 ender = reg_node(pRExC_state, END);
9189 assert(!RExC_opend); /* there can only be one! */
9194 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9195 SV * const mysv_val1=sv_newmortal();
9196 SV * const mysv_val2=sv_newmortal();
9197 DEBUG_PARSE_MSG("lsbr");
9198 regprop(RExC_rx, mysv_val1, lastbr);
9199 regprop(RExC_rx, mysv_val2, ender);
9200 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9201 SvPV_nolen_const(mysv_val1),
9202 (IV)REG_NODE_NUM(lastbr),
9203 SvPV_nolen_const(mysv_val2),
9204 (IV)REG_NODE_NUM(ender),
9205 (IV)(ender - lastbr)
9208 REGTAIL(pRExC_state, lastbr, ender);
9210 if (have_branch && !SIZE_ONLY) {
9213 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9215 /* Hook the tails of the branches to the closing node. */
9216 for (br = ret; br; br = regnext(br)) {
9217 const U8 op = PL_regkind[OP(br)];
9219 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9220 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9223 else if (op == BRANCHJ) {
9224 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9225 /* for now we always disable this optimisation * /
9226 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9232 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9233 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9234 SV * const mysv_val1=sv_newmortal();
9235 SV * const mysv_val2=sv_newmortal();
9236 DEBUG_PARSE_MSG("NADA");
9237 regprop(RExC_rx, mysv_val1, ret);
9238 regprop(RExC_rx, mysv_val2, ender);
9239 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9240 SvPV_nolen_const(mysv_val1),
9241 (IV)REG_NODE_NUM(ret),
9242 SvPV_nolen_const(mysv_val2),
9243 (IV)REG_NODE_NUM(ender),
9248 if (OP(ender) == TAIL) {
9253 for ( opt= br + 1; opt < ender ; opt++ )
9255 NEXT_OFF(br)= ender - br;
9263 static const char parens[] = "=!<,>";
9265 if (paren && (p = strchr(parens, paren))) {
9266 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9267 int flag = (p - parens) > 1;
9270 node = SUSPEND, flag = 0;
9271 reginsert(pRExC_state, node,ret, depth+1);
9272 Set_Node_Cur_Length(ret);
9273 Set_Node_Offset(ret, parse_start + 1);
9275 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9279 /* Check for proper termination. */
9281 RExC_flags = oregflags;
9282 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9283 RExC_parse = oregcomp_parse;
9284 vFAIL("Unmatched (");
9287 else if (!paren && RExC_parse < RExC_end) {
9288 if (*RExC_parse == ')') {
9290 vFAIL("Unmatched )");
9293 FAIL("Junk on end of regexp"); /* "Can't happen". */
9294 assert(0); /* NOTREACHED */
9297 if (RExC_in_lookbehind) {
9298 RExC_in_lookbehind--;
9300 if (after_freeze > RExC_npar)
9301 RExC_npar = after_freeze;
9306 - regbranch - one alternative of an | operator
9308 * Implements the concatenation operator.
9311 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9315 regnode *chain = NULL;
9317 I32 flags = 0, c = 0;
9318 GET_RE_DEBUG_FLAGS_DECL;
9320 PERL_ARGS_ASSERT_REGBRANCH;
9322 DEBUG_PARSE("brnc");
9327 if (!SIZE_ONLY && RExC_extralen)
9328 ret = reganode(pRExC_state, BRANCHJ,0);
9330 ret = reg_node(pRExC_state, BRANCH);
9331 Set_Node_Length(ret, 1);
9335 if (!first && SIZE_ONLY)
9336 RExC_extralen += 1; /* BRANCHJ */
9338 *flagp = WORST; /* Tentatively. */
9341 nextchar(pRExC_state);
9342 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9344 latest = regpiece(pRExC_state, &flags,depth+1);
9345 if (latest == NULL) {
9346 if (flags & TRYAGAIN)
9350 else if (ret == NULL)
9352 *flagp |= flags&(HASWIDTH|POSTPONED);
9353 if (chain == NULL) /* First piece. */
9354 *flagp |= flags&SPSTART;
9357 REGTAIL(pRExC_state, chain, latest);
9362 if (chain == NULL) { /* Loop ran zero times. */
9363 chain = reg_node(pRExC_state, NOTHING);
9368 *flagp |= flags&SIMPLE;
9375 - regpiece - something followed by possible [*+?]
9377 * Note that the branching code sequences used for ? and the general cases
9378 * of * and + are somewhat optimized: they use the same NOTHING node as
9379 * both the endmarker for their branch list and the body of the last branch.
9380 * It might seem that this node could be dispensed with entirely, but the
9381 * endmarker role is not redundant.
9384 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9391 const char * const origparse = RExC_parse;
9393 I32 max = REG_INFTY;
9394 #ifdef RE_TRACK_PATTERN_OFFSETS
9397 const char *maxpos = NULL;
9398 GET_RE_DEBUG_FLAGS_DECL;
9400 PERL_ARGS_ASSERT_REGPIECE;
9402 DEBUG_PARSE("piec");
9404 ret = regatom(pRExC_state, &flags,depth+1);
9406 if (flags & TRYAGAIN)
9413 if (op == '{' && regcurly(RExC_parse)) {
9415 #ifdef RE_TRACK_PATTERN_OFFSETS
9416 parse_start = RExC_parse; /* MJD */
9418 next = RExC_parse + 1;
9419 while (isDIGIT(*next) || *next == ',') {
9428 if (*next == '}') { /* got one */
9432 min = atoi(RExC_parse);
9436 maxpos = RExC_parse;
9438 if (!max && *maxpos != '0')
9439 max = REG_INFTY; /* meaning "infinity" */
9440 else if (max >= REG_INFTY)
9441 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9443 nextchar(pRExC_state);
9446 if ((flags&SIMPLE)) {
9447 RExC_naughty += 2 + RExC_naughty / 2;
9448 reginsert(pRExC_state, CURLY, ret, depth+1);
9449 Set_Node_Offset(ret, parse_start+1); /* MJD */
9450 Set_Node_Cur_Length(ret);
9453 regnode * const w = reg_node(pRExC_state, WHILEM);
9456 REGTAIL(pRExC_state, ret, w);
9457 if (!SIZE_ONLY && RExC_extralen) {
9458 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9459 reginsert(pRExC_state, NOTHING,ret, depth+1);
9460 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9462 reginsert(pRExC_state, CURLYX,ret, depth+1);
9464 Set_Node_Offset(ret, parse_start+1);
9465 Set_Node_Length(ret,
9466 op == '{' ? (RExC_parse - parse_start) : 1);
9468 if (!SIZE_ONLY && RExC_extralen)
9469 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9470 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9472 RExC_whilem_seen++, RExC_extralen += 3;
9473 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9482 vFAIL("Can't do {n,m} with n > m");
9484 ARG1_SET(ret, (U16)min);
9485 ARG2_SET(ret, (U16)max);
9497 #if 0 /* Now runtime fix should be reliable. */
9499 /* if this is reinstated, don't forget to put this back into perldiag:
9501 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9503 (F) The part of the regexp subject to either the * or + quantifier
9504 could match an empty string. The {#} shows in the regular
9505 expression about where the problem was discovered.
9509 if (!(flags&HASWIDTH) && op != '?')
9510 vFAIL("Regexp *+ operand could be empty");
9513 #ifdef RE_TRACK_PATTERN_OFFSETS
9514 parse_start = RExC_parse;
9516 nextchar(pRExC_state);
9518 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9520 if (op == '*' && (flags&SIMPLE)) {
9521 reginsert(pRExC_state, STAR, ret, depth+1);
9525 else if (op == '*') {
9529 else if (op == '+' && (flags&SIMPLE)) {
9530 reginsert(pRExC_state, PLUS, ret, depth+1);
9534 else if (op == '+') {
9538 else if (op == '?') {
9543 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9544 ckWARN3reg(RExC_parse,
9545 "%.*s matches null string many times",
9546 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9550 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9551 nextchar(pRExC_state);
9552 reginsert(pRExC_state, MINMOD, ret, depth+1);
9553 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9555 #ifndef REG_ALLOW_MINMOD_SUSPEND
9558 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9560 nextchar(pRExC_state);
9561 ender = reg_node(pRExC_state, SUCCEED);
9562 REGTAIL(pRExC_state, ret, ender);
9563 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9565 ender = reg_node(pRExC_state, TAIL);
9566 REGTAIL(pRExC_state, ret, ender);
9570 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9572 vFAIL("Nested quantifiers");
9579 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9582 /* This is expected to be called by a parser routine that has recognized '\N'
9583 and needs to handle the rest. RExC_parse is expected to point at the first
9584 char following the N at the time of the call. On successful return,
9585 RExC_parse has been updated to point to just after the sequence identified
9586 by this routine, and <*flagp> has been updated.
9588 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9591 \N may begin either a named sequence, or if outside a character class, mean
9592 to match a non-newline. For non single-quoted regexes, the tokenizer has
9593 attempted to decide which, and in the case of a named sequence, converted it
9594 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9595 where c1... are the characters in the sequence. For single-quoted regexes,
9596 the tokenizer passes the \N sequence through unchanged; this code will not
9597 attempt to determine this nor expand those, instead raising a syntax error.
9598 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9599 or there is no '}', it signals that this \N occurrence means to match a
9602 Only the \N{U+...} form should occur in a character class, for the same
9603 reason that '.' inside a character class means to just match a period: it
9604 just doesn't make sense.
9606 The function raises an error (via vFAIL), and doesn't return for various
9607 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9608 success; it returns FALSE otherwise.
9610 If <valuep> is non-null, it means the caller can accept an input sequence
9611 consisting of a just a single code point; <*valuep> is set to that value
9612 if the input is such.
9614 If <node_p> is non-null it signifies that the caller can accept any other
9615 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9617 1) \N means not-a-NL: points to a newly created REG_ANY node;
9618 2) \N{}: points to a new NOTHING node;
9619 3) otherwise: points to a new EXACT node containing the resolved
9621 Note that FALSE is returned for single code point sequences if <valuep> is
9625 char * endbrace; /* '}' following the name */
9627 char *endchar; /* Points to '.' or '}' ending cur char in the input
9629 bool has_multiple_chars; /* true if the input stream contains a sequence of
9630 more than one character */
9632 GET_RE_DEBUG_FLAGS_DECL;
9634 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9638 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9640 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9641 * modifier. The other meaning does not */
9642 p = (RExC_flags & RXf_PMf_EXTENDED)
9643 ? regwhite( pRExC_state, RExC_parse )
9646 /* Disambiguate between \N meaning a named character versus \N meaning
9647 * [^\n]. The former is assumed when it can't be the latter. */
9648 if (*p != '{' || regcurly(p)) {
9651 /* no bare \N in a charclass */
9652 if (in_char_class) {
9653 vFAIL("\\N in a character class must be a named character: \\N{...}");
9657 nextchar(pRExC_state);
9658 *node_p = reg_node(pRExC_state, REG_ANY);
9659 *flagp |= HASWIDTH|SIMPLE;
9662 Set_Node_Length(*node_p, 1); /* MJD */
9666 /* Here, we have decided it should be a named character or sequence */
9668 /* The test above made sure that the next real character is a '{', but
9669 * under the /x modifier, it could be separated by space (or a comment and
9670 * \n) and this is not allowed (for consistency with \x{...} and the
9671 * tokenizer handling of \N{NAME}). */
9672 if (*RExC_parse != '{') {
9673 vFAIL("Missing braces on \\N{}");
9676 RExC_parse++; /* Skip past the '{' */
9678 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9679 || ! (endbrace == RExC_parse /* nothing between the {} */
9680 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9681 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9683 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9684 vFAIL("\\N{NAME} must be resolved by the lexer");
9687 if (endbrace == RExC_parse) { /* empty: \N{} */
9690 *node_p = reg_node(pRExC_state,NOTHING);
9692 else if (in_char_class) {
9693 if (SIZE_ONLY && in_char_class) {
9694 ckWARNreg(RExC_parse,
9695 "Ignoring zero length \\N{} in character class"
9703 nextchar(pRExC_state);
9707 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9708 RExC_parse += 2; /* Skip past the 'U+' */
9710 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9712 /* Code points are separated by dots. If none, there is only one code
9713 * point, and is terminated by the brace */
9714 has_multiple_chars = (endchar < endbrace);
9716 if (valuep && (! has_multiple_chars || in_char_class)) {
9717 /* We only pay attention to the first char of
9718 multichar strings being returned in char classes. I kinda wonder
9719 if this makes sense as it does change the behaviour
9720 from earlier versions, OTOH that behaviour was broken
9721 as well. XXX Solution is to recharacterize as
9722 [rest-of-class]|multi1|multi2... */
9724 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9725 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9726 | PERL_SCAN_DISALLOW_PREFIX
9727 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9729 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9731 /* The tokenizer should have guaranteed validity, but it's possible to
9732 * bypass it by using single quoting, so check */
9733 if (length_of_hex == 0
9734 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9736 RExC_parse += length_of_hex; /* Includes all the valid */
9737 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9738 ? UTF8SKIP(RExC_parse)
9740 /* Guard against malformed utf8 */
9741 if (RExC_parse >= endchar) {
9742 RExC_parse = endchar;
9744 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9747 if (in_char_class && has_multiple_chars) {
9748 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9750 RExC_parse = endbrace + 1;
9752 else if (! node_p || ! has_multiple_chars) {
9754 /* Here, the input is legal, but not according to the caller's
9755 * options. We fail without advancing the parse, so that the
9756 * caller can try again */
9762 /* What is done here is to convert this to a sub-pattern of the form
9763 * (?:\x{char1}\x{char2}...)
9764 * and then call reg recursively. That way, it retains its atomicness,
9765 * while not having to worry about special handling that some code
9766 * points may have. toke.c has converted the original Unicode values
9767 * to native, so that we can just pass on the hex values unchanged. We
9768 * do have to set a flag to keep recoding from happening in the
9771 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9773 char *orig_end = RExC_end;
9776 while (RExC_parse < endbrace) {
9778 /* Convert to notation the rest of the code understands */
9779 sv_catpv(substitute_parse, "\\x{");
9780 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9781 sv_catpv(substitute_parse, "}");
9783 /* Point to the beginning of the next character in the sequence. */
9784 RExC_parse = endchar + 1;
9785 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9787 sv_catpv(substitute_parse, ")");
9789 RExC_parse = SvPV(substitute_parse, len);
9791 /* Don't allow empty number */
9793 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9795 RExC_end = RExC_parse + len;
9797 /* The values are Unicode, and therefore not subject to recoding */
9798 RExC_override_recoding = 1;
9800 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9801 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9803 RExC_parse = endbrace;
9804 RExC_end = orig_end;
9805 RExC_override_recoding = 0;
9807 nextchar(pRExC_state);
9817 * It returns the code point in utf8 for the value in *encp.
9818 * value: a code value in the source encoding
9819 * encp: a pointer to an Encode object
9821 * If the result from Encode is not a single character,
9822 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9825 S_reg_recode(pTHX_ const char value, SV **encp)
9828 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9829 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9830 const STRLEN newlen = SvCUR(sv);
9831 UV uv = UNICODE_REPLACEMENT;
9833 PERL_ARGS_ASSERT_REG_RECODE;
9837 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9840 if (!newlen || numlen != newlen) {
9841 uv = UNICODE_REPLACEMENT;
9847 PERL_STATIC_INLINE U8
9848 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9852 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9858 op = get_regex_charset(RExC_flags);
9859 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9860 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9861 been, so there is no hole */
9867 PERL_STATIC_INLINE void
9868 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9870 /* This knows the details about sizing an EXACTish node, setting flags for
9871 * it (by setting <*flagp>, and potentially populating it with a single
9874 * If <len> is non-zero, this function assumes that the node has already
9875 * been populated, and just does the sizing. In this case <code_point>
9876 * should be the final code point that has already been placed into the
9877 * node. This value will be ignored except that under some circumstances
9878 * <*flagp> is set based on it.
9880 * If <len is zero, the function assumes that the node is to contain only
9881 * the single character given by <code_point> and calculates what <len>
9882 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9883 * additionally will populate the node's STRING with <code_point>, if <len>
9884 * is 0. In both cases <*flagp> is appropriately set
9886 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9887 * folded (the latter only when the rules indicate it can match 'ss') */
9889 bool len_passed_in = cBOOL(len != 0);
9890 U8 character[UTF8_MAXBYTES_CASE+1];
9892 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9894 if (! len_passed_in) {
9897 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9900 uvchr_to_utf8( character, code_point);
9901 len = UTF8SKIP(character);
9905 || code_point != LATIN_SMALL_LETTER_SHARP_S
9906 || ASCII_FOLD_RESTRICTED
9907 || ! AT_LEAST_UNI_SEMANTICS)
9909 *character = (U8) code_point;
9914 *(character + 1) = 's';
9920 RExC_size += STR_SZ(len);
9923 RExC_emit += STR_SZ(len);
9924 STR_LEN(node) = len;
9925 if (! len_passed_in) {
9926 Copy((char *) character, STRING(node), len, char);
9931 if (len == 1 && UNI_IS_INVARIANT(code_point))
9936 - regatom - the lowest level
9938 Try to identify anything special at the start of the pattern. If there
9939 is, then handle it as required. This may involve generating a single regop,
9940 such as for an assertion; or it may involve recursing, such as to
9941 handle a () structure.
9943 If the string doesn't start with something special then we gobble up
9944 as much literal text as we can.
9946 Once we have been able to handle whatever type of thing started the
9947 sequence, we return.
9949 Note: we have to be careful with escapes, as they can be both literal
9950 and special, and in the case of \10 and friends, context determines which.
9952 A summary of the code structure is:
9954 switch (first_byte) {
9955 cases for each special:
9956 handle this special;
9960 cases for each unambiguous special:
9961 handle this special;
9963 cases for each ambigous special/literal:
9965 if (special) handle here
9967 default: // unambiguously literal:
9970 default: // is a literal char
9973 create EXACTish node for literal;
9974 while (more input and node isn't full) {
9975 switch (input_byte) {
9976 cases for each special;
9977 make sure parse pointer is set so that the next call to
9978 regatom will see this special first
9979 goto loopdone; // EXACTish node terminated by prev. char
9981 append char to EXACTISH node;
9983 get next input byte;
9987 return the generated node;
9989 Specifically there are two separate switches for handling
9990 escape sequences, with the one for handling literal escapes requiring
9991 a dummy entry for all of the special escapes that are actually handled
9996 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9999 regnode *ret = NULL;
10001 char *parse_start = RExC_parse;
10003 GET_RE_DEBUG_FLAGS_DECL;
10004 DEBUG_PARSE("atom");
10005 *flagp = WORST; /* Tentatively. */
10007 PERL_ARGS_ASSERT_REGATOM;
10010 switch ((U8)*RExC_parse) {
10012 RExC_seen_zerolen++;
10013 nextchar(pRExC_state);
10014 if (RExC_flags & RXf_PMf_MULTILINE)
10015 ret = reg_node(pRExC_state, MBOL);
10016 else if (RExC_flags & RXf_PMf_SINGLELINE)
10017 ret = reg_node(pRExC_state, SBOL);
10019 ret = reg_node(pRExC_state, BOL);
10020 Set_Node_Length(ret, 1); /* MJD */
10023 nextchar(pRExC_state);
10025 RExC_seen_zerolen++;
10026 if (RExC_flags & RXf_PMf_MULTILINE)
10027 ret = reg_node(pRExC_state, MEOL);
10028 else if (RExC_flags & RXf_PMf_SINGLELINE)
10029 ret = reg_node(pRExC_state, SEOL);
10031 ret = reg_node(pRExC_state, EOL);
10032 Set_Node_Length(ret, 1); /* MJD */
10035 nextchar(pRExC_state);
10036 if (RExC_flags & RXf_PMf_SINGLELINE)
10037 ret = reg_node(pRExC_state, SANY);
10039 ret = reg_node(pRExC_state, REG_ANY);
10040 *flagp |= HASWIDTH|SIMPLE;
10042 Set_Node_Length(ret, 1); /* MJD */
10046 char * const oregcomp_parse = ++RExC_parse;
10047 ret = regclass(pRExC_state, flagp,depth+1);
10048 if (*RExC_parse != ']') {
10049 RExC_parse = oregcomp_parse;
10050 vFAIL("Unmatched [");
10052 nextchar(pRExC_state);
10053 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10057 nextchar(pRExC_state);
10058 ret = reg(pRExC_state, 1, &flags,depth+1);
10060 if (flags & TRYAGAIN) {
10061 if (RExC_parse == RExC_end) {
10062 /* Make parent create an empty node if needed. */
10063 *flagp |= TRYAGAIN;
10070 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10074 if (flags & TRYAGAIN) {
10075 *flagp |= TRYAGAIN;
10078 vFAIL("Internal urp");
10079 /* Supposed to be caught earlier. */
10085 vFAIL("Quantifier follows nothing");
10090 This switch handles escape sequences that resolve to some kind
10091 of special regop and not to literal text. Escape sequnces that
10092 resolve to literal text are handled below in the switch marked
10095 Every entry in this switch *must* have a corresponding entry
10096 in the literal escape switch. However, the opposite is not
10097 required, as the default for this switch is to jump to the
10098 literal text handling code.
10100 switch ((U8)*++RExC_parse) {
10101 /* Special Escapes */
10103 RExC_seen_zerolen++;
10104 ret = reg_node(pRExC_state, SBOL);
10106 goto finish_meta_pat;
10108 ret = reg_node(pRExC_state, GPOS);
10109 RExC_seen |= REG_SEEN_GPOS;
10111 goto finish_meta_pat;
10113 RExC_seen_zerolen++;
10114 ret = reg_node(pRExC_state, KEEPS);
10116 /* XXX:dmq : disabling in-place substitution seems to
10117 * be necessary here to avoid cases of memory corruption, as
10118 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10120 RExC_seen |= REG_SEEN_LOOKBEHIND;
10121 goto finish_meta_pat;
10123 ret = reg_node(pRExC_state, SEOL);
10125 RExC_seen_zerolen++; /* Do not optimize RE away */
10126 goto finish_meta_pat;
10128 ret = reg_node(pRExC_state, EOS);
10130 RExC_seen_zerolen++; /* Do not optimize RE away */
10131 goto finish_meta_pat;
10133 ret = reg_node(pRExC_state, CANY);
10134 RExC_seen |= REG_SEEN_CANY;
10135 *flagp |= HASWIDTH|SIMPLE;
10136 goto finish_meta_pat;
10138 ret = reg_node(pRExC_state, CLUMP);
10139 *flagp |= HASWIDTH;
10140 goto finish_meta_pat;
10142 op = ALNUM + get_regex_charset(RExC_flags);
10143 if (op > ALNUMA) { /* /aa is same as /a */
10146 ret = reg_node(pRExC_state, op);
10147 *flagp |= HASWIDTH|SIMPLE;
10148 goto finish_meta_pat;
10150 op = NALNUM + get_regex_charset(RExC_flags);
10151 if (op > NALNUMA) { /* /aa is same as /a */
10154 ret = reg_node(pRExC_state, op);
10155 *flagp |= HASWIDTH|SIMPLE;
10156 goto finish_meta_pat;
10158 RExC_seen_zerolen++;
10159 RExC_seen |= REG_SEEN_LOOKBEHIND;
10160 op = BOUND + get_regex_charset(RExC_flags);
10161 if (op > BOUNDA) { /* /aa is same as /a */
10164 ret = reg_node(pRExC_state, op);
10165 FLAGS(ret) = get_regex_charset(RExC_flags);
10167 goto finish_meta_pat;
10169 RExC_seen_zerolen++;
10170 RExC_seen |= REG_SEEN_LOOKBEHIND;
10171 op = NBOUND + get_regex_charset(RExC_flags);
10172 if (op > NBOUNDA) { /* /aa is same as /a */
10175 ret = reg_node(pRExC_state, op);
10176 FLAGS(ret) = get_regex_charset(RExC_flags);
10178 goto finish_meta_pat;
10180 op = SPACE + get_regex_charset(RExC_flags);
10181 if (op > SPACEA) { /* /aa is same as /a */
10184 ret = reg_node(pRExC_state, op);
10185 *flagp |= HASWIDTH|SIMPLE;
10186 goto finish_meta_pat;
10188 op = NSPACE + get_regex_charset(RExC_flags);
10189 if (op > NSPACEA) { /* /aa is same as /a */
10192 ret = reg_node(pRExC_state, op);
10193 *flagp |= HASWIDTH|SIMPLE;
10194 goto finish_meta_pat;
10202 U8 offset = get_regex_charset(RExC_flags);
10203 if (offset == REGEX_UNICODE_CHARSET) {
10204 offset = REGEX_DEPENDS_CHARSET;
10206 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10207 offset = REGEX_ASCII_RESTRICTED_CHARSET;
10211 ret = reg_node(pRExC_state, op);
10212 *flagp |= HASWIDTH|SIMPLE;
10213 goto finish_meta_pat;
10215 ret = reg_node(pRExC_state, LNBREAK);
10216 *flagp |= HASWIDTH|SIMPLE;
10217 goto finish_meta_pat;
10219 ret = reg_node(pRExC_state, HORIZWS);
10220 *flagp |= HASWIDTH|SIMPLE;
10221 goto finish_meta_pat;
10223 ret = reg_node(pRExC_state, NHORIZWS);
10224 *flagp |= HASWIDTH|SIMPLE;
10225 goto finish_meta_pat;
10227 ret = reg_node(pRExC_state, VERTWS);
10228 *flagp |= HASWIDTH|SIMPLE;
10229 goto finish_meta_pat;
10231 ret = reg_node(pRExC_state, NVERTWS);
10232 *flagp |= HASWIDTH|SIMPLE;
10234 nextchar(pRExC_state);
10235 Set_Node_Length(ret, 2); /* MJD */
10240 char* const oldregxend = RExC_end;
10242 char* parse_start = RExC_parse - 2;
10245 if (RExC_parse[1] == '{') {
10246 /* a lovely hack--pretend we saw [\pX] instead */
10247 RExC_end = strchr(RExC_parse, '}');
10249 const U8 c = (U8)*RExC_parse;
10251 RExC_end = oldregxend;
10252 vFAIL2("Missing right brace on \\%c{}", c);
10257 RExC_end = RExC_parse + 2;
10258 if (RExC_end > oldregxend)
10259 RExC_end = oldregxend;
10263 ret = regclass(pRExC_state, flagp,depth+1);
10265 RExC_end = oldregxend;
10268 Set_Node_Offset(ret, parse_start + 2);
10269 Set_Node_Cur_Length(ret);
10270 nextchar(pRExC_state);
10274 /* Handle \N and \N{NAME} with multiple code points here and not
10275 * below because it can be multicharacter. join_exact() will join
10276 * them up later on. Also this makes sure that things like
10277 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10278 * The options to the grok function call causes it to fail if the
10279 * sequence is just a single code point. We then go treat it as
10280 * just another character in the current EXACT node, and hence it
10281 * gets uniform treatment with all the other characters. The
10282 * special treatment for quantifiers is not needed for such single
10283 * character sequences */
10285 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10290 case 'k': /* Handle \k<NAME> and \k'NAME' */
10293 char ch= RExC_parse[1];
10294 if (ch != '<' && ch != '\'' && ch != '{') {
10296 vFAIL2("Sequence %.2s... not terminated",parse_start);
10298 /* this pretty much dupes the code for (?P=...) in reg(), if
10299 you change this make sure you change that */
10300 char* name_start = (RExC_parse += 2);
10302 SV *sv_dat = reg_scan_name(pRExC_state,
10303 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10304 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10305 if (RExC_parse == name_start || *RExC_parse != ch)
10306 vFAIL2("Sequence %.3s... not terminated",parse_start);
10309 num = add_data( pRExC_state, 1, "S" );
10310 RExC_rxi->data->data[num]=(void*)sv_dat;
10311 SvREFCNT_inc_simple_void(sv_dat);
10315 ret = reganode(pRExC_state,
10318 : (ASCII_FOLD_RESTRICTED)
10320 : (AT_LEAST_UNI_SEMANTICS)
10326 *flagp |= HASWIDTH;
10328 /* override incorrect value set in reganode MJD */
10329 Set_Node_Offset(ret, parse_start+1);
10330 Set_Node_Cur_Length(ret); /* MJD */
10331 nextchar(pRExC_state);
10337 case '1': case '2': case '3': case '4':
10338 case '5': case '6': case '7': case '8': case '9':
10341 bool isg = *RExC_parse == 'g';
10346 if (*RExC_parse == '{') {
10350 if (*RExC_parse == '-') {
10354 if (hasbrace && !isDIGIT(*RExC_parse)) {
10355 if (isrel) RExC_parse--;
10357 goto parse_named_seq;
10359 num = atoi(RExC_parse);
10360 if (isg && num == 0)
10361 vFAIL("Reference to invalid group 0");
10363 num = RExC_npar - num;
10365 vFAIL("Reference to nonexistent or unclosed group");
10367 if (!isg && num > 9 && num >= RExC_npar)
10368 /* Probably a character specified in octal, e.g. \35 */
10371 char * const parse_start = RExC_parse - 1; /* MJD */
10372 while (isDIGIT(*RExC_parse))
10374 if (parse_start == RExC_parse - 1)
10375 vFAIL("Unterminated \\g... pattern");
10377 if (*RExC_parse != '}')
10378 vFAIL("Unterminated \\g{...} pattern");
10382 if (num > (I32)RExC_rx->nparens)
10383 vFAIL("Reference to nonexistent group");
10386 ret = reganode(pRExC_state,
10389 : (ASCII_FOLD_RESTRICTED)
10391 : (AT_LEAST_UNI_SEMANTICS)
10397 *flagp |= HASWIDTH;
10399 /* override incorrect value set in reganode MJD */
10400 Set_Node_Offset(ret, parse_start+1);
10401 Set_Node_Cur_Length(ret); /* MJD */
10403 nextchar(pRExC_state);
10408 if (RExC_parse >= RExC_end)
10409 FAIL("Trailing \\");
10412 /* Do not generate "unrecognized" warnings here, we fall
10413 back into the quick-grab loop below */
10420 if (RExC_flags & RXf_PMf_EXTENDED) {
10421 if ( reg_skipcomment( pRExC_state ) )
10428 parse_start = RExC_parse - 1;
10437 #define MAX_NODE_STRING_SIZE 127
10438 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10440 U8 upper_parse = MAX_NODE_STRING_SIZE;
10443 bool next_is_quantifier;
10447 node_type = compute_EXACTish(pRExC_state);
10448 ret = reg_node(pRExC_state, node_type);
10450 /* In pass1, folded, we use a temporary buffer instead of the
10451 * actual node, as the node doesn't exist yet */
10452 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10458 /* XXX The node can hold up to 255 bytes, yet this only goes to
10459 * 127. I (khw) do not know why. Keeping it somewhat less than
10460 * 255 allows us to not have to worry about overflow due to
10461 * converting to utf8 and fold expansion, but that value is
10462 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10463 * split up by this limit into a single one using the real max of
10464 * 255. Even at 127, this breaks under rare circumstances. If
10465 * folding, we do not want to split a node at a character that is a
10466 * non-final in a multi-char fold, as an input string could just
10467 * happen to want to match across the node boundary. The join
10468 * would solve that problem if the join actually happens. But a
10469 * series of more than two nodes in a row each of 127 would cause
10470 * the first join to succeed to get to 254, but then there wouldn't
10471 * be room for the next one, which could at be one of those split
10472 * multi-char folds. I don't know of any fool-proof solution. One
10473 * could back off to end with only a code point that isn't such a
10474 * non-final, but it is possible for there not to be any in the
10476 for (p = RExC_parse - 1;
10477 len < upper_parse && p < RExC_end;
10482 if (RExC_flags & RXf_PMf_EXTENDED)
10483 p = regwhite( pRExC_state, p );
10494 /* Literal Escapes Switch
10496 This switch is meant to handle escape sequences that
10497 resolve to a literal character.
10499 Every escape sequence that represents something
10500 else, like an assertion or a char class, is handled
10501 in the switch marked 'Special Escapes' above in this
10502 routine, but also has an entry here as anything that
10503 isn't explicitly mentioned here will be treated as
10504 an unescaped equivalent literal.
10507 switch ((U8)*++p) {
10508 /* These are all the special escapes. */
10509 case 'A': /* Start assertion */
10510 case 'b': case 'B': /* Word-boundary assertion*/
10511 case 'C': /* Single char !DANGEROUS! */
10512 case 'd': case 'D': /* digit class */
10513 case 'g': case 'G': /* generic-backref, pos assertion */
10514 case 'h': case 'H': /* HORIZWS */
10515 case 'k': case 'K': /* named backref, keep marker */
10516 case 'p': case 'P': /* Unicode property */
10517 case 'R': /* LNBREAK */
10518 case 's': case 'S': /* space class */
10519 case 'v': case 'V': /* VERTWS */
10520 case 'w': case 'W': /* word class */
10521 case 'X': /* eXtended Unicode "combining character sequence" */
10522 case 'z': case 'Z': /* End of line/string assertion */
10526 /* Anything after here is an escape that resolves to a
10527 literal. (Except digits, which may or may not)
10533 case 'N': /* Handle a single-code point named character. */
10534 /* The options cause it to fail if a multiple code
10535 * point sequence. Handle those in the switch() above
10537 RExC_parse = p + 1;
10538 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10539 flagp, depth, FALSE))
10541 RExC_parse = p = oldp;
10545 if (ender > 0xff) {
10562 ender = ASCII_TO_NATIVE('\033');
10566 ender = ASCII_TO_NATIVE('\007');
10571 STRLEN brace_len = len;
10573 const char* error_msg;
10575 bool valid = grok_bslash_o(p,
10582 RExC_parse = p; /* going to die anyway; point
10583 to exact spot of failure */
10590 if (PL_encoding && ender < 0x100) {
10591 goto recode_encoding;
10593 if (ender > 0xff) {
10600 STRLEN brace_len = len;
10602 const char* error_msg;
10604 bool valid = grok_bslash_x(p,
10611 RExC_parse = p; /* going to die anyway; point
10612 to exact spot of failure */
10618 if (PL_encoding && ender < 0x100) {
10619 goto recode_encoding;
10621 if (ender > 0xff) {
10628 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10630 case '0': case '1': case '2': case '3':case '4':
10631 case '5': case '6': case '7':
10633 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10635 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10637 ender = grok_oct(p, &numlen, &flags, NULL);
10638 if (ender > 0xff) {
10647 if (PL_encoding && ender < 0x100)
10648 goto recode_encoding;
10651 if (! RExC_override_recoding) {
10652 SV* enc = PL_encoding;
10653 ender = reg_recode((const char)(U8)ender, &enc);
10654 if (!enc && SIZE_ONLY)
10655 ckWARNreg(p, "Invalid escape in the specified encoding");
10661 FAIL("Trailing \\");
10664 if (!SIZE_ONLY&& isALNUMC(*p)) {
10665 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10667 goto normal_default;
10671 /* Currently we don't warn when the lbrace is at the start
10672 * of a construct. This catches it in the middle of a
10673 * literal string, or when its the first thing after
10674 * something like "\b" */
10676 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10678 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10683 if (UTF8_IS_START(*p) && UTF) {
10685 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10686 &numlen, UTF8_ALLOW_DEFAULT);
10692 } /* End of switch on the literal */
10694 /* Here, have looked at the literal character and <ender>
10695 * contains its ordinal, <p> points to the character after it
10698 if ( RExC_flags & RXf_PMf_EXTENDED)
10699 p = regwhite( pRExC_state, p );
10701 /* If the next thing is a quantifier, it applies to this
10702 * character only, which means that this character has to be in
10703 * its own node and can't just be appended to the string in an
10704 * existing node, so if there are already other characters in
10705 * the node, close the node with just them, and set up to do
10706 * this character again next time through, when it will be the
10707 * only thing in its new node */
10708 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10716 /* See comments for join_exact() as to why we fold
10717 * this non-UTF at compile time */
10718 || (node_type == EXACTFU
10719 && ender == LATIN_SMALL_LETTER_SHARP_S))
10723 /* Prime the casefolded buffer. Locale rules, which
10724 * apply only to code points < 256, aren't known until
10725 * execution, so for them, just output the original
10726 * character using utf8. If we start to fold non-UTF
10727 * patterns, be sure to update join_exact() */
10728 if (LOC && ender < 256) {
10729 if (UNI_IS_INVARIANT(ender)) {
10733 *s = UTF8_TWO_BYTE_HI(ender);
10734 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10739 ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
10741 | ((LOC) ? FOLD_FLAGS_LOCALE
10742 : (ASCII_FOLD_RESTRICTED)
10743 ? FOLD_FLAGS_NOMIX_ASCII
10749 /* The loop increments <len> each time, as all but this
10750 * path (and the one just below for UTF) through it add
10751 * a single byte to the EXACTish node. But this one
10752 * has changed len to be the correct final value, so
10753 * subtract one to cancel out the increment that
10755 len += foldlen - 1;
10762 const STRLEN unilen = reguni(pRExC_state, ender, s);
10768 /* See comment just above for - 1 */
10772 REGC((char)ender, s++);
10775 if (next_is_quantifier) {
10777 /* Here, the next input is a quantifier, and to get here,
10778 * the current character is the only one in the node.
10779 * Also, here <len> doesn't include the final byte for this
10785 } /* End of loop through literal characters */
10787 /* Here we have either exhausted the input or ran out of room in
10788 * the node. (If we encountered a character that can't be in the
10789 * node, transfer is made directly to <loopdone>, and so we
10790 * wouldn't have fallen off the end of the loop.) In the latter
10791 * case, we artificially have to split the node into two, because
10792 * we just don't have enough space to hold everything. This
10793 * creates a problem if the final character participates in a
10794 * multi-character fold in the non-final position, as a match that
10795 * should have occurred won't, due to the way nodes are matched,
10796 * and our artificial boundary. So back off until we find a non-
10797 * problematic character -- one that isn't at the beginning or
10798 * middle of such a fold. (Either it doesn't participate in any
10799 * folds, or appears only in the final position of all the folds it
10800 * does participate in.) A better solution with far fewer false
10801 * positives, and that would fill the nodes more completely, would
10802 * be to actually have available all the multi-character folds to
10803 * test against, and to back-off only far enough to be sure that
10804 * this node isn't ending with a partial one. <upper_parse> is set
10805 * further below (if we need to reparse the node) to include just
10806 * up through that final non-problematic character that this code
10807 * identifies, so when it is set to less than the full node, we can
10808 * skip the rest of this */
10809 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10811 const STRLEN full_len = len;
10813 assert(len >= MAX_NODE_STRING_SIZE);
10815 /* Here, <s> points to the final byte of the final character.
10816 * Look backwards through the string until find a non-
10817 * problematic character */
10821 /* These two have no multi-char folds to non-UTF characters
10823 if (ASCII_FOLD_RESTRICTED || LOC) {
10827 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10831 if (! PL_NonL1NonFinalFold) {
10832 PL_NonL1NonFinalFold = _new_invlist_C_array(
10833 NonL1_Perl_Non_Final_Folds_invlist);
10836 /* Point to the first byte of the final character */
10837 s = (char *) utf8_hop((U8 *) s, -1);
10839 while (s >= s0) { /* Search backwards until find
10840 non-problematic char */
10841 if (UTF8_IS_INVARIANT(*s)) {
10843 /* There are no ascii characters that participate
10844 * in multi-char folds under /aa. In EBCDIC, the
10845 * non-ascii invariants are all control characters,
10846 * so don't ever participate in any folds. */
10847 if (ASCII_FOLD_RESTRICTED
10848 || ! IS_NON_FINAL_FOLD(*s))
10853 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10855 /* No Latin1 characters participate in multi-char
10856 * folds under /l */
10858 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10864 else if (! _invlist_contains_cp(
10865 PL_NonL1NonFinalFold,
10866 valid_utf8_to_uvchr((U8 *) s, NULL)))
10871 /* Here, the current character is problematic in that
10872 * it does occur in the non-final position of some
10873 * fold, so try the character before it, but have to
10874 * special case the very first byte in the string, so
10875 * we don't read outside the string */
10876 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10877 } /* End of loop backwards through the string */
10879 /* If there were only problematic characters in the string,
10880 * <s> will point to before s0, in which case the length
10881 * should be 0, otherwise include the length of the
10882 * non-problematic character just found */
10883 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10886 /* Here, have found the final character, if any, that is
10887 * non-problematic as far as ending the node without splitting
10888 * it across a potential multi-char fold. <len> contains the
10889 * number of bytes in the node up-to and including that
10890 * character, or is 0 if there is no such character, meaning
10891 * the whole node contains only problematic characters. In
10892 * this case, give up and just take the node as-is. We can't
10898 /* Here, the node does contain some characters that aren't
10899 * problematic. If one such is the final character in the
10900 * node, we are done */
10901 if (len == full_len) {
10904 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
10906 /* If the final character is problematic, but the
10907 * penultimate is not, back-off that last character to
10908 * later start a new node with it */
10913 /* Here, the final non-problematic character is earlier
10914 * in the input than the penultimate character. What we do
10915 * is reparse from the beginning, going up only as far as
10916 * this final ok one, thus guaranteeing that the node ends
10917 * in an acceptable character. The reason we reparse is
10918 * that we know how far in the character is, but we don't
10919 * know how to correlate its position with the input parse.
10920 * An alternate implementation would be to build that
10921 * correlation as we go along during the original parse,
10922 * but that would entail extra work for every node, whereas
10923 * this code gets executed only when the string is too
10924 * large for the node, and the final two characters are
10925 * problematic, an infrequent occurrence. Yet another
10926 * possible strategy would be to save the tail of the
10927 * string, and the next time regatom is called, initialize
10928 * with that. The problem with this is that unless you
10929 * back off one more character, you won't be guaranteed
10930 * regatom will get called again, unless regbranch,
10931 * regpiece ... are also changed. If you do back off that
10932 * extra character, so that there is input guaranteed to
10933 * force calling regatom, you can't handle the case where
10934 * just the first character in the node is acceptable. I
10935 * (khw) decided to try this method which doesn't have that
10936 * pitfall; if performance issues are found, we can do a
10937 * combination of the current approach plus that one */
10943 } /* End of verifying node ends with an appropriate char */
10945 loopdone: /* Jumped to when encounters something that shouldn't be in
10948 /* I (khw) don't know if you can get here with zero length, but the
10949 * old code handled this situation by creating a zero-length EXACT
10950 * node. Might as well be NOTHING instead */
10955 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
10958 RExC_parse = p - 1;
10959 Set_Node_Cur_Length(ret); /* MJD */
10960 nextchar(pRExC_state);
10962 /* len is STRLEN which is unsigned, need to copy to signed */
10965 vFAIL("Internal disaster");
10968 } /* End of label 'defchar:' */
10970 } /* End of giant switch on input character */
10976 S_regwhite( RExC_state_t *pRExC_state, char *p )
10978 const char *e = RExC_end;
10980 PERL_ARGS_ASSERT_REGWHITE;
10985 else if (*p == '#') {
10988 if (*p++ == '\n') {
10994 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11002 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11003 Character classes ([:foo:]) can also be negated ([:^foo:]).
11004 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11005 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11006 but trigger failures because they are currently unimplemented. */
11008 #define POSIXCC_DONE(c) ((c) == ':')
11009 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11010 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11013 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
11016 I32 namedclass = OOB_NAMEDCLASS;
11018 PERL_ARGS_ASSERT_REGPPOSIXCC;
11020 if (value == '[' && RExC_parse + 1 < RExC_end &&
11021 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11022 POSIXCC(UCHARAT(RExC_parse))) {
11023 const char c = UCHARAT(RExC_parse);
11024 char* const s = RExC_parse++;
11026 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11028 if (RExC_parse == RExC_end)
11029 /* Grandfather lone [:, [=, [. */
11032 const char* const t = RExC_parse++; /* skip over the c */
11035 if (UCHARAT(RExC_parse) == ']') {
11036 const char *posixcc = s + 1;
11037 RExC_parse++; /* skip over the ending ] */
11040 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11041 const I32 skip = t - posixcc;
11043 /* Initially switch on the length of the name. */
11046 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11047 namedclass = ANYOF_ALNUM;
11050 /* Names all of length 5. */
11051 /* alnum alpha ascii blank cntrl digit graph lower
11052 print punct space upper */
11053 /* Offset 4 gives the best switch position. */
11054 switch (posixcc[4]) {
11056 if (memEQ(posixcc, "alph", 4)) /* alpha */
11057 namedclass = ANYOF_ALPHA;
11060 if (memEQ(posixcc, "spac", 4)) /* space */
11061 namedclass = ANYOF_PSXSPC;
11064 if (memEQ(posixcc, "grap", 4)) /* graph */
11065 namedclass = ANYOF_GRAPH;
11068 if (memEQ(posixcc, "asci", 4)) /* ascii */
11069 namedclass = ANYOF_ASCII;
11072 if (memEQ(posixcc, "blan", 4)) /* blank */
11073 namedclass = ANYOF_BLANK;
11076 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11077 namedclass = ANYOF_CNTRL;
11080 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11081 namedclass = ANYOF_ALNUMC;
11084 if (memEQ(posixcc, "lowe", 4)) /* lower */
11085 namedclass = ANYOF_LOWER;
11086 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11087 namedclass = ANYOF_UPPER;
11090 if (memEQ(posixcc, "digi", 4)) /* digit */
11091 namedclass = ANYOF_DIGIT;
11092 else if (memEQ(posixcc, "prin", 4)) /* print */
11093 namedclass = ANYOF_PRINT;
11094 else if (memEQ(posixcc, "punc", 4)) /* punct */
11095 namedclass = ANYOF_PUNCT;
11100 if (memEQ(posixcc, "xdigit", 6))
11101 namedclass = ANYOF_XDIGIT;
11105 if (namedclass == OOB_NAMEDCLASS)
11106 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11109 /* The #defines are structured so each complement is +1 to
11110 * the normal one */
11114 assert (posixcc[skip] == ':');
11115 assert (posixcc[skip+1] == ']');
11116 } else if (!SIZE_ONLY) {
11117 /* [[=foo=]] and [[.foo.]] are still future. */
11119 /* adjust RExC_parse so the warning shows after
11120 the class closes */
11121 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11123 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11126 /* Maternal grandfather:
11127 * "[:" ending in ":" but not in ":]" */
11137 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
11141 PERL_ARGS_ASSERT_CHECKPOSIXCC;
11143 if (POSIXCC(UCHARAT(RExC_parse))) {
11144 const char *s = RExC_parse;
11145 const char c = *s++;
11147 while (isALNUM(*s))
11149 if (*s && c == *s && s[1] == ']') {
11151 "POSIX syntax [%c %c] belongs inside character classes",
11154 /* [[=foo=]] and [[.foo.]] are still future. */
11155 if (POSIXCC_NOTYET(c)) {
11156 /* adjust RExC_parse so the error shows after
11157 the class closes */
11158 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11160 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11166 /* Generate the code to add a full posix character <class> to the bracketed
11167 * character class given by <node>. (<node> is needed only under locale rules)
11168 * destlist is the inversion list for non-locale rules that this class is
11170 * sourcelist is the ASCII-range inversion list to add under /a rules
11171 * Xsourcelist is the full Unicode range list to use otherwise. */
11172 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11174 SV* scratch_list = NULL; \
11176 /* Set this class in the node for runtime matching */ \
11177 ANYOF_CLASS_SET(node, class); \
11179 /* For above Latin1 code points, we use the full Unicode range */ \
11180 _invlist_intersection(PL_AboveLatin1, \
11183 /* And set the output to it, adding instead if there already is an \
11184 * output. Checking if <destlist> is NULL first saves an extra \
11185 * clone. Its reference count will be decremented at the next \
11186 * union, etc, or if this is the only instance, at the end of the \
11188 if (! destlist) { \
11189 destlist = scratch_list; \
11192 _invlist_union(destlist, scratch_list, &destlist); \
11193 SvREFCNT_dec(scratch_list); \
11197 /* For non-locale, just add it to any existing list */ \
11198 _invlist_union(destlist, \
11199 (AT_LEAST_ASCII_RESTRICTED) \
11205 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11207 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11209 SV* scratch_list = NULL; \
11210 ANYOF_CLASS_SET(node, class); \
11211 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
11212 if (! destlist) { \
11213 destlist = scratch_list; \
11216 _invlist_union(destlist, scratch_list, &destlist); \
11217 SvREFCNT_dec(scratch_list); \
11221 _invlist_union_complement_2nd(destlist, \
11222 (AT_LEAST_ASCII_RESTRICTED) \
11226 /* Under /d, everything in the upper half of the Latin1 range \
11227 * matches this complement */ \
11228 if (DEPENDS_SEMANTICS) { \
11229 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11233 /* Generate the code to add a posix character <class> to the bracketed
11234 * character class given by <node>. (<node> is needed only under locale rules)
11235 * destlist is the inversion list for non-locale rules that this class is
11237 * sourcelist is the ASCII-range inversion list to add under /a rules
11238 * l1_sourcelist is the Latin1 range list to use otherwise.
11239 * Xpropertyname is the name to add to <run_time_list> of the property to
11240 * specify the code points above Latin1 that will have to be
11241 * determined at run-time
11242 * run_time_list is a SV* that contains text names of properties that are to
11243 * be computed at run time. This concatenates <Xpropertyname>
11244 * to it, appropriately
11245 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11247 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11248 l1_sourcelist, Xpropertyname, run_time_list) \
11249 /* First, resolve whether to use the ASCII-only list or the L1 \
11251 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
11252 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11253 Xpropertyname, run_time_list)
11255 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11256 Xpropertyname, run_time_list) \
11257 /* If not /a matching, there are going to be code points we will have \
11258 * to defer to runtime to look-up */ \
11259 if (! AT_LEAST_ASCII_RESTRICTED) { \
11260 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11263 ANYOF_CLASS_SET(node, class); \
11266 _invlist_union(destlist, sourcelist, &destlist); \
11269 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
11270 * this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
11272 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11273 l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11274 if (AT_LEAST_ASCII_RESTRICTED) { \
11275 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
11278 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11279 matches_above_unicode = TRUE; \
11281 ANYOF_CLASS_SET(node, namedclass); \
11284 SV* scratch_list = NULL; \
11285 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
11286 if (! destlist) { \
11287 destlist = scratch_list; \
11290 _invlist_union(destlist, scratch_list, &destlist); \
11291 SvREFCNT_dec(scratch_list); \
11293 if (DEPENDS_SEMANTICS) { \
11294 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11300 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
11302 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
11303 * alternate list, pointed to by 'alternate_ptr'. This is an array of
11304 * the multi-character folds of characters in the node */
11307 PERL_ARGS_ASSERT_ADD_ALTERNATE;
11309 if (! *alternate_ptr) {
11310 *alternate_ptr = newAV();
11312 sv = newSVpvn_utf8((char*)string, len, TRUE);
11313 av_push(*alternate_ptr, sv);
11317 /* The names of properties whose definitions are not known at compile time are
11318 * stored in this SV, after a constant heading. So if the length has been
11319 * changed since initialization, then there is a run-time definition. */
11320 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11322 /* This converts the named class defined in regcomp.h to its equivalent class
11323 * number defined in handy.h. */
11324 #define namedclass_to_classnum(class) ((class) / 2)
11327 parse a class specification and produce either an ANYOF node that
11328 matches the pattern or perhaps will be optimized into an EXACTish node
11329 instead. The node contains a bit map for the first 256 characters, with the
11330 corresponding bit set if that character is in the list. For characters
11331 above 255, a range list is used */
11334 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11338 UV prevvalue = OOB_UNICODE;
11343 IV namedclass = OOB_NAMEDCLASS;
11344 char *rangebegin = NULL;
11345 bool need_class = 0;
11346 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
11348 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11349 than just initialized. */
11350 SV* properties = NULL; /* Code points that match \p{} \P{} */
11351 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11352 extended beyond the Latin1 range */
11353 UV element_count = 0; /* Number of distinct elements in the class.
11354 Optimizations may be possible if this is tiny */
11357 /* Unicode properties are stored in a swash; this holds the current one
11358 * being parsed. If this swash is the only above-latin1 component of the
11359 * character class, an optimization is to pass it directly on to the
11360 * execution engine. Otherwise, it is set to NULL to indicate that there
11361 * are other things in the class that have to be dealt with at execution
11363 SV* swash = NULL; /* Code points that match \p{} \P{} */
11365 /* Set if a component of this character class is user-defined; just passed
11366 * on to the engine */
11367 bool has_user_defined_property = FALSE;
11369 /* inversion list of code points this node matches only when the target
11370 * string is in UTF-8. (Because is under /d) */
11371 SV* depends_list = NULL;
11373 /* inversion list of code points this node matches. For much of the
11374 * function, it includes only those that match regardless of the utf8ness
11375 * of the target string */
11376 SV* cp_list = NULL;
11378 /* List of multi-character folds that are matched by this node */
11379 AV* unicode_alternate = NULL;
11381 /* In a range, counts how many 0-2 of the ends of it came from literals,
11382 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11383 UV literal_endpoint = 0;
11385 bool invert = FALSE; /* Is this class to be complemented */
11387 /* Is there any thing like \W or [:^digit:] that matches above the legal
11388 * Unicode range? */
11389 bool runtime_posix_matches_above_Unicode = FALSE;
11391 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11392 case we need to change the emitted regop to an EXACT. */
11393 const char * orig_parse = RExC_parse;
11394 const I32 orig_size = RExC_size;
11395 GET_RE_DEBUG_FLAGS_DECL;
11397 PERL_ARGS_ASSERT_REGCLASS;
11399 PERL_UNUSED_ARG(depth);
11402 DEBUG_PARSE("clas");
11404 /* Assume we are going to generate an ANYOF node. */
11405 ret = reganode(pRExC_state, ANYOF, 0);
11409 ANYOF_FLAGS(ret) = 0;
11412 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11417 /* We have decided to not allow multi-char folds in inverted character
11418 * classes, due to the confusion that can happen, especially with
11419 * classes that are designed for a non-Unicode world: You have the
11420 * peculiar case that:
11421 "s s" =~ /^[^\xDF]+$/i => Y
11422 "ss" =~ /^[^\xDF]+$/i => N
11424 * See [perl #89750] */
11425 allow_full_fold = FALSE;
11429 RExC_size += ANYOF_SKIP;
11430 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11433 RExC_emit += ANYOF_SKIP;
11435 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11437 listsv = newSVpvs("# comment\n");
11438 initial_listsv_len = SvCUR(listsv);
11441 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11443 if (!SIZE_ONLY && POSIXCC(nextvalue))
11444 checkposixcc(pRExC_state);
11446 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11447 if (UCHARAT(RExC_parse) == ']')
11448 goto charclassloop;
11451 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11455 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11458 rangebegin = RExC_parse;
11462 value = utf8n_to_uvchr((U8*)RExC_parse,
11463 RExC_end - RExC_parse,
11464 &numlen, UTF8_ALLOW_DEFAULT);
11465 RExC_parse += numlen;
11468 value = UCHARAT(RExC_parse++);
11470 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11471 if (value == '[' && POSIXCC(nextvalue))
11472 namedclass = regpposixcc(pRExC_state, value);
11473 else if (value == '\\') {
11475 value = utf8n_to_uvchr((U8*)RExC_parse,
11476 RExC_end - RExC_parse,
11477 &numlen, UTF8_ALLOW_DEFAULT);
11478 RExC_parse += numlen;
11481 value = UCHARAT(RExC_parse++);
11482 /* Some compilers cannot handle switching on 64-bit integer
11483 * values, therefore value cannot be an UV. Yes, this will
11484 * be a problem later if we want switch on Unicode.
11485 * A similar issue a little bit later when switching on
11486 * namedclass. --jhi */
11487 switch ((I32)value) {
11488 case 'w': namedclass = ANYOF_ALNUM; break;
11489 case 'W': namedclass = ANYOF_NALNUM; break;
11490 case 's': namedclass = ANYOF_SPACE; break;
11491 case 'S': namedclass = ANYOF_NSPACE; break;
11492 case 'd': namedclass = ANYOF_DIGIT; break;
11493 case 'D': namedclass = ANYOF_NDIGIT; break;
11494 case 'v': namedclass = ANYOF_VERTWS; break;
11495 case 'V': namedclass = ANYOF_NVERTWS; break;
11496 case 'h': namedclass = ANYOF_HORIZWS; break;
11497 case 'H': namedclass = ANYOF_NHORIZWS; break;
11498 case 'N': /* Handle \N{NAME} in class */
11500 /* We only pay attention to the first char of
11501 multichar strings being returned. I kinda wonder
11502 if this makes sense as it does change the behaviour
11503 from earlier versions, OTOH that behaviour was broken
11505 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11506 TRUE /* => charclass */))
11516 if (RExC_parse >= RExC_end)
11517 vFAIL2("Empty \\%c{}", (U8)value);
11518 if (*RExC_parse == '{') {
11519 const U8 c = (U8)value;
11520 e = strchr(RExC_parse++, '}');
11522 vFAIL2("Missing right brace on \\%c{}", c);
11523 while (isSPACE(UCHARAT(RExC_parse)))
11525 if (e == RExC_parse)
11526 vFAIL2("Empty \\%c{}", c);
11527 n = e - RExC_parse;
11528 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11539 if (UCHARAT(RExC_parse) == '^') {
11542 value = value == 'p' ? 'P' : 'p'; /* toggle */
11543 while (isSPACE(UCHARAT(RExC_parse))) {
11548 /* Try to get the definition of the property into
11549 * <invlist>. If /i is in effect, the effective property
11550 * will have its name be <__NAME_i>. The design is
11551 * discussed in commit
11552 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11553 Newx(name, n + sizeof("_i__\n"), char);
11555 sprintf(name, "%s%.*s%s\n",
11556 (FOLD) ? "__" : "",
11562 /* Look up the property name, and get its swash and
11563 * inversion list, if the property is found */
11565 SvREFCNT_dec(swash);
11567 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11570 TRUE, /* this routine will handle
11571 undefined properties */
11572 NULL, FALSE /* No inversion list */
11576 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11577 || ! (invlist = _get_swash_invlist(swash)))
11580 SvREFCNT_dec(swash);
11584 /* Here didn't find it. It could be a user-defined
11585 * property that will be available at run-time. Add it
11586 * to the list to look up then */
11587 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11588 (value == 'p' ? '+' : '!'),
11590 has_user_defined_property = TRUE;
11592 /* We don't know yet, so have to assume that the
11593 * property could match something in the Latin1 range,
11594 * hence something that isn't utf8. Note that this
11595 * would cause things in <depends_list> to match
11596 * inappropriately, except that any \p{}, including
11597 * this one forces Unicode semantics, which means there
11598 * is <no depends_list> */
11599 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11603 /* Here, did get the swash and its inversion list. If
11604 * the swash is from a user-defined property, then this
11605 * whole character class should be regarded as such */
11606 has_user_defined_property =
11607 _is_swash_user_defined(swash);
11609 /* Invert if asking for the complement */
11610 if (value == 'P') {
11611 _invlist_union_complement_2nd(properties,
11615 /* The swash can't be used as-is, because we've
11616 * inverted things; delay removing it to here after
11617 * have copied its invlist above */
11618 SvREFCNT_dec(swash);
11622 _invlist_union(properties, invlist, &properties);
11627 RExC_parse = e + 1;
11628 namedclass = ANYOF_MAX; /* no official name, but it's named */
11630 /* \p means they want Unicode semantics */
11631 RExC_uni_semantics = 1;
11634 case 'n': value = '\n'; break;
11635 case 'r': value = '\r'; break;
11636 case 't': value = '\t'; break;
11637 case 'f': value = '\f'; break;
11638 case 'b': value = '\b'; break;
11639 case 'e': value = ASCII_TO_NATIVE('\033');break;
11640 case 'a': value = ASCII_TO_NATIVE('\007');break;
11642 RExC_parse--; /* function expects to be pointed at the 'o' */
11644 const char* error_msg;
11645 bool valid = grok_bslash_o(RExC_parse,
11650 RExC_parse += numlen;
11655 if (PL_encoding && value < 0x100) {
11656 goto recode_encoding;
11660 RExC_parse--; /* function expects to be pointed at the 'x' */
11662 const char* error_msg;
11663 bool valid = grok_bslash_x(RExC_parse,
11668 RExC_parse += numlen;
11673 if (PL_encoding && value < 0x100)
11674 goto recode_encoding;
11677 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11679 case '0': case '1': case '2': case '3': case '4':
11680 case '5': case '6': case '7':
11682 /* Take 1-3 octal digits */
11683 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11685 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11686 RExC_parse += numlen;
11687 if (PL_encoding && value < 0x100)
11688 goto recode_encoding;
11692 if (! RExC_override_recoding) {
11693 SV* enc = PL_encoding;
11694 value = reg_recode((const char)(U8)value, &enc);
11695 if (!enc && SIZE_ONLY)
11696 ckWARNreg(RExC_parse,
11697 "Invalid escape in the specified encoding");
11701 /* Allow \_ to not give an error */
11702 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11703 ckWARN2reg(RExC_parse,
11704 "Unrecognized escape \\%c in character class passed through",
11709 } /* end of \blah */
11712 literal_endpoint++;
11715 /* What matches in a locale is not known until runtime. This
11716 * includes what the Posix classes (like \w, [:space:]) match.
11717 * Room must be reserved (one time per class) to store such
11718 * classes, either if Perl is compiled so that locale nodes always
11719 * should have this space, or if there is such class info to be
11720 * stored. The space will contain a bit for each named class that
11721 * is to be matched against. This isn't needed for \p{} and
11722 * pseudo-classes, as they are not affected by locale, and hence
11723 * are dealt with separately */
11726 && (ANYOF_LOCALE == ANYOF_CLASS
11727 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11731 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11734 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11735 ANYOF_CLASS_ZERO(ret);
11737 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11740 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11742 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11743 * literal, as is the character that began the false range, i.e.
11744 * the 'a' in the examples */
11748 RExC_parse >= rangebegin ?
11749 RExC_parse - rangebegin : 0;
11750 ckWARN4reg(RExC_parse,
11751 "False [] range \"%*.*s\"",
11753 cp_list = add_cp_to_invlist(cp_list, '-');
11754 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11757 range = 0; /* this was not a true range */
11758 element_count += 2; /* So counts for three values */
11762 switch ((I32)namedclass) {
11764 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11765 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11766 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11768 case ANYOF_NALNUMC:
11769 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11770 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11771 runtime_posix_matches_above_Unicode);
11774 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11775 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11778 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11779 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11780 runtime_posix_matches_above_Unicode);
11784 ANYOF_CLASS_SET(ret, namedclass);
11787 _invlist_union(posixes, PL_ASCII, &posixes);
11792 ANYOF_CLASS_SET(ret, namedclass);
11795 _invlist_union_complement_2nd(posixes,
11796 PL_ASCII, &posixes);
11797 if (DEPENDS_SEMANTICS) {
11798 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11803 DO_POSIX(ret, namedclass, posixes,
11804 PL_PosixBlank, PL_XPosixBlank);
11807 DO_N_POSIX(ret, namedclass, posixes,
11808 PL_PosixBlank, PL_XPosixBlank);
11811 DO_POSIX(ret, namedclass, posixes,
11812 PL_PosixCntrl, PL_XPosixCntrl);
11815 DO_N_POSIX(ret, namedclass, posixes,
11816 PL_PosixCntrl, PL_XPosixCntrl);
11819 /* There are no digits in the Latin1 range outside of
11820 * ASCII, so call the macro that doesn't have to resolve
11822 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
11823 PL_PosixDigit, "XPosixDigit", listsv);
11826 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11827 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
11828 runtime_posix_matches_above_Unicode);
11831 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11832 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11835 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11836 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
11837 runtime_posix_matches_above_Unicode);
11839 case ANYOF_HORIZWS:
11840 /* For these, we use the cp_list, as /d doesn't make a
11841 * difference in what these match. There would be problems
11842 * if these characters had folds other than themselves, as
11843 * cp_list is subject to folding. It turns out that \h
11844 * is just a synonym for XPosixBlank */
11845 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
11847 case ANYOF_NHORIZWS:
11848 _invlist_union_complement_2nd(cp_list,
11849 PL_XPosixBlank, &cp_list);
11853 { /* These require special handling, as they differ under
11854 folding, matching Cased there (which in the ASCII range
11855 is the same as Alpha */
11861 if (FOLD && ! LOC) {
11862 ascii_source = PL_PosixAlpha;
11863 l1_source = PL_L1Cased;
11867 ascii_source = PL_PosixLower;
11868 l1_source = PL_L1PosixLower;
11869 Xname = "XPosixLower";
11871 if (namedclass == ANYOF_LOWER) {
11872 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11873 ascii_source, l1_source, Xname, listsv);
11876 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11877 posixes, ascii_source, l1_source, Xname, listsv,
11878 runtime_posix_matches_above_Unicode);
11883 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11884 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11887 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11888 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
11889 runtime_posix_matches_above_Unicode);
11892 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11893 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11896 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11897 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
11898 runtime_posix_matches_above_Unicode);
11901 DO_POSIX(ret, namedclass, posixes,
11902 PL_PosixSpace, PL_XPosixSpace);
11904 case ANYOF_NPSXSPC:
11905 DO_N_POSIX(ret, namedclass, posixes,
11906 PL_PosixSpace, PL_XPosixSpace);
11909 DO_POSIX(ret, namedclass, posixes,
11910 PL_PerlSpace, PL_XPerlSpace);
11913 DO_N_POSIX(ret, namedclass, posixes,
11914 PL_PerlSpace, PL_XPerlSpace);
11916 case ANYOF_UPPER: /* Same as LOWER, above */
11923 if (FOLD && ! LOC) {
11924 ascii_source = PL_PosixAlpha;
11925 l1_source = PL_L1Cased;
11929 ascii_source = PL_PosixUpper;
11930 l1_source = PL_L1PosixUpper;
11931 Xname = "XPosixUpper";
11933 if (namedclass == ANYOF_UPPER) {
11934 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11935 ascii_source, l1_source, Xname, listsv);
11938 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11939 posixes, ascii_source, l1_source, Xname, listsv,
11940 runtime_posix_matches_above_Unicode);
11944 case ANYOF_ALNUM: /* Really is 'Word' */
11945 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11946 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11949 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11950 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
11951 runtime_posix_matches_above_Unicode);
11954 /* For these, we use the cp_list, as /d doesn't make a
11955 * difference in what these match. There would be problems
11956 * if these characters had folds other than themselves, as
11957 * cp_list is subject to folding */
11958 _invlist_union(cp_list, PL_VertSpace, &cp_list);
11960 case ANYOF_NVERTWS:
11961 _invlist_union_complement_2nd(cp_list,
11962 PL_VertSpace, &cp_list);
11965 DO_POSIX(ret, namedclass, posixes,
11966 PL_PosixXDigit, PL_XPosixXDigit);
11968 case ANYOF_NXDIGIT:
11969 DO_N_POSIX(ret, namedclass, posixes,
11970 PL_PosixXDigit, PL_XPosixXDigit);
11973 /* this is to handle \p and \P */
11976 vFAIL("Invalid [::] class");
11980 continue; /* Go get next character */
11982 } /* end of namedclass \blah */
11985 if (prevvalue > value) /* b-a */ {
11986 const int w = RExC_parse - rangebegin;
11987 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11988 range = 0; /* not a valid range */
11992 prevvalue = value; /* save the beginning of the potential range */
11993 if (RExC_parse+1 < RExC_end
11994 && *RExC_parse == '-'
11995 && RExC_parse[1] != ']')
11999 /* a bad range like \w-, [:word:]- ? */
12000 if (namedclass > OOB_NAMEDCLASS) {
12001 if (ckWARN(WARN_REGEXP)) {
12003 RExC_parse >= rangebegin ?
12004 RExC_parse - rangebegin : 0;
12006 "False [] range \"%*.*s\"",
12010 cp_list = add_cp_to_invlist(cp_list, '-');
12014 range = 1; /* yeah, it's a range! */
12015 continue; /* but do it the next time */
12019 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12022 /* non-Latin1 code point implies unicode semantics. Must be set in
12023 * pass1 so is there for the whole of pass 2 */
12025 RExC_uni_semantics = 1;
12028 /* Ready to process either the single value, or the completed range */
12031 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12033 UV* this_range = _new_invlist(1);
12034 _append_range_to_invlist(this_range, prevvalue, value);
12036 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12037 * If this range was specified using something like 'i-j', we want
12038 * to include only the 'i' and the 'j', and not anything in
12039 * between, so exclude non-ASCII, non-alphabetics from it.
12040 * However, if the range was specified with something like
12041 * [\x89-\x91] or [\x89-j], all code points within it should be
12042 * included. literal_endpoint==2 means both ends of the range used
12043 * a literal character, not \x{foo} */
12044 if (literal_endpoint == 2
12045 && (prevvalue >= 'a' && value <= 'z')
12046 || (prevvalue >= 'A' && value <= 'Z'))
12048 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12049 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12051 _invlist_union(cp_list, this_range, &cp_list);
12052 literal_endpoint = 0;
12056 range = 0; /* this range (if it was one) is done now */
12057 } /* End of loop through all the text within the brackets */
12059 /* If the character class contains only a single element, it may be
12060 * optimizable into another node type which is smaller and runs faster.
12061 * Check if this is the case for this class */
12062 if (element_count == 1) {
12066 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12067 [:digit:] or \p{foo} */
12069 /* Certain named classes have equivalents that can appear outside a
12070 * character class, e.g. \w, \H. We use these instead of a
12071 * character class. */
12072 switch ((I32)namedclass) {
12075 /* The first group is for node types that depend on the charset
12076 * modifier to the regex. We first calculate the base node
12077 * type, and if it should be inverted */
12084 goto join_charset_classes;
12091 goto join_charset_classes;
12099 join_charset_classes:
12101 /* Now that we have the base node type, we take advantage
12102 * of the enum ordering of the charset modifiers to get the
12103 * exact node type, For example the base SPACE also has
12104 * SPACEL, SPACEU, and SPACEA */
12106 offset = get_regex_charset(RExC_flags);
12108 /* /aa is the same as /a for these */
12109 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12110 offset = REGEX_ASCII_RESTRICTED_CHARSET;
12112 else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12113 offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12118 /* The number of varieties of each of these is the same,
12119 * hence, so is the delta between the normal and
12120 * complemented nodes */
12122 op += NALNUM - ALNUM;
12124 *flagp |= HASWIDTH|SIMPLE;
12127 /* The second group doesn't depend of the charset modifiers.
12128 * We just have normal and complemented */
12129 case ANYOF_NHORIZWS:
12132 case ANYOF_HORIZWS:
12134 op = (invert) ? NHORIZWS : HORIZWS;
12135 *flagp |= HASWIDTH|SIMPLE;
12138 case ANYOF_NVERTWS:
12142 op = (invert) ? NVERTWS : VERTWS;
12143 *flagp |= HASWIDTH|SIMPLE;
12153 if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12158 /* A generic posix class. All the /a ones can be handled
12159 * by the POSIXA opcode. And all are closed under folding
12160 * in the ASCII range, so FOLD doesn't matter */
12161 if (AT_LEAST_ASCII_RESTRICTED
12162 || (! LOC && namedclass == ANYOF_ASCII))
12164 /* The odd numbered ones are the complements of the
12165 * next-lower even number one */
12166 if (namedclass % 2 == 1) {
12170 arg = namedclass_to_classnum(namedclass);
12171 op = (invert) ? NPOSIXA : POSIXA;
12176 else if (value == prevvalue) {
12178 /* Here, the class consists of just a single code point */
12181 if (! LOC && value == '\n') {
12182 op = REG_ANY; /* Optimize [^\n] */
12183 *flagp |= HASWIDTH|SIMPLE;
12187 else if (value < 256 || UTF) {
12189 /* Optimize a single value into an EXACTish node, but not if it
12190 * would require converting the pattern to UTF-8. */
12191 op = compute_EXACTish(pRExC_state);
12193 } /* Otherwise is a range */
12194 else if (! LOC) { /* locale could vary these */
12195 if (prevvalue == '0') {
12196 if (value == '9') {
12197 op = (invert) ? NDIGITA : DIGITA;
12198 *flagp |= HASWIDTH|SIMPLE;
12203 /* Here, we have changed <op> away from its initial value iff we found
12204 * an optimization */
12207 /* Throw away this ANYOF regnode, and emit the calculated one,
12208 * which should correspond to the beginning, not current, state of
12210 const char * cur_parse = RExC_parse;
12211 RExC_parse = (char *)orig_parse;
12215 /* To get locale nodes to not use the full ANYOF size would
12216 * require moving the code above that writes the portions
12217 * of it that aren't in other nodes to after this point.
12218 * e.g. ANYOF_CLASS_SET */
12219 RExC_size = orig_size;
12223 RExC_emit = (regnode *)orig_emit;
12226 ret = reg_node(pRExC_state, op);
12228 if (PL_regkind[op] == POSIXD) {
12232 *flagp |= HASWIDTH|SIMPLE;
12234 else if (PL_regkind[op] == EXACT) {
12235 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12238 RExC_parse = (char *) cur_parse;
12240 SvREFCNT_dec(listsv);
12247 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12249 /* If folding, we calculate all characters that could fold to or from the
12250 * ones already on the list */
12251 if (FOLD && cp_list) {
12252 UV start, end; /* End points of code point ranges */
12254 SV* fold_intersection = NULL;
12256 /* In the Latin1 range, the characters that can be folded-to or -from
12257 * are precisely the alphabetic characters. If the highest code point
12258 * is within Latin1, we can use the compiled-in list, and not have to
12259 * go out to disk. */
12260 if (invlist_highest(cp_list) < 256) {
12261 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12265 /* Here, there are non-Latin1 code points, so we will have to go
12266 * fetch the list of all the characters that participate in folds
12268 if (! PL_utf8_foldable) {
12269 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12270 &PL_sv_undef, 1, 0);
12271 PL_utf8_foldable = _get_swash_invlist(swash);
12272 SvREFCNT_dec(swash);
12275 /* This is a hash that for a particular fold gives all characters
12276 * that are involved in it */
12277 if (! PL_utf8_foldclosures) {
12279 /* If we were unable to find any folds, then we likely won't be
12280 * able to find the closures. So just create an empty list.
12281 * Folding will effectively be restricted to the non-Unicode
12282 * rules hard-coded into Perl. (This case happens legitimately
12283 * during compilation of Perl itself before the Unicode tables
12284 * are generated) */
12285 if (invlist_len(PL_utf8_foldable) == 0) {
12286 PL_utf8_foldclosures = newHV();
12289 /* If the folds haven't been read in, call a fold function
12291 if (! PL_utf8_tofold) {
12292 U8 dummy[UTF8_MAXBYTES+1];
12295 /* This particular string is above \xff in both UTF-8
12297 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
12298 assert(PL_utf8_tofold); /* Verify that worked */
12300 PL_utf8_foldclosures =
12301 _swash_inversion_hash(PL_utf8_tofold);
12305 /* Only the characters in this class that participate in folds need
12306 * be checked. Get the intersection of this class and all the
12307 * possible characters that are foldable. This can quickly narrow
12308 * down a large class */
12309 _invlist_intersection(PL_utf8_foldable, cp_list,
12310 &fold_intersection);
12313 /* Now look at the foldable characters in this class individually */
12314 invlist_iterinit(fold_intersection);
12315 while (invlist_iternext(fold_intersection, &start, &end)) {
12318 /* Locale folding for Latin1 characters is deferred until runtime */
12319 if (LOC && start < 256) {
12323 /* Look at every character in the range */
12324 for (j = start; j <= end; j++) {
12326 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12332 /* We have the latin1 folding rules hard-coded here so that
12333 * an innocent-looking character class, like /[ks]/i won't
12334 * have to go out to disk to find the possible matches.
12335 * XXX It would be better to generate these via regen, in
12336 * case a new version of the Unicode standard adds new
12337 * mappings, though that is not really likely, and may be
12338 * caught by the default: case of the switch below. */
12340 if (PL_fold_latin1[j] != j) {
12342 /* ASCII is always matched; non-ASCII is matched only
12343 * under Unicode rules */
12344 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12346 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12350 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12354 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12355 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12357 /* Certain Latin1 characters have matches outside
12358 * Latin1, or are multi-character. To get here, 'j' is
12359 * one of those characters. None of these matches is
12360 * valid for ASCII characters under /aa, which is why
12361 * the 'if' just above excludes those. The matches
12362 * fall into three categories:
12363 * 1) They are singly folded-to or -from an above 255
12364 * character, e.g., LATIN SMALL LETTER Y WITH
12365 * DIAERESIS and LATIN CAPITAL LETTER Y WITH
12367 * 2) They are part of a multi-char fold with another
12368 * latin1 character; only LATIN SMALL LETTER
12369 * SHARP S => "ss" fits this;
12370 * 3) They are part of a multi-char fold with a
12371 * character outside of Latin1, such as various
12373 * We aren't dealing fully with multi-char folds, except
12374 * we do deal with the pattern containing a character
12375 * that has a multi-char fold (not so much the inverse).
12376 * For types 1) and 3), the matches only happen when the
12377 * target string is utf8; that's not true for 2), and we
12378 * set a flag for it.
12380 * The code below adds the single fold closures for 'j'
12381 * to the inversion list. */
12386 add_cp_to_invlist(cp_list, KELVIN_SIGN);
12390 cp_list = add_cp_to_invlist(cp_list,
12391 LATIN_SMALL_LETTER_LONG_S);
12394 cp_list = add_cp_to_invlist(cp_list,
12395 GREEK_CAPITAL_LETTER_MU);
12396 cp_list = add_cp_to_invlist(cp_list,
12397 GREEK_SMALL_LETTER_MU);
12399 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12400 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12402 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12404 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12405 cp_list = add_cp_to_invlist(cp_list,
12406 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12408 case LATIN_SMALL_LETTER_SHARP_S:
12409 cp_list = add_cp_to_invlist(cp_list,
12410 LATIN_CAPITAL_LETTER_SHARP_S);
12412 /* Under /a, /d, and /u, this can match the two
12414 if (! ASCII_FOLD_RESTRICTED) {
12415 add_alternate(&unicode_alternate,
12418 /* And under /u or /a, it can match even if
12419 * the target is not utf8 */
12420 if (AT_LEAST_UNI_SEMANTICS) {
12421 ANYOF_FLAGS(ret) |=
12422 ANYOF_NONBITMAP_NON_UTF8;
12426 case 'F': case 'f':
12427 case 'I': case 'i':
12428 case 'L': case 'l':
12429 case 'T': case 't':
12430 case 'A': case 'a':
12431 case 'H': case 'h':
12432 case 'J': case 'j':
12433 case 'N': case 'n':
12434 case 'W': case 'w':
12435 case 'Y': case 'y':
12436 /* These all are targets of multi-character
12437 * folds from code points that require UTF8 to
12438 * express, so they can't match unless the
12439 * target string is in UTF-8, so no action here
12440 * is necessary, as regexec.c properly handles
12441 * the general case for UTF-8 matching */
12444 /* Use deprecated warning to increase the
12445 * chances of this being output */
12446 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12453 /* Here is an above Latin1 character. We don't have the rules
12454 * hard-coded for it. First, get its fold */
12455 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
12456 ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
12458 ? FOLD_FLAGS_LOCALE
12459 : (ASCII_FOLD_RESTRICTED)
12460 ? FOLD_FLAGS_NOMIX_ASCII
12463 if (foldlen > (STRLEN)UNISKIP(f)) {
12465 /* Any multicharacter foldings (disallowed in lookbehind
12466 * patterns) require the following transform: [ABCDEF] ->
12467 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
12468 * folds into "rst", all other characters fold to single
12469 * characters. We save away these multicharacter foldings,
12470 * to be later saved as part of the additional "s" data. */
12471 if (! RExC_in_lookbehind) {
12473 U8* e = foldbuf + foldlen;
12475 /* If any of the folded characters of this are in the
12476 * Latin1 range, tell the regex engine that this can
12477 * match a non-utf8 target string. */
12479 if (UTF8_IS_INVARIANT(*loc)
12480 || UTF8_IS_DOWNGRADEABLE_START(*loc))
12483 |= ANYOF_NONBITMAP_NON_UTF8;
12486 loc += UTF8SKIP(loc);
12489 add_alternate(&unicode_alternate, foldbuf, foldlen);
12493 /* Single character fold of above Latin1. Add everything
12494 * in its fold closure to the list that this node should
12498 /* The fold closures data structure is a hash with the keys
12499 * being every character that is folded to, like 'k', and
12500 * the values each an array of everything that folds to its
12501 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
12502 if ((listp = hv_fetch(PL_utf8_foldclosures,
12503 (char *) foldbuf, foldlen, FALSE)))
12505 AV* list = (AV*) *listp;
12507 for (k = 0; k <= av_len(list); k++) {
12508 SV** c_p = av_fetch(list, k, FALSE);
12511 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12515 /* /aa doesn't allow folds between ASCII and non-;
12516 * /l doesn't allow them between above and below
12518 if ((ASCII_FOLD_RESTRICTED
12519 && (isASCII(c) != isASCII(j)))
12520 || (LOC && ((c < 256) != (j < 256))))
12525 /* Folds involving non-ascii Latin1 characters
12526 * under /d are added to a separate list */
12527 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12529 cp_list = add_cp_to_invlist(cp_list, c);
12532 depends_list = add_cp_to_invlist(depends_list, c);
12539 SvREFCNT_dec(fold_intersection);
12542 /* And combine the result (if any) with any inversion list from posix
12543 * classes. The lists are kept separate up to now because we don't want to
12544 * fold the classes (folding of those is automatically handled by the swash
12545 * fetching code) */
12547 if (! DEPENDS_SEMANTICS) {
12549 _invlist_union(cp_list, posixes, &cp_list);
12550 SvREFCNT_dec(posixes);
12557 /* Under /d, we put into a separate list the Latin1 things that
12558 * match only when the target string is utf8 */
12559 SV* nonascii_but_latin1_properties = NULL;
12560 _invlist_intersection(posixes, PL_Latin1,
12561 &nonascii_but_latin1_properties);
12562 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12563 &nonascii_but_latin1_properties);
12564 _invlist_subtract(posixes, nonascii_but_latin1_properties,
12567 _invlist_union(cp_list, posixes, &cp_list);
12568 SvREFCNT_dec(posixes);
12574 if (depends_list) {
12575 _invlist_union(depends_list, nonascii_but_latin1_properties,
12577 SvREFCNT_dec(nonascii_but_latin1_properties);
12580 depends_list = nonascii_but_latin1_properties;
12585 /* And combine the result (if any) with any inversion list from properties.
12586 * The lists are kept separate up to now so that we can distinguish the two
12587 * in regards to matching above-Unicode. A run-time warning is generated
12588 * if a Unicode property is matched against a non-Unicode code point. But,
12589 * we allow user-defined properties to match anything, without any warning,
12590 * and we also suppress the warning if there is a portion of the character
12591 * class that isn't a Unicode property, and which matches above Unicode, \W
12592 * or [\x{110000}] for example.
12593 * (Note that in this case, unlike the Posix one above, there is no
12594 * <depends_list>, because having a Unicode property forces Unicode
12597 bool warn_super = ! has_user_defined_property;
12600 /* If it matters to the final outcome, see if a non-property
12601 * component of the class matches above Unicode. If so, the
12602 * warning gets suppressed. This is true even if just a single
12603 * such code point is specified, as though not strictly correct if
12604 * another such code point is matched against, the fact that they
12605 * are using above-Unicode code points indicates they should know
12606 * the issues involved */
12608 bool non_prop_matches_above_Unicode =
12609 runtime_posix_matches_above_Unicode
12610 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12612 non_prop_matches_above_Unicode =
12613 ! non_prop_matches_above_Unicode;
12615 warn_super = ! non_prop_matches_above_Unicode;
12618 _invlist_union(properties, cp_list, &cp_list);
12619 SvREFCNT_dec(properties);
12622 cp_list = properties;
12626 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12630 /* Here, we have calculated what code points should be in the character
12633 * Now we can see about various optimizations. Fold calculation (which we
12634 * did above) needs to take place before inversion. Otherwise /[^k]/i
12635 * would invert to include K, which under /i would match k, which it
12636 * shouldn't. Therefore we can't invert folded locale now, as it won't be
12637 * folded until runtime */
12639 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12640 * at compile time. Besides not inverting folded locale now, we can't invert
12641 * if there are things such as \w, which aren't known until runtime */
12643 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12645 && ! unicode_alternate
12646 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12648 _invlist_invert(cp_list);
12650 /* Any swash can't be used as-is, because we've inverted things */
12652 SvREFCNT_dec(swash);
12656 /* Clear the invert flag since have just done it here */
12660 /* If we didn't do folding, it's because some information isn't available
12661 * until runtime; set the run-time fold flag for these. (We don't have to
12662 * worry about properties folding, as that is taken care of by the swash
12664 if (FOLD && (LOC || unicode_alternate))
12666 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12669 /* Some character classes are equivalent to other nodes. Such nodes take
12670 * up less room and generally fewer operations to execute than ANYOF nodes.
12671 * Above, we checked for and optimized into some such equivalents for
12672 * certain common classes that are easy to test. Getting to this point in
12673 * the code means that the class didn't get optimized there. Since this
12674 * code is only executed in Pass 2, it is too late to save space--it has
12675 * been allocated in Pass 1, and currently isn't given back. But turning
12676 * things into an EXACTish node can allow the optimizer to join it to any
12677 * adjacent such nodes. And if the class is equivalent to things like /./,
12678 * expensive run-time swashes can be avoided. Now that we have more
12679 * complete information, we can find things necessarily missed by the
12680 * earlier code. I (khw) am not sure how much to look for here. It would
12681 * be easy, but perhaps too slow, to check any candidates against all the
12682 * node types they could possibly match using _invlistEQ(). */
12685 && ! unicode_alternate
12688 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12689 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12692 U8 op = END; /* The optimzation node-type */
12693 const char * cur_parse= RExC_parse;
12695 invlist_iterinit(cp_list);
12696 if (! invlist_iternext(cp_list, &start, &end)) {
12698 /* Here, the list is empty. This happens, for example, when a
12699 * Unicode property is the only thing in the character class, and
12700 * it doesn't match anything. (perluniprops.pod notes such
12703 *flagp |= HASWIDTH|SIMPLE;
12705 else if (start == end) { /* The range is a single code point */
12706 if (! invlist_iternext(cp_list, &start, &end)
12708 /* Don't do this optimization if it would require changing
12709 * the pattern to UTF-8 */
12710 && (start < 256 || UTF))
12712 /* Here, the list contains a single code point. Can optimize
12713 * into an EXACT node */
12722 /* A locale node under folding with one code point can be
12723 * an EXACTFL, as its fold won't be calculated until
12729 /* Here, we are generally folding, but there is only one
12730 * code point to match. If we have to, we use an EXACT
12731 * node, but it would be better for joining with adjacent
12732 * nodes in the optimization pass if we used the same
12733 * EXACTFish node that any such are likely to be. We can
12734 * do this iff the code point doesn't participate in any
12735 * folds. For example, an EXACTF of a colon is the same as
12736 * an EXACT one, since nothing folds to or from a colon.
12737 * In the Latin1 range, being an alpha means that the
12738 * character participates in a fold (except for the
12739 * feminine and masculine ordinals, which I (khw) don't
12740 * think are worrying about optimizing for). */
12742 if (isALPHA_L1(value)) {
12747 if (! PL_utf8_foldable) {
12748 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12749 &PL_sv_undef, 1, 0);
12750 PL_utf8_foldable = _get_swash_invlist(swash);
12751 SvREFCNT_dec(swash);
12753 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
12758 /* If we haven't found the node type, above, it means we
12759 * can use the prevailing one */
12761 op = compute_EXACTish(pRExC_state);
12766 else if (start == 0) {
12767 if (end == UV_MAX) {
12769 *flagp |= HASWIDTH|SIMPLE;
12772 else if (end == '\n' - 1
12773 && invlist_iternext(cp_list, &start, &end)
12774 && start == '\n' + 1 && end == UV_MAX)
12777 *flagp |= HASWIDTH|SIMPLE;
12783 RExC_parse = (char *)orig_parse;
12784 RExC_emit = (regnode *)orig_emit;
12786 ret = reg_node(pRExC_state, op);
12788 RExC_parse = (char *)cur_parse;
12790 if (PL_regkind[op] == EXACT) {
12791 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12794 SvREFCNT_dec(listsv);
12799 /* Here, <cp_list> contains all the code points we can determine at
12800 * compile time that match under all conditions. Go through it, and
12801 * for things that belong in the bitmap, put them there, and delete from
12802 * <cp_list>. While we are at it, see if everything above 255 is in the
12803 * list, and if so, set a flag to speed up execution */
12804 ANYOF_BITMAP_ZERO(ret);
12807 /* This gets set if we actually need to modify things */
12808 bool change_invlist = FALSE;
12812 /* Start looking through <cp_list> */
12813 invlist_iterinit(cp_list);
12814 while (invlist_iternext(cp_list, &start, &end)) {
12818 if (end == UV_MAX && start <= 256) {
12819 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12822 /* Quit if are above what we should change */
12827 change_invlist = TRUE;
12829 /* Set all the bits in the range, up to the max that we are doing */
12830 high = (end < 255) ? end : 255;
12831 for (i = start; i <= (int) high; i++) {
12832 if (! ANYOF_BITMAP_TEST(ret, i)) {
12833 ANYOF_BITMAP_SET(ret, i);
12840 /* Done with loop; remove any code points that are in the bitmap from
12842 if (change_invlist) {
12843 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
12846 /* If have completely emptied it, remove it completely */
12847 if (invlist_len(cp_list) == 0) {
12848 SvREFCNT_dec(cp_list);
12854 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
12857 /* Here, the bitmap has been populated with all the Latin1 code points that
12858 * always match. Can now add to the overall list those that match only
12859 * when the target string is UTF-8 (<depends_list>). */
12860 if (depends_list) {
12862 _invlist_union(cp_list, depends_list, &cp_list);
12863 SvREFCNT_dec(depends_list);
12866 cp_list = depends_list;
12870 /* If there is a swash and more than one element, we can't use the swash in
12871 * the optimization below. */
12872 if (swash && element_count > 1) {
12873 SvREFCNT_dec(swash);
12878 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12879 && ! unicode_alternate)
12881 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12882 SvREFCNT_dec(listsv);
12883 SvREFCNT_dec(unicode_alternate);
12886 /* av[0] stores the character class description in its textual form:
12887 * used later (regexec.c:Perl_regclass_swash()) to initialize the
12888 * appropriate swash, and is also useful for dumping the regnode.
12889 * av[1] if NULL, is a placeholder to later contain the swash computed
12890 * from av[0]. But if no further computation need be done, the
12891 * swash is stored there now.
12892 * av[2] stores the multicharacter foldings, used later in
12893 * regexec.c:S_reginclass().
12894 * av[3] stores the cp_list inversion list for use in addition or
12895 * instead of av[0]; used only if av[1] is NULL
12896 * av[4] is set if any component of the class is from a user-defined
12897 * property; used only if av[1] is NULL */
12898 AV * const av = newAV();
12901 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12905 av_store(av, 1, swash);
12906 SvREFCNT_dec(cp_list);
12909 av_store(av, 1, NULL);
12911 av_store(av, 3, cp_list);
12912 av_store(av, 4, newSVuv(has_user_defined_property));
12916 /* Store any computed multi-char folds only if we are allowing
12918 if (allow_full_fold) {
12919 av_store(av, 2, MUTABLE_SV(unicode_alternate));
12920 if (unicode_alternate) { /* This node is variable length */
12925 av_store(av, 2, NULL);
12927 rv = newRV_noinc(MUTABLE_SV(av));
12928 n = add_data(pRExC_state, 1, "s");
12929 RExC_rxi->data->data[n] = (void*)rv;
12933 *flagp |= HASWIDTH|SIMPLE;
12936 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12939 /* reg_skipcomment()
12941 Absorbs an /x style # comments from the input stream.
12942 Returns true if there is more text remaining in the stream.
12943 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12944 terminates the pattern without including a newline.
12946 Note its the callers responsibility to ensure that we are
12947 actually in /x mode
12952 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12956 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12958 while (RExC_parse < RExC_end)
12959 if (*RExC_parse++ == '\n') {
12964 /* we ran off the end of the pattern without ending
12965 the comment, so we have to add an \n when wrapping */
12966 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12974 Advances the parse position, and optionally absorbs
12975 "whitespace" from the inputstream.
12977 Without /x "whitespace" means (?#...) style comments only,
12978 with /x this means (?#...) and # comments and whitespace proper.
12980 Returns the RExC_parse point from BEFORE the scan occurs.
12982 This is the /x friendly way of saying RExC_parse++.
12986 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12988 char* const retval = RExC_parse++;
12990 PERL_ARGS_ASSERT_NEXTCHAR;
12993 if (RExC_end - RExC_parse >= 3
12994 && *RExC_parse == '('
12995 && RExC_parse[1] == '?'
12996 && RExC_parse[2] == '#')
12998 while (*RExC_parse != ')') {
12999 if (RExC_parse == RExC_end)
13000 FAIL("Sequence (?#... not terminated");
13006 if (RExC_flags & RXf_PMf_EXTENDED) {
13007 if (isSPACE(*RExC_parse)) {
13011 else if (*RExC_parse == '#') {
13012 if ( reg_skipcomment( pRExC_state ) )
13021 - reg_node - emit a node
13023 STATIC regnode * /* Location. */
13024 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13028 regnode * const ret = RExC_emit;
13029 GET_RE_DEBUG_FLAGS_DECL;
13031 PERL_ARGS_ASSERT_REG_NODE;
13034 SIZE_ALIGN(RExC_size);
13038 if (RExC_emit >= RExC_emit_bound)
13039 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13040 op, RExC_emit, RExC_emit_bound);
13042 NODE_ALIGN_FILL(ret);
13044 FILL_ADVANCE_NODE(ptr, op);
13045 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
13046 #ifdef RE_TRACK_PATTERN_OFFSETS
13047 if (RExC_offsets) { /* MJD */
13048 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13049 "reg_node", __LINE__,
13051 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13052 ? "Overwriting end of array!\n" : "OK",
13053 (UV)(RExC_emit - RExC_emit_start),
13054 (UV)(RExC_parse - RExC_start),
13055 (UV)RExC_offsets[0]));
13056 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13064 - reganode - emit a node with an argument
13066 STATIC regnode * /* Location. */
13067 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13071 regnode * const ret = RExC_emit;
13072 GET_RE_DEBUG_FLAGS_DECL;
13074 PERL_ARGS_ASSERT_REGANODE;
13077 SIZE_ALIGN(RExC_size);
13082 assert(2==regarglen[op]+1);
13084 Anything larger than this has to allocate the extra amount.
13085 If we changed this to be:
13087 RExC_size += (1 + regarglen[op]);
13089 then it wouldn't matter. Its not clear what side effect
13090 might come from that so its not done so far.
13095 if (RExC_emit >= RExC_emit_bound)
13096 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13097 op, RExC_emit, RExC_emit_bound);
13099 NODE_ALIGN_FILL(ret);
13101 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13102 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
13103 #ifdef RE_TRACK_PATTERN_OFFSETS
13104 if (RExC_offsets) { /* MJD */
13105 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13109 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13110 "Overwriting end of array!\n" : "OK",
13111 (UV)(RExC_emit - RExC_emit_start),
13112 (UV)(RExC_parse - RExC_start),
13113 (UV)RExC_offsets[0]));
13114 Set_Cur_Node_Offset;
13122 - reguni - emit (if appropriate) a Unicode character
13125 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13129 PERL_ARGS_ASSERT_REGUNI;
13131 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13135 - reginsert - insert an operator in front of already-emitted operand
13137 * Means relocating the operand.
13140 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13146 const int offset = regarglen[(U8)op];
13147 const int size = NODE_STEP_REGNODE + offset;
13148 GET_RE_DEBUG_FLAGS_DECL;
13150 PERL_ARGS_ASSERT_REGINSERT;
13151 PERL_UNUSED_ARG(depth);
13152 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13153 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13162 if (RExC_open_parens) {
13164 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13165 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13166 if ( RExC_open_parens[paren] >= opnd ) {
13167 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13168 RExC_open_parens[paren] += size;
13170 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13172 if ( RExC_close_parens[paren] >= opnd ) {
13173 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13174 RExC_close_parens[paren] += size;
13176 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13181 while (src > opnd) {
13182 StructCopy(--src, --dst, regnode);
13183 #ifdef RE_TRACK_PATTERN_OFFSETS
13184 if (RExC_offsets) { /* MJD 20010112 */
13185 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13189 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13190 ? "Overwriting end of array!\n" : "OK",
13191 (UV)(src - RExC_emit_start),
13192 (UV)(dst - RExC_emit_start),
13193 (UV)RExC_offsets[0]));
13194 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13195 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13201 place = opnd; /* Op node, where operand used to be. */
13202 #ifdef RE_TRACK_PATTERN_OFFSETS
13203 if (RExC_offsets) { /* MJD */
13204 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13208 (UV)(place - RExC_emit_start) > RExC_offsets[0]
13209 ? "Overwriting end of array!\n" : "OK",
13210 (UV)(place - RExC_emit_start),
13211 (UV)(RExC_parse - RExC_start),
13212 (UV)RExC_offsets[0]));
13213 Set_Node_Offset(place, RExC_parse);
13214 Set_Node_Length(place, 1);
13217 src = NEXTOPER(place);
13218 FILL_ADVANCE_NODE(place, op);
13219 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
13220 Zero(src, offset, regnode);
13224 - regtail - set the next-pointer at the end of a node chain of p to val.
13225 - SEE ALSO: regtail_study
13227 /* TODO: All three parms should be const */
13229 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13233 GET_RE_DEBUG_FLAGS_DECL;
13235 PERL_ARGS_ASSERT_REGTAIL;
13237 PERL_UNUSED_ARG(depth);
13243 /* Find last node. */
13246 regnode * const temp = regnext(scan);
13248 SV * const mysv=sv_newmortal();
13249 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13250 regprop(RExC_rx, mysv, scan);
13251 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13252 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13253 (temp == NULL ? "->" : ""),
13254 (temp == NULL ? PL_reg_name[OP(val)] : "")
13262 if (reg_off_by_arg[OP(scan)]) {
13263 ARG_SET(scan, val - scan);
13266 NEXT_OFF(scan) = val - scan;
13272 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13273 - Look for optimizable sequences at the same time.
13274 - currently only looks for EXACT chains.
13276 This is experimental code. The idea is to use this routine to perform
13277 in place optimizations on branches and groups as they are constructed,
13278 with the long term intention of removing optimization from study_chunk so
13279 that it is purely analytical.
13281 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13282 to control which is which.
13285 /* TODO: All four parms should be const */
13288 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13293 #ifdef EXPERIMENTAL_INPLACESCAN
13296 GET_RE_DEBUG_FLAGS_DECL;
13298 PERL_ARGS_ASSERT_REGTAIL_STUDY;
13304 /* Find last node. */
13308 regnode * const temp = regnext(scan);
13309 #ifdef EXPERIMENTAL_INPLACESCAN
13310 if (PL_regkind[OP(scan)] == EXACT) {
13311 bool has_exactf_sharp_s; /* Unexamined in this routine */
13312 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13317 switch (OP(scan)) {
13323 case EXACTFU_TRICKYFOLD:
13325 if( exact == PSEUDO )
13327 else if ( exact != OP(scan) )
13336 SV * const mysv=sv_newmortal();
13337 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13338 regprop(RExC_rx, mysv, scan);
13339 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13340 SvPV_nolen_const(mysv),
13341 REG_NODE_NUM(scan),
13342 PL_reg_name[exact]);
13349 SV * const mysv_val=sv_newmortal();
13350 DEBUG_PARSE_MSG("");
13351 regprop(RExC_rx, mysv_val, val);
13352 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13353 SvPV_nolen_const(mysv_val),
13354 (IV)REG_NODE_NUM(val),
13358 if (reg_off_by_arg[OP(scan)]) {
13359 ARG_SET(scan, val - scan);
13362 NEXT_OFF(scan) = val - scan;
13370 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13374 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13380 for (bit=0; bit<32; bit++) {
13381 if (flags & (1<<bit)) {
13382 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
13385 if (!set++ && lead)
13386 PerlIO_printf(Perl_debug_log, "%s",lead);
13387 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13390 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13391 if (!set++ && lead) {
13392 PerlIO_printf(Perl_debug_log, "%s",lead);
13395 case REGEX_UNICODE_CHARSET:
13396 PerlIO_printf(Perl_debug_log, "UNICODE");
13398 case REGEX_LOCALE_CHARSET:
13399 PerlIO_printf(Perl_debug_log, "LOCALE");
13401 case REGEX_ASCII_RESTRICTED_CHARSET:
13402 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13404 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13405 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13408 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13414 PerlIO_printf(Perl_debug_log, "\n");
13416 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13422 Perl_regdump(pTHX_ const regexp *r)
13426 SV * const sv = sv_newmortal();
13427 SV *dsv= sv_newmortal();
13428 RXi_GET_DECL(r,ri);
13429 GET_RE_DEBUG_FLAGS_DECL;
13431 PERL_ARGS_ASSERT_REGDUMP;
13433 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13435 /* Header fields of interest. */
13436 if (r->anchored_substr) {
13437 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13438 RE_SV_DUMPLEN(r->anchored_substr), 30);
13439 PerlIO_printf(Perl_debug_log,
13440 "anchored %s%s at %"IVdf" ",
13441 s, RE_SV_TAIL(r->anchored_substr),
13442 (IV)r->anchored_offset);
13443 } else if (r->anchored_utf8) {
13444 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13445 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13446 PerlIO_printf(Perl_debug_log,
13447 "anchored utf8 %s%s at %"IVdf" ",
13448 s, RE_SV_TAIL(r->anchored_utf8),
13449 (IV)r->anchored_offset);
13451 if (r->float_substr) {
13452 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13453 RE_SV_DUMPLEN(r->float_substr), 30);
13454 PerlIO_printf(Perl_debug_log,
13455 "floating %s%s at %"IVdf"..%"UVuf" ",
13456 s, RE_SV_TAIL(r->float_substr),
13457 (IV)r->float_min_offset, (UV)r->float_max_offset);
13458 } else if (r->float_utf8) {
13459 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13460 RE_SV_DUMPLEN(r->float_utf8), 30);
13461 PerlIO_printf(Perl_debug_log,
13462 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13463 s, RE_SV_TAIL(r->float_utf8),
13464 (IV)r->float_min_offset, (UV)r->float_max_offset);
13466 if (r->check_substr || r->check_utf8)
13467 PerlIO_printf(Perl_debug_log,
13469 (r->check_substr == r->float_substr
13470 && r->check_utf8 == r->float_utf8
13471 ? "(checking floating" : "(checking anchored"));
13472 if (r->extflags & RXf_NOSCAN)
13473 PerlIO_printf(Perl_debug_log, " noscan");
13474 if (r->extflags & RXf_CHECK_ALL)
13475 PerlIO_printf(Perl_debug_log, " isall");
13476 if (r->check_substr || r->check_utf8)
13477 PerlIO_printf(Perl_debug_log, ") ");
13479 if (ri->regstclass) {
13480 regprop(r, sv, ri->regstclass);
13481 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13483 if (r->extflags & RXf_ANCH) {
13484 PerlIO_printf(Perl_debug_log, "anchored");
13485 if (r->extflags & RXf_ANCH_BOL)
13486 PerlIO_printf(Perl_debug_log, "(BOL)");
13487 if (r->extflags & RXf_ANCH_MBOL)
13488 PerlIO_printf(Perl_debug_log, "(MBOL)");
13489 if (r->extflags & RXf_ANCH_SBOL)
13490 PerlIO_printf(Perl_debug_log, "(SBOL)");
13491 if (r->extflags & RXf_ANCH_GPOS)
13492 PerlIO_printf(Perl_debug_log, "(GPOS)");
13493 PerlIO_putc(Perl_debug_log, ' ');
13495 if (r->extflags & RXf_GPOS_SEEN)
13496 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13497 if (r->intflags & PREGf_SKIP)
13498 PerlIO_printf(Perl_debug_log, "plus ");
13499 if (r->intflags & PREGf_IMPLICIT)
13500 PerlIO_printf(Perl_debug_log, "implicit ");
13501 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13502 if (r->extflags & RXf_EVAL_SEEN)
13503 PerlIO_printf(Perl_debug_log, "with eval ");
13504 PerlIO_printf(Perl_debug_log, "\n");
13505 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13507 PERL_ARGS_ASSERT_REGDUMP;
13508 PERL_UNUSED_CONTEXT;
13509 PERL_UNUSED_ARG(r);
13510 #endif /* DEBUGGING */
13514 - regprop - printable representation of opcode
13516 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13519 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13520 if (flags & ANYOF_INVERT) \
13521 /*make sure the invert info is in each */ \
13522 sv_catpvs(sv, "^"); \
13528 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13534 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13535 static const char * const anyofs[] = {
13567 RXi_GET_DECL(prog,progi);
13568 GET_RE_DEBUG_FLAGS_DECL;
13570 PERL_ARGS_ASSERT_REGPROP;
13574 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13575 /* It would be nice to FAIL() here, but this may be called from
13576 regexec.c, and it would be hard to supply pRExC_state. */
13577 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13578 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13580 k = PL_regkind[OP(o)];
13583 sv_catpvs(sv, " ");
13584 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13585 * is a crude hack but it may be the best for now since
13586 * we have no flag "this EXACTish node was UTF-8"
13588 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13589 PERL_PV_ESCAPE_UNI_DETECT |
13590 PERL_PV_ESCAPE_NONASCII |
13591 PERL_PV_PRETTY_ELLIPSES |
13592 PERL_PV_PRETTY_LTGT |
13593 PERL_PV_PRETTY_NOCLEAR
13595 } else if (k == TRIE) {
13596 /* print the details of the trie in dumpuntil instead, as
13597 * progi->data isn't available here */
13598 const char op = OP(o);
13599 const U32 n = ARG(o);
13600 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13601 (reg_ac_data *)progi->data->data[n] :
13603 const reg_trie_data * const trie
13604 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13606 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13607 DEBUG_TRIE_COMPILE_r(
13608 Perl_sv_catpvf(aTHX_ sv,
13609 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13610 (UV)trie->startstate,
13611 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13612 (UV)trie->wordcount,
13615 (UV)TRIE_CHARCOUNT(trie),
13616 (UV)trie->uniquecharcount
13619 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13621 int rangestart = -1;
13622 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13623 sv_catpvs(sv, "[");
13624 for (i = 0; i <= 256; i++) {
13625 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13626 if (rangestart == -1)
13628 } else if (rangestart != -1) {
13629 if (i <= rangestart + 3)
13630 for (; rangestart < i; rangestart++)
13631 put_byte(sv, rangestart);
13633 put_byte(sv, rangestart);
13634 sv_catpvs(sv, "-");
13635 put_byte(sv, i - 1);
13640 sv_catpvs(sv, "]");
13643 } else if (k == CURLY) {
13644 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13645 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13646 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13648 else if (k == WHILEM && o->flags) /* Ordinal/of */
13649 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13650 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13651 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13652 if ( RXp_PAREN_NAMES(prog) ) {
13653 if ( k != REF || (OP(o) < NREF)) {
13654 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13655 SV **name= av_fetch(list, ARG(o), 0 );
13657 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13660 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13661 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13662 I32 *nums=(I32*)SvPVX(sv_dat);
13663 SV **name= av_fetch(list, nums[0], 0 );
13666 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13667 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13668 (n ? "," : ""), (IV)nums[n]);
13670 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13674 } else if (k == GOSUB)
13675 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13676 else if (k == VERB) {
13678 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13679 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13680 } else if (k == LOGICAL)
13681 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13682 else if (k == ANYOF) {
13683 int i, rangestart = -1;
13684 const U8 flags = ANYOF_FLAGS(o);
13688 if (flags & ANYOF_LOCALE)
13689 sv_catpvs(sv, "{loc}");
13690 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13691 sv_catpvs(sv, "{i}");
13692 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13693 if (flags & ANYOF_INVERT)
13694 sv_catpvs(sv, "^");
13696 /* output what the standard cp 0-255 bitmap matches */
13697 for (i = 0; i <= 256; i++) {
13698 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13699 if (rangestart == -1)
13701 } else if (rangestart != -1) {
13702 if (i <= rangestart + 3)
13703 for (; rangestart < i; rangestart++)
13704 put_byte(sv, rangestart);
13706 put_byte(sv, rangestart);
13707 sv_catpvs(sv, "-");
13708 put_byte(sv, i - 1);
13715 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13716 /* output any special charclass tests (used entirely under use locale) */
13717 if (ANYOF_CLASS_TEST_ANY_SET(o))
13718 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13719 if (ANYOF_CLASS_TEST(o,i)) {
13720 sv_catpv(sv, anyofs[i]);
13724 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13726 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13727 sv_catpvs(sv, "{non-utf8-latin1-all}");
13730 /* output information about the unicode matching */
13731 if (flags & ANYOF_UNICODE_ALL)
13732 sv_catpvs(sv, "{unicode_all}");
13733 else if (ANYOF_NONBITMAP(o))
13734 sv_catpvs(sv, "{unicode}");
13735 if (flags & ANYOF_NONBITMAP_NON_UTF8)
13736 sv_catpvs(sv, "{outside bitmap}");
13738 if (ANYOF_NONBITMAP(o)) {
13739 SV *lv; /* Set if there is something outside the bit map */
13740 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13741 bool byte_output = FALSE; /* If something in the bitmap has been
13744 if (lv && lv != &PL_sv_undef) {
13746 U8 s[UTF8_MAXBYTES_CASE+1];
13748 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13749 uvchr_to_utf8(s, i);
13752 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
13756 && swash_fetch(sw, s, TRUE))
13758 if (rangestart == -1)
13760 } else if (rangestart != -1) {
13761 byte_output = TRUE;
13762 if (i <= rangestart + 3)
13763 for (; rangestart < i; rangestart++) {
13764 put_byte(sv, rangestart);
13767 put_byte(sv, rangestart);
13768 sv_catpvs(sv, "-");
13777 char *s = savesvpv(lv);
13778 char * const origs = s;
13780 while (*s && *s != '\n')
13784 const char * const t = ++s;
13787 sv_catpvs(sv, " ");
13793 /* Truncate very long output */
13794 if (s - origs > 256) {
13795 Perl_sv_catpvf(aTHX_ sv,
13797 (int) (s - origs - 1),
13803 else if (*s == '\t') {
13822 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13824 else if (k == POSIXD) {
13825 U8 index = FLAGS(o) * 2;
13826 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
13827 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
13830 sv_catpv(sv, anyofs[index]);
13833 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13834 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13836 PERL_UNUSED_CONTEXT;
13837 PERL_UNUSED_ARG(sv);
13838 PERL_UNUSED_ARG(o);
13839 PERL_UNUSED_ARG(prog);
13840 #endif /* DEBUGGING */
13844 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13845 { /* Assume that RE_INTUIT is set */
13847 struct regexp *const prog = (struct regexp *)SvANY(r);
13848 GET_RE_DEBUG_FLAGS_DECL;
13850 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13851 PERL_UNUSED_CONTEXT;
13855 const char * const s = SvPV_nolen_const(prog->check_substr
13856 ? prog->check_substr : prog->check_utf8);
13858 if (!PL_colorset) reginitcolors();
13859 PerlIO_printf(Perl_debug_log,
13860 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13862 prog->check_substr ? "" : "utf8 ",
13863 PL_colors[5],PL_colors[0],
13866 (strlen(s) > 60 ? "..." : ""));
13869 return prog->check_substr ? prog->check_substr : prog->check_utf8;
13875 handles refcounting and freeing the perl core regexp structure. When
13876 it is necessary to actually free the structure the first thing it
13877 does is call the 'free' method of the regexp_engine associated to
13878 the regexp, allowing the handling of the void *pprivate; member
13879 first. (This routine is not overridable by extensions, which is why
13880 the extensions free is called first.)
13882 See regdupe and regdupe_internal if you change anything here.
13884 #ifndef PERL_IN_XSUB_RE
13886 Perl_pregfree(pTHX_ REGEXP *r)
13892 Perl_pregfree2(pTHX_ REGEXP *rx)
13895 struct regexp *const r = (struct regexp *)SvANY(rx);
13896 GET_RE_DEBUG_FLAGS_DECL;
13898 PERL_ARGS_ASSERT_PREGFREE2;
13900 if (r->mother_re) {
13901 ReREFCNT_dec(r->mother_re);
13903 CALLREGFREE_PVT(rx); /* free the private data */
13904 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13907 SvREFCNT_dec(r->anchored_substr);
13908 SvREFCNT_dec(r->anchored_utf8);
13909 SvREFCNT_dec(r->float_substr);
13910 SvREFCNT_dec(r->float_utf8);
13911 Safefree(r->substrs);
13913 RX_MATCH_COPY_FREE(rx);
13914 #ifdef PERL_OLD_COPY_ON_WRITE
13915 SvREFCNT_dec(r->saved_copy);
13918 SvREFCNT_dec(r->qr_anoncv);
13923 This is a hacky workaround to the structural issue of match results
13924 being stored in the regexp structure which is in turn stored in
13925 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13926 could be PL_curpm in multiple contexts, and could require multiple
13927 result sets being associated with the pattern simultaneously, such
13928 as when doing a recursive match with (??{$qr})
13930 The solution is to make a lightweight copy of the regexp structure
13931 when a qr// is returned from the code executed by (??{$qr}) this
13932 lightweight copy doesn't actually own any of its data except for
13933 the starp/end and the actual regexp structure itself.
13939 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13941 struct regexp *ret;
13942 struct regexp *const r = (struct regexp *)SvANY(rx);
13944 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13947 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13948 ret = (struct regexp *)SvANY(ret_x);
13950 (void)ReREFCNT_inc(rx);
13951 /* We can take advantage of the existing "copied buffer" mechanism in SVs
13952 by pointing directly at the buffer, but flagging that the allocated
13953 space in the copy is zero. As we've just done a struct copy, it's now
13954 a case of zero-ing that, rather than copying the current length. */
13955 SvPV_set(ret_x, RX_WRAPPED(rx));
13956 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13957 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13958 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13959 SvLEN_set(ret_x, 0);
13960 SvSTASH_set(ret_x, NULL);
13961 SvMAGIC_set(ret_x, NULL);
13963 const I32 npar = r->nparens+1;
13964 Newx(ret->offs, npar, regexp_paren_pair);
13965 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13968 Newx(ret->substrs, 1, struct reg_substr_data);
13969 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13971 SvREFCNT_inc_void(ret->anchored_substr);
13972 SvREFCNT_inc_void(ret->anchored_utf8);
13973 SvREFCNT_inc_void(ret->float_substr);
13974 SvREFCNT_inc_void(ret->float_utf8);
13976 /* check_substr and check_utf8, if non-NULL, point to either their
13977 anchored or float namesakes, and don't hold a second reference. */
13979 RX_MATCH_COPIED_off(ret_x);
13980 #ifdef PERL_OLD_COPY_ON_WRITE
13981 ret->saved_copy = NULL;
13983 ret->mother_re = rx;
13984 SvREFCNT_inc_void(ret->qr_anoncv);
13990 /* regfree_internal()
13992 Free the private data in a regexp. This is overloadable by
13993 extensions. Perl takes care of the regexp structure in pregfree(),
13994 this covers the *pprivate pointer which technically perl doesn't
13995 know about, however of course we have to handle the
13996 regexp_internal structure when no extension is in use.
13998 Note this is called before freeing anything in the regexp
14003 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14006 struct regexp *const r = (struct regexp *)SvANY(rx);
14007 RXi_GET_DECL(r,ri);
14008 GET_RE_DEBUG_FLAGS_DECL;
14010 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14016 SV *dsv= sv_newmortal();
14017 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14018 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14019 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14020 PL_colors[4],PL_colors[5],s);
14023 #ifdef RE_TRACK_PATTERN_OFFSETS
14025 Safefree(ri->u.offsets); /* 20010421 MJD */
14027 if (ri->code_blocks) {
14029 for (n = 0; n < ri->num_code_blocks; n++)
14030 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14031 Safefree(ri->code_blocks);
14035 int n = ri->data->count;
14038 /* If you add a ->what type here, update the comment in regcomp.h */
14039 switch (ri->data->what[n]) {
14045 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14048 Safefree(ri->data->data[n]);
14054 { /* Aho Corasick add-on structure for a trie node.
14055 Used in stclass optimization only */
14057 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14059 refcount = --aho->refcount;
14062 PerlMemShared_free(aho->states);
14063 PerlMemShared_free(aho->fail);
14064 /* do this last!!!! */
14065 PerlMemShared_free(ri->data->data[n]);
14066 PerlMemShared_free(ri->regstclass);
14072 /* trie structure. */
14074 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14076 refcount = --trie->refcount;
14079 PerlMemShared_free(trie->charmap);
14080 PerlMemShared_free(trie->states);
14081 PerlMemShared_free(trie->trans);
14083 PerlMemShared_free(trie->bitmap);
14085 PerlMemShared_free(trie->jump);
14086 PerlMemShared_free(trie->wordinfo);
14087 /* do this last!!!! */
14088 PerlMemShared_free(ri->data->data[n]);
14093 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14096 Safefree(ri->data->what);
14097 Safefree(ri->data);
14103 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14104 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14105 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14108 re_dup - duplicate a regexp.
14110 This routine is expected to clone a given regexp structure. It is only
14111 compiled under USE_ITHREADS.
14113 After all of the core data stored in struct regexp is duplicated
14114 the regexp_engine.dupe method is used to copy any private data
14115 stored in the *pprivate pointer. This allows extensions to handle
14116 any duplication it needs to do.
14118 See pregfree() and regfree_internal() if you change anything here.
14120 #if defined(USE_ITHREADS)
14121 #ifndef PERL_IN_XSUB_RE
14123 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14127 const struct regexp *r = (const struct regexp *)SvANY(sstr);
14128 struct regexp *ret = (struct regexp *)SvANY(dstr);
14130 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14132 npar = r->nparens+1;
14133 Newx(ret->offs, npar, regexp_paren_pair);
14134 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14136 /* no need to copy these */
14137 Newx(ret->swap, npar, regexp_paren_pair);
14140 if (ret->substrs) {
14141 /* Do it this way to avoid reading from *r after the StructCopy().
14142 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14143 cache, it doesn't matter. */
14144 const bool anchored = r->check_substr
14145 ? r->check_substr == r->anchored_substr
14146 : r->check_utf8 == r->anchored_utf8;
14147 Newx(ret->substrs, 1, struct reg_substr_data);
14148 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14150 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14151 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14152 ret->float_substr = sv_dup_inc(ret->float_substr, param);
14153 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14155 /* check_substr and check_utf8, if non-NULL, point to either their
14156 anchored or float namesakes, and don't hold a second reference. */
14158 if (ret->check_substr) {
14160 assert(r->check_utf8 == r->anchored_utf8);
14161 ret->check_substr = ret->anchored_substr;
14162 ret->check_utf8 = ret->anchored_utf8;
14164 assert(r->check_substr == r->float_substr);
14165 assert(r->check_utf8 == r->float_utf8);
14166 ret->check_substr = ret->float_substr;
14167 ret->check_utf8 = ret->float_utf8;
14169 } else if (ret->check_utf8) {
14171 ret->check_utf8 = ret->anchored_utf8;
14173 ret->check_utf8 = ret->float_utf8;
14178 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14179 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14182 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14184 if (RX_MATCH_COPIED(dstr))
14185 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
14187 ret->subbeg = NULL;
14188 #ifdef PERL_OLD_COPY_ON_WRITE
14189 ret->saved_copy = NULL;
14192 if (ret->mother_re) {
14193 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
14194 /* Our storage points directly to our mother regexp, but that's
14195 1: a buffer in a different thread
14196 2: something we no longer hold a reference on
14197 so we need to copy it locally. */
14198 /* Note we need to use SvCUR(), rather than
14199 SvLEN(), on our mother_re, because it, in
14200 turn, may well be pointing to its own mother_re. */
14201 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
14202 SvCUR(ret->mother_re)+1));
14203 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
14205 ret->mother_re = NULL;
14209 #endif /* PERL_IN_XSUB_RE */
14214 This is the internal complement to regdupe() which is used to copy
14215 the structure pointed to by the *pprivate pointer in the regexp.
14216 This is the core version of the extension overridable cloning hook.
14217 The regexp structure being duplicated will be copied by perl prior
14218 to this and will be provided as the regexp *r argument, however
14219 with the /old/ structures pprivate pointer value. Thus this routine
14220 may override any copying normally done by perl.
14222 It returns a pointer to the new regexp_internal structure.
14226 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14229 struct regexp *const r = (struct regexp *)SvANY(rx);
14230 regexp_internal *reti;
14232 RXi_GET_DECL(r,ri);
14234 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14238 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14239 Copy(ri->program, reti->program, len+1, regnode);
14241 reti->num_code_blocks = ri->num_code_blocks;
14242 if (ri->code_blocks) {
14244 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14245 struct reg_code_block);
14246 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14247 struct reg_code_block);
14248 for (n = 0; n < ri->num_code_blocks; n++)
14249 reti->code_blocks[n].src_regex = (REGEXP*)
14250 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14253 reti->code_blocks = NULL;
14255 reti->regstclass = NULL;
14258 struct reg_data *d;
14259 const int count = ri->data->count;
14262 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14263 char, struct reg_data);
14264 Newx(d->what, count, U8);
14267 for (i = 0; i < count; i++) {
14268 d->what[i] = ri->data->what[i];
14269 switch (d->what[i]) {
14270 /* see also regcomp.h and regfree_internal() */
14271 case 'a': /* actually an AV, but the dup function is identical. */
14275 case 'u': /* actually an HV, but the dup function is identical. */
14276 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14279 /* This is cheating. */
14280 Newx(d->data[i], 1, struct regnode_charclass_class);
14281 StructCopy(ri->data->data[i], d->data[i],
14282 struct regnode_charclass_class);
14283 reti->regstclass = (regnode*)d->data[i];
14286 /* Trie stclasses are readonly and can thus be shared
14287 * without duplication. We free the stclass in pregfree
14288 * when the corresponding reg_ac_data struct is freed.
14290 reti->regstclass= ri->regstclass;
14294 ((reg_trie_data*)ri->data->data[i])->refcount++;
14299 d->data[i] = ri->data->data[i];
14302 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14311 reti->name_list_idx = ri->name_list_idx;
14313 #ifdef RE_TRACK_PATTERN_OFFSETS
14314 if (ri->u.offsets) {
14315 Newx(reti->u.offsets, 2*len+1, U32);
14316 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14319 SetProgLen(reti,len);
14322 return (void*)reti;
14325 #endif /* USE_ITHREADS */
14327 #ifndef PERL_IN_XSUB_RE
14330 - regnext - dig the "next" pointer out of a node
14333 Perl_regnext(pTHX_ register regnode *p)
14341 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
14342 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14345 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14354 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14357 STRLEN l1 = strlen(pat1);
14358 STRLEN l2 = strlen(pat2);
14361 const char *message;
14363 PERL_ARGS_ASSERT_RE_CROAK2;
14369 Copy(pat1, buf, l1 , char);
14370 Copy(pat2, buf + l1, l2 , char);
14371 buf[l1 + l2] = '\n';
14372 buf[l1 + l2 + 1] = '\0';
14374 /* ANSI variant takes additional second argument */
14375 va_start(args, pat2);
14379 msv = vmess(buf, &args);
14381 message = SvPV_const(msv,l1);
14384 Copy(message, buf, l1 , char);
14385 buf[l1-1] = '\0'; /* Overwrite \n */
14386 Perl_croak(aTHX_ "%s", buf);
14389 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
14391 #ifndef PERL_IN_XSUB_RE
14393 Perl_save_re_context(pTHX)
14397 struct re_save_state *state;
14399 SAVEVPTR(PL_curcop);
14400 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14402 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14403 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14404 SSPUSHUV(SAVEt_RE_STATE);
14406 Copy(&PL_reg_state, state, 1, struct re_save_state);
14408 PL_reg_oldsaved = NULL;
14409 PL_reg_oldsavedlen = 0;
14410 PL_reg_maxiter = 0;
14411 PL_reg_leftiter = 0;
14412 PL_reg_poscache = NULL;
14413 PL_reg_poscache_size = 0;
14414 #ifdef PERL_OLD_COPY_ON_WRITE
14418 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14420 const REGEXP * const rx = PM_GETRE(PL_curpm);
14423 for (i = 1; i <= RX_NPARENS(rx); i++) {
14424 char digits[TYPE_CHARS(long)];
14425 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14426 GV *const *const gvp
14427 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14430 GV * const gv = *gvp;
14431 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14441 clear_re(pTHX_ void *r)
14444 ReREFCNT_dec((REGEXP *)r);
14450 S_put_byte(pTHX_ SV *sv, int c)
14452 PERL_ARGS_ASSERT_PUT_BYTE;
14454 /* Our definition of isPRINT() ignores locales, so only bytes that are
14455 not part of UTF-8 are considered printable. I assume that the same
14456 holds for UTF-EBCDIC.
14457 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14458 which Wikipedia says:
14460 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14461 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14462 identical, to the ASCII delete (DEL) or rubout control character.
14463 ) So the old condition can be simplified to !isPRINT(c) */
14466 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14469 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14473 const char string = c;
14474 if (c == '-' || c == ']' || c == '\\' || c == '^')
14475 sv_catpvs(sv, "\\");
14476 sv_catpvn(sv, &string, 1);
14481 #define CLEAR_OPTSTART \
14482 if (optstart) STMT_START { \
14483 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14487 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14489 STATIC const regnode *
14490 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14491 const regnode *last, const regnode *plast,
14492 SV* sv, I32 indent, U32 depth)
14495 U8 op = PSEUDO; /* Arbitrary non-END op. */
14496 const regnode *next;
14497 const regnode *optstart= NULL;
14499 RXi_GET_DECL(r,ri);
14500 GET_RE_DEBUG_FLAGS_DECL;
14502 PERL_ARGS_ASSERT_DUMPUNTIL;
14504 #ifdef DEBUG_DUMPUNTIL
14505 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14506 last ? last-start : 0,plast ? plast-start : 0);
14509 if (plast && plast < last)
14512 while (PL_regkind[op] != END && (!last || node < last)) {
14513 /* While that wasn't END last time... */
14516 if (op == CLOSE || op == WHILEM)
14518 next = regnext((regnode *)node);
14521 if (OP(node) == OPTIMIZED) {
14522 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14529 regprop(r, sv, node);
14530 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14531 (int)(2*indent + 1), "", SvPVX_const(sv));
14533 if (OP(node) != OPTIMIZED) {
14534 if (next == NULL) /* Next ptr. */
14535 PerlIO_printf(Perl_debug_log, " (0)");
14536 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14537 PerlIO_printf(Perl_debug_log, " (FAIL)");
14539 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14540 (void)PerlIO_putc(Perl_debug_log, '\n');
14544 if (PL_regkind[(U8)op] == BRANCHJ) {
14547 const regnode *nnode = (OP(next) == LONGJMP
14548 ? regnext((regnode *)next)
14550 if (last && nnode > last)
14552 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14555 else if (PL_regkind[(U8)op] == BRANCH) {
14557 DUMPUNTIL(NEXTOPER(node), next);
14559 else if ( PL_regkind[(U8)op] == TRIE ) {
14560 const regnode *this_trie = node;
14561 const char op = OP(node);
14562 const U32 n = ARG(node);
14563 const reg_ac_data * const ac = op>=AHOCORASICK ?
14564 (reg_ac_data *)ri->data->data[n] :
14566 const reg_trie_data * const trie =
14567 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14569 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14571 const regnode *nextbranch= NULL;
14574 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14575 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14577 PerlIO_printf(Perl_debug_log, "%*s%s ",
14578 (int)(2*(indent+3)), "",
14579 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14580 PL_colors[0], PL_colors[1],
14581 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14582 PERL_PV_PRETTY_ELLIPSES |
14583 PERL_PV_PRETTY_LTGT
14588 U16 dist= trie->jump[word_idx+1];
14589 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14590 (UV)((dist ? this_trie + dist : next) - start));
14593 nextbranch= this_trie + trie->jump[0];
14594 DUMPUNTIL(this_trie + dist, nextbranch);
14596 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14597 nextbranch= regnext((regnode *)nextbranch);
14599 PerlIO_printf(Perl_debug_log, "\n");
14602 if (last && next > last)
14607 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14608 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14609 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14611 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14613 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14615 else if ( op == PLUS || op == STAR) {
14616 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14618 else if (PL_regkind[(U8)op] == ANYOF) {
14619 /* arglen 1 + class block */
14620 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14621 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14622 node = NEXTOPER(node);
14624 else if (PL_regkind[(U8)op] == EXACT) {
14625 /* Literal string, where present. */
14626 node += NODE_SZ_STR(node) - 1;
14627 node = NEXTOPER(node);
14630 node = NEXTOPER(node);
14631 node += regarglen[(U8)op];
14633 if (op == CURLYX || op == OPEN)
14637 #ifdef DEBUG_DUMPUNTIL
14638 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14643 #endif /* DEBUGGING */
14647 * c-indentation-style: bsd
14648 * c-basic-offset: 4
14649 * indent-tabs-mode: nil
14652 * ex: set ts=8 sts=4 sw=4 et: