5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
100 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
101 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
122 typedef struct RExC_state_t {
123 U32 flags; /* RXf_* are we folding, multilining? */
124 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
125 char *precomp; /* uncompiled string. */
126 REGEXP *rx_sv; /* The SV that is the regexp. */
127 regexp *rx; /* perl core regexp structure */
128 regexp_internal *rxi; /* internal data for regexp object pprivate field */
129 char *start; /* Start of input for compile */
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
132 I32 whilem_seen; /* number of WHILEM in this expr */
133 regnode *emit_start; /* Start of emitted-code area */
134 regnode *emit_bound; /* First regnode outside of the allocated space */
135 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
136 I32 naughty; /* How bad is this pattern? */
137 I32 sawback; /* Did we see \1, ...? */
139 I32 size; /* Code size. */
140 I32 npar; /* Capture buffer count, (OPEN). */
141 I32 cpar; /* Capture buffer count, (CLOSE). */
142 I32 nestroot; /* root parens we are in - used by accept */
145 regnode **open_parens; /* pointers to open parens */
146 regnode **close_parens; /* pointers to close parens */
147 regnode *opend; /* END node in program */
148 I32 utf8; /* whether the pattern is utf8 or not */
149 I32 orig_utf8; /* whether the pattern was originally in utf8 */
150 /* XXX use this for future optimisation of case
151 * where pattern must be upgraded to utf8. */
152 I32 uni_semantics; /* If a d charset modifier should use unicode
153 rules, even if the pattern is not in
155 HV *paren_names; /* Paren names */
157 regnode **recurse; /* Recurse regops */
158 I32 recurse_count; /* Number of recurse regops */
161 I32 override_recoding;
162 I32 in_multi_char_class;
163 struct reg_code_block *code_blocks; /* positions of literal (?{})
165 int num_code_blocks; /* size of code_blocks[] */
166 int code_index; /* next code_blocks[] slot */
168 char *starttry; /* -Dr: where regtry was called. */
169 #define RExC_starttry (pRExC_state->starttry)
171 SV *runtime_code_qr; /* qr with the runtime code blocks */
173 const char *lastparse;
175 AV *paren_name_list; /* idx -> name */
176 #define RExC_lastparse (pRExC_state->lastparse)
177 #define RExC_lastnum (pRExC_state->lastnum)
178 #define RExC_paren_name_list (pRExC_state->paren_name_list)
182 #define RExC_flags (pRExC_state->flags)
183 #define RExC_pm_flags (pRExC_state->pm_flags)
184 #define RExC_precomp (pRExC_state->precomp)
185 #define RExC_rx_sv (pRExC_state->rx_sv)
186 #define RExC_rx (pRExC_state->rx)
187 #define RExC_rxi (pRExC_state->rxi)
188 #define RExC_start (pRExC_state->start)
189 #define RExC_end (pRExC_state->end)
190 #define RExC_parse (pRExC_state->parse)
191 #define RExC_whilem_seen (pRExC_state->whilem_seen)
192 #ifdef RE_TRACK_PATTERN_OFFSETS
193 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
195 #define RExC_emit (pRExC_state->emit)
196 #define RExC_emit_start (pRExC_state->emit_start)
197 #define RExC_emit_bound (pRExC_state->emit_bound)
198 #define RExC_naughty (pRExC_state->naughty)
199 #define RExC_sawback (pRExC_state->sawback)
200 #define RExC_seen (pRExC_state->seen)
201 #define RExC_size (pRExC_state->size)
202 #define RExC_npar (pRExC_state->npar)
203 #define RExC_nestroot (pRExC_state->nestroot)
204 #define RExC_extralen (pRExC_state->extralen)
205 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
206 #define RExC_utf8 (pRExC_state->utf8)
207 #define RExC_uni_semantics (pRExC_state->uni_semantics)
208 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
209 #define RExC_open_parens (pRExC_state->open_parens)
210 #define RExC_close_parens (pRExC_state->close_parens)
211 #define RExC_opend (pRExC_state->opend)
212 #define RExC_paren_names (pRExC_state->paren_names)
213 #define RExC_recurse (pRExC_state->recurse)
214 #define RExC_recurse_count (pRExC_state->recurse_count)
215 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
216 #define RExC_contains_locale (pRExC_state->contains_locale)
217 #define RExC_override_recoding (pRExC_state->override_recoding)
218 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
221 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
222 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
223 ((*s) == '{' && regcurly(s)))
226 #undef SPSTART /* dratted cpp namespace... */
229 * Flags to be passed up and down.
231 #define WORST 0 /* Worst case. */
232 #define HASWIDTH 0x01 /* Known to match non-null strings. */
234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
235 * character. (There needs to be a case: in the switch statement in regexec.c
236 * for any node marked SIMPLE.) Note that this is not the same thing as
239 #define SPSTART 0x04 /* Starts with * or + */
240 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
241 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
243 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
245 /* whether trie related optimizations are enabled */
246 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
247 #define TRIE_STUDY_OPT
248 #define FULL_TRIE_STUDY
254 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
255 #define PBITVAL(paren) (1 << ((paren) & 7))
256 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
257 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
258 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
260 /* If not already in utf8, do a longjmp back to the beginning */
261 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
262 #define REQUIRE_UTF8 STMT_START { \
263 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
266 /* About scan_data_t.
268 During optimisation we recurse through the regexp program performing
269 various inplace (keyhole style) optimisations. In addition study_chunk
270 and scan_commit populate this data structure with information about
271 what strings MUST appear in the pattern. We look for the longest
272 string that must appear at a fixed location, and we look for the
273 longest string that may appear at a floating location. So for instance
278 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
279 strings (because they follow a .* construct). study_chunk will identify
280 both FOO and BAR as being the longest fixed and floating strings respectively.
282 The strings can be composites, for instance
286 will result in a composite fixed substring 'foo'.
288 For each string some basic information is maintained:
290 - offset or min_offset
291 This is the position the string must appear at, or not before.
292 It also implicitly (when combined with minlenp) tells us how many
293 characters must match before the string we are searching for.
294 Likewise when combined with minlenp and the length of the string it
295 tells us how many characters must appear after the string we have
299 Only used for floating strings. This is the rightmost point that
300 the string can appear at. If set to I32 max it indicates that the
301 string can occur infinitely far to the right.
304 A pointer to the minimum number of characters of the pattern that the
305 string was found inside. This is important as in the case of positive
306 lookahead or positive lookbehind we can have multiple patterns
311 The minimum length of the pattern overall is 3, the minimum length
312 of the lookahead part is 3, but the minimum length of the part that
313 will actually match is 1. So 'FOO's minimum length is 3, but the
314 minimum length for the F is 1. This is important as the minimum length
315 is used to determine offsets in front of and behind the string being
316 looked for. Since strings can be composites this is the length of the
317 pattern at the time it was committed with a scan_commit. Note that
318 the length is calculated by study_chunk, so that the minimum lengths
319 are not known until the full pattern has been compiled, thus the
320 pointer to the value.
324 In the case of lookbehind the string being searched for can be
325 offset past the start point of the final matching string.
326 If this value was just blithely removed from the min_offset it would
327 invalidate some of the calculations for how many chars must match
328 before or after (as they are derived from min_offset and minlen and
329 the length of the string being searched for).
330 When the final pattern is compiled and the data is moved from the
331 scan_data_t structure into the regexp structure the information
332 about lookbehind is factored in, with the information that would
333 have been lost precalculated in the end_shift field for the
336 The fields pos_min and pos_delta are used to store the minimum offset
337 and the delta to the maximum offset at the current point in the pattern.
341 typedef struct scan_data_t {
342 /*I32 len_min; unused */
343 /*I32 len_delta; unused */
347 I32 last_end; /* min value, <0 unless valid. */
350 SV **longest; /* Either &l_fixed, or &l_float. */
351 SV *longest_fixed; /* longest fixed string found in pattern */
352 I32 offset_fixed; /* offset where it starts */
353 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
354 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
355 SV *longest_float; /* longest floating string found in pattern */
356 I32 offset_float_min; /* earliest point in string it can appear */
357 I32 offset_float_max; /* latest point in string it can appear */
358 I32 *minlen_float; /* pointer to the minlen relevant to the string */
359 I32 lookbehind_float; /* is the position of the string modified by LB */
363 struct regnode_charclass_class *start_class;
367 * Forward declarations for pregcomp()'s friends.
370 static const scan_data_t zero_scan_data =
371 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
373 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
374 #define SF_BEFORE_SEOL 0x0001
375 #define SF_BEFORE_MEOL 0x0002
376 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
377 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
380 # define SF_FIX_SHIFT_EOL (0+2)
381 # define SF_FL_SHIFT_EOL (0+4)
383 # define SF_FIX_SHIFT_EOL (+2)
384 # define SF_FL_SHIFT_EOL (+4)
387 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
388 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
390 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
391 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
392 #define SF_IS_INF 0x0040
393 #define SF_HAS_PAR 0x0080
394 #define SF_IN_PAR 0x0100
395 #define SF_HAS_EVAL 0x0200
396 #define SCF_DO_SUBSTR 0x0400
397 #define SCF_DO_STCLASS_AND 0x0800
398 #define SCF_DO_STCLASS_OR 0x1000
399 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
400 #define SCF_WHILEM_VISITED_POS 0x2000
402 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
403 #define SCF_SEEN_ACCEPT 0x8000
405 #define UTF cBOOL(RExC_utf8)
407 /* The enums for all these are ordered so things work out correctly */
408 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
409 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
410 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
411 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
412 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
413 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
414 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
416 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
418 #define OOB_NAMEDCLASS -1
420 /* There is no code point that is out-of-bounds, so this is problematic. But
421 * its only current use is to initialize a variable that is always set before
423 #define OOB_UNICODE 0xDEADBEEF
425 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
426 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
429 /* length of regex to show in messages that don't mark a position within */
430 #define RegexLengthToShowInErrorMessages 127
433 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
434 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
435 * op/pragma/warn/regcomp.
437 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
438 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
440 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
443 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
444 * arg. Show regex, up to a maximum length. If it's too long, chop and add
447 #define _FAIL(code) STMT_START { \
448 const char *ellipses = ""; \
449 IV len = RExC_end - RExC_precomp; \
452 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
453 if (len > RegexLengthToShowInErrorMessages) { \
454 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
455 len = RegexLengthToShowInErrorMessages - 10; \
461 #define FAIL(msg) _FAIL( \
462 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
463 msg, (int)len, RExC_precomp, ellipses))
465 #define FAIL2(msg,arg) _FAIL( \
466 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
467 arg, (int)len, RExC_precomp, ellipses))
470 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
472 #define Simple_vFAIL(m) STMT_START { \
473 const IV offset = RExC_parse - RExC_precomp; \
474 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
475 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
479 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
481 #define vFAIL(m) STMT_START { \
483 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
488 * Like Simple_vFAIL(), but accepts two arguments.
490 #define Simple_vFAIL2(m,a1) STMT_START { \
491 const IV offset = RExC_parse - RExC_precomp; \
492 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
493 (int)offset, RExC_precomp, RExC_precomp + offset); \
497 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
499 #define vFAIL2(m,a1) STMT_START { \
501 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
502 Simple_vFAIL2(m, a1); \
507 * Like Simple_vFAIL(), but accepts three arguments.
509 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
510 const IV offset = RExC_parse - RExC_precomp; \
511 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
512 (int)offset, RExC_precomp, RExC_precomp + offset); \
516 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
518 #define vFAIL3(m,a1,a2) STMT_START { \
520 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
521 Simple_vFAIL3(m, a1, a2); \
525 * Like Simple_vFAIL(), but accepts four arguments.
527 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
528 const IV offset = RExC_parse - RExC_precomp; \
529 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
530 (int)offset, RExC_precomp, RExC_precomp + offset); \
533 #define ckWARNreg(loc,m) STMT_START { \
534 const IV offset = loc - RExC_precomp; \
535 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
536 (int)offset, RExC_precomp, RExC_precomp + offset); \
539 #define ckWARNregdep(loc,m) STMT_START { \
540 const IV offset = loc - RExC_precomp; \
541 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
543 (int)offset, RExC_precomp, RExC_precomp + offset); \
546 #define ckWARN2regdep(loc,m, a1) STMT_START { \
547 const IV offset = loc - RExC_precomp; \
548 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
550 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
553 #define ckWARN2reg(loc, m, a1) STMT_START { \
554 const IV offset = loc - RExC_precomp; \
555 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
556 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
559 #define vWARN3(loc, m, a1, a2) STMT_START { \
560 const IV offset = loc - RExC_precomp; \
561 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
562 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
565 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
566 const IV offset = loc - RExC_precomp; \
567 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
568 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
571 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
572 const IV offset = loc - RExC_precomp; \
573 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
574 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
577 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
578 const IV offset = loc - RExC_precomp; \
579 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
580 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
583 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
584 const IV offset = loc - RExC_precomp; \
585 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
586 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
590 /* Allow for side effects in s */
591 #define REGC(c,s) STMT_START { \
592 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
595 /* Macros for recording node offsets. 20001227 mjd@plover.com
596 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
597 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
598 * Element 0 holds the number n.
599 * Position is 1 indexed.
601 #ifndef RE_TRACK_PATTERN_OFFSETS
602 #define Set_Node_Offset_To_R(node,byte)
603 #define Set_Node_Offset(node,byte)
604 #define Set_Cur_Node_Offset
605 #define Set_Node_Length_To_R(node,len)
606 #define Set_Node_Length(node,len)
607 #define Set_Node_Cur_Length(node)
608 #define Node_Offset(n)
609 #define Node_Length(n)
610 #define Set_Node_Offset_Length(node,offset,len)
611 #define ProgLen(ri) ri->u.proglen
612 #define SetProgLen(ri,x) ri->u.proglen = x
614 #define ProgLen(ri) ri->u.offsets[0]
615 #define SetProgLen(ri,x) ri->u.offsets[0] = x
616 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
618 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
619 __LINE__, (int)(node), (int)(byte))); \
621 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
623 RExC_offsets[2*(node)-1] = (byte); \
628 #define Set_Node_Offset(node,byte) \
629 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
630 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
632 #define Set_Node_Length_To_R(node,len) STMT_START { \
634 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
635 __LINE__, (int)(node), (int)(len))); \
637 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
639 RExC_offsets[2*(node)] = (len); \
644 #define Set_Node_Length(node,len) \
645 Set_Node_Length_To_R((node)-RExC_emit_start, len)
646 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
647 #define Set_Node_Cur_Length(node) \
648 Set_Node_Length(node, RExC_parse - parse_start)
650 /* Get offsets and lengths */
651 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
652 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
654 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
655 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
656 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
660 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
661 #define EXPERIMENTAL_INPLACESCAN
662 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
664 #define DEBUG_STUDYDATA(str,data,depth) \
665 DEBUG_OPTIMISE_MORE_r(if(data){ \
666 PerlIO_printf(Perl_debug_log, \
667 "%*s" str "Pos:%"IVdf"/%"IVdf \
668 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
669 (int)(depth)*2, "", \
670 (IV)((data)->pos_min), \
671 (IV)((data)->pos_delta), \
672 (UV)((data)->flags), \
673 (IV)((data)->whilem_c), \
674 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
675 is_inf ? "INF " : "" \
677 if ((data)->last_found) \
678 PerlIO_printf(Perl_debug_log, \
679 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
680 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
681 SvPVX_const((data)->last_found), \
682 (IV)((data)->last_end), \
683 (IV)((data)->last_start_min), \
684 (IV)((data)->last_start_max), \
685 ((data)->longest && \
686 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
687 SvPVX_const((data)->longest_fixed), \
688 (IV)((data)->offset_fixed), \
689 ((data)->longest && \
690 (data)->longest==&((data)->longest_float)) ? "*" : "", \
691 SvPVX_const((data)->longest_float), \
692 (IV)((data)->offset_float_min), \
693 (IV)((data)->offset_float_max) \
695 PerlIO_printf(Perl_debug_log,"\n"); \
698 static void clear_re(pTHX_ void *r);
700 /* Mark that we cannot extend a found fixed substring at this point.
701 Update the longest found anchored substring and the longest found
702 floating substrings if needed. */
705 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
707 const STRLEN l = CHR_SVLEN(data->last_found);
708 const STRLEN old_l = CHR_SVLEN(*data->longest);
709 GET_RE_DEBUG_FLAGS_DECL;
711 PERL_ARGS_ASSERT_SCAN_COMMIT;
713 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
714 SvSetMagicSV(*data->longest, data->last_found);
715 if (*data->longest == data->longest_fixed) {
716 data->offset_fixed = l ? data->last_start_min : data->pos_min;
717 if (data->flags & SF_BEFORE_EOL)
719 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
721 data->flags &= ~SF_FIX_BEFORE_EOL;
722 data->minlen_fixed=minlenp;
723 data->lookbehind_fixed=0;
725 else { /* *data->longest == data->longest_float */
726 data->offset_float_min = l ? data->last_start_min : data->pos_min;
727 data->offset_float_max = (l
728 ? data->last_start_max
729 : data->pos_min + data->pos_delta);
730 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
731 data->offset_float_max = I32_MAX;
732 if (data->flags & SF_BEFORE_EOL)
734 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
736 data->flags &= ~SF_FL_BEFORE_EOL;
737 data->minlen_float=minlenp;
738 data->lookbehind_float=0;
741 SvCUR_set(data->last_found, 0);
743 SV * const sv = data->last_found;
744 if (SvUTF8(sv) && SvMAGICAL(sv)) {
745 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
751 data->flags &= ~SF_BEFORE_EOL;
752 DEBUG_STUDYDATA("commit: ",data,0);
755 /* Can match anything (initialization) */
757 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
759 PERL_ARGS_ASSERT_CL_ANYTHING;
761 ANYOF_BITMAP_SETALL(cl);
762 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
763 |ANYOF_NON_UTF8_LATIN1_ALL;
765 /* If any portion of the regex is to operate under locale rules,
766 * initialization includes it. The reason this isn't done for all regexes
767 * is that the optimizer was written under the assumption that locale was
768 * all-or-nothing. Given the complexity and lack of documentation in the
769 * optimizer, and that there are inadequate test cases for locale, so many
770 * parts of it may not work properly, it is safest to avoid locale unless
772 if (RExC_contains_locale) {
773 ANYOF_CLASS_SETALL(cl); /* /l uses class */
774 cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
777 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
781 /* Can match anything (initialization) */
783 S_cl_is_anything(const struct regnode_charclass_class *cl)
787 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
789 for (value = 0; value <= ANYOF_MAX; value += 2)
790 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
792 if (!(cl->flags & ANYOF_UNICODE_ALL))
794 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
799 /* Can match anything (initialization) */
801 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
803 PERL_ARGS_ASSERT_CL_INIT;
805 Zero(cl, 1, struct regnode_charclass_class);
807 cl_anything(pRExC_state, cl);
808 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
811 /* These two functions currently do the exact same thing */
812 #define cl_init_zero S_cl_init
814 /* 'AND' a given class with another one. Can create false positives. 'cl'
815 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
816 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
818 S_cl_and(struct regnode_charclass_class *cl,
819 const struct regnode_charclass_class *and_with)
821 PERL_ARGS_ASSERT_CL_AND;
823 assert(and_with->type == ANYOF);
825 /* I (khw) am not sure all these restrictions are necessary XXX */
826 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
827 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
828 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
829 && !(and_with->flags & ANYOF_LOC_FOLD)
830 && !(cl->flags & ANYOF_LOC_FOLD)) {
833 if (and_with->flags & ANYOF_INVERT)
834 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
835 cl->bitmap[i] &= ~and_with->bitmap[i];
837 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
838 cl->bitmap[i] &= and_with->bitmap[i];
839 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
841 if (and_with->flags & ANYOF_INVERT) {
843 /* Here, the and'ed node is inverted. Get the AND of the flags that
844 * aren't affected by the inversion. Those that are affected are
845 * handled individually below */
846 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
847 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
848 cl->flags |= affected_flags;
850 /* We currently don't know how to deal with things that aren't in the
851 * bitmap, but we know that the intersection is no greater than what
852 * is already in cl, so let there be false positives that get sorted
853 * out after the synthetic start class succeeds, and the node is
854 * matched for real. */
856 /* The inversion of these two flags indicate that the resulting
857 * intersection doesn't have them */
858 if (and_with->flags & ANYOF_UNICODE_ALL) {
859 cl->flags &= ~ANYOF_UNICODE_ALL;
861 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
862 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
865 else { /* and'd node is not inverted */
866 U8 outside_bitmap_but_not_utf8; /* Temp variable */
868 if (! ANYOF_NONBITMAP(and_with)) {
870 /* Here 'and_with' doesn't match anything outside the bitmap
871 * (except possibly ANYOF_UNICODE_ALL), which means the
872 * intersection can't either, except for ANYOF_UNICODE_ALL, in
873 * which case we don't know what the intersection is, but it's no
874 * greater than what cl already has, so can just leave it alone,
875 * with possible false positives */
876 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
877 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
878 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
881 else if (! ANYOF_NONBITMAP(cl)) {
883 /* Here, 'and_with' does match something outside the bitmap, and cl
884 * doesn't have a list of things to match outside the bitmap. If
885 * cl can match all code points above 255, the intersection will
886 * be those above-255 code points that 'and_with' matches. If cl
887 * can't match all Unicode code points, it means that it can't
888 * match anything outside the bitmap (since the 'if' that got us
889 * into this block tested for that), so we leave the bitmap empty.
891 if (cl->flags & ANYOF_UNICODE_ALL) {
892 ARG_SET(cl, ARG(and_with));
894 /* and_with's ARG may match things that don't require UTF8.
895 * And now cl's will too, in spite of this being an 'and'. See
896 * the comments below about the kludge */
897 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
901 /* Here, both 'and_with' and cl match something outside the
902 * bitmap. Currently we do not do the intersection, so just match
903 * whatever cl had at the beginning. */
907 /* Take the intersection of the two sets of flags. However, the
908 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
909 * kludge around the fact that this flag is not treated like the others
910 * which are initialized in cl_anything(). The way the optimizer works
911 * is that the synthetic start class (SSC) is initialized to match
912 * anything, and then the first time a real node is encountered, its
913 * values are AND'd with the SSC's with the result being the values of
914 * the real node. However, there are paths through the optimizer where
915 * the AND never gets called, so those initialized bits are set
916 * inappropriately, which is not usually a big deal, as they just cause
917 * false positives in the SSC, which will just mean a probably
918 * imperceptible slow down in execution. However this bit has a
919 * higher false positive consequence in that it can cause utf8.pm,
920 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
921 * bigger slowdown and also causes significant extra memory to be used.
922 * In order to prevent this, the code now takes a different tack. The
923 * bit isn't set unless some part of the regular expression needs it,
924 * but once set it won't get cleared. This means that these extra
925 * modules won't get loaded unless there was some path through the
926 * pattern that would have required them anyway, and so any false
927 * positives that occur by not ANDing them out when they could be
928 * aren't as severe as they would be if we treated this bit like all
930 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
931 & ANYOF_NONBITMAP_NON_UTF8;
932 cl->flags &= and_with->flags;
933 cl->flags |= outside_bitmap_but_not_utf8;
937 /* 'OR' a given class with another one. Can create false positives. 'cl'
938 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
939 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
941 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
943 PERL_ARGS_ASSERT_CL_OR;
945 if (or_with->flags & ANYOF_INVERT) {
947 /* Here, the or'd node is to be inverted. This means we take the
948 * complement of everything not in the bitmap, but currently we don't
949 * know what that is, so give up and match anything */
950 if (ANYOF_NONBITMAP(or_with)) {
951 cl_anything(pRExC_state, cl);
954 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
955 * <= (B1 | !B2) | (CL1 | !CL2)
956 * which is wasteful if CL2 is small, but we ignore CL2:
957 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
958 * XXXX Can we handle case-fold? Unclear:
959 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
960 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
962 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
963 && !(or_with->flags & ANYOF_LOC_FOLD)
964 && !(cl->flags & ANYOF_LOC_FOLD) ) {
967 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
968 cl->bitmap[i] |= ~or_with->bitmap[i];
969 } /* XXXX: logic is complicated otherwise */
971 cl_anything(pRExC_state, cl);
974 /* And, we can just take the union of the flags that aren't affected
975 * by the inversion */
976 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
978 /* For the remaining flags:
979 ANYOF_UNICODE_ALL and inverted means to not match anything above
980 255, which means that the union with cl should just be
981 what cl has in it, so can ignore this flag
982 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
983 is 127-255 to match them, but then invert that, so the
984 union with cl should just be what cl has in it, so can
987 } else { /* 'or_with' is not inverted */
988 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
989 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
990 && (!(or_with->flags & ANYOF_LOC_FOLD)
991 || (cl->flags & ANYOF_LOC_FOLD)) ) {
994 /* OR char bitmap and class bitmap separately */
995 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
996 cl->bitmap[i] |= or_with->bitmap[i];
997 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
998 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
999 cl->classflags[i] |= or_with->classflags[i];
1000 cl->flags |= ANYOF_CLASS;
1003 else { /* XXXX: logic is complicated, leave it along for a moment. */
1004 cl_anything(pRExC_state, cl);
1007 if (ANYOF_NONBITMAP(or_with)) {
1009 /* Use the added node's outside-the-bit-map match if there isn't a
1010 * conflict. If there is a conflict (both nodes match something
1011 * outside the bitmap, but what they match outside is not the same
1012 * pointer, and hence not easily compared until XXX we extend
1013 * inversion lists this far), give up and allow the start class to
1014 * match everything outside the bitmap. If that stuff is all above
1015 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1016 if (! ANYOF_NONBITMAP(cl)) {
1017 ARG_SET(cl, ARG(or_with));
1019 else if (ARG(cl) != ARG(or_with)) {
1021 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1022 cl_anything(pRExC_state, cl);
1025 cl->flags |= ANYOF_UNICODE_ALL;
1030 /* Take the union */
1031 cl->flags |= or_with->flags;
1035 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1036 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1037 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1038 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1043 dump_trie(trie,widecharmap,revcharmap)
1044 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1045 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1047 These routines dump out a trie in a somewhat readable format.
1048 The _interim_ variants are used for debugging the interim
1049 tables that are used to generate the final compressed
1050 representation which is what dump_trie expects.
1052 Part of the reason for their existence is to provide a form
1053 of documentation as to how the different representations function.
1058 Dumps the final compressed table form of the trie to Perl_debug_log.
1059 Used for debugging make_trie().
1063 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1064 AV *revcharmap, U32 depth)
1067 SV *sv=sv_newmortal();
1068 int colwidth= widecharmap ? 6 : 4;
1070 GET_RE_DEBUG_FLAGS_DECL;
1072 PERL_ARGS_ASSERT_DUMP_TRIE;
1074 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1075 (int)depth * 2 + 2,"",
1076 "Match","Base","Ofs" );
1078 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1079 SV ** const tmp = av_fetch( revcharmap, state, 0);
1081 PerlIO_printf( Perl_debug_log, "%*s",
1083 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1084 PL_colors[0], PL_colors[1],
1085 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1086 PERL_PV_ESCAPE_FIRSTCHAR
1091 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1092 (int)depth * 2 + 2,"");
1094 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1095 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1096 PerlIO_printf( Perl_debug_log, "\n");
1098 for( state = 1 ; state < trie->statecount ; state++ ) {
1099 const U32 base = trie->states[ state ].trans.base;
1101 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1103 if ( trie->states[ state ].wordnum ) {
1104 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1106 PerlIO_printf( Perl_debug_log, "%6s", "" );
1109 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1114 while( ( base + ofs < trie->uniquecharcount ) ||
1115 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1116 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1119 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1121 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1122 if ( ( base + ofs >= trie->uniquecharcount ) &&
1123 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1124 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1126 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1128 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1130 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1134 PerlIO_printf( Perl_debug_log, "]");
1137 PerlIO_printf( Perl_debug_log, "\n" );
1139 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1140 for (word=1; word <= trie->wordcount; word++) {
1141 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1142 (int)word, (int)(trie->wordinfo[word].prev),
1143 (int)(trie->wordinfo[word].len));
1145 PerlIO_printf(Perl_debug_log, "\n" );
1148 Dumps a fully constructed but uncompressed trie in list form.
1149 List tries normally only are used for construction when the number of
1150 possible chars (trie->uniquecharcount) is very high.
1151 Used for debugging make_trie().
1154 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1155 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1159 SV *sv=sv_newmortal();
1160 int colwidth= widecharmap ? 6 : 4;
1161 GET_RE_DEBUG_FLAGS_DECL;
1163 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1165 /* print out the table precompression. */
1166 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1167 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1168 "------:-----+-----------------\n" );
1170 for( state=1 ; state < next_alloc ; state ++ ) {
1173 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1174 (int)depth * 2 + 2,"", (UV)state );
1175 if ( ! trie->states[ state ].wordnum ) {
1176 PerlIO_printf( Perl_debug_log, "%5s| ","");
1178 PerlIO_printf( Perl_debug_log, "W%4x| ",
1179 trie->states[ state ].wordnum
1182 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1183 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1185 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1187 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1188 PL_colors[0], PL_colors[1],
1189 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1190 PERL_PV_ESCAPE_FIRSTCHAR
1192 TRIE_LIST_ITEM(state,charid).forid,
1193 (UV)TRIE_LIST_ITEM(state,charid).newstate
1196 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1197 (int)((depth * 2) + 14), "");
1200 PerlIO_printf( Perl_debug_log, "\n");
1205 Dumps a fully constructed but uncompressed trie in table form.
1206 This is the normal DFA style state transition table, with a few
1207 twists to facilitate compression later.
1208 Used for debugging make_trie().
1211 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1212 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1217 SV *sv=sv_newmortal();
1218 int colwidth= widecharmap ? 6 : 4;
1219 GET_RE_DEBUG_FLAGS_DECL;
1221 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1224 print out the table precompression so that we can do a visual check
1225 that they are identical.
1228 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1230 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1231 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1233 PerlIO_printf( Perl_debug_log, "%*s",
1235 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1236 PL_colors[0], PL_colors[1],
1237 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1238 PERL_PV_ESCAPE_FIRSTCHAR
1244 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1246 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1247 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1250 PerlIO_printf( Perl_debug_log, "\n" );
1252 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1254 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1255 (int)depth * 2 + 2,"",
1256 (UV)TRIE_NODENUM( state ) );
1258 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1259 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1261 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1263 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1265 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1266 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1268 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1269 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1277 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1278 startbranch: the first branch in the whole branch sequence
1279 first : start branch of sequence of branch-exact nodes.
1280 May be the same as startbranch
1281 last : Thing following the last branch.
1282 May be the same as tail.
1283 tail : item following the branch sequence
1284 count : words in the sequence
1285 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1286 depth : indent depth
1288 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1290 A trie is an N'ary tree where the branches are determined by digital
1291 decomposition of the key. IE, at the root node you look up the 1st character and
1292 follow that branch repeat until you find the end of the branches. Nodes can be
1293 marked as "accepting" meaning they represent a complete word. Eg:
1297 would convert into the following structure. Numbers represent states, letters
1298 following numbers represent valid transitions on the letter from that state, if
1299 the number is in square brackets it represents an accepting state, otherwise it
1300 will be in parenthesis.
1302 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1306 (1) +-i->(6)-+-s->[7]
1308 +-s->(3)-+-h->(4)-+-e->[5]
1310 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1312 This shows that when matching against the string 'hers' we will begin at state 1
1313 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1314 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1315 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1316 single traverse. We store a mapping from accepting to state to which word was
1317 matched, and then when we have multiple possibilities we try to complete the
1318 rest of the regex in the order in which they occured in the alternation.
1320 The only prior NFA like behaviour that would be changed by the TRIE support is
1321 the silent ignoring of duplicate alternations which are of the form:
1323 / (DUPE|DUPE) X? (?{ ... }) Y /x
1325 Thus EVAL blocks following a trie may be called a different number of times with
1326 and without the optimisation. With the optimisations dupes will be silently
1327 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1328 the following demonstrates:
1330 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1332 which prints out 'word' three times, but
1334 'words'=~/(word|word|word)(?{ print $1 })S/
1336 which doesnt print it out at all. This is due to other optimisations kicking in.
1338 Example of what happens on a structural level:
1340 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1342 1: CURLYM[1] {1,32767}(18)
1353 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1354 and should turn into:
1356 1: CURLYM[1] {1,32767}(18)
1358 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1366 Cases where tail != last would be like /(?foo|bar)baz/:
1376 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1377 and would end up looking like:
1380 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1387 d = uvuni_to_utf8_flags(d, uv, 0);
1389 is the recommended Unicode-aware way of saying
1394 #define TRIE_STORE_REVCHAR(val) \
1397 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1398 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1399 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1400 SvCUR_set(zlopp, kapow - flrbbbbb); \
1403 av_push(revcharmap, zlopp); \
1405 char ooooff = (char)val; \
1406 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1410 #define TRIE_READ_CHAR STMT_START { \
1413 /* if it is UTF then it is either already folded, or does not need folding */ \
1414 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1416 else if (folder == PL_fold_latin1) { \
1417 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1418 if ( foldlen > 0 ) { \
1419 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1425 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1426 skiplen = UNISKIP(uvc); \
1427 foldlen -= skiplen; \
1428 scan = foldbuf + skiplen; \
1431 /* raw data, will be folded later if needed */ \
1439 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1440 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1441 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1442 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1444 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1445 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1446 TRIE_LIST_CUR( state )++; \
1449 #define TRIE_LIST_NEW(state) STMT_START { \
1450 Newxz( trie->states[ state ].trans.list, \
1451 4, reg_trie_trans_le ); \
1452 TRIE_LIST_CUR( state ) = 1; \
1453 TRIE_LIST_LEN( state ) = 4; \
1456 #define TRIE_HANDLE_WORD(state) STMT_START { \
1457 U16 dupe= trie->states[ state ].wordnum; \
1458 regnode * const noper_next = regnext( noper ); \
1461 /* store the word for dumping */ \
1463 if (OP(noper) != NOTHING) \
1464 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1466 tmp = newSVpvn_utf8( "", 0, UTF ); \
1467 av_push( trie_words, tmp ); \
1471 trie->wordinfo[curword].prev = 0; \
1472 trie->wordinfo[curword].len = wordlen; \
1473 trie->wordinfo[curword].accept = state; \
1475 if ( noper_next < tail ) { \
1477 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1478 trie->jump[curword] = (U16)(noper_next - convert); \
1480 jumper = noper_next; \
1482 nextbranch= regnext(cur); \
1486 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1487 /* chain, so that when the bits of chain are later */\
1488 /* linked together, the dups appear in the chain */\
1489 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1490 trie->wordinfo[dupe].prev = curword; \
1492 /* we haven't inserted this word yet. */ \
1493 trie->states[ state ].wordnum = curword; \
1498 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1499 ( ( base + charid >= ucharcount \
1500 && base + charid < ubound \
1501 && state == trie->trans[ base - ucharcount + charid ].check \
1502 && trie->trans[ base - ucharcount + charid ].next ) \
1503 ? trie->trans[ base - ucharcount + charid ].next \
1504 : ( state==1 ? special : 0 ) \
1508 #define MADE_JUMP_TRIE 2
1509 #define MADE_EXACT_TRIE 4
1512 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1515 /* first pass, loop through and scan words */
1516 reg_trie_data *trie;
1517 HV *widecharmap = NULL;
1518 AV *revcharmap = newAV();
1520 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1525 regnode *jumper = NULL;
1526 regnode *nextbranch = NULL;
1527 regnode *convert = NULL;
1528 U32 *prev_states; /* temp array mapping each state to previous one */
1529 /* we just use folder as a flag in utf8 */
1530 const U8 * folder = NULL;
1533 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1534 AV *trie_words = NULL;
1535 /* along with revcharmap, this only used during construction but both are
1536 * useful during debugging so we store them in the struct when debugging.
1539 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1540 STRLEN trie_charcount=0;
1542 SV *re_trie_maxbuff;
1543 GET_RE_DEBUG_FLAGS_DECL;
1545 PERL_ARGS_ASSERT_MAKE_TRIE;
1547 PERL_UNUSED_ARG(depth);
1554 case EXACTFU_TRICKYFOLD:
1555 case EXACTFU: folder = PL_fold_latin1; break;
1556 case EXACTF: folder = PL_fold; break;
1557 case EXACTFL: folder = PL_fold_locale; break;
1558 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1561 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1563 trie->startstate = 1;
1564 trie->wordcount = word_count;
1565 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1566 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1568 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1569 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1570 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1573 trie_words = newAV();
1576 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1577 if (!SvIOK(re_trie_maxbuff)) {
1578 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1580 DEBUG_TRIE_COMPILE_r({
1581 PerlIO_printf( Perl_debug_log,
1582 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1583 (int)depth * 2 + 2, "",
1584 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1585 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1589 /* Find the node we are going to overwrite */
1590 if ( first == startbranch && OP( last ) != BRANCH ) {
1591 /* whole branch chain */
1594 /* branch sub-chain */
1595 convert = NEXTOPER( first );
1598 /* -- First loop and Setup --
1600 We first traverse the branches and scan each word to determine if it
1601 contains widechars, and how many unique chars there are, this is
1602 important as we have to build a table with at least as many columns as we
1605 We use an array of integers to represent the character codes 0..255
1606 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1607 native representation of the character value as the key and IV's for the
1610 *TODO* If we keep track of how many times each character is used we can
1611 remap the columns so that the table compression later on is more
1612 efficient in terms of memory by ensuring the most common value is in the
1613 middle and the least common are on the outside. IMO this would be better
1614 than a most to least common mapping as theres a decent chance the most
1615 common letter will share a node with the least common, meaning the node
1616 will not be compressible. With a middle is most common approach the worst
1617 case is when we have the least common nodes twice.
1621 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1622 regnode *noper = NEXTOPER( cur );
1623 const U8 *uc = (U8*)STRING( noper );
1624 const U8 *e = uc + STR_LEN( noper );
1626 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1628 const U8 *scan = (U8*)NULL;
1629 U32 wordlen = 0; /* required init */
1631 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1633 if (OP(noper) == NOTHING) {
1634 regnode *noper_next= regnext(noper);
1635 if (noper_next != tail && OP(noper_next) == flags) {
1637 uc= (U8*)STRING(noper);
1638 e= uc + STR_LEN(noper);
1639 trie->minlen= STR_LEN(noper);
1646 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1647 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1648 regardless of encoding */
1649 if (OP( noper ) == EXACTFU_SS) {
1650 /* false positives are ok, so just set this */
1651 TRIE_BITMAP_SET(trie,0xDF);
1654 for ( ; uc < e ; uc += len ) {
1655 TRIE_CHARCOUNT(trie)++;
1660 U8 folded= folder[ (U8) uvc ];
1661 if ( !trie->charmap[ folded ] ) {
1662 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1663 TRIE_STORE_REVCHAR( folded );
1666 if ( !trie->charmap[ uvc ] ) {
1667 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1668 TRIE_STORE_REVCHAR( uvc );
1671 /* store the codepoint in the bitmap, and its folded
1673 TRIE_BITMAP_SET(trie, uvc);
1675 /* store the folded codepoint */
1676 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1679 /* store first byte of utf8 representation of
1680 variant codepoints */
1681 if (! UNI_IS_INVARIANT(uvc)) {
1682 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1685 set_bit = 0; /* We've done our bit :-) */
1690 widecharmap = newHV();
1692 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1695 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1697 if ( !SvTRUE( *svpp ) ) {
1698 sv_setiv( *svpp, ++trie->uniquecharcount );
1699 TRIE_STORE_REVCHAR(uvc);
1703 if( cur == first ) {
1704 trie->minlen = chars;
1705 trie->maxlen = chars;
1706 } else if (chars < trie->minlen) {
1707 trie->minlen = chars;
1708 } else if (chars > trie->maxlen) {
1709 trie->maxlen = chars;
1711 if (OP( noper ) == EXACTFU_SS) {
1712 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1713 if (trie->minlen > 1)
1716 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1717 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1718 * - We assume that any such sequence might match a 2 byte string */
1719 if (trie->minlen > 2 )
1723 } /* end first pass */
1724 DEBUG_TRIE_COMPILE_r(
1725 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1726 (int)depth * 2 + 2,"",
1727 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1728 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1729 (int)trie->minlen, (int)trie->maxlen )
1733 We now know what we are dealing with in terms of unique chars and
1734 string sizes so we can calculate how much memory a naive
1735 representation using a flat table will take. If it's over a reasonable
1736 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1737 conservative but potentially much slower representation using an array
1740 At the end we convert both representations into the same compressed
1741 form that will be used in regexec.c for matching with. The latter
1742 is a form that cannot be used to construct with but has memory
1743 properties similar to the list form and access properties similar
1744 to the table form making it both suitable for fast searches and
1745 small enough that its feasable to store for the duration of a program.
1747 See the comment in the code where the compressed table is produced
1748 inplace from the flat tabe representation for an explanation of how
1749 the compression works.
1754 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1757 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1759 Second Pass -- Array Of Lists Representation
1761 Each state will be represented by a list of charid:state records
1762 (reg_trie_trans_le) the first such element holds the CUR and LEN
1763 points of the allocated array. (See defines above).
1765 We build the initial structure using the lists, and then convert
1766 it into the compressed table form which allows faster lookups
1767 (but cant be modified once converted).
1770 STRLEN transcount = 1;
1772 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1773 "%*sCompiling trie using list compiler\n",
1774 (int)depth * 2 + 2, ""));
1776 trie->states = (reg_trie_state *)
1777 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1778 sizeof(reg_trie_state) );
1782 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1784 regnode *noper = NEXTOPER( cur );
1785 U8 *uc = (U8*)STRING( noper );
1786 const U8 *e = uc + STR_LEN( noper );
1787 U32 state = 1; /* required init */
1788 U16 charid = 0; /* sanity init */
1789 U8 *scan = (U8*)NULL; /* sanity init */
1790 STRLEN foldlen = 0; /* required init */
1791 U32 wordlen = 0; /* required init */
1792 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1795 if (OP(noper) == NOTHING) {
1796 regnode *noper_next= regnext(noper);
1797 if (noper_next != tail && OP(noper_next) == flags) {
1799 uc= (U8*)STRING(noper);
1800 e= uc + STR_LEN(noper);
1804 if (OP(noper) != NOTHING) {
1805 for ( ; uc < e ; uc += len ) {
1810 charid = trie->charmap[ uvc ];
1812 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1816 charid=(U16)SvIV( *svpp );
1819 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1826 if ( !trie->states[ state ].trans.list ) {
1827 TRIE_LIST_NEW( state );
1829 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1830 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1831 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1836 newstate = next_alloc++;
1837 prev_states[newstate] = state;
1838 TRIE_LIST_PUSH( state, charid, newstate );
1843 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1847 TRIE_HANDLE_WORD(state);
1849 } /* end second pass */
1851 /* next alloc is the NEXT state to be allocated */
1852 trie->statecount = next_alloc;
1853 trie->states = (reg_trie_state *)
1854 PerlMemShared_realloc( trie->states,
1856 * sizeof(reg_trie_state) );
1858 /* and now dump it out before we compress it */
1859 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1860 revcharmap, next_alloc,
1864 trie->trans = (reg_trie_trans *)
1865 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1872 for( state=1 ; state < next_alloc ; state ++ ) {
1876 DEBUG_TRIE_COMPILE_MORE_r(
1877 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1881 if (trie->states[state].trans.list) {
1882 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1886 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1887 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1888 if ( forid < minid ) {
1890 } else if ( forid > maxid ) {
1894 if ( transcount < tp + maxid - minid + 1) {
1896 trie->trans = (reg_trie_trans *)
1897 PerlMemShared_realloc( trie->trans,
1899 * sizeof(reg_trie_trans) );
1900 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1902 base = trie->uniquecharcount + tp - minid;
1903 if ( maxid == minid ) {
1905 for ( ; zp < tp ; zp++ ) {
1906 if ( ! trie->trans[ zp ].next ) {
1907 base = trie->uniquecharcount + zp - minid;
1908 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1909 trie->trans[ zp ].check = state;
1915 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1916 trie->trans[ tp ].check = state;
1921 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1922 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1923 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1924 trie->trans[ tid ].check = state;
1926 tp += ( maxid - minid + 1 );
1928 Safefree(trie->states[ state ].trans.list);
1931 DEBUG_TRIE_COMPILE_MORE_r(
1932 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1935 trie->states[ state ].trans.base=base;
1937 trie->lasttrans = tp + 1;
1941 Second Pass -- Flat Table Representation.
1943 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1944 We know that we will need Charcount+1 trans at most to store the data
1945 (one row per char at worst case) So we preallocate both structures
1946 assuming worst case.
1948 We then construct the trie using only the .next slots of the entry
1951 We use the .check field of the first entry of the node temporarily to
1952 make compression both faster and easier by keeping track of how many non
1953 zero fields are in the node.
1955 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1958 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1959 number representing the first entry of the node, and state as a
1960 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1961 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1962 are 2 entrys per node. eg:
1970 The table is internally in the right hand, idx form. However as we also
1971 have to deal with the states array which is indexed by nodenum we have to
1972 use TRIE_NODENUM() to convert.
1975 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1976 "%*sCompiling trie using table compiler\n",
1977 (int)depth * 2 + 2, ""));
1979 trie->trans = (reg_trie_trans *)
1980 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1981 * trie->uniquecharcount + 1,
1982 sizeof(reg_trie_trans) );
1983 trie->states = (reg_trie_state *)
1984 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1985 sizeof(reg_trie_state) );
1986 next_alloc = trie->uniquecharcount + 1;
1989 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1991 regnode *noper = NEXTOPER( cur );
1992 const U8 *uc = (U8*)STRING( noper );
1993 const U8 *e = uc + STR_LEN( noper );
1995 U32 state = 1; /* required init */
1997 U16 charid = 0; /* sanity init */
1998 U32 accept_state = 0; /* sanity init */
1999 U8 *scan = (U8*)NULL; /* sanity init */
2001 STRLEN foldlen = 0; /* required init */
2002 U32 wordlen = 0; /* required init */
2004 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2006 if (OP(noper) == NOTHING) {
2007 regnode *noper_next= regnext(noper);
2008 if (noper_next != tail && OP(noper_next) == flags) {
2010 uc= (U8*)STRING(noper);
2011 e= uc + STR_LEN(noper);
2015 if ( OP(noper) != NOTHING ) {
2016 for ( ; uc < e ; uc += len ) {
2021 charid = trie->charmap[ uvc ];
2023 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2024 charid = svpp ? (U16)SvIV(*svpp) : 0;
2028 if ( !trie->trans[ state + charid ].next ) {
2029 trie->trans[ state + charid ].next = next_alloc;
2030 trie->trans[ state ].check++;
2031 prev_states[TRIE_NODENUM(next_alloc)]
2032 = TRIE_NODENUM(state);
2033 next_alloc += trie->uniquecharcount;
2035 state = trie->trans[ state + charid ].next;
2037 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2039 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2042 accept_state = TRIE_NODENUM( state );
2043 TRIE_HANDLE_WORD(accept_state);
2045 } /* end second pass */
2047 /* and now dump it out before we compress it */
2048 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2050 next_alloc, depth+1));
2054 * Inplace compress the table.*
2056 For sparse data sets the table constructed by the trie algorithm will
2057 be mostly 0/FAIL transitions or to put it another way mostly empty.
2058 (Note that leaf nodes will not contain any transitions.)
2060 This algorithm compresses the tables by eliminating most such
2061 transitions, at the cost of a modest bit of extra work during lookup:
2063 - Each states[] entry contains a .base field which indicates the
2064 index in the state[] array wheres its transition data is stored.
2066 - If .base is 0 there are no valid transitions from that node.
2068 - If .base is nonzero then charid is added to it to find an entry in
2071 -If trans[states[state].base+charid].check!=state then the
2072 transition is taken to be a 0/Fail transition. Thus if there are fail
2073 transitions at the front of the node then the .base offset will point
2074 somewhere inside the previous nodes data (or maybe even into a node
2075 even earlier), but the .check field determines if the transition is
2079 The following process inplace converts the table to the compressed
2080 table: We first do not compress the root node 1,and mark all its
2081 .check pointers as 1 and set its .base pointer as 1 as well. This
2082 allows us to do a DFA construction from the compressed table later,
2083 and ensures that any .base pointers we calculate later are greater
2086 - We set 'pos' to indicate the first entry of the second node.
2088 - We then iterate over the columns of the node, finding the first and
2089 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2090 and set the .check pointers accordingly, and advance pos
2091 appropriately and repreat for the next node. Note that when we copy
2092 the next pointers we have to convert them from the original
2093 NODEIDX form to NODENUM form as the former is not valid post
2096 - If a node has no transitions used we mark its base as 0 and do not
2097 advance the pos pointer.
2099 - If a node only has one transition we use a second pointer into the
2100 structure to fill in allocated fail transitions from other states.
2101 This pointer is independent of the main pointer and scans forward
2102 looking for null transitions that are allocated to a state. When it
2103 finds one it writes the single transition into the "hole". If the
2104 pointer doesnt find one the single transition is appended as normal.
2106 - Once compressed we can Renew/realloc the structures to release the
2109 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2110 specifically Fig 3.47 and the associated pseudocode.
2114 const U32 laststate = TRIE_NODENUM( next_alloc );
2117 trie->statecount = laststate;
2119 for ( state = 1 ; state < laststate ; state++ ) {
2121 const U32 stateidx = TRIE_NODEIDX( state );
2122 const U32 o_used = trie->trans[ stateidx ].check;
2123 U32 used = trie->trans[ stateidx ].check;
2124 trie->trans[ stateidx ].check = 0;
2126 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2127 if ( flag || trie->trans[ stateidx + charid ].next ) {
2128 if ( trie->trans[ stateidx + charid ].next ) {
2130 for ( ; zp < pos ; zp++ ) {
2131 if ( ! trie->trans[ zp ].next ) {
2135 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2136 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2137 trie->trans[ zp ].check = state;
2138 if ( ++zp > pos ) pos = zp;
2145 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2147 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2148 trie->trans[ pos ].check = state;
2153 trie->lasttrans = pos + 1;
2154 trie->states = (reg_trie_state *)
2155 PerlMemShared_realloc( trie->states, laststate
2156 * sizeof(reg_trie_state) );
2157 DEBUG_TRIE_COMPILE_MORE_r(
2158 PerlIO_printf( Perl_debug_log,
2159 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2160 (int)depth * 2 + 2,"",
2161 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2164 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2167 } /* end table compress */
2169 DEBUG_TRIE_COMPILE_MORE_r(
2170 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2171 (int)depth * 2 + 2, "",
2172 (UV)trie->statecount,
2173 (UV)trie->lasttrans)
2175 /* resize the trans array to remove unused space */
2176 trie->trans = (reg_trie_trans *)
2177 PerlMemShared_realloc( trie->trans, trie->lasttrans
2178 * sizeof(reg_trie_trans) );
2180 { /* Modify the program and insert the new TRIE node */
2181 U8 nodetype =(U8)(flags & 0xFF);
2185 regnode *optimize = NULL;
2186 #ifdef RE_TRACK_PATTERN_OFFSETS
2189 U32 mjd_nodelen = 0;
2190 #endif /* RE_TRACK_PATTERN_OFFSETS */
2191 #endif /* DEBUGGING */
2193 This means we convert either the first branch or the first Exact,
2194 depending on whether the thing following (in 'last') is a branch
2195 or not and whther first is the startbranch (ie is it a sub part of
2196 the alternation or is it the whole thing.)
2197 Assuming its a sub part we convert the EXACT otherwise we convert
2198 the whole branch sequence, including the first.
2200 /* Find the node we are going to overwrite */
2201 if ( first != startbranch || OP( last ) == BRANCH ) {
2202 /* branch sub-chain */
2203 NEXT_OFF( first ) = (U16)(last - first);
2204 #ifdef RE_TRACK_PATTERN_OFFSETS
2206 mjd_offset= Node_Offset((convert));
2207 mjd_nodelen= Node_Length((convert));
2210 /* whole branch chain */
2212 #ifdef RE_TRACK_PATTERN_OFFSETS
2215 const regnode *nop = NEXTOPER( convert );
2216 mjd_offset= Node_Offset((nop));
2217 mjd_nodelen= Node_Length((nop));
2221 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2222 (int)depth * 2 + 2, "",
2223 (UV)mjd_offset, (UV)mjd_nodelen)
2226 /* But first we check to see if there is a common prefix we can
2227 split out as an EXACT and put in front of the TRIE node. */
2228 trie->startstate= 1;
2229 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2231 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2235 const U32 base = trie->states[ state ].trans.base;
2237 if ( trie->states[state].wordnum )
2240 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2241 if ( ( base + ofs >= trie->uniquecharcount ) &&
2242 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2243 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2245 if ( ++count > 1 ) {
2246 SV **tmp = av_fetch( revcharmap, ofs, 0);
2247 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2248 if ( state == 1 ) break;
2250 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2252 PerlIO_printf(Perl_debug_log,
2253 "%*sNew Start State=%"UVuf" Class: [",
2254 (int)depth * 2 + 2, "",
2257 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2258 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2260 TRIE_BITMAP_SET(trie,*ch);
2262 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2264 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2268 TRIE_BITMAP_SET(trie,*ch);
2270 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2271 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2277 SV **tmp = av_fetch( revcharmap, idx, 0);
2279 char *ch = SvPV( *tmp, len );
2281 SV *sv=sv_newmortal();
2282 PerlIO_printf( Perl_debug_log,
2283 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2284 (int)depth * 2 + 2, "",
2286 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2287 PL_colors[0], PL_colors[1],
2288 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2289 PERL_PV_ESCAPE_FIRSTCHAR
2294 OP( convert ) = nodetype;
2295 str=STRING(convert);
2298 STR_LEN(convert) += len;
2304 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2309 trie->prefixlen = (state-1);
2311 regnode *n = convert+NODE_SZ_STR(convert);
2312 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2313 trie->startstate = state;
2314 trie->minlen -= (state - 1);
2315 trie->maxlen -= (state - 1);
2317 /* At least the UNICOS C compiler choked on this
2318 * being argument to DEBUG_r(), so let's just have
2321 #ifdef PERL_EXT_RE_BUILD
2327 regnode *fix = convert;
2328 U32 word = trie->wordcount;
2330 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2331 while( ++fix < n ) {
2332 Set_Node_Offset_Length(fix, 0, 0);
2335 SV ** const tmp = av_fetch( trie_words, word, 0 );
2337 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2338 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2340 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2348 NEXT_OFF(convert) = (U16)(tail - convert);
2349 DEBUG_r(optimize= n);
2355 if ( trie->maxlen ) {
2356 NEXT_OFF( convert ) = (U16)(tail - convert);
2357 ARG_SET( convert, data_slot );
2358 /* Store the offset to the first unabsorbed branch in
2359 jump[0], which is otherwise unused by the jump logic.
2360 We use this when dumping a trie and during optimisation. */
2362 trie->jump[0] = (U16)(nextbranch - convert);
2364 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2365 * and there is a bitmap
2366 * and the first "jump target" node we found leaves enough room
2367 * then convert the TRIE node into a TRIEC node, with the bitmap
2368 * embedded inline in the opcode - this is hypothetically faster.
2370 if ( !trie->states[trie->startstate].wordnum
2372 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2374 OP( convert ) = TRIEC;
2375 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2376 PerlMemShared_free(trie->bitmap);
2379 OP( convert ) = TRIE;
2381 /* store the type in the flags */
2382 convert->flags = nodetype;
2386 + regarglen[ OP( convert ) ];
2388 /* XXX We really should free up the resource in trie now,
2389 as we won't use them - (which resources?) dmq */
2391 /* needed for dumping*/
2392 DEBUG_r(if (optimize) {
2393 regnode *opt = convert;
2395 while ( ++opt < optimize) {
2396 Set_Node_Offset_Length(opt,0,0);
2399 Try to clean up some of the debris left after the
2402 while( optimize < jumper ) {
2403 mjd_nodelen += Node_Length((optimize));
2404 OP( optimize ) = OPTIMIZED;
2405 Set_Node_Offset_Length(optimize,0,0);
2408 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2410 } /* end node insert */
2411 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2413 /* Finish populating the prev field of the wordinfo array. Walk back
2414 * from each accept state until we find another accept state, and if
2415 * so, point the first word's .prev field at the second word. If the
2416 * second already has a .prev field set, stop now. This will be the
2417 * case either if we've already processed that word's accept state,
2418 * or that state had multiple words, and the overspill words were
2419 * already linked up earlier.
2426 for (word=1; word <= trie->wordcount; word++) {
2428 if (trie->wordinfo[word].prev)
2430 state = trie->wordinfo[word].accept;
2432 state = prev_states[state];
2435 prev = trie->states[state].wordnum;
2439 trie->wordinfo[word].prev = prev;
2441 Safefree(prev_states);
2445 /* and now dump out the compressed format */
2446 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2448 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2450 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2451 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2453 SvREFCNT_dec(revcharmap);
2457 : trie->startstate>1
2463 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2465 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2467 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2468 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2471 We find the fail state for each state in the trie, this state is the longest proper
2472 suffix of the current state's 'word' that is also a proper prefix of another word in our
2473 trie. State 1 represents the word '' and is thus the default fail state. This allows
2474 the DFA not to have to restart after its tried and failed a word at a given point, it
2475 simply continues as though it had been matching the other word in the first place.
2477 'abcdgu'=~/abcdefg|cdgu/
2478 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2479 fail, which would bring us to the state representing 'd' in the second word where we would
2480 try 'g' and succeed, proceeding to match 'cdgu'.
2482 /* add a fail transition */
2483 const U32 trie_offset = ARG(source);
2484 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2486 const U32 ucharcount = trie->uniquecharcount;
2487 const U32 numstates = trie->statecount;
2488 const U32 ubound = trie->lasttrans + ucharcount;
2492 U32 base = trie->states[ 1 ].trans.base;
2495 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2496 GET_RE_DEBUG_FLAGS_DECL;
2498 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2500 PERL_UNUSED_ARG(depth);
2504 ARG_SET( stclass, data_slot );
2505 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2506 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2507 aho->trie=trie_offset;
2508 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2509 Copy( trie->states, aho->states, numstates, reg_trie_state );
2510 Newxz( q, numstates, U32);
2511 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2514 /* initialize fail[0..1] to be 1 so that we always have
2515 a valid final fail state */
2516 fail[ 0 ] = fail[ 1 ] = 1;
2518 for ( charid = 0; charid < ucharcount ; charid++ ) {
2519 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2521 q[ q_write ] = newstate;
2522 /* set to point at the root */
2523 fail[ q[ q_write++ ] ]=1;
2526 while ( q_read < q_write) {
2527 const U32 cur = q[ q_read++ % numstates ];
2528 base = trie->states[ cur ].trans.base;
2530 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2531 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2533 U32 fail_state = cur;
2536 fail_state = fail[ fail_state ];
2537 fail_base = aho->states[ fail_state ].trans.base;
2538 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2540 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2541 fail[ ch_state ] = fail_state;
2542 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2544 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2546 q[ q_write++ % numstates] = ch_state;
2550 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2551 when we fail in state 1, this allows us to use the
2552 charclass scan to find a valid start char. This is based on the principle
2553 that theres a good chance the string being searched contains lots of stuff
2554 that cant be a start char.
2556 fail[ 0 ] = fail[ 1 ] = 0;
2557 DEBUG_TRIE_COMPILE_r({
2558 PerlIO_printf(Perl_debug_log,
2559 "%*sStclass Failtable (%"UVuf" states): 0",
2560 (int)(depth * 2), "", (UV)numstates
2562 for( q_read=1; q_read<numstates; q_read++ ) {
2563 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2565 PerlIO_printf(Perl_debug_log, "\n");
2568 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2573 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2574 * These need to be revisited when a newer toolchain becomes available.
2576 #if defined(__sparc64__) && defined(__GNUC__)
2577 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2578 # undef SPARC64_GCC_WORKAROUND
2579 # define SPARC64_GCC_WORKAROUND 1
2583 #define DEBUG_PEEP(str,scan,depth) \
2584 DEBUG_OPTIMISE_r({if (scan){ \
2585 SV * const mysv=sv_newmortal(); \
2586 regnode *Next = regnext(scan); \
2587 regprop(RExC_rx, mysv, scan); \
2588 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2589 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2590 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2594 /* The below joins as many adjacent EXACTish nodes as possible into a single
2595 * one. The regop may be changed if the node(s) contain certain sequences that
2596 * require special handling. The joining is only done if:
2597 * 1) there is room in the current conglomerated node to entirely contain the
2599 * 2) they are the exact same node type
2601 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2602 * these get optimized out
2604 * If a node is to match under /i (folded), the number of characters it matches
2605 * can be different than its character length if it contains a multi-character
2606 * fold. *min_subtract is set to the total delta of the input nodes.
2608 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2609 * and contains LATIN SMALL LETTER SHARP S
2611 * This is as good a place as any to discuss the design of handling these
2612 * multi-character fold sequences. It's been wrong in Perl for a very long
2613 * time. There are three code points in Unicode whose multi-character folds
2614 * were long ago discovered to mess things up. The previous designs for
2615 * dealing with these involved assigning a special node for them. This
2616 * approach doesn't work, as evidenced by this example:
2617 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2618 * Both these fold to "sss", but if the pattern is parsed to create a node that
2619 * would match just the \xDF, it won't be able to handle the case where a
2620 * successful match would have to cross the node's boundary. The new approach
2621 * that hopefully generally solves the problem generates an EXACTFU_SS node
2624 * It turns out that there are problems with all multi-character folds, and not
2625 * just these three. Now the code is general, for all such cases, but the
2626 * three still have some special handling. The approach taken is:
2627 * 1) This routine examines each EXACTFish node that could contain multi-
2628 * character fold sequences. It returns in *min_subtract how much to
2629 * subtract from the the actual length of the string to get a real minimum
2630 * match length; it is 0 if there are no multi-char folds. This delta is
2631 * used by the caller to adjust the min length of the match, and the delta
2632 * between min and max, so that the optimizer doesn't reject these
2633 * possibilities based on size constraints.
2634 * 2) Certain of these sequences require special handling by the trie code,
2635 * so, if found, this code changes the joined node type to special ops:
2636 * EXACTFU_TRICKYFOLD and EXACTFU_SS.
2637 * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2638 * is used for an EXACTFU node that contains at least one "ss" sequence in
2639 * it. For non-UTF-8 patterns and strings, this is the only case where
2640 * there is a possible fold length change. That means that a regular
2641 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2642 * with length changes, and so can be processed faster. regexec.c takes
2643 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2644 * pre-folded by regcomp.c. This saves effort in regex matching.
2645 * However, the pre-folding isn't done for non-UTF8 patterns because the
2646 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2647 * down by forcing the pattern into UTF8 unless necessary. Also what
2648 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2649 * possibilities for the non-UTF8 patterns are quite simple, except for
2650 * the sharp s. All the ones that don't involve a UTF-8 target string are
2651 * members of a fold-pair, and arrays are set up for all of them so that
2652 * the other member of the pair can be found quickly. Code elsewhere in
2653 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2654 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2655 * described in the next item.
2656 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2657 * 'ss' or not is not knowable at compile time. It will match iff the
2658 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2659 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2660 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2661 * described in item 3). An assumption that the optimizer part of
2662 * regexec.c (probably unwittingly) makes is that a character in the
2663 * pattern corresponds to at most a single character in the target string.
2664 * (And I do mean character, and not byte here, unlike other parts of the
2665 * documentation that have never been updated to account for multibyte
2666 * Unicode.) This assumption is wrong only in this case, as all other
2667 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2668 * virtue of having this file pre-fold UTF-8 patterns. I'm
2669 * reluctant to try to change this assumption, so instead the code punts.
2670 * This routine examines EXACTF nodes for the sharp s, and returns a
2671 * boolean indicating whether or not the node is an EXACTF node that
2672 * contains a sharp s. When it is true, the caller sets a flag that later
2673 * causes the optimizer in this file to not set values for the floating
2674 * and fixed string lengths, and thus avoids the optimizer code in
2675 * regexec.c that makes the invalid assumption. Thus, there is no
2676 * optimization based on string lengths for EXACTF nodes that contain the
2677 * sharp s. This only happens for /id rules (which means the pattern
2681 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2682 if (PL_regkind[OP(scan)] == EXACT) \
2683 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2686 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2687 /* Merge several consecutive EXACTish nodes into one. */
2688 regnode *n = regnext(scan);
2690 regnode *next = scan + NODE_SZ_STR(scan);
2694 regnode *stop = scan;
2695 GET_RE_DEBUG_FLAGS_DECL;
2697 PERL_UNUSED_ARG(depth);
2700 PERL_ARGS_ASSERT_JOIN_EXACT;
2701 #ifndef EXPERIMENTAL_INPLACESCAN
2702 PERL_UNUSED_ARG(flags);
2703 PERL_UNUSED_ARG(val);
2705 DEBUG_PEEP("join",scan,depth);
2707 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2708 * EXACT ones that are mergeable to the current one. */
2710 && (PL_regkind[OP(n)] == NOTHING
2711 || (stringok && OP(n) == OP(scan)))
2713 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2716 if (OP(n) == TAIL || n > next)
2718 if (PL_regkind[OP(n)] == NOTHING) {
2719 DEBUG_PEEP("skip:",n,depth);
2720 NEXT_OFF(scan) += NEXT_OFF(n);
2721 next = n + NODE_STEP_REGNODE;
2728 else if (stringok) {
2729 const unsigned int oldl = STR_LEN(scan);
2730 regnode * const nnext = regnext(n);
2732 /* XXX I (khw) kind of doubt that this works on platforms where
2733 * U8_MAX is above 255 because of lots of other assumptions */
2734 if (oldl + STR_LEN(n) > U8_MAX)
2737 DEBUG_PEEP("merg",n,depth);
2740 NEXT_OFF(scan) += NEXT_OFF(n);
2741 STR_LEN(scan) += STR_LEN(n);
2742 next = n + NODE_SZ_STR(n);
2743 /* Now we can overwrite *n : */
2744 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2752 #ifdef EXPERIMENTAL_INPLACESCAN
2753 if (flags && !NEXT_OFF(n)) {
2754 DEBUG_PEEP("atch", val, depth);
2755 if (reg_off_by_arg[OP(n)]) {
2756 ARG_SET(n, val - n);
2759 NEXT_OFF(n) = val - n;
2767 *has_exactf_sharp_s = FALSE;
2769 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2770 * can now analyze for sequences of problematic code points. (Prior to
2771 * this final joining, sequences could have been split over boundaries, and
2772 * hence missed). The sequences only happen in folding, hence for any
2773 * non-EXACT EXACTish node */
2774 if (OP(scan) != EXACT) {
2775 const U8 * const s0 = (U8*) STRING(scan);
2777 const U8 * const s_end = s0 + STR_LEN(scan);
2779 /* One pass is made over the node's string looking for all the
2780 * possibilities. to avoid some tests in the loop, there are two main
2781 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2785 /* Examine the string for a multi-character fold sequence. UTF-8
2786 * patterns have all characters pre-folded by the time this code is
2788 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2789 length sequence we are looking for is 2 */
2792 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2793 if (! len) { /* Not a multi-char fold: get next char */
2798 /* Nodes with 'ss' require special handling, except for EXACTFL
2799 * and EXACTFA for which there is no multi-char fold to this */
2800 if (len == 2 && *s == 's' && *(s+1) == 's'
2801 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2804 OP(scan) = EXACTFU_SS;
2807 else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
2808 && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2809 COMBINING_DIAERESIS_UTF8
2810 COMBINING_ACUTE_ACCENT_UTF8,
2812 || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2813 COMBINING_DIAERESIS_UTF8
2814 COMBINING_ACUTE_ACCENT_UTF8,
2819 /* These two folds require special handling by trie's, so
2820 * change the node type to indicate this. If EXACTFA and
2821 * EXACTFL were ever to be handled by trie's, this would
2822 * have to be changed. If this node has already been
2823 * changed to EXACTFU_SS in this loop, leave it as is. (I
2824 * (khw) think it doesn't matter in regexec.c for UTF
2825 * patterns, but no need to change it */
2826 if (OP(scan) == EXACTFU) {
2827 OP(scan) = EXACTFU_TRICKYFOLD;
2831 else { /* Here is a generic multi-char fold. */
2832 const U8* multi_end = s + len;
2834 /* Count how many characters in it. In the case of /l and
2835 * /aa, no folds which contain ASCII code points are
2836 * allowed, so check for those, and skip if found. (In
2837 * EXACTFL, no folds are allowed to any Latin1 code point,
2838 * not just ASCII. But there aren't any of these
2839 * currently, nor ever likely, so don't take the time to
2840 * test for them. The code that generates the
2841 * is_MULTI_foo() macros croaks should one actually get put
2842 * into Unicode .) */
2843 if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2844 count = utf8_length(s, multi_end);
2848 while (s < multi_end) {
2851 goto next_iteration;
2861 /* The delta is how long the sequence is minus 1 (1 is how long
2862 * the character that folds to the sequence is) */
2863 *min_subtract += count - 1;
2867 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2869 /* Here, the pattern is not UTF-8. Look for the multi-char folds
2870 * that are all ASCII. As in the above case, EXACTFL and EXACTFA
2871 * nodes can't have multi-char folds to this range (and there are
2872 * no existing ones in the upper latin1 range). In the EXACTF
2873 * case we look also for the sharp s, which can be in the final
2874 * position. Otherwise we can stop looking 1 byte earlier because
2875 * have to find at least two characters for a multi-fold */
2876 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2878 /* The below is perhaps overboard, but this allows us to save a
2879 * test each time through the loop at the expense of a mask. This
2880 * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2881 * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
2882 * are 64. This uses an exclusive 'or' to find that bit and then
2883 * inverts it to form a mask, with just a single 0, in the bit
2884 * position where 'S' and 's' differ. */
2885 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2886 const U8 s_masked = 's' & S_or_s_mask;
2889 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2890 if (! len) { /* Not a multi-char fold. */
2891 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2893 *has_exactf_sharp_s = TRUE;
2900 && ((*s & S_or_s_mask) == s_masked)
2901 && ((*(s+1) & S_or_s_mask) == s_masked))
2904 /* EXACTF nodes need to know that the minimum length
2905 * changed so that a sharp s in the string can match this
2906 * ss in the pattern, but they remain EXACTF nodes, as they
2907 * won't match this unless the target string is is UTF-8,
2908 * which we don't know until runtime */
2909 if (OP(scan) != EXACTF) {
2910 OP(scan) = EXACTFU_SS;
2914 *min_subtract += len - 1;
2921 /* Allow dumping but overwriting the collection of skipped
2922 * ops and/or strings with fake optimized ops */
2923 n = scan + NODE_SZ_STR(scan);
2931 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2935 /* REx optimizer. Converts nodes into quicker variants "in place".
2936 Finds fixed substrings. */
2938 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2939 to the position after last scanned or to NULL. */
2941 #define INIT_AND_WITHP \
2942 assert(!and_withp); \
2943 Newx(and_withp,1,struct regnode_charclass_class); \
2944 SAVEFREEPV(and_withp)
2946 /* this is a chain of data about sub patterns we are processing that
2947 need to be handled separately/specially in study_chunk. Its so
2948 we can simulate recursion without losing state. */
2950 typedef struct scan_frame {
2951 regnode *last; /* last node to process in this frame */
2952 regnode *next; /* next node to process when last is reached */
2953 struct scan_frame *prev; /*previous frame*/
2954 I32 stop; /* what stopparen do we use */
2958 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2960 #define CASE_SYNST_FNC(nAmE) \
2962 if (flags & SCF_DO_STCLASS_AND) { \
2963 for (value = 0; value < 256; value++) \
2964 if (!is_ ## nAmE ## _cp(value)) \
2965 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2968 for (value = 0; value < 256; value++) \
2969 if (is_ ## nAmE ## _cp(value)) \
2970 ANYOF_BITMAP_SET(data->start_class, value); \
2974 if (flags & SCF_DO_STCLASS_AND) { \
2975 for (value = 0; value < 256; value++) \
2976 if (is_ ## nAmE ## _cp(value)) \
2977 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2980 for (value = 0; value < 256; value++) \
2981 if (!is_ ## nAmE ## _cp(value)) \
2982 ANYOF_BITMAP_SET(data->start_class, value); \
2989 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2990 I32 *minlenp, I32 *deltap,
2995 struct regnode_charclass_class *and_withp,
2996 U32 flags, U32 depth)
2997 /* scanp: Start here (read-write). */
2998 /* deltap: Write maxlen-minlen here. */
2999 /* last: Stop before this one. */
3000 /* data: string data about the pattern */
3001 /* stopparen: treat close N as END */
3002 /* recursed: which subroutines have we recursed into */
3003 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3006 I32 min = 0; /* There must be at least this number of characters to match */
3008 regnode *scan = *scanp, *next;
3010 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3011 int is_inf_internal = 0; /* The studied chunk is infinite */
3012 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3013 scan_data_t data_fake;
3014 SV *re_trie_maxbuff = NULL;
3015 regnode *first_non_open = scan;
3016 I32 stopmin = I32_MAX;
3017 scan_frame *frame = NULL;
3018 GET_RE_DEBUG_FLAGS_DECL;
3020 PERL_ARGS_ASSERT_STUDY_CHUNK;
3023 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3027 while (first_non_open && OP(first_non_open) == OPEN)
3028 first_non_open=regnext(first_non_open);
3033 while ( scan && OP(scan) != END && scan < last ){
3034 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3035 node length to get a real minimum (because
3036 the folded version may be shorter) */
3037 bool has_exactf_sharp_s = FALSE;
3038 /* Peephole optimizer: */
3039 DEBUG_STUDYDATA("Peep:", data,depth);
3040 DEBUG_PEEP("Peep",scan,depth);
3042 /* Its not clear to khw or hv why this is done here, and not in the
3043 * clauses that deal with EXACT nodes. khw's guess is that it's
3044 * because of a previous design */
3045 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3047 /* Follow the next-chain of the current node and optimize
3048 away all the NOTHINGs from it. */
3049 if (OP(scan) != CURLYX) {
3050 const int max = (reg_off_by_arg[OP(scan)]
3052 /* I32 may be smaller than U16 on CRAYs! */
3053 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3054 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3058 /* Skip NOTHING and LONGJMP. */
3059 while ((n = regnext(n))
3060 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3061 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3062 && off + noff < max)
3064 if (reg_off_by_arg[OP(scan)])
3067 NEXT_OFF(scan) = off;
3072 /* The principal pseudo-switch. Cannot be a switch, since we
3073 look into several different things. */
3074 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3075 || OP(scan) == IFTHEN) {
3076 next = regnext(scan);
3078 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3080 if (OP(next) == code || code == IFTHEN) {
3081 /* NOTE - There is similar code to this block below for handling
3082 TRIE nodes on a re-study. If you change stuff here check there
3084 I32 max1 = 0, min1 = I32_MAX, num = 0;
3085 struct regnode_charclass_class accum;
3086 regnode * const startbranch=scan;
3088 if (flags & SCF_DO_SUBSTR)
3089 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3090 if (flags & SCF_DO_STCLASS)
3091 cl_init_zero(pRExC_state, &accum);
3093 while (OP(scan) == code) {
3094 I32 deltanext, minnext, f = 0, fake;
3095 struct regnode_charclass_class this_class;
3098 data_fake.flags = 0;
3100 data_fake.whilem_c = data->whilem_c;
3101 data_fake.last_closep = data->last_closep;
3104 data_fake.last_closep = &fake;
3106 data_fake.pos_delta = delta;
3107 next = regnext(scan);
3108 scan = NEXTOPER(scan);
3110 scan = NEXTOPER(scan);
3111 if (flags & SCF_DO_STCLASS) {
3112 cl_init(pRExC_state, &this_class);
3113 data_fake.start_class = &this_class;
3114 f = SCF_DO_STCLASS_AND;
3116 if (flags & SCF_WHILEM_VISITED_POS)
3117 f |= SCF_WHILEM_VISITED_POS;
3119 /* we suppose the run is continuous, last=next...*/
3120 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3122 stopparen, recursed, NULL, f,depth+1);
3125 if (max1 < minnext + deltanext)
3126 max1 = minnext + deltanext;
3127 if (deltanext == I32_MAX)
3128 is_inf = is_inf_internal = 1;
3130 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3132 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3133 if ( stopmin > minnext)
3134 stopmin = min + min1;
3135 flags &= ~SCF_DO_SUBSTR;
3137 data->flags |= SCF_SEEN_ACCEPT;
3140 if (data_fake.flags & SF_HAS_EVAL)
3141 data->flags |= SF_HAS_EVAL;
3142 data->whilem_c = data_fake.whilem_c;
3144 if (flags & SCF_DO_STCLASS)
3145 cl_or(pRExC_state, &accum, &this_class);
3147 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3149 if (flags & SCF_DO_SUBSTR) {
3150 data->pos_min += min1;
3151 data->pos_delta += max1 - min1;
3152 if (max1 != min1 || is_inf)
3153 data->longest = &(data->longest_float);
3156 delta += max1 - min1;
3157 if (flags & SCF_DO_STCLASS_OR) {
3158 cl_or(pRExC_state, data->start_class, &accum);
3160 cl_and(data->start_class, and_withp);
3161 flags &= ~SCF_DO_STCLASS;
3164 else if (flags & SCF_DO_STCLASS_AND) {
3166 cl_and(data->start_class, &accum);
3167 flags &= ~SCF_DO_STCLASS;
3170 /* Switch to OR mode: cache the old value of
3171 * data->start_class */
3173 StructCopy(data->start_class, and_withp,
3174 struct regnode_charclass_class);
3175 flags &= ~SCF_DO_STCLASS_AND;
3176 StructCopy(&accum, data->start_class,
3177 struct regnode_charclass_class);
3178 flags |= SCF_DO_STCLASS_OR;
3179 data->start_class->flags |= ANYOF_EOS;
3183 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3186 Assuming this was/is a branch we are dealing with: 'scan' now
3187 points at the item that follows the branch sequence, whatever
3188 it is. We now start at the beginning of the sequence and look
3195 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3197 If we can find such a subsequence we need to turn the first
3198 element into a trie and then add the subsequent branch exact
3199 strings to the trie.
3203 1. patterns where the whole set of branches can be converted.
3205 2. patterns where only a subset can be converted.
3207 In case 1 we can replace the whole set with a single regop
3208 for the trie. In case 2 we need to keep the start and end
3211 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3212 becomes BRANCH TRIE; BRANCH X;
3214 There is an additional case, that being where there is a
3215 common prefix, which gets split out into an EXACT like node
3216 preceding the TRIE node.
3218 If x(1..n)==tail then we can do a simple trie, if not we make
3219 a "jump" trie, such that when we match the appropriate word
3220 we "jump" to the appropriate tail node. Essentially we turn
3221 a nested if into a case structure of sorts.
3226 if (!re_trie_maxbuff) {
3227 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3228 if (!SvIOK(re_trie_maxbuff))
3229 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3231 if ( SvIV(re_trie_maxbuff)>=0 ) {
3233 regnode *first = (regnode *)NULL;
3234 regnode *last = (regnode *)NULL;
3235 regnode *tail = scan;
3240 SV * const mysv = sv_newmortal(); /* for dumping */
3242 /* var tail is used because there may be a TAIL
3243 regop in the way. Ie, the exacts will point to the
3244 thing following the TAIL, but the last branch will
3245 point at the TAIL. So we advance tail. If we
3246 have nested (?:) we may have to move through several
3250 while ( OP( tail ) == TAIL ) {
3251 /* this is the TAIL generated by (?:) */
3252 tail = regnext( tail );
3256 DEBUG_TRIE_COMPILE_r({
3257 regprop(RExC_rx, mysv, tail );
3258 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3259 (int)depth * 2 + 2, "",
3260 "Looking for TRIE'able sequences. Tail node is: ",
3261 SvPV_nolen_const( mysv )
3267 Step through the branches
3268 cur represents each branch,
3269 noper is the first thing to be matched as part of that branch
3270 noper_next is the regnext() of that node.
3272 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3273 via a "jump trie" but we also support building with NOJUMPTRIE,
3274 which restricts the trie logic to structures like /FOO|BAR/.
3276 If noper is a trieable nodetype then the branch is a possible optimization
3277 target. If we are building under NOJUMPTRIE then we require that noper_next
3278 is the same as scan (our current position in the regex program).
3280 Once we have two or more consecutive such branches we can create a
3281 trie of the EXACT's contents and stitch it in place into the program.
3283 If the sequence represents all of the branches in the alternation we
3284 replace the entire thing with a single TRIE node.
3286 Otherwise when it is a subsequence we need to stitch it in place and
3287 replace only the relevant branches. This means the first branch has
3288 to remain as it is used by the alternation logic, and its next pointer,
3289 and needs to be repointed at the item on the branch chain following
3290 the last branch we have optimized away.
3292 This could be either a BRANCH, in which case the subsequence is internal,
3293 or it could be the item following the branch sequence in which case the
3294 subsequence is at the end (which does not necessarily mean the first node
3295 is the start of the alternation).
3297 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3300 ----------------+-----------
3304 EXACTFU_SS | EXACTFU
3305 EXACTFU_TRICKYFOLD | EXACTFU
3310 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3311 ( EXACT == (X) ) ? EXACT : \
3312 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3315 /* dont use tail as the end marker for this traverse */
3316 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3317 regnode * const noper = NEXTOPER( cur );
3318 U8 noper_type = OP( noper );
3319 U8 noper_trietype = TRIE_TYPE( noper_type );
3320 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3321 regnode * const noper_next = regnext( noper );
3322 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3323 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3326 DEBUG_TRIE_COMPILE_r({
3327 regprop(RExC_rx, mysv, cur);
3328 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3329 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3331 regprop(RExC_rx, mysv, noper);
3332 PerlIO_printf( Perl_debug_log, " -> %s",
3333 SvPV_nolen_const(mysv));
3336 regprop(RExC_rx, mysv, noper_next );
3337 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3338 SvPV_nolen_const(mysv));
3340 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3341 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3342 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3346 /* Is noper a trieable nodetype that can be merged with the
3347 * current trie (if there is one)? */
3351 ( noper_trietype == NOTHING)
3352 || ( trietype == NOTHING )
3353 || ( trietype == noper_trietype )
3356 && noper_next == tail
3360 /* Handle mergable triable node
3361 * Either we are the first node in a new trieable sequence,
3362 * in which case we do some bookkeeping, otherwise we update
3363 * the end pointer. */
3366 if ( noper_trietype == NOTHING ) {
3367 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3368 regnode * const noper_next = regnext( noper );
3369 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3370 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3373 if ( noper_next_trietype ) {
3374 trietype = noper_next_trietype;
3375 } else if (noper_next_type) {
3376 /* a NOTHING regop is 1 regop wide. We need at least two
3377 * for a trie so we can't merge this in */
3381 trietype = noper_trietype;
3384 if ( trietype == NOTHING )
3385 trietype = noper_trietype;
3390 } /* end handle mergable triable node */
3392 /* handle unmergable node -
3393 * noper may either be a triable node which can not be tried
3394 * together with the current trie, or a non triable node */
3396 /* If last is set and trietype is not NOTHING then we have found
3397 * at least two triable branch sequences in a row of a similar
3398 * trietype so we can turn them into a trie. If/when we
3399 * allow NOTHING to start a trie sequence this condition will be
3400 * required, and it isn't expensive so we leave it in for now. */
3401 if ( trietype && trietype != NOTHING )
3402 make_trie( pRExC_state,
3403 startbranch, first, cur, tail, count,
3404 trietype, depth+1 );
3405 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3409 && noper_next == tail
3412 /* noper is triable, so we can start a new trie sequence */
3415 trietype = noper_trietype;
3417 /* if we already saw a first but the current node is not triable then we have
3418 * to reset the first information. */
3423 } /* end handle unmergable node */
3424 } /* loop over branches */
3425 DEBUG_TRIE_COMPILE_r({
3426 regprop(RExC_rx, mysv, cur);
3427 PerlIO_printf( Perl_debug_log,
3428 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3429 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3432 if ( last && trietype ) {
3433 if ( trietype != NOTHING ) {
3434 /* the last branch of the sequence was part of a trie,
3435 * so we have to construct it here outside of the loop
3437 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3438 #ifdef TRIE_STUDY_OPT
3439 if ( ((made == MADE_EXACT_TRIE &&
3440 startbranch == first)
3441 || ( first_non_open == first )) &&
3443 flags |= SCF_TRIE_RESTUDY;
3444 if ( startbranch == first
3447 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3452 /* at this point we know whatever we have is a NOTHING sequence/branch
3453 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3455 if ( startbranch == first ) {
3457 /* the entire thing is a NOTHING sequence, something like this:
3458 * (?:|) So we can turn it into a plain NOTHING op. */
3459 DEBUG_TRIE_COMPILE_r({
3460 regprop(RExC_rx, mysv, cur);
3461 PerlIO_printf( Perl_debug_log,
3462 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3463 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3466 OP(startbranch)= NOTHING;
3467 NEXT_OFF(startbranch)= tail - startbranch;
3468 for ( opt= startbranch + 1; opt < tail ; opt++ )
3472 } /* end if ( last) */
3473 } /* TRIE_MAXBUF is non zero */
3478 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3479 scan = NEXTOPER(NEXTOPER(scan));
3480 } else /* single branch is optimized. */
3481 scan = NEXTOPER(scan);
3483 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3484 scan_frame *newframe = NULL;
3489 if (OP(scan) != SUSPEND) {
3490 /* set the pointer */
3491 if (OP(scan) == GOSUB) {
3493 RExC_recurse[ARG2L(scan)] = scan;
3494 start = RExC_open_parens[paren-1];
3495 end = RExC_close_parens[paren-1];
3498 start = RExC_rxi->program + 1;
3502 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3503 SAVEFREEPV(recursed);
3505 if (!PAREN_TEST(recursed,paren+1)) {
3506 PAREN_SET(recursed,paren+1);
3507 Newx(newframe,1,scan_frame);
3509 if (flags & SCF_DO_SUBSTR) {
3510 SCAN_COMMIT(pRExC_state,data,minlenp);
3511 data->longest = &(data->longest_float);
3513 is_inf = is_inf_internal = 1;
3514 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3515 cl_anything(pRExC_state, data->start_class);
3516 flags &= ~SCF_DO_STCLASS;
3519 Newx(newframe,1,scan_frame);
3522 end = regnext(scan);
3527 SAVEFREEPV(newframe);
3528 newframe->next = regnext(scan);
3529 newframe->last = last;
3530 newframe->stop = stopparen;
3531 newframe->prev = frame;
3541 else if (OP(scan) == EXACT) {
3542 I32 l = STR_LEN(scan);
3545 const U8 * const s = (U8*)STRING(scan);
3546 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3547 l = utf8_length(s, s + l);
3549 uc = *((U8*)STRING(scan));
3552 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3553 /* The code below prefers earlier match for fixed
3554 offset, later match for variable offset. */
3555 if (data->last_end == -1) { /* Update the start info. */
3556 data->last_start_min = data->pos_min;
3557 data->last_start_max = is_inf
3558 ? I32_MAX : data->pos_min + data->pos_delta;
3560 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3562 SvUTF8_on(data->last_found);
3564 SV * const sv = data->last_found;
3565 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3566 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3567 if (mg && mg->mg_len >= 0)
3568 mg->mg_len += utf8_length((U8*)STRING(scan),
3569 (U8*)STRING(scan)+STR_LEN(scan));
3571 data->last_end = data->pos_min + l;
3572 data->pos_min += l; /* As in the first entry. */
3573 data->flags &= ~SF_BEFORE_EOL;
3575 if (flags & SCF_DO_STCLASS_AND) {
3576 /* Check whether it is compatible with what we know already! */
3580 /* If compatible, we or it in below. It is compatible if is
3581 * in the bitmp and either 1) its bit or its fold is set, or 2)
3582 * it's for a locale. Even if there isn't unicode semantics
3583 * here, at runtime there may be because of matching against a
3584 * utf8 string, so accept a possible false positive for
3585 * latin1-range folds */
3587 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3588 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3589 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3590 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3595 ANYOF_CLASS_ZERO(data->start_class);
3596 ANYOF_BITMAP_ZERO(data->start_class);
3598 ANYOF_BITMAP_SET(data->start_class, uc);
3599 else if (uc >= 0x100) {
3602 /* Some Unicode code points fold to the Latin1 range; as
3603 * XXX temporary code, instead of figuring out if this is
3604 * one, just assume it is and set all the start class bits
3605 * that could be some such above 255 code point's fold
3606 * which will generate fals positives. As the code
3607 * elsewhere that does compute the fold settles down, it
3608 * can be extracted out and re-used here */
3609 for (i = 0; i < 256; i++){
3610 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3611 ANYOF_BITMAP_SET(data->start_class, i);
3615 data->start_class->flags &= ~ANYOF_EOS;
3617 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3619 else if (flags & SCF_DO_STCLASS_OR) {
3620 /* false positive possible if the class is case-folded */
3622 ANYOF_BITMAP_SET(data->start_class, uc);
3624 data->start_class->flags |= ANYOF_UNICODE_ALL;
3625 data->start_class->flags &= ~ANYOF_EOS;
3626 cl_and(data->start_class, and_withp);
3628 flags &= ~SCF_DO_STCLASS;
3630 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3631 I32 l = STR_LEN(scan);
3632 UV uc = *((U8*)STRING(scan));
3634 /* Search for fixed substrings supports EXACT only. */
3635 if (flags & SCF_DO_SUBSTR) {
3637 SCAN_COMMIT(pRExC_state, data, minlenp);
3640 const U8 * const s = (U8 *)STRING(scan);
3641 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3642 l = utf8_length(s, s + l);
3644 if (has_exactf_sharp_s) {
3645 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3647 min += l - min_subtract;
3649 delta += min_subtract;
3650 if (flags & SCF_DO_SUBSTR) {
3651 data->pos_min += l - min_subtract;
3652 if (data->pos_min < 0) {
3655 data->pos_delta += min_subtract;
3657 data->longest = &(data->longest_float);
3660 if (flags & SCF_DO_STCLASS_AND) {
3661 /* Check whether it is compatible with what we know already! */
3664 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3665 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3666 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3670 ANYOF_CLASS_ZERO(data->start_class);
3671 ANYOF_BITMAP_ZERO(data->start_class);
3673 ANYOF_BITMAP_SET(data->start_class, uc);
3674 data->start_class->flags &= ~ANYOF_EOS;
3675 if (OP(scan) == EXACTFL) {
3676 /* XXX This set is probably no longer necessary, and
3677 * probably wrong as LOCALE now is on in the initial
3679 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3683 /* Also set the other member of the fold pair. In case
3684 * that unicode semantics is called for at runtime, use
3685 * the full latin1 fold. (Can't do this for locale,
3686 * because not known until runtime) */
3687 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3689 /* All other (EXACTFL handled above) folds except under
3690 * /iaa that include s, S, and sharp_s also may include
3692 if (OP(scan) != EXACTFA) {
3693 if (uc == 's' || uc == 'S') {
3694 ANYOF_BITMAP_SET(data->start_class,
3695 LATIN_SMALL_LETTER_SHARP_S);
3697 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3698 ANYOF_BITMAP_SET(data->start_class, 's');
3699 ANYOF_BITMAP_SET(data->start_class, 'S');
3704 else if (uc >= 0x100) {
3706 for (i = 0; i < 256; i++){
3707 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3708 ANYOF_BITMAP_SET(data->start_class, i);
3713 else if (flags & SCF_DO_STCLASS_OR) {
3714 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3715 /* false positive possible if the class is case-folded.
3716 Assume that the locale settings are the same... */
3718 ANYOF_BITMAP_SET(data->start_class, uc);
3719 if (OP(scan) != EXACTFL) {
3721 /* And set the other member of the fold pair, but
3722 * can't do that in locale because not known until
3724 ANYOF_BITMAP_SET(data->start_class,
3725 PL_fold_latin1[uc]);
3727 /* All folds except under /iaa that include s, S,
3728 * and sharp_s also may include the others */
3729 if (OP(scan) != EXACTFA) {
3730 if (uc == 's' || uc == 'S') {
3731 ANYOF_BITMAP_SET(data->start_class,
3732 LATIN_SMALL_LETTER_SHARP_S);
3734 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3735 ANYOF_BITMAP_SET(data->start_class, 's');
3736 ANYOF_BITMAP_SET(data->start_class, 'S');
3741 data->start_class->flags &= ~ANYOF_EOS;
3743 cl_and(data->start_class, and_withp);
3745 flags &= ~SCF_DO_STCLASS;
3747 else if (REGNODE_VARIES(OP(scan))) {
3748 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3749 I32 f = flags, pos_before = 0;
3750 regnode * const oscan = scan;
3751 struct regnode_charclass_class this_class;
3752 struct regnode_charclass_class *oclass = NULL;
3753 I32 next_is_eval = 0;
3755 switch (PL_regkind[OP(scan)]) {
3756 case WHILEM: /* End of (?:...)* . */
3757 scan = NEXTOPER(scan);
3760 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3761 next = NEXTOPER(scan);
3762 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3764 maxcount = REG_INFTY;
3765 next = regnext(scan);
3766 scan = NEXTOPER(scan);
3770 if (flags & SCF_DO_SUBSTR)
3775 if (flags & SCF_DO_STCLASS) {
3777 maxcount = REG_INFTY;
3778 next = regnext(scan);
3779 scan = NEXTOPER(scan);
3782 is_inf = is_inf_internal = 1;
3783 scan = regnext(scan);
3784 if (flags & SCF_DO_SUBSTR) {
3785 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3786 data->longest = &(data->longest_float);
3788 goto optimize_curly_tail;
3790 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3791 && (scan->flags == stopparen))
3796 mincount = ARG1(scan);
3797 maxcount = ARG2(scan);
3799 next = regnext(scan);
3800 if (OP(scan) == CURLYX) {
3801 I32 lp = (data ? *(data->last_closep) : 0);
3802 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3804 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3805 next_is_eval = (OP(scan) == EVAL);
3807 if (flags & SCF_DO_SUBSTR) {
3808 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3809 pos_before = data->pos_min;
3813 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3815 data->flags |= SF_IS_INF;
3817 if (flags & SCF_DO_STCLASS) {
3818 cl_init(pRExC_state, &this_class);
3819 oclass = data->start_class;
3820 data->start_class = &this_class;
3821 f |= SCF_DO_STCLASS_AND;
3822 f &= ~SCF_DO_STCLASS_OR;
3824 /* Exclude from super-linear cache processing any {n,m}
3825 regops for which the combination of input pos and regex
3826 pos is not enough information to determine if a match
3829 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3830 regex pos at the \s*, the prospects for a match depend not
3831 only on the input position but also on how many (bar\s*)
3832 repeats into the {4,8} we are. */
3833 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3834 f &= ~SCF_WHILEM_VISITED_POS;
3836 /* This will finish on WHILEM, setting scan, or on NULL: */
3837 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3838 last, data, stopparen, recursed, NULL,
3840 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3842 if (flags & SCF_DO_STCLASS)
3843 data->start_class = oclass;
3844 if (mincount == 0 || minnext == 0) {
3845 if (flags & SCF_DO_STCLASS_OR) {
3846 cl_or(pRExC_state, data->start_class, &this_class);
3848 else if (flags & SCF_DO_STCLASS_AND) {
3849 /* Switch to OR mode: cache the old value of
3850 * data->start_class */
3852 StructCopy(data->start_class, and_withp,
3853 struct regnode_charclass_class);
3854 flags &= ~SCF_DO_STCLASS_AND;
3855 StructCopy(&this_class, data->start_class,
3856 struct regnode_charclass_class);
3857 flags |= SCF_DO_STCLASS_OR;
3858 data->start_class->flags |= ANYOF_EOS;
3860 } else { /* Non-zero len */
3861 if (flags & SCF_DO_STCLASS_OR) {
3862 cl_or(pRExC_state, data->start_class, &this_class);
3863 cl_and(data->start_class, and_withp);
3865 else if (flags & SCF_DO_STCLASS_AND)
3866 cl_and(data->start_class, &this_class);
3867 flags &= ~SCF_DO_STCLASS;
3869 if (!scan) /* It was not CURLYX, but CURLY. */
3871 if ( /* ? quantifier ok, except for (?{ ... }) */
3872 (next_is_eval || !(mincount == 0 && maxcount == 1))
3873 && (minnext == 0) && (deltanext == 0)
3874 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3875 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3877 ckWARNreg(RExC_parse,
3878 "Quantifier unexpected on zero-length expression");
3881 min += minnext * mincount;
3882 is_inf_internal |= ((maxcount == REG_INFTY
3883 && (minnext + deltanext) > 0)
3884 || deltanext == I32_MAX);
3885 is_inf |= is_inf_internal;
3886 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3888 /* Try powerful optimization CURLYX => CURLYN. */
3889 if ( OP(oscan) == CURLYX && data
3890 && data->flags & SF_IN_PAR
3891 && !(data->flags & SF_HAS_EVAL)
3892 && !deltanext && minnext == 1 ) {
3893 /* Try to optimize to CURLYN. */
3894 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3895 regnode * const nxt1 = nxt;
3902 if (!REGNODE_SIMPLE(OP(nxt))
3903 && !(PL_regkind[OP(nxt)] == EXACT
3904 && STR_LEN(nxt) == 1))
3910 if (OP(nxt) != CLOSE)
3912 if (RExC_open_parens) {
3913 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3914 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3916 /* Now we know that nxt2 is the only contents: */
3917 oscan->flags = (U8)ARG(nxt);
3919 OP(nxt1) = NOTHING; /* was OPEN. */
3922 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3923 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3924 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3925 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3926 OP(nxt + 1) = OPTIMIZED; /* was count. */
3927 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3932 /* Try optimization CURLYX => CURLYM. */
3933 if ( OP(oscan) == CURLYX && data
3934 && !(data->flags & SF_HAS_PAR)
3935 && !(data->flags & SF_HAS_EVAL)
3936 && !deltanext /* atom is fixed width */
3937 && minnext != 0 /* CURLYM can't handle zero width */
3938 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3940 /* XXXX How to optimize if data == 0? */
3941 /* Optimize to a simpler form. */
3942 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3946 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3947 && (OP(nxt2) != WHILEM))
3949 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3950 /* Need to optimize away parenths. */
3951 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3952 /* Set the parenth number. */
3953 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3955 oscan->flags = (U8)ARG(nxt);
3956 if (RExC_open_parens) {
3957 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3958 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3960 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3961 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3964 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3965 OP(nxt + 1) = OPTIMIZED; /* was count. */
3966 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3967 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3970 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3971 regnode *nnxt = regnext(nxt1);
3973 if (reg_off_by_arg[OP(nxt1)])
3974 ARG_SET(nxt1, nxt2 - nxt1);
3975 else if (nxt2 - nxt1 < U16_MAX)
3976 NEXT_OFF(nxt1) = nxt2 - nxt1;
3978 OP(nxt) = NOTHING; /* Cannot beautify */
3983 /* Optimize again: */
3984 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3985 NULL, stopparen, recursed, NULL, 0,depth+1);
3990 else if ((OP(oscan) == CURLYX)
3991 && (flags & SCF_WHILEM_VISITED_POS)
3992 /* See the comment on a similar expression above.
3993 However, this time it's not a subexpression
3994 we care about, but the expression itself. */
3995 && (maxcount == REG_INFTY)
3996 && data && ++data->whilem_c < 16) {
3997 /* This stays as CURLYX, we can put the count/of pair. */
3998 /* Find WHILEM (as in regexec.c) */
3999 regnode *nxt = oscan + NEXT_OFF(oscan);
4001 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4003 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4004 | (RExC_whilem_seen << 4)); /* On WHILEM */
4006 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4008 if (flags & SCF_DO_SUBSTR) {
4009 SV *last_str = NULL;
4010 int counted = mincount != 0;
4012 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4013 #if defined(SPARC64_GCC_WORKAROUND)
4016 const char *s = NULL;
4019 if (pos_before >= data->last_start_min)
4022 b = data->last_start_min;
4025 s = SvPV_const(data->last_found, l);
4026 old = b - data->last_start_min;
4029 I32 b = pos_before >= data->last_start_min
4030 ? pos_before : data->last_start_min;
4032 const char * const s = SvPV_const(data->last_found, l);
4033 I32 old = b - data->last_start_min;
4037 old = utf8_hop((U8*)s, old) - (U8*)s;
4039 /* Get the added string: */
4040 last_str = newSVpvn_utf8(s + old, l, UTF);
4041 if (deltanext == 0 && pos_before == b) {
4042 /* What was added is a constant string */
4044 SvGROW(last_str, (mincount * l) + 1);
4045 repeatcpy(SvPVX(last_str) + l,
4046 SvPVX_const(last_str), l, mincount - 1);
4047 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4048 /* Add additional parts. */
4049 SvCUR_set(data->last_found,
4050 SvCUR(data->last_found) - l);
4051 sv_catsv(data->last_found, last_str);
4053 SV * sv = data->last_found;
4055 SvUTF8(sv) && SvMAGICAL(sv) ?
4056 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4057 if (mg && mg->mg_len >= 0)
4058 mg->mg_len += CHR_SVLEN(last_str) - l;
4060 data->last_end += l * (mincount - 1);
4063 /* start offset must point into the last copy */
4064 data->last_start_min += minnext * (mincount - 1);
4065 data->last_start_max += is_inf ? I32_MAX
4066 : (maxcount - 1) * (minnext + data->pos_delta);
4069 /* It is counted once already... */
4070 data->pos_min += minnext * (mincount - counted);
4071 data->pos_delta += - counted * deltanext +
4072 (minnext + deltanext) * maxcount - minnext * mincount;
4073 if (mincount != maxcount) {
4074 /* Cannot extend fixed substrings found inside
4076 SCAN_COMMIT(pRExC_state,data,minlenp);
4077 if (mincount && last_str) {
4078 SV * const sv = data->last_found;
4079 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4080 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4084 sv_setsv(sv, last_str);
4085 data->last_end = data->pos_min;
4086 data->last_start_min =
4087 data->pos_min - CHR_SVLEN(last_str);
4088 data->last_start_max = is_inf
4090 : data->pos_min + data->pos_delta
4091 - CHR_SVLEN(last_str);
4093 data->longest = &(data->longest_float);
4095 SvREFCNT_dec(last_str);
4097 if (data && (fl & SF_HAS_EVAL))
4098 data->flags |= SF_HAS_EVAL;
4099 optimize_curly_tail:
4100 if (OP(oscan) != CURLYX) {
4101 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4103 NEXT_OFF(oscan) += NEXT_OFF(next);
4106 default: /* REF, ANYOFV, and CLUMP only? */
4107 if (flags & SCF_DO_SUBSTR) {
4108 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4109 data->longest = &(data->longest_float);
4111 is_inf = is_inf_internal = 1;
4112 if (flags & SCF_DO_STCLASS_OR)
4113 cl_anything(pRExC_state, data->start_class);
4114 flags &= ~SCF_DO_STCLASS;
4118 else if (OP(scan) == LNBREAK) {
4119 if (flags & SCF_DO_STCLASS) {
4121 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4122 if (flags & SCF_DO_STCLASS_AND) {
4123 for (value = 0; value < 256; value++)
4124 if (!is_VERTWS_cp(value))
4125 ANYOF_BITMAP_CLEAR(data->start_class, value);
4128 for (value = 0; value < 256; value++)
4129 if (is_VERTWS_cp(value))
4130 ANYOF_BITMAP_SET(data->start_class, value);
4132 if (flags & SCF_DO_STCLASS_OR)
4133 cl_and(data->start_class, and_withp);
4134 flags &= ~SCF_DO_STCLASS;
4137 delta++; /* Because of the 2 char string cr-lf */
4138 if (flags & SCF_DO_SUBSTR) {
4139 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4141 data->pos_delta += 1;
4142 data->longest = &(data->longest_float);
4145 else if (REGNODE_SIMPLE(OP(scan))) {
4148 if (flags & SCF_DO_SUBSTR) {
4149 SCAN_COMMIT(pRExC_state,data,minlenp);
4153 if (flags & SCF_DO_STCLASS) {
4154 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4156 /* Some of the logic below assumes that switching
4157 locale on will only add false positives. */
4158 switch (PL_regkind[OP(scan)]) {
4162 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4163 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4164 cl_anything(pRExC_state, data->start_class);
4167 if (OP(scan) == SANY)
4169 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4170 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4171 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4172 cl_anything(pRExC_state, data->start_class);
4174 if (flags & SCF_DO_STCLASS_AND || !value)
4175 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4178 if (flags & SCF_DO_STCLASS_AND)
4179 cl_and(data->start_class,
4180 (struct regnode_charclass_class*)scan);
4182 cl_or(pRExC_state, data->start_class,
4183 (struct regnode_charclass_class*)scan);
4186 if (flags & SCF_DO_STCLASS_AND) {
4187 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4188 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
4189 if (OP(scan) == ALNUMU) {
4190 for (value = 0; value < 256; value++) {
4191 if (!isWORDCHAR_L1(value)) {
4192 ANYOF_BITMAP_CLEAR(data->start_class, value);
4196 for (value = 0; value < 256; value++) {
4197 if (!isALNUM(value)) {
4198 ANYOF_BITMAP_CLEAR(data->start_class, value);
4205 if (data->start_class->flags & ANYOF_LOCALE)
4206 ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
4208 /* Even if under locale, set the bits for non-locale
4209 * in case it isn't a true locale-node. This will
4210 * create false positives if it truly is locale */
4211 if (OP(scan) == ALNUMU) {
4212 for (value = 0; value < 256; value++) {
4213 if (isWORDCHAR_L1(value)) {
4214 ANYOF_BITMAP_SET(data->start_class, value);
4218 for (value = 0; value < 256; value++) {
4219 if (isALNUM(value)) {
4220 ANYOF_BITMAP_SET(data->start_class, value);
4227 if (flags & SCF_DO_STCLASS_AND) {
4228 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4229 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
4230 if (OP(scan) == NALNUMU) {
4231 for (value = 0; value < 256; value++) {
4232 if (isWORDCHAR_L1(value)) {
4233 ANYOF_BITMAP_CLEAR(data->start_class, value);
4237 for (value = 0; value < 256; value++) {
4238 if (isALNUM(value)) {
4239 ANYOF_BITMAP_CLEAR(data->start_class, value);
4246 if (data->start_class->flags & ANYOF_LOCALE)
4247 ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
4249 /* Even if under locale, set the bits for non-locale in
4250 * case it isn't a true locale-node. This will create
4251 * false positives if it truly is locale */
4252 if (OP(scan) == NALNUMU) {
4253 for (value = 0; value < 256; value++) {
4254 if (! isWORDCHAR_L1(value)) {
4255 ANYOF_BITMAP_SET(data->start_class, value);
4259 for (value = 0; value < 256; value++) {
4260 if (! isALNUM(value)) {
4261 ANYOF_BITMAP_SET(data->start_class, value);
4268 if (flags & SCF_DO_STCLASS_AND) {
4269 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4270 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4271 if (OP(scan) == SPACEU) {
4272 for (value = 0; value < 256; value++) {
4273 if (!isSPACE_L1(value)) {
4274 ANYOF_BITMAP_CLEAR(data->start_class, value);
4278 for (value = 0; value < 256; value++) {
4279 if (!isSPACE(value)) {
4280 ANYOF_BITMAP_CLEAR(data->start_class, value);
4287 if (data->start_class->flags & ANYOF_LOCALE) {
4288 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4290 if (OP(scan) == SPACEU) {
4291 for (value = 0; value < 256; value++) {
4292 if (isSPACE_L1(value)) {
4293 ANYOF_BITMAP_SET(data->start_class, value);
4297 for (value = 0; value < 256; value++) {
4298 if (isSPACE(value)) {
4299 ANYOF_BITMAP_SET(data->start_class, value);
4306 if (flags & SCF_DO_STCLASS_AND) {
4307 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4308 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4309 if (OP(scan) == NSPACEU) {
4310 for (value = 0; value < 256; value++) {
4311 if (isSPACE_L1(value)) {
4312 ANYOF_BITMAP_CLEAR(data->start_class, value);
4316 for (value = 0; value < 256; value++) {
4317 if (isSPACE(value)) {
4318 ANYOF_BITMAP_CLEAR(data->start_class, value);
4325 if (data->start_class->flags & ANYOF_LOCALE)
4326 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4327 if (OP(scan) == NSPACEU) {
4328 for (value = 0; value < 256; value++) {
4329 if (!isSPACE_L1(value)) {
4330 ANYOF_BITMAP_SET(data->start_class, value);
4335 for (value = 0; value < 256; value++) {
4336 if (!isSPACE(value)) {
4337 ANYOF_BITMAP_SET(data->start_class, value);
4344 if (flags & SCF_DO_STCLASS_AND) {
4345 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4346 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4347 for (value = 0; value < 256; value++)
4348 if (!isDIGIT(value))
4349 ANYOF_BITMAP_CLEAR(data->start_class, value);
4353 if (data->start_class->flags & ANYOF_LOCALE)
4354 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4355 for (value = 0; value < 256; value++)
4357 ANYOF_BITMAP_SET(data->start_class, value);
4361 if (flags & SCF_DO_STCLASS_AND) {
4362 if (!(data->start_class->flags & ANYOF_LOCALE))
4363 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4364 for (value = 0; value < 256; value++)
4366 ANYOF_BITMAP_CLEAR(data->start_class, value);
4369 if (data->start_class->flags & ANYOF_LOCALE)
4370 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4371 for (value = 0; value < 256; value++)
4372 if (!isDIGIT(value))
4373 ANYOF_BITMAP_SET(data->start_class, value);
4376 CASE_SYNST_FNC(VERTWS);
4377 CASE_SYNST_FNC(HORIZWS);
4380 if (flags & SCF_DO_STCLASS_OR)
4381 cl_and(data->start_class, and_withp);
4382 flags &= ~SCF_DO_STCLASS;
4385 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4386 data->flags |= (OP(scan) == MEOL
4389 SCAN_COMMIT(pRExC_state, data, minlenp);
4392 else if ( PL_regkind[OP(scan)] == BRANCHJ
4393 /* Lookbehind, or need to calculate parens/evals/stclass: */
4394 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4395 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4396 if ( OP(scan) == UNLESSM &&
4398 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4399 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4402 regnode *upto= regnext(scan);
4404 SV * const mysv_val=sv_newmortal();
4405 DEBUG_STUDYDATA("OPFAIL",data,depth);
4407 /*DEBUG_PARSE_MSG("opfail");*/
4408 regprop(RExC_rx, mysv_val, upto);
4409 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4410 SvPV_nolen_const(mysv_val),
4411 (IV)REG_NODE_NUM(upto),
4416 NEXT_OFF(scan) = upto - scan;
4417 for (opt= scan + 1; opt < upto ; opt++)
4418 OP(opt) = OPTIMIZED;
4422 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4423 || OP(scan) == UNLESSM )
4425 /* Negative Lookahead/lookbehind
4426 In this case we can't do fixed string optimisation.
4429 I32 deltanext, minnext, fake = 0;
4431 struct regnode_charclass_class intrnl;
4434 data_fake.flags = 0;
4436 data_fake.whilem_c = data->whilem_c;
4437 data_fake.last_closep = data->last_closep;
4440 data_fake.last_closep = &fake;
4441 data_fake.pos_delta = delta;
4442 if ( flags & SCF_DO_STCLASS && !scan->flags
4443 && OP(scan) == IFMATCH ) { /* Lookahead */
4444 cl_init(pRExC_state, &intrnl);
4445 data_fake.start_class = &intrnl;
4446 f |= SCF_DO_STCLASS_AND;
4448 if (flags & SCF_WHILEM_VISITED_POS)
4449 f |= SCF_WHILEM_VISITED_POS;
4450 next = regnext(scan);
4451 nscan = NEXTOPER(NEXTOPER(scan));
4452 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4453 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4456 FAIL("Variable length lookbehind not implemented");
4458 else if (minnext > (I32)U8_MAX) {
4459 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4461 scan->flags = (U8)minnext;
4464 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4466 if (data_fake.flags & SF_HAS_EVAL)
4467 data->flags |= SF_HAS_EVAL;
4468 data->whilem_c = data_fake.whilem_c;
4470 if (f & SCF_DO_STCLASS_AND) {
4471 if (flags & SCF_DO_STCLASS_OR) {
4472 /* OR before, AND after: ideally we would recurse with
4473 * data_fake to get the AND applied by study of the
4474 * remainder of the pattern, and then derecurse;
4475 * *** HACK *** for now just treat as "no information".
4476 * See [perl #56690].
4478 cl_init(pRExC_state, data->start_class);
4480 /* AND before and after: combine and continue */
4481 const int was = (data->start_class->flags & ANYOF_EOS);
4483 cl_and(data->start_class, &intrnl);
4485 data->start_class->flags |= ANYOF_EOS;
4489 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4491 /* Positive Lookahead/lookbehind
4492 In this case we can do fixed string optimisation,
4493 but we must be careful about it. Note in the case of
4494 lookbehind the positions will be offset by the minimum
4495 length of the pattern, something we won't know about
4496 until after the recurse.
4498 I32 deltanext, fake = 0;
4500 struct regnode_charclass_class intrnl;
4502 /* We use SAVEFREEPV so that when the full compile
4503 is finished perl will clean up the allocated
4504 minlens when it's all done. This way we don't
4505 have to worry about freeing them when we know
4506 they wont be used, which would be a pain.
4509 Newx( minnextp, 1, I32 );
4510 SAVEFREEPV(minnextp);
4513 StructCopy(data, &data_fake, scan_data_t);
4514 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4517 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4518 data_fake.last_found=newSVsv(data->last_found);
4522 data_fake.last_closep = &fake;
4523 data_fake.flags = 0;
4524 data_fake.pos_delta = delta;
4526 data_fake.flags |= SF_IS_INF;
4527 if ( flags & SCF_DO_STCLASS && !scan->flags
4528 && OP(scan) == IFMATCH ) { /* Lookahead */
4529 cl_init(pRExC_state, &intrnl);
4530 data_fake.start_class = &intrnl;
4531 f |= SCF_DO_STCLASS_AND;
4533 if (flags & SCF_WHILEM_VISITED_POS)
4534 f |= SCF_WHILEM_VISITED_POS;
4535 next = regnext(scan);
4536 nscan = NEXTOPER(NEXTOPER(scan));
4538 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4539 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4542 FAIL("Variable length lookbehind not implemented");
4544 else if (*minnextp > (I32)U8_MAX) {
4545 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4547 scan->flags = (U8)*minnextp;
4552 if (f & SCF_DO_STCLASS_AND) {
4553 const int was = (data->start_class->flags & ANYOF_EOS);
4555 cl_and(data->start_class, &intrnl);
4557 data->start_class->flags |= ANYOF_EOS;
4560 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4562 if (data_fake.flags & SF_HAS_EVAL)
4563 data->flags |= SF_HAS_EVAL;
4564 data->whilem_c = data_fake.whilem_c;
4565 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4566 if (RExC_rx->minlen<*minnextp)
4567 RExC_rx->minlen=*minnextp;
4568 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4569 SvREFCNT_dec(data_fake.last_found);
4571 if ( data_fake.minlen_fixed != minlenp )
4573 data->offset_fixed= data_fake.offset_fixed;
4574 data->minlen_fixed= data_fake.minlen_fixed;
4575 data->lookbehind_fixed+= scan->flags;
4577 if ( data_fake.minlen_float != minlenp )
4579 data->minlen_float= data_fake.minlen_float;
4580 data->offset_float_min=data_fake.offset_float_min;
4581 data->offset_float_max=data_fake.offset_float_max;
4582 data->lookbehind_float+= scan->flags;
4589 else if (OP(scan) == OPEN) {
4590 if (stopparen != (I32)ARG(scan))
4593 else if (OP(scan) == CLOSE) {
4594 if (stopparen == (I32)ARG(scan)) {
4597 if ((I32)ARG(scan) == is_par) {
4598 next = regnext(scan);
4600 if ( next && (OP(next) != WHILEM) && next < last)
4601 is_par = 0; /* Disable optimization */
4604 *(data->last_closep) = ARG(scan);
4606 else if (OP(scan) == EVAL) {
4608 data->flags |= SF_HAS_EVAL;
4610 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4611 if (flags & SCF_DO_SUBSTR) {
4612 SCAN_COMMIT(pRExC_state,data,minlenp);
4613 flags &= ~SCF_DO_SUBSTR;
4615 if (data && OP(scan)==ACCEPT) {
4616 data->flags |= SCF_SEEN_ACCEPT;
4621 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4623 if (flags & SCF_DO_SUBSTR) {
4624 SCAN_COMMIT(pRExC_state,data,minlenp);
4625 data->longest = &(data->longest_float);
4627 is_inf = is_inf_internal = 1;
4628 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4629 cl_anything(pRExC_state, data->start_class);
4630 flags &= ~SCF_DO_STCLASS;
4632 else if (OP(scan) == GPOS) {
4633 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4634 !(delta || is_inf || (data && data->pos_delta)))
4636 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4637 RExC_rx->extflags |= RXf_ANCH_GPOS;
4638 if (RExC_rx->gofs < (U32)min)
4639 RExC_rx->gofs = min;
4641 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4645 #ifdef TRIE_STUDY_OPT
4646 #ifdef FULL_TRIE_STUDY
4647 else if (PL_regkind[OP(scan)] == TRIE) {
4648 /* NOTE - There is similar code to this block above for handling
4649 BRANCH nodes on the initial study. If you change stuff here
4651 regnode *trie_node= scan;
4652 regnode *tail= regnext(scan);
4653 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4654 I32 max1 = 0, min1 = I32_MAX;
4655 struct regnode_charclass_class accum;
4657 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4658 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4659 if (flags & SCF_DO_STCLASS)
4660 cl_init_zero(pRExC_state, &accum);
4666 const regnode *nextbranch= NULL;
4669 for ( word=1 ; word <= trie->wordcount ; word++)
4671 I32 deltanext=0, minnext=0, f = 0, fake;
4672 struct regnode_charclass_class this_class;
4674 data_fake.flags = 0;
4676 data_fake.whilem_c = data->whilem_c;
4677 data_fake.last_closep = data->last_closep;
4680 data_fake.last_closep = &fake;
4681 data_fake.pos_delta = delta;
4682 if (flags & SCF_DO_STCLASS) {
4683 cl_init(pRExC_state, &this_class);
4684 data_fake.start_class = &this_class;
4685 f = SCF_DO_STCLASS_AND;
4687 if (flags & SCF_WHILEM_VISITED_POS)
4688 f |= SCF_WHILEM_VISITED_POS;
4690 if (trie->jump[word]) {
4692 nextbranch = trie_node + trie->jump[0];
4693 scan= trie_node + trie->jump[word];
4694 /* We go from the jump point to the branch that follows
4695 it. Note this means we need the vestigal unused branches
4696 even though they arent otherwise used.
4698 minnext = study_chunk(pRExC_state, &scan, minlenp,
4699 &deltanext, (regnode *)nextbranch, &data_fake,
4700 stopparen, recursed, NULL, f,depth+1);
4702 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4703 nextbranch= regnext((regnode*)nextbranch);
4705 if (min1 > (I32)(minnext + trie->minlen))
4706 min1 = minnext + trie->minlen;
4707 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4708 max1 = minnext + deltanext + trie->maxlen;
4709 if (deltanext == I32_MAX)
4710 is_inf = is_inf_internal = 1;
4712 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4714 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4715 if ( stopmin > min + min1)
4716 stopmin = min + min1;
4717 flags &= ~SCF_DO_SUBSTR;
4719 data->flags |= SCF_SEEN_ACCEPT;
4722 if (data_fake.flags & SF_HAS_EVAL)
4723 data->flags |= SF_HAS_EVAL;
4724 data->whilem_c = data_fake.whilem_c;
4726 if (flags & SCF_DO_STCLASS)
4727 cl_or(pRExC_state, &accum, &this_class);
4730 if (flags & SCF_DO_SUBSTR) {
4731 data->pos_min += min1;
4732 data->pos_delta += max1 - min1;
4733 if (max1 != min1 || is_inf)
4734 data->longest = &(data->longest_float);
4737 delta += max1 - min1;
4738 if (flags & SCF_DO_STCLASS_OR) {
4739 cl_or(pRExC_state, data->start_class, &accum);
4741 cl_and(data->start_class, and_withp);
4742 flags &= ~SCF_DO_STCLASS;
4745 else if (flags & SCF_DO_STCLASS_AND) {
4747 cl_and(data->start_class, &accum);
4748 flags &= ~SCF_DO_STCLASS;
4751 /* Switch to OR mode: cache the old value of
4752 * data->start_class */
4754 StructCopy(data->start_class, and_withp,
4755 struct regnode_charclass_class);
4756 flags &= ~SCF_DO_STCLASS_AND;
4757 StructCopy(&accum, data->start_class,
4758 struct regnode_charclass_class);
4759 flags |= SCF_DO_STCLASS_OR;
4760 data->start_class->flags |= ANYOF_EOS;
4767 else if (PL_regkind[OP(scan)] == TRIE) {
4768 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4771 min += trie->minlen;
4772 delta += (trie->maxlen - trie->minlen);
4773 flags &= ~SCF_DO_STCLASS; /* xxx */
4774 if (flags & SCF_DO_SUBSTR) {
4775 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4776 data->pos_min += trie->minlen;
4777 data->pos_delta += (trie->maxlen - trie->minlen);
4778 if (trie->maxlen != trie->minlen)
4779 data->longest = &(data->longest_float);
4781 if (trie->jump) /* no more substrings -- for now /grr*/
4782 flags &= ~SCF_DO_SUBSTR;
4784 #endif /* old or new */
4785 #endif /* TRIE_STUDY_OPT */
4787 /* Else: zero-length, ignore. */
4788 scan = regnext(scan);
4793 stopparen = frame->stop;
4794 frame = frame->prev;
4795 goto fake_study_recurse;
4800 DEBUG_STUDYDATA("pre-fin:",data,depth);
4803 *deltap = is_inf_internal ? I32_MAX : delta;
4804 if (flags & SCF_DO_SUBSTR && is_inf)
4805 data->pos_delta = I32_MAX - data->pos_min;
4806 if (is_par > (I32)U8_MAX)
4808 if (is_par && pars==1 && data) {
4809 data->flags |= SF_IN_PAR;
4810 data->flags &= ~SF_HAS_PAR;
4812 else if (pars && data) {
4813 data->flags |= SF_HAS_PAR;
4814 data->flags &= ~SF_IN_PAR;
4816 if (flags & SCF_DO_STCLASS_OR)
4817 cl_and(data->start_class, and_withp);
4818 if (flags & SCF_TRIE_RESTUDY)
4819 data->flags |= SCF_TRIE_RESTUDY;
4821 DEBUG_STUDYDATA("post-fin:",data,depth);
4823 return min < stopmin ? min : stopmin;
4827 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4829 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4831 PERL_ARGS_ASSERT_ADD_DATA;
4833 Renewc(RExC_rxi->data,
4834 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4835 char, struct reg_data);
4837 Renew(RExC_rxi->data->what, count + n, U8);
4839 Newx(RExC_rxi->data->what, n, U8);
4840 RExC_rxi->data->count = count + n;
4841 Copy(s, RExC_rxi->data->what + count, n, U8);
4845 /*XXX: todo make this not included in a non debugging perl */
4846 #ifndef PERL_IN_XSUB_RE
4848 Perl_reginitcolors(pTHX)
4851 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4853 char *t = savepv(s);
4857 t = strchr(t, '\t');
4863 PL_colors[i] = t = (char *)"";
4868 PL_colors[i++] = (char *)"";
4875 #ifdef TRIE_STUDY_OPT
4876 #define CHECK_RESTUDY_GOTO \
4878 (data.flags & SCF_TRIE_RESTUDY) \
4882 #define CHECK_RESTUDY_GOTO
4886 * pregcomp - compile a regular expression into internal code
4888 * Decides which engine's compiler to call based on the hint currently in
4892 #ifndef PERL_IN_XSUB_RE
4894 /* return the currently in-scope regex engine (or the default if none) */
4896 regexp_engine const *
4897 Perl_current_re_engine(pTHX)
4901 if (IN_PERL_COMPILETIME) {
4902 HV * const table = GvHV(PL_hintgv);
4906 return &reh_regexp_engine;
4907 ptr = hv_fetchs(table, "regcomp", FALSE);
4908 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4909 return &reh_regexp_engine;
4910 return INT2PTR(regexp_engine*,SvIV(*ptr));
4914 if (!PL_curcop->cop_hints_hash)
4915 return &reh_regexp_engine;
4916 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4917 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4918 return &reh_regexp_engine;
4919 return INT2PTR(regexp_engine*,SvIV(ptr));
4925 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4928 regexp_engine const *eng = current_re_engine();
4929 GET_RE_DEBUG_FLAGS_DECL;
4931 PERL_ARGS_ASSERT_PREGCOMP;
4933 /* Dispatch a request to compile a regexp to correct regexp engine. */
4935 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4938 return CALLREGCOMP_ENG(eng, pattern, flags);
4942 /* public(ish) entry point for the perl core's own regex compiling code.
4943 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4944 * pattern rather than a list of OPs, and uses the internal engine rather
4945 * than the current one */
4948 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4950 SV *pat = pattern; /* defeat constness! */
4951 PERL_ARGS_ASSERT_RE_COMPILE;
4952 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4953 #ifdef PERL_IN_XSUB_RE
4958 NULL, NULL, rx_flags, 0);
4961 /* see if there are any run-time code blocks in the pattern.
4962 * False positives are allowed */
4965 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4966 U32 pm_flags, char *pat, STRLEN plen)
4971 /* avoid infinitely recursing when we recompile the pattern parcelled up
4972 * as qr'...'. A single constant qr// string can't have have any
4973 * run-time component in it, and thus, no runtime code. (A non-qr
4974 * string, however, can, e.g. $x =~ '(?{})') */
4975 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4978 for (s = 0; s < plen; s++) {
4979 if (n < pRExC_state->num_code_blocks
4980 && s == pRExC_state->code_blocks[n].start)
4982 s = pRExC_state->code_blocks[n].end;
4986 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4988 if (pat[s] == '(' && pat[s+1] == '?' &&
4989 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4996 /* Handle run-time code blocks. We will already have compiled any direct
4997 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4998 * copy of it, but with any literal code blocks blanked out and
4999 * appropriate chars escaped; then feed it into
5001 * eval "qr'modified_pattern'"
5005 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5009 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5011 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5012 * and merge them with any code blocks of the original regexp.
5014 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5015 * instead, just save the qr and return FALSE; this tells our caller that
5016 * the original pattern needs upgrading to utf8.
5020 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5021 char *pat, STRLEN plen)
5025 GET_RE_DEBUG_FLAGS_DECL;
5027 if (pRExC_state->runtime_code_qr) {
5028 /* this is the second time we've been called; this should
5029 * only happen if the main pattern got upgraded to utf8
5030 * during compilation; re-use the qr we compiled first time
5031 * round (which should be utf8 too)
5033 qr = pRExC_state->runtime_code_qr;
5034 pRExC_state->runtime_code_qr = NULL;
5035 assert(RExC_utf8 && SvUTF8(qr));
5041 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5045 /* determine how many extra chars we need for ' and \ escaping */
5046 for (s = 0; s < plen; s++) {
5047 if (pat[s] == '\'' || pat[s] == '\\')
5051 Newx(newpat, newlen, char);
5053 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5055 for (s = 0; s < plen; s++) {
5056 if (n < pRExC_state->num_code_blocks
5057 && s == pRExC_state->code_blocks[n].start)
5059 /* blank out literal code block */
5060 assert(pat[s] == '(');
5061 while (s <= pRExC_state->code_blocks[n].end) {
5069 if (pat[s] == '\'' || pat[s] == '\\')
5074 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5078 PerlIO_printf(Perl_debug_log,
5079 "%sre-parsing pattern for runtime code:%s %s\n",
5080 PL_colors[4],PL_colors[5],newpat);
5083 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5089 PUSHSTACKi(PERLSI_REQUIRE);
5090 /* this causes the toker to collapse \\ into \ when parsing
5091 * qr''; normally only q'' does this. It also alters hints
5093 PL_reg_state.re_reparsing = TRUE;
5094 eval_sv(sv, G_SCALAR);
5101 Safefree(pRExC_state->code_blocks);
5102 Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5104 assert(SvROK(qr_ref));
5106 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5107 /* the leaving below frees the tmp qr_ref.
5108 * Give qr a life of its own */
5116 if (!RExC_utf8 && SvUTF8(qr)) {
5117 /* first time through; the pattern got upgraded; save the
5118 * qr for the next time through */
5119 assert(!pRExC_state->runtime_code_qr);
5120 pRExC_state->runtime_code_qr = qr;
5125 /* extract any code blocks within the returned qr// */
5128 /* merge the main (r1) and run-time (r2) code blocks into one */
5130 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5131 struct reg_code_block *new_block, *dst;
5132 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5135 if (!r2->num_code_blocks) /* we guessed wrong */
5142 r1->num_code_blocks + r2->num_code_blocks,
5143 struct reg_code_block);
5146 while ( i1 < r1->num_code_blocks
5147 || i2 < r2->num_code_blocks)
5149 struct reg_code_block *src;
5152 if (i1 == r1->num_code_blocks) {
5153 src = &r2->code_blocks[i2++];
5156 else if (i2 == r2->num_code_blocks)
5157 src = &r1->code_blocks[i1++];
5158 else if ( r1->code_blocks[i1].start
5159 < r2->code_blocks[i2].start)
5161 src = &r1->code_blocks[i1++];
5162 assert(src->end < r2->code_blocks[i2].start);
5165 assert( r1->code_blocks[i1].start
5166 > r2->code_blocks[i2].start);
5167 src = &r2->code_blocks[i2++];
5169 assert(src->end < r1->code_blocks[i1].start);
5172 assert(pat[src->start] == '(');
5173 assert(pat[src->end] == ')');
5174 dst->start = src->start;
5175 dst->end = src->end;
5176 dst->block = src->block;
5177 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5181 r1->num_code_blocks += r2->num_code_blocks;
5182 Safefree(r1->code_blocks);
5183 r1->code_blocks = new_block;
5192 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5194 /* This is the common code for setting up the floating and fixed length
5195 * string data extracted from Perlre_op_compile() below. Returns a boolean
5196 * as to whether succeeded or not */
5200 if (! (longest_length
5201 || (eol /* Can't have SEOL and MULTI */
5202 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5204 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5205 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5210 /* copy the information about the longest from the reg_scan_data
5211 over to the program. */
5212 if (SvUTF8(sv_longest)) {
5213 *rx_utf8 = sv_longest;
5216 *rx_substr = sv_longest;
5219 /* end_shift is how many chars that must be matched that
5220 follow this item. We calculate it ahead of time as once the
5221 lookbehind offset is added in we lose the ability to correctly
5223 ml = minlen ? *(minlen) : (I32)longest_length;
5224 *rx_end_shift = ml - offset
5225 - longest_length + (SvTAIL(sv_longest) != 0)
5228 t = (eol/* Can't have SEOL and MULTI */
5229 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5230 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5236 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5237 * regular expression into internal code.
5238 * The pattern may be passed either as:
5239 * a list of SVs (patternp plus pat_count)
5240 * a list of OPs (expr)
5241 * If both are passed, the SV list is used, but the OP list indicates
5242 * which SVs are actually pre-compiled code blocks
5244 * The SVs in the list have magic and qr overloading applied to them (and
5245 * the list may be modified in-place with replacement SVs in the latter
5248 * If the pattern hasn't changed from old_re, then old_re will be
5251 * eng is the current engine. If that engine has an op_comp method, then
5252 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5253 * do the initial concatenation of arguments and pass on to the external
5256 * If is_bare_re is not null, set it to a boolean indicating whether the
5257 * arg list reduced (after overloading) to a single bare regex which has
5258 * been returned (i.e. /$qr/).
5260 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5262 * pm_flags contains the PMf_* flags, typically based on those from the
5263 * pm_flags field of the related PMOP. Currently we're only interested in
5264 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5266 * We can't allocate space until we know how big the compiled form will be,
5267 * but we can't compile it (and thus know how big it is) until we've got a
5268 * place to put the code. So we cheat: we compile it twice, once with code
5269 * generation turned off and size counting turned on, and once "for real".
5270 * This also means that we don't allocate space until we are sure that the
5271 * thing really will compile successfully, and we never have to move the
5272 * code and thus invalidate pointers into it. (Note that it has to be in
5273 * one piece because free() must be able to free it all.) [NB: not true in perl]
5275 * Beware that the optimization-preparation code in here knows about some
5276 * of the structure of the compiled regexp. [I'll say.]
5280 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5281 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5282 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5287 regexp_internal *ri;
5296 SV * VOL code_blocksv = NULL;
5298 /* these are all flags - maybe they should be turned
5299 * into a single int with different bit masks */
5300 I32 sawlookahead = 0;
5303 bool used_setjump = FALSE;
5304 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5305 bool code_is_utf8 = 0;
5306 bool VOL recompile = 0;
5307 bool runtime_code = 0;
5311 RExC_state_t RExC_state;
5312 RExC_state_t * const pRExC_state = &RExC_state;
5313 #ifdef TRIE_STUDY_OPT
5315 RExC_state_t copyRExC_state;
5317 GET_RE_DEBUG_FLAGS_DECL;
5319 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5321 DEBUG_r(if (!PL_colorset) reginitcolors());
5323 #ifndef PERL_IN_XSUB_RE
5324 /* Initialize these here instead of as-needed, as is quick and avoids
5325 * having to test them each time otherwise */
5326 if (! PL_AboveLatin1) {
5327 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5328 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5329 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5331 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5332 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5334 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5335 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5337 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5338 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5340 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5342 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5343 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5345 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5347 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5348 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5350 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5351 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5353 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5354 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5356 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5357 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5359 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5360 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5362 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5363 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5365 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5366 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5368 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5370 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5371 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5373 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5374 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5376 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5380 pRExC_state->code_blocks = NULL;
5381 pRExC_state->num_code_blocks = 0;
5384 *is_bare_re = FALSE;
5386 if (expr && (expr->op_type == OP_LIST ||
5387 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5389 /* is the source UTF8, and how many code blocks are there? */
5393 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5394 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5396 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5397 /* count of DO blocks */
5401 pRExC_state->num_code_blocks = ncode;
5402 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5407 /* handle a list of SVs */
5411 /* apply magic and RE overloading to each arg */
5412 for (svp = patternp; svp < patternp + pat_count; svp++) {
5415 if (SvROK(rx) && SvAMAGIC(rx)) {
5416 SV *sv = AMG_CALLunary(rx, regexp_amg);
5420 if (SvTYPE(sv) != SVt_REGEXP)
5421 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5427 if (pat_count > 1) {
5428 /* concat multiple args and find any code block indexes */
5433 STRLEN orig_patlen = 0;
5435 if (pRExC_state->num_code_blocks) {
5436 o = cLISTOPx(expr)->op_first;
5437 assert( o->op_type == OP_PUSHMARK
5438 || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5439 || o->op_type == OP_PADRANGE);
5443 pat = newSVpvn("", 0);
5446 /* determine if the pattern is going to be utf8 (needed
5447 * in advance to align code block indices correctly).
5448 * XXX This could fail to be detected for an arg with
5449 * overloading but not concat overloading; but the main effect
5450 * in this obscure case is to need a 'use re eval' for a
5451 * literal code block */
5452 for (svp = patternp; svp < patternp + pat_count; svp++) {
5459 for (svp = patternp; svp < patternp + pat_count; svp++) {
5460 SV *sv, *msv = *svp;
5463 /* we make the assumption here that each op in the list of
5464 * op_siblings maps to one SV pushed onto the stack,
5465 * except for code blocks, with have both an OP_NULL and
5467 * This allows us to match up the list of SVs against the
5468 * list of OPs to find the next code block.
5470 * Note that PUSHMARK PADSV PADSV ..
5472 * PADRANGE NULL NULL ..
5473 * so the alignment still works. */
5475 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5476 assert(n < pRExC_state->num_code_blocks);
5477 pRExC_state->code_blocks[n].start = SvCUR(pat);
5478 pRExC_state->code_blocks[n].block = o;
5479 pRExC_state->code_blocks[n].src_regex = NULL;
5482 o = o->op_sibling; /* skip CONST */
5488 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5489 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5492 /* overloading involved: all bets are off over literal
5493 * code. Pretend we haven't seen it */
5494 pRExC_state->num_code_blocks -= n;
5500 while (SvAMAGIC(msv)
5501 && (sv = AMG_CALLunary(msv, string_amg))
5505 && SvRV(msv) == SvRV(sv))
5510 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5512 orig_patlen = SvCUR(pat);
5513 sv_catsv_nomg(pat, msv);
5516 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5519 /* extract any code blocks within any embedded qr//'s */
5520 if (rx && SvTYPE(rx) == SVt_REGEXP
5521 && RX_ENGINE((REGEXP*)rx)->op_comp)
5524 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5525 if (ri->num_code_blocks) {
5527 /* the presence of an embedded qr// with code means
5528 * we should always recompile: the text of the
5529 * qr// may not have changed, but it may be a
5530 * different closure than last time */
5532 Renew(pRExC_state->code_blocks,
5533 pRExC_state->num_code_blocks + ri->num_code_blocks,
5534 struct reg_code_block);
5535 pRExC_state->num_code_blocks += ri->num_code_blocks;
5536 for (i=0; i < ri->num_code_blocks; i++) {
5537 struct reg_code_block *src, *dst;
5538 STRLEN offset = orig_patlen
5539 + ReANY((REGEXP *)rx)->pre_prefix;
5540 assert(n < pRExC_state->num_code_blocks);
5541 src = &ri->code_blocks[i];
5542 dst = &pRExC_state->code_blocks[n];
5543 dst->start = src->start + offset;
5544 dst->end = src->end + offset;
5545 dst->block = src->block;
5546 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5560 while (SvAMAGIC(pat)
5561 && (sv = AMG_CALLunary(pat, string_amg))
5569 /* handle bare regex: foo =~ $re */
5574 if (SvTYPE(re) == SVt_REGEXP) {
5578 Safefree(pRExC_state->code_blocks);
5584 /* not a list of SVs, so must be a list of OPs */
5586 if (expr->op_type == OP_LIST) {
5591 pat = newSVpvn("", 0);
5596 /* given a list of CONSTs and DO blocks in expr, append all
5597 * the CONSTs to pat, and record the start and end of each
5598 * code block in code_blocks[] (each DO{} op is followed by an
5599 * OP_CONST containing the corresponding literal '(?{...})
5602 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5603 if (o->op_type == OP_CONST) {
5604 sv_catsv(pat, cSVOPo_sv);
5606 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5610 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5611 assert(i+1 < pRExC_state->num_code_blocks);
5612 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5613 pRExC_state->code_blocks[i].block = o;
5614 pRExC_state->code_blocks[i].src_regex = NULL;
5620 assert(expr->op_type == OP_CONST);
5621 pat = cSVOPx_sv(expr);
5625 exp = SvPV_nomg(pat, plen);
5627 if (!eng->op_comp) {
5628 if ((SvUTF8(pat) && IN_BYTES)
5629 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5631 /* make a temporary copy; either to convert to bytes,
5632 * or to avoid repeating get-magic / overloaded stringify */
5633 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5634 (IN_BYTES ? 0 : SvUTF8(pat)));
5636 Safefree(pRExC_state->code_blocks);
5637 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5640 /* ignore the utf8ness if the pattern is 0 length */
5641 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5642 RExC_uni_semantics = 0;
5643 RExC_contains_locale = 0;
5644 pRExC_state->runtime_code_qr = NULL;
5646 /****************** LONG JUMP TARGET HERE***********************/
5647 /* Longjmp back to here if have to switch in midstream to utf8 */
5648 if (! RExC_orig_utf8) {
5649 JMPENV_PUSH(jump_ret);
5650 used_setjump = TRUE;
5653 if (jump_ret == 0) { /* First time through */
5657 SV *dsv= sv_newmortal();
5658 RE_PV_QUOTED_DECL(s, RExC_utf8,
5659 dsv, exp, plen, 60);
5660 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5661 PL_colors[4],PL_colors[5],s);
5664 else { /* longjumped back */
5667 STRLEN s = 0, d = 0;
5670 /* If the cause for the longjmp was other than changing to utf8, pop
5671 * our own setjmp, and longjmp to the correct handler */
5672 if (jump_ret != UTF8_LONGJMP) {
5674 JMPENV_JUMP(jump_ret);
5679 /* It's possible to write a regexp in ascii that represents Unicode
5680 codepoints outside of the byte range, such as via \x{100}. If we
5681 detect such a sequence we have to convert the entire pattern to utf8
5682 and then recompile, as our sizing calculation will have been based
5683 on 1 byte == 1 character, but we will need to use utf8 to encode
5684 at least some part of the pattern, and therefore must convert the whole
5687 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5688 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5690 /* upgrade pattern to UTF8, and if there are code blocks,
5691 * recalculate the indices.
5692 * This is essentially an unrolled Perl_bytes_to_utf8() */
5694 src = (U8*)SvPV_nomg(pat, plen);
5695 Newx(dst, plen * 2 + 1, U8);
5698 const UV uv = NATIVE_TO_ASCII(src[s]);
5699 if (UNI_IS_INVARIANT(uv))
5700 dst[d] = (U8)UTF_TO_NATIVE(uv);
5702 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5703 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5705 if (n < pRExC_state->num_code_blocks) {
5706 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5707 pRExC_state->code_blocks[n].start = d;
5708 assert(dst[d] == '(');
5711 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5712 pRExC_state->code_blocks[n].end = d;
5713 assert(dst[d] == ')');
5726 RExC_orig_utf8 = RExC_utf8 = 1;
5729 /* return old regex if pattern hasn't changed */
5733 && !!RX_UTF8(old_re) == !!RExC_utf8
5734 && RX_PRECOMP(old_re)
5735 && RX_PRELEN(old_re) == plen
5736 && memEQ(RX_PRECOMP(old_re), exp, plen))
5738 /* with runtime code, always recompile */
5739 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5741 if (!runtime_code) {
5745 Safefree(pRExC_state->code_blocks);
5749 else if ((pm_flags & PMf_USE_RE_EVAL)
5750 /* this second condition covers the non-regex literal case,
5751 * i.e. $foo =~ '(?{})'. */
5752 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5753 && (PL_hints & HINT_RE_EVAL))
5755 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5758 #ifdef TRIE_STUDY_OPT
5762 rx_flags = orig_rx_flags;
5764 if (initial_charset == REGEX_LOCALE_CHARSET) {
5765 RExC_contains_locale = 1;
5767 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5769 /* Set to use unicode semantics if the pattern is in utf8 and has the
5770 * 'depends' charset specified, as it means unicode when utf8 */
5771 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5775 RExC_flags = rx_flags;
5776 RExC_pm_flags = pm_flags;
5779 if (TAINTING_get && TAINT_get)
5780 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5782 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5783 /* whoops, we have a non-utf8 pattern, whilst run-time code
5784 * got compiled as utf8. Try again with a utf8 pattern */
5785 JMPENV_JUMP(UTF8_LONGJMP);
5788 assert(!pRExC_state->runtime_code_qr);
5793 RExC_in_lookbehind = 0;
5794 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5796 RExC_override_recoding = 0;
5797 RExC_in_multi_char_class = 0;
5799 /* First pass: determine size, legality. */
5807 RExC_emit = &PL_regdummy;
5808 RExC_whilem_seen = 0;
5809 RExC_open_parens = NULL;
5810 RExC_close_parens = NULL;
5812 RExC_paren_names = NULL;
5814 RExC_paren_name_list = NULL;
5816 RExC_recurse = NULL;
5817 RExC_recurse_count = 0;
5818 pRExC_state->code_index = 0;
5820 #if 0 /* REGC() is (currently) a NOP at the first pass.
5821 * Clever compilers notice this and complain. --jhi */
5822 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5825 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5827 RExC_lastparse=NULL;
5829 /* reg may croak on us, not giving us a chance to free
5830 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5831 need it to survive as long as the regexp (qr/(?{})/).
5832 We must check that code_blocksv is not already set, because we may
5833 have longjmped back. */
5834 if (pRExC_state->code_blocks && !code_blocksv) {
5835 code_blocksv = newSV_type(SVt_PV);
5836 SAVEFREESV(code_blocksv);
5837 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5838 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5840 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5841 RExC_precomp = NULL;
5845 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5847 /* Here, finished first pass. Get rid of any added setjmp */
5853 PerlIO_printf(Perl_debug_log,
5854 "Required size %"IVdf" nodes\n"
5855 "Starting second pass (creation)\n",
5858 RExC_lastparse=NULL;
5861 /* The first pass could have found things that force Unicode semantics */
5862 if ((RExC_utf8 || RExC_uni_semantics)
5863 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5865 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5868 /* Small enough for pointer-storage convention?
5869 If extralen==0, this means that we will not need long jumps. */
5870 if (RExC_size >= 0x10000L && RExC_extralen)
5871 RExC_size += RExC_extralen;
5874 if (RExC_whilem_seen > 15)
5875 RExC_whilem_seen = 15;
5877 /* Allocate space and zero-initialize. Note, the two step process
5878 of zeroing when in debug mode, thus anything assigned has to
5879 happen after that */
5880 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5882 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5883 char, regexp_internal);
5884 if ( r == NULL || ri == NULL )
5885 FAIL("Regexp out of space");
5887 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5888 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5890 /* bulk initialize base fields with 0. */
5891 Zero(ri, sizeof(regexp_internal), char);
5894 /* non-zero initialization begins here */
5897 r->extflags = rx_flags;
5898 if (pm_flags & PMf_IS_QR) {
5899 ri->code_blocks = pRExC_state->code_blocks;
5900 ri->num_code_blocks = pRExC_state->num_code_blocks;
5905 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5906 if (pRExC_state->code_blocks[n].src_regex)
5907 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5908 SAVEFREEPV(pRExC_state->code_blocks);
5912 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5913 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5915 /* The caret is output if there are any defaults: if not all the STD
5916 * flags are set, or if no character set specifier is needed */
5918 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5920 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5921 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5922 >> RXf_PMf_STD_PMMOD_SHIFT);
5923 const char *fptr = STD_PAT_MODS; /*"msix"*/
5925 /* Allocate for the worst case, which is all the std flags are turned
5926 * on. If more precision is desired, we could do a population count of
5927 * the flags set. This could be done with a small lookup table, or by
5928 * shifting, masking and adding, or even, when available, assembly
5929 * language for a machine-language population count.
5930 * We never output a minus, as all those are defaults, so are
5931 * covered by the caret */
5932 const STRLEN wraplen = plen + has_p + has_runon
5933 + has_default /* If needs a caret */
5935 /* If needs a character set specifier */
5936 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5937 + (sizeof(STD_PAT_MODS) - 1)
5938 + (sizeof("(?:)") - 1);
5940 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5941 r->xpv_len_u.xpvlenu_pv = p;
5943 SvFLAGS(rx) |= SVf_UTF8;
5946 /* If a default, cover it using the caret */
5948 *p++= DEFAULT_PAT_MOD;
5952 const char* const name = get_regex_charset_name(r->extflags, &len);
5953 Copy(name, p, len, char);
5957 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5960 while((ch = *fptr++)) {
5968 Copy(RExC_precomp, p, plen, char);
5969 assert ((RX_WRAPPED(rx) - p) < 16);
5970 r->pre_prefix = p - RX_WRAPPED(rx);
5976 SvCUR_set(rx, p - RX_WRAPPED(rx));
5980 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5982 if (RExC_seen & REG_SEEN_RECURSE) {
5983 Newxz(RExC_open_parens, RExC_npar,regnode *);
5984 SAVEFREEPV(RExC_open_parens);
5985 Newxz(RExC_close_parens,RExC_npar,regnode *);
5986 SAVEFREEPV(RExC_close_parens);
5989 /* Useful during FAIL. */
5990 #ifdef RE_TRACK_PATTERN_OFFSETS
5991 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5992 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5993 "%s %"UVuf" bytes for offset annotations.\n",
5994 ri->u.offsets ? "Got" : "Couldn't get",
5995 (UV)((2*RExC_size+1) * sizeof(U32))));
5997 SetProgLen(ri,RExC_size);
6001 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6003 /* Second pass: emit code. */
6004 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6005 RExC_pm_flags = pm_flags;
6010 RExC_emit_start = ri->program;
6011 RExC_emit = ri->program;
6012 RExC_emit_bound = ri->program + RExC_size + 1;
6013 pRExC_state->code_index = 0;
6015 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6016 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6020 /* XXXX To minimize changes to RE engine we always allocate
6021 3-units-long substrs field. */
6022 Newx(r->substrs, 1, struct reg_substr_data);
6023 if (RExC_recurse_count) {
6024 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6025 SAVEFREEPV(RExC_recurse);
6029 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6030 Zero(r->substrs, 1, struct reg_substr_data);
6032 #ifdef TRIE_STUDY_OPT
6034 StructCopy(&zero_scan_data, &data, scan_data_t);
6035 copyRExC_state = RExC_state;
6038 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6040 RExC_state = copyRExC_state;
6041 if (seen & REG_TOP_LEVEL_BRANCHES)
6042 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6044 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6045 if (data.last_found) {
6046 SvREFCNT_dec(data.longest_fixed);
6047 SvREFCNT_dec(data.longest_float);
6048 SvREFCNT_dec(data.last_found);
6050 StructCopy(&zero_scan_data, &data, scan_data_t);
6053 StructCopy(&zero_scan_data, &data, scan_data_t);
6056 /* Dig out information for optimizations. */
6057 r->extflags = RExC_flags; /* was pm_op */
6058 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6061 SvUTF8_on(rx); /* Unicode in it? */
6062 ri->regstclass = NULL;
6063 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6064 r->intflags |= PREGf_NAUGHTY;
6065 scan = ri->program + 1; /* First BRANCH. */
6067 /* testing for BRANCH here tells us whether there is "must appear"
6068 data in the pattern. If there is then we can use it for optimisations */
6069 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6071 STRLEN longest_float_length, longest_fixed_length;
6072 struct regnode_charclass_class ch_class; /* pointed to by data */
6074 I32 last_close = 0; /* pointed to by data */
6075 regnode *first= scan;
6076 regnode *first_next= regnext(first);
6078 * Skip introductions and multiplicators >= 1
6079 * so that we can extract the 'meat' of the pattern that must
6080 * match in the large if() sequence following.
6081 * NOTE that EXACT is NOT covered here, as it is normally
6082 * picked up by the optimiser separately.
6084 * This is unfortunate as the optimiser isnt handling lookahead
6085 * properly currently.
6088 while ((OP(first) == OPEN && (sawopen = 1)) ||
6089 /* An OR of *one* alternative - should not happen now. */
6090 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6091 /* for now we can't handle lookbehind IFMATCH*/
6092 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6093 (OP(first) == PLUS) ||
6094 (OP(first) == MINMOD) ||
6095 /* An {n,m} with n>0 */
6096 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6097 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6100 * the only op that could be a regnode is PLUS, all the rest
6101 * will be regnode_1 or regnode_2.
6104 if (OP(first) == PLUS)
6107 first += regarglen[OP(first)];
6109 first = NEXTOPER(first);
6110 first_next= regnext(first);
6113 /* Starting-point info. */
6115 DEBUG_PEEP("first:",first,0);
6116 /* Ignore EXACT as we deal with it later. */
6117 if (PL_regkind[OP(first)] == EXACT) {
6118 if (OP(first) == EXACT)
6119 NOOP; /* Empty, get anchored substr later. */
6121 ri->regstclass = first;
6124 else if (PL_regkind[OP(first)] == TRIE &&
6125 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6128 /* this can happen only on restudy */
6129 if ( OP(first) == TRIE ) {
6130 struct regnode_1 *trieop = (struct regnode_1 *)
6131 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6132 StructCopy(first,trieop,struct regnode_1);
6133 trie_op=(regnode *)trieop;
6135 struct regnode_charclass *trieop = (struct regnode_charclass *)
6136 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6137 StructCopy(first,trieop,struct regnode_charclass);
6138 trie_op=(regnode *)trieop;
6141 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6142 ri->regstclass = trie_op;
6145 else if (REGNODE_SIMPLE(OP(first)))
6146 ri->regstclass = first;
6147 else if (PL_regkind[OP(first)] == BOUND ||
6148 PL_regkind[OP(first)] == NBOUND)
6149 ri->regstclass = first;
6150 else if (PL_regkind[OP(first)] == BOL) {
6151 r->extflags |= (OP(first) == MBOL
6153 : (OP(first) == SBOL
6156 first = NEXTOPER(first);
6159 else if (OP(first) == GPOS) {
6160 r->extflags |= RXf_ANCH_GPOS;
6161 first = NEXTOPER(first);
6164 else if ((!sawopen || !RExC_sawback) &&
6165 (OP(first) == STAR &&
6166 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6167 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6169 /* turn .* into ^.* with an implied $*=1 */
6171 (OP(NEXTOPER(first)) == REG_ANY)
6174 r->extflags |= type;
6175 r->intflags |= PREGf_IMPLICIT;
6176 first = NEXTOPER(first);
6179 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6180 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6181 /* x+ must match at the 1st pos of run of x's */
6182 r->intflags |= PREGf_SKIP;
6184 /* Scan is after the zeroth branch, first is atomic matcher. */
6185 #ifdef TRIE_STUDY_OPT
6188 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6189 (IV)(first - scan + 1))
6193 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6194 (IV)(first - scan + 1))
6200 * If there's something expensive in the r.e., find the
6201 * longest literal string that must appear and make it the
6202 * regmust. Resolve ties in favor of later strings, since
6203 * the regstart check works with the beginning of the r.e.
6204 * and avoiding duplication strengthens checking. Not a
6205 * strong reason, but sufficient in the absence of others.
6206 * [Now we resolve ties in favor of the earlier string if
6207 * it happens that c_offset_min has been invalidated, since the
6208 * earlier string may buy us something the later one won't.]
6211 data.longest_fixed = newSVpvs("");
6212 data.longest_float = newSVpvs("");
6213 data.last_found = newSVpvs("");
6214 data.longest = &(data.longest_fixed);
6216 if (!ri->regstclass) {
6217 cl_init(pRExC_state, &ch_class);
6218 data.start_class = &ch_class;
6219 stclass_flag = SCF_DO_STCLASS_AND;
6220 } else /* XXXX Check for BOUND? */
6222 data.last_closep = &last_close;
6224 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6225 &data, -1, NULL, NULL,
6226 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6232 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6233 && data.last_start_min == 0 && data.last_end > 0
6234 && !RExC_seen_zerolen
6235 && !(RExC_seen & REG_SEEN_VERBARG)
6236 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6237 r->extflags |= RXf_CHECK_ALL;
6238 scan_commit(pRExC_state, &data,&minlen,0);
6239 SvREFCNT_dec(data.last_found);
6241 longest_float_length = CHR_SVLEN(data.longest_float);
6243 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6244 && data.offset_fixed == data.offset_float_min
6245 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6246 && S_setup_longest (aTHX_ pRExC_state,
6250 &(r->float_end_shift),
6251 data.lookbehind_float,
6252 data.offset_float_min,
6254 longest_float_length,
6255 data.flags & SF_FL_BEFORE_EOL,
6256 data.flags & SF_FL_BEFORE_MEOL))
6258 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6259 r->float_max_offset = data.offset_float_max;
6260 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6261 r->float_max_offset -= data.lookbehind_float;
6264 r->float_substr = r->float_utf8 = NULL;
6265 SvREFCNT_dec(data.longest_float);
6266 longest_float_length = 0;
6269 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6271 if (S_setup_longest (aTHX_ pRExC_state,
6273 &(r->anchored_utf8),
6274 &(r->anchored_substr),
6275 &(r->anchored_end_shift),
6276 data.lookbehind_fixed,
6279 longest_fixed_length,
6280 data.flags & SF_FIX_BEFORE_EOL,
6281 data.flags & SF_FIX_BEFORE_MEOL))
6283 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6286 r->anchored_substr = r->anchored_utf8 = NULL;
6287 SvREFCNT_dec(data.longest_fixed);
6288 longest_fixed_length = 0;
6292 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6293 ri->regstclass = NULL;
6295 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6297 && !(data.start_class->flags & ANYOF_EOS)
6298 && !cl_is_anything(data.start_class))
6300 const U32 n = add_data(pRExC_state, 1, "f");
6301 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6303 Newx(RExC_rxi->data->data[n], 1,
6304 struct regnode_charclass_class);
6305 StructCopy(data.start_class,
6306 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6307 struct regnode_charclass_class);
6308 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6309 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6310 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6311 regprop(r, sv, (regnode*)data.start_class);
6312 PerlIO_printf(Perl_debug_log,
6313 "synthetic stclass \"%s\".\n",
6314 SvPVX_const(sv));});
6317 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6318 if (longest_fixed_length > longest_float_length) {
6319 r->check_end_shift = r->anchored_end_shift;
6320 r->check_substr = r->anchored_substr;
6321 r->check_utf8 = r->anchored_utf8;
6322 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6323 if (r->extflags & RXf_ANCH_SINGLE)
6324 r->extflags |= RXf_NOSCAN;
6327 r->check_end_shift = r->float_end_shift;
6328 r->check_substr = r->float_substr;
6329 r->check_utf8 = r->float_utf8;
6330 r->check_offset_min = r->float_min_offset;
6331 r->check_offset_max = r->float_max_offset;
6333 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6334 This should be changed ASAP! */
6335 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6336 r->extflags |= RXf_USE_INTUIT;
6337 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6338 r->extflags |= RXf_INTUIT_TAIL;
6340 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6341 if ( (STRLEN)minlen < longest_float_length )
6342 minlen= longest_float_length;
6343 if ( (STRLEN)minlen < longest_fixed_length )
6344 minlen= longest_fixed_length;
6348 /* Several toplevels. Best we can is to set minlen. */
6350 struct regnode_charclass_class ch_class;
6353 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6355 scan = ri->program + 1;
6356 cl_init(pRExC_state, &ch_class);
6357 data.start_class = &ch_class;
6358 data.last_closep = &last_close;
6361 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6362 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6366 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6367 = r->float_substr = r->float_utf8 = NULL;
6369 if (!(data.start_class->flags & ANYOF_EOS)
6370 && !cl_is_anything(data.start_class))
6372 const U32 n = add_data(pRExC_state, 1, "f");
6373 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6375 Newx(RExC_rxi->data->data[n], 1,
6376 struct regnode_charclass_class);
6377 StructCopy(data.start_class,
6378 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6379 struct regnode_charclass_class);
6380 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6381 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6382 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6383 regprop(r, sv, (regnode*)data.start_class);
6384 PerlIO_printf(Perl_debug_log,
6385 "synthetic stclass \"%s\".\n",
6386 SvPVX_const(sv));});
6390 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6391 the "real" pattern. */
6393 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6394 (IV)minlen, (IV)r->minlen);
6396 r->minlenret = minlen;
6397 if (r->minlen < minlen)
6400 if (RExC_seen & REG_SEEN_GPOS)
6401 r->extflags |= RXf_GPOS_SEEN;
6402 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6403 r->extflags |= RXf_LOOKBEHIND_SEEN;
6404 if (pRExC_state->num_code_blocks)
6405 r->extflags |= RXf_EVAL_SEEN;
6406 if (RExC_seen & REG_SEEN_CANY)
6407 r->extflags |= RXf_CANY_SEEN;
6408 if (RExC_seen & REG_SEEN_VERBARG)
6410 r->intflags |= PREGf_VERBARG_SEEN;
6411 r->extflags |= RXf_MODIFIES_VARS;
6413 if (RExC_seen & REG_SEEN_CUTGROUP)
6414 r->intflags |= PREGf_CUTGROUP_SEEN;
6415 if (pm_flags & PMf_USE_RE_EVAL)
6416 r->intflags |= PREGf_USE_RE_EVAL;
6417 if (RExC_paren_names)
6418 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6420 RXp_PAREN_NAMES(r) = NULL;
6422 #ifdef STUPID_PATTERN_CHECKS
6423 if (RX_PRELEN(rx) == 0)
6424 r->extflags |= RXf_NULL;
6425 if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6426 r->extflags |= RXf_WHITE;
6427 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6428 r->extflags |= RXf_START_ONLY;
6431 regnode *first = ri->program + 1;
6434 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6435 r->extflags |= RXf_NULL;
6436 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6437 r->extflags |= RXf_START_ONLY;
6438 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6439 && OP(regnext(first)) == END)
6440 r->extflags |= RXf_WHITE;
6444 if (RExC_paren_names) {
6445 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6446 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6449 ri->name_list_idx = 0;
6451 if (RExC_recurse_count) {
6452 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6453 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6454 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6457 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6458 /* assume we don't need to swap parens around before we match */
6461 PerlIO_printf(Perl_debug_log,"Final program:\n");
6464 #ifdef RE_TRACK_PATTERN_OFFSETS
6465 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6466 const U32 len = ri->u.offsets[0];
6468 GET_RE_DEBUG_FLAGS_DECL;
6469 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6470 for (i = 1; i <= len; i++) {
6471 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6472 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6473 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6475 PerlIO_printf(Perl_debug_log, "\n");
6483 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6486 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6488 PERL_UNUSED_ARG(value);
6490 if (flags & RXapif_FETCH) {
6491 return reg_named_buff_fetch(rx, key, flags);
6492 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6493 Perl_croak_no_modify();
6495 } else if (flags & RXapif_EXISTS) {
6496 return reg_named_buff_exists(rx, key, flags)
6499 } else if (flags & RXapif_REGNAMES) {
6500 return reg_named_buff_all(rx, flags);
6501 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6502 return reg_named_buff_scalar(rx, flags);
6504 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6510 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6513 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6514 PERL_UNUSED_ARG(lastkey);
6516 if (flags & RXapif_FIRSTKEY)
6517 return reg_named_buff_firstkey(rx, flags);
6518 else if (flags & RXapif_NEXTKEY)
6519 return reg_named_buff_nextkey(rx, flags);
6521 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6527 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6530 AV *retarray = NULL;
6532 struct regexp *const rx = ReANY(r);
6534 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6536 if (flags & RXapif_ALL)
6539 if (rx && RXp_PAREN_NAMES(rx)) {
6540 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6543 SV* sv_dat=HeVAL(he_str);
6544 I32 *nums=(I32*)SvPVX(sv_dat);
6545 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6546 if ((I32)(rx->nparens) >= nums[i]
6547 && rx->offs[nums[i]].start != -1
6548 && rx->offs[nums[i]].end != -1)
6551 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6556 ret = newSVsv(&PL_sv_undef);
6559 av_push(retarray, ret);
6562 return newRV_noinc(MUTABLE_SV(retarray));
6569 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6572 struct regexp *const rx = ReANY(r);
6574 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6576 if (rx && RXp_PAREN_NAMES(rx)) {
6577 if (flags & RXapif_ALL) {
6578 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6580 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6594 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6596 struct regexp *const rx = ReANY(r);
6598 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6600 if ( rx && RXp_PAREN_NAMES(rx) ) {
6601 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6603 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6610 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6612 struct regexp *const rx = ReANY(r);
6613 GET_RE_DEBUG_FLAGS_DECL;
6615 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6617 if (rx && RXp_PAREN_NAMES(rx)) {
6618 HV *hv = RXp_PAREN_NAMES(rx);
6620 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6623 SV* sv_dat = HeVAL(temphe);
6624 I32 *nums = (I32*)SvPVX(sv_dat);
6625 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6626 if ((I32)(rx->lastparen) >= nums[i] &&
6627 rx->offs[nums[i]].start != -1 &&
6628 rx->offs[nums[i]].end != -1)
6634 if (parno || flags & RXapif_ALL) {
6635 return newSVhek(HeKEY_hek(temphe));
6643 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6648 struct regexp *const rx = ReANY(r);
6650 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6652 if (rx && RXp_PAREN_NAMES(rx)) {
6653 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6654 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6655 } else if (flags & RXapif_ONE) {
6656 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6657 av = MUTABLE_AV(SvRV(ret));
6658 length = av_len(av);
6660 return newSViv(length + 1);
6662 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6666 return &PL_sv_undef;
6670 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6672 struct regexp *const rx = ReANY(r);
6675 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6677 if (rx && RXp_PAREN_NAMES(rx)) {
6678 HV *hv= RXp_PAREN_NAMES(rx);
6680 (void)hv_iterinit(hv);
6681 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6684 SV* sv_dat = HeVAL(temphe);
6685 I32 *nums = (I32*)SvPVX(sv_dat);
6686 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6687 if ((I32)(rx->lastparen) >= nums[i] &&
6688 rx->offs[nums[i]].start != -1 &&
6689 rx->offs[nums[i]].end != -1)
6695 if (parno || flags & RXapif_ALL) {
6696 av_push(av, newSVhek(HeKEY_hek(temphe)));
6701 return newRV_noinc(MUTABLE_SV(av));
6705 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6708 struct regexp *const rx = ReANY(r);
6714 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6716 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6717 || n == RX_BUFF_IDX_CARET_FULLMATCH
6718 || n == RX_BUFF_IDX_CARET_POSTMATCH
6720 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6727 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6728 /* no need to distinguish between them any more */
6729 n = RX_BUFF_IDX_FULLMATCH;
6731 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6732 && rx->offs[0].start != -1)
6734 /* $`, ${^PREMATCH} */
6735 i = rx->offs[0].start;
6739 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6740 && rx->offs[0].end != -1)
6742 /* $', ${^POSTMATCH} */
6743 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6744 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6747 if ( 0 <= n && n <= (I32)rx->nparens &&
6748 (s1 = rx->offs[n].start) != -1 &&
6749 (t1 = rx->offs[n].end) != -1)
6751 /* $&, ${^MATCH}, $1 ... */
6753 s = rx->subbeg + s1 - rx->suboffset;
6758 assert(s >= rx->subbeg);
6759 assert(rx->sublen >= (s - rx->subbeg) + i );
6761 #if NO_TAINT_SUPPORT
6762 sv_setpvn(sv, s, i);
6764 const int oldtainted = TAINT_get;
6766 sv_setpvn(sv, s, i);
6767 TAINT_set(oldtainted);
6769 if ( (rx->extflags & RXf_CANY_SEEN)
6770 ? (RXp_MATCH_UTF8(rx)
6771 && (!i || is_utf8_string((U8*)s, i)))
6772 : (RXp_MATCH_UTF8(rx)) )
6779 if (RXp_MATCH_TAINTED(rx)) {
6780 if (SvTYPE(sv) >= SVt_PVMG) {
6781 MAGIC* const mg = SvMAGIC(sv);
6784 SvMAGIC_set(sv, mg->mg_moremagic);
6786 if ((mgt = SvMAGIC(sv))) {
6787 mg->mg_moremagic = mgt;
6788 SvMAGIC_set(sv, mg);
6799 sv_setsv(sv,&PL_sv_undef);
6805 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6806 SV const * const value)
6808 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6810 PERL_UNUSED_ARG(rx);
6811 PERL_UNUSED_ARG(paren);
6812 PERL_UNUSED_ARG(value);
6815 Perl_croak_no_modify();
6819 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6822 struct regexp *const rx = ReANY(r);
6826 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6828 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6830 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6831 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6835 case RX_BUFF_IDX_PREMATCH: /* $` */
6836 if (rx->offs[0].start != -1) {
6837 i = rx->offs[0].start;
6846 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6847 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6849 case RX_BUFF_IDX_POSTMATCH: /* $' */
6850 if (rx->offs[0].end != -1) {
6851 i = rx->sublen - rx->offs[0].end;
6853 s1 = rx->offs[0].end;
6860 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6861 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6865 /* $& / ${^MATCH}, $1, $2, ... */
6867 if (paren <= (I32)rx->nparens &&
6868 (s1 = rx->offs[paren].start) != -1 &&
6869 (t1 = rx->offs[paren].end) != -1)
6875 if (ckWARN(WARN_UNINITIALIZED))
6876 report_uninit((const SV *)sv);
6881 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6882 const char * const s = rx->subbeg - rx->suboffset + s1;
6887 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6894 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6896 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6897 PERL_UNUSED_ARG(rx);
6901 return newSVpvs("Regexp");
6904 /* Scans the name of a named buffer from the pattern.
6905 * If flags is REG_RSN_RETURN_NULL returns null.
6906 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6907 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6908 * to the parsed name as looked up in the RExC_paren_names hash.
6909 * If there is an error throws a vFAIL().. type exception.
6912 #define REG_RSN_RETURN_NULL 0
6913 #define REG_RSN_RETURN_NAME 1
6914 #define REG_RSN_RETURN_DATA 2
6917 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6919 char *name_start = RExC_parse;
6921 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6923 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6924 /* skip IDFIRST by using do...while */
6927 RExC_parse += UTF8SKIP(RExC_parse);
6928 } while (isALNUM_utf8((U8*)RExC_parse));
6932 } while (isALNUM(*RExC_parse));
6934 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6935 vFAIL("Group name must start with a non-digit word character");
6939 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6940 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6941 if ( flags == REG_RSN_RETURN_NAME)
6943 else if (flags==REG_RSN_RETURN_DATA) {
6946 if ( ! sv_name ) /* should not happen*/
6947 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6948 if (RExC_paren_names)
6949 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6951 sv_dat = HeVAL(he_str);
6953 vFAIL("Reference to nonexistent named group");
6957 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6958 (unsigned long) flags);
6960 assert(0); /* NOT REACHED */
6965 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6966 int rem=(int)(RExC_end - RExC_parse); \
6975 if (RExC_lastparse!=RExC_parse) \
6976 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6979 iscut ? "..." : "<" \
6982 PerlIO_printf(Perl_debug_log,"%16s",""); \
6985 num = RExC_size + 1; \
6987 num=REG_NODE_NUM(RExC_emit); \
6988 if (RExC_lastnum!=num) \
6989 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6991 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6992 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6993 (int)((depth*2)), "", \
6997 RExC_lastparse=RExC_parse; \
7002 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7003 DEBUG_PARSE_MSG((funcname)); \
7004 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7006 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7007 DEBUG_PARSE_MSG((funcname)); \
7008 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7011 /* This section of code defines the inversion list object and its methods. The
7012 * interfaces are highly subject to change, so as much as possible is static to
7013 * this file. An inversion list is here implemented as a malloc'd C UV array
7014 * with some added info that is placed as UVs at the beginning in a header
7015 * portion. An inversion list for Unicode is an array of code points, sorted
7016 * by ordinal number. The zeroth element is the first code point in the list.
7017 * The 1th element is the first element beyond that not in the list. In other
7018 * words, the first range is
7019 * invlist[0]..(invlist[1]-1)
7020 * The other ranges follow. Thus every element whose index is divisible by two
7021 * marks the beginning of a range that is in the list, and every element not
7022 * divisible by two marks the beginning of a range not in the list. A single
7023 * element inversion list that contains the single code point N generally
7024 * consists of two elements
7027 * (The exception is when N is the highest representable value on the
7028 * machine, in which case the list containing just it would be a single
7029 * element, itself. By extension, if the last range in the list extends to
7030 * infinity, then the first element of that range will be in the inversion list
7031 * at a position that is divisible by two, and is the final element in the
7033 * Taking the complement (inverting) an inversion list is quite simple, if the
7034 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7035 * This implementation reserves an element at the beginning of each inversion
7036 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
7037 * actual beginning of the list is either that element if 0, or the next one if
7040 * More about inversion lists can be found in "Unicode Demystified"
7041 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7042 * More will be coming when functionality is added later.
7044 * The inversion list data structure is currently implemented as an SV pointing
7045 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7046 * array of UV whose memory management is automatically handled by the existing
7047 * facilities for SV's.
7049 * Some of the methods should always be private to the implementation, and some
7050 * should eventually be made public */
7052 /* The header definitions are in F<inline_invlist.c> */
7054 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7055 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7057 #define INVLIST_INITIAL_LEN 10
7059 PERL_STATIC_INLINE UV*
7060 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7062 /* Returns a pointer to the first element in the inversion list's array.
7063 * This is called upon initialization of an inversion list. Where the
7064 * array begins depends on whether the list has the code point U+0000
7065 * in it or not. The other parameter tells it whether the code that
7066 * follows this call is about to put a 0 in the inversion list or not.
7067 * The first element is either the element with 0, if 0, or the next one,
7070 UV* zero = get_invlist_zero_addr(invlist);
7072 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7075 assert(! *_get_invlist_len_addr(invlist));
7077 /* 1^1 = 0; 1^0 = 1 */
7078 *zero = 1 ^ will_have_0;
7079 return zero + *zero;
7082 PERL_STATIC_INLINE UV*
7083 S_invlist_array(pTHX_ SV* const invlist)
7085 /* Returns the pointer to the inversion list's array. Every time the
7086 * length changes, this needs to be called in case malloc or realloc moved
7089 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7091 /* Must not be empty. If these fail, you probably didn't check for <len>
7092 * being non-zero before trying to get the array */
7093 assert(*_get_invlist_len_addr(invlist));
7094 assert(*get_invlist_zero_addr(invlist) == 0
7095 || *get_invlist_zero_addr(invlist) == 1);
7097 /* The array begins either at the element reserved for zero if the
7098 * list contains 0 (that element will be set to 0), or otherwise the next
7099 * element (in which case the reserved element will be set to 1). */
7100 return (UV *) (get_invlist_zero_addr(invlist)
7101 + *get_invlist_zero_addr(invlist));
7104 PERL_STATIC_INLINE void
7105 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7107 /* Sets the current number of elements stored in the inversion list */
7109 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7111 *_get_invlist_len_addr(invlist) = len;
7113 assert(len <= SvLEN(invlist));
7115 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7116 /* If the list contains U+0000, that element is part of the header,
7117 * and should not be counted as part of the array. It will contain
7118 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7120 * SvCUR_set(invlist,
7121 * TO_INTERNAL_SIZE(len
7122 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7123 * But, this is only valid if len is not 0. The consequences of not doing
7124 * this is that the memory allocation code may think that 1 more UV is
7125 * being used than actually is, and so might do an unnecessary grow. That
7126 * seems worth not bothering to make this the precise amount.
7128 * Note that when inverting, SvCUR shouldn't change */
7131 PERL_STATIC_INLINE IV*
7132 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7134 /* Return the address of the UV that is reserved to hold the cached index
7137 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7139 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7142 PERL_STATIC_INLINE IV
7143 S_invlist_previous_index(pTHX_ SV* const invlist)
7145 /* Returns cached index of previous search */
7147 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7149 return *get_invlist_previous_index_addr(invlist);
7152 PERL_STATIC_INLINE void
7153 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7155 /* Caches <index> for later retrieval */
7157 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7159 assert(index == 0 || index < (int) _invlist_len(invlist));
7161 *get_invlist_previous_index_addr(invlist) = index;
7164 PERL_STATIC_INLINE UV
7165 S_invlist_max(pTHX_ SV* const invlist)
7167 /* Returns the maximum number of elements storable in the inversion list's
7168 * array, without having to realloc() */
7170 PERL_ARGS_ASSERT_INVLIST_MAX;
7172 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7175 PERL_STATIC_INLINE UV*
7176 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7178 /* Return the address of the UV that is reserved to hold 0 if the inversion
7179 * list contains 0. This has to be the last element of the heading, as the
7180 * list proper starts with either it if 0, or the next element if not.
7181 * (But we force it to contain either 0 or 1) */
7183 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7185 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7188 #ifndef PERL_IN_XSUB_RE
7190 Perl__new_invlist(pTHX_ IV initial_size)
7193 /* Return a pointer to a newly constructed inversion list, with enough
7194 * space to store 'initial_size' elements. If that number is negative, a
7195 * system default is used instead */
7199 if (initial_size < 0) {
7200 initial_size = INVLIST_INITIAL_LEN;
7203 /* Allocate the initial space */
7204 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7205 invlist_set_len(new_list, 0);
7207 /* Force iterinit() to be used to get iteration to work */
7208 *get_invlist_iter_addr(new_list) = UV_MAX;
7210 /* This should force a segfault if a method doesn't initialize this
7212 *get_invlist_zero_addr(new_list) = UV_MAX;
7214 *get_invlist_previous_index_addr(new_list) = 0;
7215 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7216 #if HEADER_LENGTH != 5
7217 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7225 S__new_invlist_C_array(pTHX_ UV* list)
7227 /* Return a pointer to a newly constructed inversion list, initialized to
7228 * point to <list>, which has to be in the exact correct inversion list
7229 * form, including internal fields. Thus this is a dangerous routine that
7230 * should not be used in the wrong hands */
7232 SV* invlist = newSV_type(SVt_PV);
7234 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7236 SvPV_set(invlist, (char *) list);
7237 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7238 shouldn't touch it */
7239 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7241 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7242 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7249 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7251 /* Grow the maximum size of an inversion list */
7253 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7255 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7258 PERL_STATIC_INLINE void
7259 S_invlist_trim(pTHX_ SV* const invlist)
7261 PERL_ARGS_ASSERT_INVLIST_TRIM;
7263 /* Change the length of the inversion list to how many entries it currently
7266 SvPV_shrink_to_cur((SV *) invlist);
7269 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7272 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7274 /* Subject to change or removal. Append the range from 'start' to 'end' at
7275 * the end of the inversion list. The range must be above any existing
7279 UV max = invlist_max(invlist);
7280 UV len = _invlist_len(invlist);
7282 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7284 if (len == 0) { /* Empty lists must be initialized */
7285 array = _invlist_array_init(invlist, start == 0);
7288 /* Here, the existing list is non-empty. The current max entry in the
7289 * list is generally the first value not in the set, except when the
7290 * set extends to the end of permissible values, in which case it is
7291 * the first entry in that final set, and so this call is an attempt to
7292 * append out-of-order */
7294 UV final_element = len - 1;
7295 array = invlist_array(invlist);
7296 if (array[final_element] > start
7297 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7299 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7300 array[final_element], start,
7301 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7304 /* Here, it is a legal append. If the new range begins with the first
7305 * value not in the set, it is extending the set, so the new first
7306 * value not in the set is one greater than the newly extended range.
7308 if (array[final_element] == start) {
7309 if (end != UV_MAX) {
7310 array[final_element] = end + 1;
7313 /* But if the end is the maximum representable on the machine,
7314 * just let the range that this would extend to have no end */
7315 invlist_set_len(invlist, len - 1);
7321 /* Here the new range doesn't extend any existing set. Add it */
7323 len += 2; /* Includes an element each for the start and end of range */
7325 /* If overflows the existing space, extend, which may cause the array to be
7328 invlist_extend(invlist, len);
7329 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7330 failure in invlist_array() */
7331 array = invlist_array(invlist);
7334 invlist_set_len(invlist, len);
7337 /* The next item on the list starts the range, the one after that is
7338 * one past the new range. */
7339 array[len - 2] = start;
7340 if (end != UV_MAX) {
7341 array[len - 1] = end + 1;
7344 /* But if the end is the maximum representable on the machine, just let
7345 * the range have no end */
7346 invlist_set_len(invlist, len - 1);
7350 #ifndef PERL_IN_XSUB_RE
7353 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7355 /* Searches the inversion list for the entry that contains the input code
7356 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7357 * return value is the index into the list's array of the range that
7362 IV high = _invlist_len(invlist);
7363 const IV highest_element = high - 1;
7366 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7368 /* If list is empty, return failure. */
7373 /* If the code point is before the first element, return failure. (We
7374 * can't combine this with the test above, because we can't get the array
7375 * unless we know the list is non-empty) */
7376 array = invlist_array(invlist);
7378 mid = invlist_previous_index(invlist);
7379 assert(mid >=0 && mid <= highest_element);
7381 /* <mid> contains the cache of the result of the previous call to this
7382 * function (0 the first time). See if this call is for the same result,
7383 * or if it is for mid-1. This is under the theory that calls to this
7384 * function will often be for related code points that are near each other.
7385 * And benchmarks show that caching gives better results. We also test
7386 * here if the code point is within the bounds of the list. These tests
7387 * replace others that would have had to be made anyway to make sure that
7388 * the array bounds were not exceeded, and these give us extra information
7389 * at the same time */
7390 if (cp >= array[mid]) {
7391 if (cp >= array[highest_element]) {
7392 return highest_element;
7395 /* Here, array[mid] <= cp < array[highest_element]. This means that
7396 * the final element is not the answer, so can exclude it; it also
7397 * means that <mid> is not the final element, so can refer to 'mid + 1'
7399 if (cp < array[mid + 1]) {
7405 else { /* cp < aray[mid] */
7406 if (cp < array[0]) { /* Fail if outside the array */
7410 if (cp >= array[mid - 1]) {
7415 /* Binary search. What we are looking for is <i> such that
7416 * array[i] <= cp < array[i+1]
7417 * The loop below converges on the i+1. Note that there may not be an
7418 * (i+1)th element in the array, and things work nonetheless */
7419 while (low < high) {
7420 mid = (low + high) / 2;
7421 assert(mid <= highest_element);
7422 if (array[mid] <= cp) { /* cp >= array[mid] */
7425 /* We could do this extra test to exit the loop early.
7426 if (cp < array[low]) {
7431 else { /* cp < array[mid] */
7438 invlist_set_previous_index(invlist, high);
7443 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7445 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7446 * but is used when the swash has an inversion list. This makes this much
7447 * faster, as it uses a binary search instead of a linear one. This is
7448 * intimately tied to that function, and perhaps should be in utf8.c,
7449 * except it is intimately tied to inversion lists as well. It assumes
7450 * that <swatch> is all 0's on input */
7453 const IV len = _invlist_len(invlist);
7457 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7459 if (len == 0) { /* Empty inversion list */
7463 array = invlist_array(invlist);
7465 /* Find which element it is */
7466 i = _invlist_search(invlist, start);
7468 /* We populate from <start> to <end> */
7469 while (current < end) {
7472 /* The inversion list gives the results for every possible code point
7473 * after the first one in the list. Only those ranges whose index is
7474 * even are ones that the inversion list matches. For the odd ones,
7475 * and if the initial code point is not in the list, we have to skip
7476 * forward to the next element */
7477 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7479 if (i >= len) { /* Finished if beyond the end of the array */
7483 if (current >= end) { /* Finished if beyond the end of what we
7485 if (LIKELY(end < UV_MAX)) {
7489 /* We get here when the upper bound is the maximum
7490 * representable on the machine, and we are looking for just
7491 * that code point. Have to special case it */
7493 goto join_end_of_list;
7496 assert(current >= start);
7498 /* The current range ends one below the next one, except don't go past
7501 upper = (i < len && array[i] < end) ? array[i] : end;
7503 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7504 * for each code point in it */
7505 for (; current < upper; current++) {
7506 const STRLEN offset = (STRLEN)(current - start);
7507 swatch[offset >> 3] |= 1 << (offset & 7);
7512 /* Quit if at the end of the list */
7515 /* But first, have to deal with the highest possible code point on
7516 * the platform. The previous code assumes that <end> is one
7517 * beyond where we want to populate, but that is impossible at the
7518 * platform's infinity, so have to handle it specially */
7519 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7521 const STRLEN offset = (STRLEN)(end - start);
7522 swatch[offset >> 3] |= 1 << (offset & 7);
7527 /* Advance to the next range, which will be for code points not in the
7536 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7538 /* Take the union of two inversion lists and point <output> to it. *output
7539 * should be defined upon input, and if it points to one of the two lists,
7540 * the reference count to that list will be decremented. The first list,
7541 * <a>, may be NULL, in which case a copy of the second list is returned.
7542 * If <complement_b> is TRUE, the union is taken of the complement
7543 * (inversion) of <b> instead of b itself.
7545 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7546 * Richard Gillam, published by Addison-Wesley, and explained at some
7547 * length there. The preface says to incorporate its examples into your
7548 * code at your own risk.
7550 * The algorithm is like a merge sort.
7552 * XXX A potential performance improvement is to keep track as we go along
7553 * if only one of the inputs contributes to the result, meaning the other
7554 * is a subset of that one. In that case, we can skip the final copy and
7555 * return the larger of the input lists, but then outside code might need
7556 * to keep track of whether to free the input list or not */
7558 UV* array_a; /* a's array */
7560 UV len_a; /* length of a's array */
7563 SV* u; /* the resulting union */
7567 UV i_a = 0; /* current index into a's array */
7571 /* running count, as explained in the algorithm source book; items are
7572 * stopped accumulating and are output when the count changes to/from 0.
7573 * The count is incremented when we start a range that's in the set, and
7574 * decremented when we start a range that's not in the set. So its range
7575 * is 0 to 2. Only when the count is zero is something not in the set.
7579 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7582 /* If either one is empty, the union is the other one */
7583 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7590 *output = invlist_clone(b);
7592 _invlist_invert(*output);
7594 } /* else *output already = b; */
7597 else if ((len_b = _invlist_len(b)) == 0) {
7602 /* The complement of an empty list is a list that has everything in it,
7603 * so the union with <a> includes everything too */
7608 *output = _new_invlist(1);
7609 _append_range_to_invlist(*output, 0, UV_MAX);
7611 else if (*output != a) {
7612 *output = invlist_clone(a);
7614 /* else *output already = a; */
7618 /* Here both lists exist and are non-empty */
7619 array_a = invlist_array(a);
7620 array_b = invlist_array(b);
7622 /* If are to take the union of 'a' with the complement of b, set it
7623 * up so are looking at b's complement. */
7626 /* To complement, we invert: if the first element is 0, remove it. To
7627 * do this, we just pretend the array starts one later, and clear the
7628 * flag as we don't have to do anything else later */
7629 if (array_b[0] == 0) {
7632 complement_b = FALSE;
7636 /* But if the first element is not zero, we unshift a 0 before the
7637 * array. The data structure reserves a space for that 0 (which
7638 * should be a '1' right now), so physical shifting is unneeded,
7639 * but temporarily change that element to 0. Before exiting the
7640 * routine, we must restore the element to '1' */
7647 /* Size the union for the worst case: that the sets are completely
7649 u = _new_invlist(len_a + len_b);
7651 /* Will contain U+0000 if either component does */
7652 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7653 || (len_b > 0 && array_b[0] == 0));
7655 /* Go through each list item by item, stopping when exhausted one of
7657 while (i_a < len_a && i_b < len_b) {
7658 UV cp; /* The element to potentially add to the union's array */
7659 bool cp_in_set; /* is it in the the input list's set or not */
7661 /* We need to take one or the other of the two inputs for the union.
7662 * Since we are merging two sorted lists, we take the smaller of the
7663 * next items. In case of a tie, we take the one that is in its set
7664 * first. If we took one not in the set first, it would decrement the
7665 * count, possibly to 0 which would cause it to be output as ending the
7666 * range, and the next time through we would take the same number, and
7667 * output it again as beginning the next range. By doing it the
7668 * opposite way, there is no possibility that the count will be
7669 * momentarily decremented to 0, and thus the two adjoining ranges will
7670 * be seamlessly merged. (In a tie and both are in the set or both not
7671 * in the set, it doesn't matter which we take first.) */
7672 if (array_a[i_a] < array_b[i_b]
7673 || (array_a[i_a] == array_b[i_b]
7674 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7676 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7680 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7684 /* Here, have chosen which of the two inputs to look at. Only output
7685 * if the running count changes to/from 0, which marks the
7686 * beginning/end of a range in that's in the set */
7689 array_u[i_u++] = cp;
7696 array_u[i_u++] = cp;
7701 /* Here, we are finished going through at least one of the lists, which
7702 * means there is something remaining in at most one. We check if the list
7703 * that hasn't been exhausted is positioned such that we are in the middle
7704 * of a range in its set or not. (i_a and i_b point to the element beyond
7705 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7706 * is potentially more to output.
7707 * There are four cases:
7708 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7709 * in the union is entirely from the non-exhausted set.
7710 * 2) Both were in their sets, count is 2. Nothing further should
7711 * be output, as everything that remains will be in the exhausted
7712 * list's set, hence in the union; decrementing to 1 but not 0 insures
7714 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7715 * Nothing further should be output because the union includes
7716 * everything from the exhausted set. Not decrementing ensures that.
7717 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7718 * decrementing to 0 insures that we look at the remainder of the
7719 * non-exhausted set */
7720 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7721 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7726 /* The final length is what we've output so far, plus what else is about to
7727 * be output. (If 'count' is non-zero, then the input list we exhausted
7728 * has everything remaining up to the machine's limit in its set, and hence
7729 * in the union, so there will be no further output. */
7732 /* At most one of the subexpressions will be non-zero */
7733 len_u += (len_a - i_a) + (len_b - i_b);
7736 /* Set result to final length, which can change the pointer to array_u, so
7738 if (len_u != _invlist_len(u)) {
7739 invlist_set_len(u, len_u);
7741 array_u = invlist_array(u);
7744 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7745 * the other) ended with everything above it not in its set. That means
7746 * that the remaining part of the union is precisely the same as the
7747 * non-exhausted list, so can just copy it unchanged. (If both list were
7748 * exhausted at the same time, then the operations below will be both 0.)
7751 IV copy_count; /* At most one will have a non-zero copy count */
7752 if ((copy_count = len_a - i_a) > 0) {
7753 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7755 else if ((copy_count = len_b - i_b) > 0) {
7756 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7760 /* We may be removing a reference to one of the inputs */
7761 if (a == *output || b == *output) {
7762 SvREFCNT_dec(*output);
7765 /* If we've changed b, restore it */
7775 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7777 /* Take the intersection of two inversion lists and point <i> to it. *i
7778 * should be defined upon input, and if it points to one of the two lists,
7779 * the reference count to that list will be decremented.
7780 * If <complement_b> is TRUE, the result will be the intersection of <a>
7781 * and the complement (or inversion) of <b> instead of <b> directly.
7783 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7784 * Richard Gillam, published by Addison-Wesley, and explained at some
7785 * length there. The preface says to incorporate its examples into your
7786 * code at your own risk. In fact, it had bugs
7788 * The algorithm is like a merge sort, and is essentially the same as the
7792 UV* array_a; /* a's array */
7794 UV len_a; /* length of a's array */
7797 SV* r; /* the resulting intersection */
7801 UV i_a = 0; /* current index into a's array */
7805 /* running count, as explained in the algorithm source book; items are
7806 * stopped accumulating and are output when the count changes to/from 2.
7807 * The count is incremented when we start a range that's in the set, and
7808 * decremented when we start a range that's not in the set. So its range
7809 * is 0 to 2. Only when the count is 2 is something in the intersection.
7813 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7816 /* Special case if either one is empty */
7817 len_a = _invlist_len(a);
7818 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7820 if (len_a != 0 && complement_b) {
7822 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7823 * be empty. Here, also we are using 'b's complement, which hence
7824 * must be every possible code point. Thus the intersection is
7827 *i = invlist_clone(a);
7833 /* else *i is already 'a' */
7837 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7838 * intersection must be empty */
7845 *i = _new_invlist(0);
7849 /* Here both lists exist and are non-empty */
7850 array_a = invlist_array(a);
7851 array_b = invlist_array(b);
7853 /* If are to take the intersection of 'a' with the complement of b, set it
7854 * up so are looking at b's complement. */
7857 /* To complement, we invert: if the first element is 0, remove it. To
7858 * do this, we just pretend the array starts one later, and clear the
7859 * flag as we don't have to do anything else later */
7860 if (array_b[0] == 0) {
7863 complement_b = FALSE;
7867 /* But if the first element is not zero, we unshift a 0 before the
7868 * array. The data structure reserves a space for that 0 (which
7869 * should be a '1' right now), so physical shifting is unneeded,
7870 * but temporarily change that element to 0. Before exiting the
7871 * routine, we must restore the element to '1' */
7878 /* Size the intersection for the worst case: that the intersection ends up
7879 * fragmenting everything to be completely disjoint */
7880 r= _new_invlist(len_a + len_b);
7882 /* Will contain U+0000 iff both components do */
7883 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7884 && len_b > 0 && array_b[0] == 0);
7886 /* Go through each list item by item, stopping when exhausted one of
7888 while (i_a < len_a && i_b < len_b) {
7889 UV cp; /* The element to potentially add to the intersection's
7891 bool cp_in_set; /* Is it in the input list's set or not */
7893 /* We need to take one or the other of the two inputs for the
7894 * intersection. Since we are merging two sorted lists, we take the
7895 * smaller of the next items. In case of a tie, we take the one that
7896 * is not in its set first (a difference from the union algorithm). If
7897 * we took one in the set first, it would increment the count, possibly
7898 * to 2 which would cause it to be output as starting a range in the
7899 * intersection, and the next time through we would take that same
7900 * number, and output it again as ending the set. By doing it the
7901 * opposite of this, there is no possibility that the count will be
7902 * momentarily incremented to 2. (In a tie and both are in the set or
7903 * both not in the set, it doesn't matter which we take first.) */
7904 if (array_a[i_a] < array_b[i_b]
7905 || (array_a[i_a] == array_b[i_b]
7906 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7908 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7912 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7916 /* Here, have chosen which of the two inputs to look at. Only output
7917 * if the running count changes to/from 2, which marks the
7918 * beginning/end of a range that's in the intersection */
7922 array_r[i_r++] = cp;
7927 array_r[i_r++] = cp;
7933 /* Here, we are finished going through at least one of the lists, which
7934 * means there is something remaining in at most one. We check if the list
7935 * that has been exhausted is positioned such that we are in the middle
7936 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7937 * the ones we care about.) There are four cases:
7938 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7939 * nothing left in the intersection.
7940 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7941 * above 2. What should be output is exactly that which is in the
7942 * non-exhausted set, as everything it has is also in the intersection
7943 * set, and everything it doesn't have can't be in the intersection
7944 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7945 * gets incremented to 2. Like the previous case, the intersection is
7946 * everything that remains in the non-exhausted set.
7947 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7948 * remains 1. And the intersection has nothing more. */
7949 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7950 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7955 /* The final length is what we've output so far plus what else is in the
7956 * intersection. At most one of the subexpressions below will be non-zero */
7959 len_r += (len_a - i_a) + (len_b - i_b);
7962 /* Set result to final length, which can change the pointer to array_r, so
7964 if (len_r != _invlist_len(r)) {
7965 invlist_set_len(r, len_r);
7967 array_r = invlist_array(r);
7970 /* Finish outputting any remaining */
7971 if (count >= 2) { /* At most one will have a non-zero copy count */
7973 if ((copy_count = len_a - i_a) > 0) {
7974 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7976 else if ((copy_count = len_b - i_b) > 0) {
7977 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7981 /* We may be removing a reference to one of the inputs */
7982 if (a == *i || b == *i) {
7986 /* If we've changed b, restore it */
7996 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7998 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7999 * set. A pointer to the inversion list is returned. This may actually be
8000 * a new list, in which case the passed in one has been destroyed. The
8001 * passed in inversion list can be NULL, in which case a new one is created
8002 * with just the one range in it */
8007 if (invlist == NULL) {
8008 invlist = _new_invlist(2);
8012 len = _invlist_len(invlist);
8015 /* If comes after the final entry, can just append it to the end */
8017 || start >= invlist_array(invlist)
8018 [_invlist_len(invlist) - 1])
8020 _append_range_to_invlist(invlist, start, end);
8024 /* Here, can't just append things, create and return a new inversion list
8025 * which is the union of this range and the existing inversion list */
8026 range_invlist = _new_invlist(2);
8027 _append_range_to_invlist(range_invlist, start, end);
8029 _invlist_union(invlist, range_invlist, &invlist);
8031 /* The temporary can be freed */
8032 SvREFCNT_dec(range_invlist);
8039 PERL_STATIC_INLINE SV*
8040 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8041 return _add_range_to_invlist(invlist, cp, cp);
8044 #ifndef PERL_IN_XSUB_RE
8046 Perl__invlist_invert(pTHX_ SV* const invlist)
8048 /* Complement the input inversion list. This adds a 0 if the list didn't
8049 * have a zero; removes it otherwise. As described above, the data
8050 * structure is set up so that this is very efficient */
8052 UV* len_pos = _get_invlist_len_addr(invlist);
8054 PERL_ARGS_ASSERT__INVLIST_INVERT;
8056 /* The inverse of matching nothing is matching everything */
8057 if (*len_pos == 0) {
8058 _append_range_to_invlist(invlist, 0, UV_MAX);
8062 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
8063 * zero element was a 0, so it is being removed, so the length decrements
8064 * by 1; and vice-versa. SvCUR is unaffected */
8065 if (*get_invlist_zero_addr(invlist) ^= 1) {
8074 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8076 /* Complement the input inversion list (which must be a Unicode property,
8077 * all of which don't match above the Unicode maximum code point.) And
8078 * Perl has chosen to not have the inversion match above that either. This
8079 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8085 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8087 _invlist_invert(invlist);
8089 len = _invlist_len(invlist);
8091 if (len != 0) { /* If empty do nothing */
8092 array = invlist_array(invlist);
8093 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8094 /* Add 0x110000. First, grow if necessary */
8096 if (invlist_max(invlist) < len) {
8097 invlist_extend(invlist, len);
8098 array = invlist_array(invlist);
8100 invlist_set_len(invlist, len);
8101 array[len - 1] = PERL_UNICODE_MAX + 1;
8103 else { /* Remove the 0x110000 */
8104 invlist_set_len(invlist, len - 1);
8112 PERL_STATIC_INLINE SV*
8113 S_invlist_clone(pTHX_ SV* const invlist)
8116 /* Return a new inversion list that is a copy of the input one, which is
8119 /* Need to allocate extra space to accommodate Perl's addition of a
8120 * trailing NUL to SvPV's, since it thinks they are always strings */
8121 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8122 STRLEN length = SvCUR(invlist);
8124 PERL_ARGS_ASSERT_INVLIST_CLONE;
8126 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8127 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8132 PERL_STATIC_INLINE UV*
8133 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8135 /* Return the address of the UV that contains the current iteration
8138 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8140 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8143 PERL_STATIC_INLINE UV*
8144 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8146 /* Return the address of the UV that contains the version id. */
8148 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8150 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8153 PERL_STATIC_INLINE void
8154 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8156 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8158 *get_invlist_iter_addr(invlist) = 0;
8162 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8164 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8165 * This call sets in <*start> and <*end>, the next range in <invlist>.
8166 * Returns <TRUE> if successful and the next call will return the next
8167 * range; <FALSE> if was already at the end of the list. If the latter,
8168 * <*start> and <*end> are unchanged, and the next call to this function
8169 * will start over at the beginning of the list */
8171 UV* pos = get_invlist_iter_addr(invlist);
8172 UV len = _invlist_len(invlist);
8175 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8178 *pos = UV_MAX; /* Force iternit() to be required next time */
8182 array = invlist_array(invlist);
8184 *start = array[(*pos)++];
8190 *end = array[(*pos)++] - 1;
8196 PERL_STATIC_INLINE UV
8197 S_invlist_highest(pTHX_ SV* const invlist)
8199 /* Returns the highest code point that matches an inversion list. This API
8200 * has an ambiguity, as it returns 0 under either the highest is actually
8201 * 0, or if the list is empty. If this distinction matters to you, check
8202 * for emptiness before calling this function */
8204 UV len = _invlist_len(invlist);
8207 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8213 array = invlist_array(invlist);
8215 /* The last element in the array in the inversion list always starts a
8216 * range that goes to infinity. That range may be for code points that are
8217 * matched in the inversion list, or it may be for ones that aren't
8218 * matched. In the latter case, the highest code point in the set is one
8219 * less than the beginning of this range; otherwise it is the final element
8220 * of this range: infinity */
8221 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8223 : array[len - 1] - 1;
8226 #ifndef PERL_IN_XSUB_RE
8228 Perl__invlist_contents(pTHX_ SV* const invlist)
8230 /* Get the contents of an inversion list into a string SV so that they can
8231 * be printed out. It uses the format traditionally done for debug tracing
8235 SV* output = newSVpvs("\n");
8237 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8239 invlist_iterinit(invlist);
8240 while (invlist_iternext(invlist, &start, &end)) {
8241 if (end == UV_MAX) {
8242 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8244 else if (end != start) {
8245 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8249 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8257 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8259 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8261 /* Dumps out the ranges in an inversion list. The string 'header'
8262 * if present is output on a line before the first range */
8266 PERL_ARGS_ASSERT__INVLIST_DUMP;
8268 if (header && strlen(header)) {
8269 PerlIO_printf(Perl_debug_log, "%s\n", header);
8271 invlist_iterinit(invlist);
8272 while (invlist_iternext(invlist, &start, &end)) {
8273 if (end == UV_MAX) {
8274 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8276 else if (end != start) {
8277 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8281 PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8289 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8291 /* Return a boolean as to if the two passed in inversion lists are
8292 * identical. The final argument, if TRUE, says to take the complement of
8293 * the second inversion list before doing the comparison */
8295 UV* array_a = invlist_array(a);
8296 UV* array_b = invlist_array(b);
8297 UV len_a = _invlist_len(a);
8298 UV len_b = _invlist_len(b);
8300 UV i = 0; /* current index into the arrays */
8301 bool retval = TRUE; /* Assume are identical until proven otherwise */
8303 PERL_ARGS_ASSERT__INVLISTEQ;
8305 /* If are to compare 'a' with the complement of b, set it
8306 * up so are looking at b's complement. */
8309 /* The complement of nothing is everything, so <a> would have to have
8310 * just one element, starting at zero (ending at infinity) */
8312 return (len_a == 1 && array_a[0] == 0);
8314 else if (array_b[0] == 0) {
8316 /* Otherwise, to complement, we invert. Here, the first element is
8317 * 0, just remove it. To do this, we just pretend the array starts
8318 * one later, and clear the flag as we don't have to do anything
8323 complement_b = FALSE;
8327 /* But if the first element is not zero, we unshift a 0 before the
8328 * array. The data structure reserves a space for that 0 (which
8329 * should be a '1' right now), so physical shifting is unneeded,
8330 * but temporarily change that element to 0. Before exiting the
8331 * routine, we must restore the element to '1' */
8338 /* Make sure that the lengths are the same, as well as the final element
8339 * before looping through the remainder. (Thus we test the length, final,
8340 * and first elements right off the bat) */
8341 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8344 else for (i = 0; i < len_a - 1; i++) {
8345 if (array_a[i] != array_b[i]) {
8358 #undef HEADER_LENGTH
8359 #undef INVLIST_INITIAL_LENGTH
8360 #undef TO_INTERNAL_SIZE
8361 #undef FROM_INTERNAL_SIZE
8362 #undef INVLIST_LEN_OFFSET
8363 #undef INVLIST_ZERO_OFFSET
8364 #undef INVLIST_ITER_OFFSET
8365 #undef INVLIST_VERSION_ID
8367 /* End of inversion list object */
8370 - reg - regular expression, i.e. main body or parenthesized thing
8372 * Caller must absorb opening parenthesis.
8374 * Combining parenthesis handling with the base level of regular expression
8375 * is a trifle forced, but the need to tie the tails of the branches to what
8376 * follows makes it hard to avoid.
8378 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8380 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8382 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8386 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8387 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8390 regnode *ret; /* Will be the head of the group. */
8393 regnode *ender = NULL;
8396 U32 oregflags = RExC_flags;
8397 bool have_branch = 0;
8399 I32 freeze_paren = 0;
8400 I32 after_freeze = 0;
8402 /* for (?g), (?gc), and (?o) warnings; warning
8403 about (?c) will warn about (?g) -- japhy */
8405 #define WASTED_O 0x01
8406 #define WASTED_G 0x02
8407 #define WASTED_C 0x04
8408 #define WASTED_GC (0x02|0x04)
8409 I32 wastedflags = 0x00;
8411 char * parse_start = RExC_parse; /* MJD */
8412 char * const oregcomp_parse = RExC_parse;
8414 GET_RE_DEBUG_FLAGS_DECL;
8416 PERL_ARGS_ASSERT_REG;
8417 DEBUG_PARSE("reg ");
8419 *flagp = 0; /* Tentatively. */
8422 /* Make an OPEN node, if parenthesized. */
8424 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8425 char *start_verb = RExC_parse;
8426 STRLEN verb_len = 0;
8427 char *start_arg = NULL;
8428 unsigned char op = 0;
8430 int internal_argval = 0; /* internal_argval is only useful if !argok */
8431 while ( *RExC_parse && *RExC_parse != ')' ) {
8432 if ( *RExC_parse == ':' ) {
8433 start_arg = RExC_parse + 1;
8439 verb_len = RExC_parse - start_verb;
8442 while ( *RExC_parse && *RExC_parse != ')' )
8444 if ( *RExC_parse != ')' )
8445 vFAIL("Unterminated verb pattern argument");
8446 if ( RExC_parse == start_arg )
8449 if ( *RExC_parse != ')' )
8450 vFAIL("Unterminated verb pattern");
8453 switch ( *start_verb ) {
8454 case 'A': /* (*ACCEPT) */
8455 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8457 internal_argval = RExC_nestroot;
8460 case 'C': /* (*COMMIT) */
8461 if ( memEQs(start_verb,verb_len,"COMMIT") )
8464 case 'F': /* (*FAIL) */
8465 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8470 case ':': /* (*:NAME) */
8471 case 'M': /* (*MARK:NAME) */
8472 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8477 case 'P': /* (*PRUNE) */
8478 if ( memEQs(start_verb,verb_len,"PRUNE") )
8481 case 'S': /* (*SKIP) */
8482 if ( memEQs(start_verb,verb_len,"SKIP") )
8485 case 'T': /* (*THEN) */
8486 /* [19:06] <TimToady> :: is then */
8487 if ( memEQs(start_verb,verb_len,"THEN") ) {
8489 RExC_seen |= REG_SEEN_CUTGROUP;
8495 vFAIL3("Unknown verb pattern '%.*s'",
8496 verb_len, start_verb);
8499 if ( start_arg && internal_argval ) {
8500 vFAIL3("Verb pattern '%.*s' may not have an argument",
8501 verb_len, start_verb);
8502 } else if ( argok < 0 && !start_arg ) {
8503 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8504 verb_len, start_verb);
8506 ret = reganode(pRExC_state, op, internal_argval);
8507 if ( ! internal_argval && ! SIZE_ONLY ) {
8509 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8510 ARG(ret) = add_data( pRExC_state, 1, "S" );
8511 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8518 if (!internal_argval)
8519 RExC_seen |= REG_SEEN_VERBARG;
8520 } else if ( start_arg ) {
8521 vFAIL3("Verb pattern '%.*s' may not have an argument",
8522 verb_len, start_verb);
8524 ret = reg_node(pRExC_state, op);
8526 nextchar(pRExC_state);
8529 if (*RExC_parse == '?') { /* (?...) */
8530 bool is_logical = 0;
8531 const char * const seqstart = RExC_parse;
8532 bool has_use_defaults = FALSE;
8535 paren = *RExC_parse++;
8536 ret = NULL; /* For look-ahead/behind. */
8539 case 'P': /* (?P...) variants for those used to PCRE/Python */
8540 paren = *RExC_parse++;
8541 if ( paren == '<') /* (?P<...>) named capture */
8543 else if (paren == '>') { /* (?P>name) named recursion */
8544 goto named_recursion;
8546 else if (paren == '=') { /* (?P=...) named backref */
8547 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8548 you change this make sure you change that */
8549 char* name_start = RExC_parse;
8551 SV *sv_dat = reg_scan_name(pRExC_state,
8552 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8553 if (RExC_parse == name_start || *RExC_parse != ')')
8554 vFAIL2("Sequence %.3s... not terminated",parse_start);
8557 num = add_data( pRExC_state, 1, "S" );
8558 RExC_rxi->data->data[num]=(void*)sv_dat;
8559 SvREFCNT_inc_simple_void(sv_dat);
8562 ret = reganode(pRExC_state,
8565 : (ASCII_FOLD_RESTRICTED)
8567 : (AT_LEAST_UNI_SEMANTICS)
8575 Set_Node_Offset(ret, parse_start+1);
8576 Set_Node_Cur_Length(ret); /* MJD */
8578 nextchar(pRExC_state);
8582 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8584 case '<': /* (?<...) */
8585 if (*RExC_parse == '!')
8587 else if (*RExC_parse != '=')
8593 case '\'': /* (?'...') */
8594 name_start= RExC_parse;
8595 svname = reg_scan_name(pRExC_state,
8596 SIZE_ONLY ? /* reverse test from the others */
8597 REG_RSN_RETURN_NAME :
8598 REG_RSN_RETURN_NULL);
8599 if (RExC_parse == name_start) {
8601 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8604 if (*RExC_parse != paren)
8605 vFAIL2("Sequence (?%c... not terminated",
8606 paren=='>' ? '<' : paren);
8610 if (!svname) /* shouldn't happen */
8612 "panic: reg_scan_name returned NULL");
8613 if (!RExC_paren_names) {
8614 RExC_paren_names= newHV();
8615 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8617 RExC_paren_name_list= newAV();
8618 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8621 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8623 sv_dat = HeVAL(he_str);
8625 /* croak baby croak */
8627 "panic: paren_name hash element allocation failed");
8628 } else if ( SvPOK(sv_dat) ) {
8629 /* (?|...) can mean we have dupes so scan to check
8630 its already been stored. Maybe a flag indicating
8631 we are inside such a construct would be useful,
8632 but the arrays are likely to be quite small, so
8633 for now we punt -- dmq */
8634 IV count = SvIV(sv_dat);
8635 I32 *pv = (I32*)SvPVX(sv_dat);
8637 for ( i = 0 ; i < count ; i++ ) {
8638 if ( pv[i] == RExC_npar ) {
8644 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8645 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8646 pv[count] = RExC_npar;
8647 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8650 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8651 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8653 SvIV_set(sv_dat, 1);
8656 /* Yes this does cause a memory leak in debugging Perls */
8657 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8658 SvREFCNT_dec(svname);
8661 /*sv_dump(sv_dat);*/
8663 nextchar(pRExC_state);
8665 goto capturing_parens;
8667 RExC_seen |= REG_SEEN_LOOKBEHIND;
8668 RExC_in_lookbehind++;
8670 case '=': /* (?=...) */
8671 RExC_seen_zerolen++;
8673 case '!': /* (?!...) */
8674 RExC_seen_zerolen++;
8675 if (*RExC_parse == ')') {
8676 ret=reg_node(pRExC_state, OPFAIL);
8677 nextchar(pRExC_state);
8681 case '|': /* (?|...) */
8682 /* branch reset, behave like a (?:...) except that
8683 buffers in alternations share the same numbers */
8685 after_freeze = freeze_paren = RExC_npar;
8687 case ':': /* (?:...) */
8688 case '>': /* (?>...) */
8690 case '$': /* (?$...) */
8691 case '@': /* (?@...) */
8692 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8694 case '#': /* (?#...) */
8695 while (*RExC_parse && *RExC_parse != ')')
8697 if (*RExC_parse != ')')
8698 FAIL("Sequence (?#... not terminated");
8699 nextchar(pRExC_state);
8702 case '0' : /* (?0) */
8703 case 'R' : /* (?R) */
8704 if (*RExC_parse != ')')
8705 FAIL("Sequence (?R) not terminated");
8706 ret = reg_node(pRExC_state, GOSTART);
8707 *flagp |= POSTPONED;
8708 nextchar(pRExC_state);
8711 { /* named and numeric backreferences */
8713 case '&': /* (?&NAME) */
8714 parse_start = RExC_parse - 1;
8717 SV *sv_dat = reg_scan_name(pRExC_state,
8718 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8719 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8721 goto gen_recurse_regop;
8722 assert(0); /* NOT REACHED */
8724 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8726 vFAIL("Illegal pattern");
8728 goto parse_recursion;
8730 case '-': /* (?-1) */
8731 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8732 RExC_parse--; /* rewind to let it be handled later */
8736 case '1': case '2': case '3': case '4': /* (?1) */
8737 case '5': case '6': case '7': case '8': case '9':
8740 num = atoi(RExC_parse);
8741 parse_start = RExC_parse - 1; /* MJD */
8742 if (*RExC_parse == '-')
8744 while (isDIGIT(*RExC_parse))
8746 if (*RExC_parse!=')')
8747 vFAIL("Expecting close bracket");
8750 if ( paren == '-' ) {
8752 Diagram of capture buffer numbering.
8753 Top line is the normal capture buffer numbers
8754 Bottom line is the negative indexing as from
8758 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8762 num = RExC_npar + num;
8765 vFAIL("Reference to nonexistent group");
8767 } else if ( paren == '+' ) {
8768 num = RExC_npar + num - 1;
8771 ret = reganode(pRExC_state, GOSUB, num);
8773 if (num > (I32)RExC_rx->nparens) {
8775 vFAIL("Reference to nonexistent group");
8777 ARG2L_SET( ret, RExC_recurse_count++);
8779 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8780 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8784 RExC_seen |= REG_SEEN_RECURSE;
8785 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8786 Set_Node_Offset(ret, parse_start); /* MJD */
8788 *flagp |= POSTPONED;
8789 nextchar(pRExC_state);
8791 } /* named and numeric backreferences */
8792 assert(0); /* NOT REACHED */
8794 case '?': /* (??...) */
8796 if (*RExC_parse != '{') {
8798 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8801 *flagp |= POSTPONED;
8802 paren = *RExC_parse++;
8804 case '{': /* (?{...}) */
8807 struct reg_code_block *cb;
8809 RExC_seen_zerolen++;
8811 if ( !pRExC_state->num_code_blocks
8812 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8813 || pRExC_state->code_blocks[pRExC_state->code_index].start
8814 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8817 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8818 FAIL("panic: Sequence (?{...}): no code block found\n");
8819 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8821 /* this is a pre-compiled code block (?{...}) */
8822 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8823 RExC_parse = RExC_start + cb->end;
8826 if (cb->src_regex) {
8827 n = add_data(pRExC_state, 2, "rl");
8828 RExC_rxi->data->data[n] =
8829 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8830 RExC_rxi->data->data[n+1] = (void*)o;
8833 n = add_data(pRExC_state, 1,
8834 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8835 RExC_rxi->data->data[n] = (void*)o;
8838 pRExC_state->code_index++;
8839 nextchar(pRExC_state);
8843 ret = reg_node(pRExC_state, LOGICAL);
8844 eval = reganode(pRExC_state, EVAL, n);
8847 /* for later propagation into (??{}) return value */
8848 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8850 REGTAIL(pRExC_state, ret, eval);
8851 /* deal with the length of this later - MJD */
8854 ret = reganode(pRExC_state, EVAL, n);
8855 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8856 Set_Node_Offset(ret, parse_start);
8859 case '(': /* (?(?{...})...) and (?(?=...)...) */
8862 if (RExC_parse[0] == '?') { /* (?(?...)) */
8863 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8864 || RExC_parse[1] == '<'
8865 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8868 ret = reg_node(pRExC_state, LOGICAL);
8871 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8875 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8876 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8878 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8879 char *name_start= RExC_parse++;
8881 SV *sv_dat=reg_scan_name(pRExC_state,
8882 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8883 if (RExC_parse == name_start || *RExC_parse != ch)
8884 vFAIL2("Sequence (?(%c... not terminated",
8885 (ch == '>' ? '<' : ch));
8888 num = add_data( pRExC_state, 1, "S" );
8889 RExC_rxi->data->data[num]=(void*)sv_dat;
8890 SvREFCNT_inc_simple_void(sv_dat);
8892 ret = reganode(pRExC_state,NGROUPP,num);
8893 goto insert_if_check_paren;
8895 else if (RExC_parse[0] == 'D' &&
8896 RExC_parse[1] == 'E' &&
8897 RExC_parse[2] == 'F' &&
8898 RExC_parse[3] == 'I' &&
8899 RExC_parse[4] == 'N' &&
8900 RExC_parse[5] == 'E')
8902 ret = reganode(pRExC_state,DEFINEP,0);
8905 goto insert_if_check_paren;
8907 else if (RExC_parse[0] == 'R') {
8910 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8911 parno = atoi(RExC_parse++);
8912 while (isDIGIT(*RExC_parse))
8914 } else if (RExC_parse[0] == '&') {
8917 sv_dat = reg_scan_name(pRExC_state,
8918 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8919 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8921 ret = reganode(pRExC_state,INSUBP,parno);
8922 goto insert_if_check_paren;
8924 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8927 parno = atoi(RExC_parse++);
8929 while (isDIGIT(*RExC_parse))
8931 ret = reganode(pRExC_state, GROUPP, parno);
8933 insert_if_check_paren:
8934 if ((c = *nextchar(pRExC_state)) != ')')
8935 vFAIL("Switch condition not recognized");
8937 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8938 br = regbranch(pRExC_state, &flags, 1,depth+1);
8940 br = reganode(pRExC_state, LONGJMP, 0);
8942 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8943 c = *nextchar(pRExC_state);
8948 vFAIL("(?(DEFINE)....) does not allow branches");
8949 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8950 regbranch(pRExC_state, &flags, 1,depth+1);
8951 REGTAIL(pRExC_state, ret, lastbr);
8954 c = *nextchar(pRExC_state);
8959 vFAIL("Switch (?(condition)... contains too many branches");
8960 ender = reg_node(pRExC_state, TAIL);
8961 REGTAIL(pRExC_state, br, ender);
8963 REGTAIL(pRExC_state, lastbr, ender);
8964 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8967 REGTAIL(pRExC_state, ret, ender);
8968 RExC_size++; /* XXX WHY do we need this?!!
8969 For large programs it seems to be required
8970 but I can't figure out why. -- dmq*/
8974 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8978 RExC_parse--; /* for vFAIL to print correctly */
8979 vFAIL("Sequence (? incomplete");
8981 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8983 has_use_defaults = TRUE;
8984 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8985 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8986 ? REGEX_UNICODE_CHARSET
8987 : REGEX_DEPENDS_CHARSET);
8991 parse_flags: /* (?i) */
8993 U32 posflags = 0, negflags = 0;
8994 U32 *flagsp = &posflags;
8995 char has_charset_modifier = '\0';
8996 regex_charset cs = get_regex_charset(RExC_flags);
8997 if (cs == REGEX_DEPENDS_CHARSET
8998 && (RExC_utf8 || RExC_uni_semantics))
9000 cs = REGEX_UNICODE_CHARSET;
9003 while (*RExC_parse) {
9004 /* && strchr("iogcmsx", *RExC_parse) */
9005 /* (?g), (?gc) and (?o) are useless here
9006 and must be globally applied -- japhy */
9007 switch (*RExC_parse) {
9008 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9009 case LOCALE_PAT_MOD:
9010 if (has_charset_modifier) {
9011 goto excess_modifier;
9013 else if (flagsp == &negflags) {
9016 cs = REGEX_LOCALE_CHARSET;
9017 has_charset_modifier = LOCALE_PAT_MOD;
9018 RExC_contains_locale = 1;
9020 case UNICODE_PAT_MOD:
9021 if (has_charset_modifier) {
9022 goto excess_modifier;
9024 else if (flagsp == &negflags) {
9027 cs = REGEX_UNICODE_CHARSET;
9028 has_charset_modifier = UNICODE_PAT_MOD;
9030 case ASCII_RESTRICT_PAT_MOD:
9031 if (flagsp == &negflags) {
9034 if (has_charset_modifier) {
9035 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9036 goto excess_modifier;
9038 /* Doubled modifier implies more restricted */
9039 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9042 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9044 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9046 case DEPENDS_PAT_MOD:
9047 if (has_use_defaults) {
9048 goto fail_modifiers;
9050 else if (flagsp == &negflags) {
9053 else if (has_charset_modifier) {
9054 goto excess_modifier;
9057 /* The dual charset means unicode semantics if the
9058 * pattern (or target, not known until runtime) are
9059 * utf8, or something in the pattern indicates unicode
9061 cs = (RExC_utf8 || RExC_uni_semantics)
9062 ? REGEX_UNICODE_CHARSET
9063 : REGEX_DEPENDS_CHARSET;
9064 has_charset_modifier = DEPENDS_PAT_MOD;
9068 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9069 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9071 else if (has_charset_modifier == *(RExC_parse - 1)) {
9072 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9075 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9080 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9082 case ONCE_PAT_MOD: /* 'o' */
9083 case GLOBAL_PAT_MOD: /* 'g' */
9084 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9085 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9086 if (! (wastedflags & wflagbit) ) {
9087 wastedflags |= wflagbit;
9090 "Useless (%s%c) - %suse /%c modifier",
9091 flagsp == &negflags ? "?-" : "?",
9093 flagsp == &negflags ? "don't " : "",
9100 case CONTINUE_PAT_MOD: /* 'c' */
9101 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9102 if (! (wastedflags & WASTED_C) ) {
9103 wastedflags |= WASTED_GC;
9106 "Useless (%sc) - %suse /gc modifier",
9107 flagsp == &negflags ? "?-" : "?",
9108 flagsp == &negflags ? "don't " : ""
9113 case KEEPCOPY_PAT_MOD: /* 'p' */
9114 if (flagsp == &negflags) {
9116 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9118 *flagsp |= RXf_PMf_KEEPCOPY;
9122 /* A flag is a default iff it is following a minus, so
9123 * if there is a minus, it means will be trying to
9124 * re-specify a default which is an error */
9125 if (has_use_defaults || flagsp == &negflags) {
9128 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9132 wastedflags = 0; /* reset so (?g-c) warns twice */
9138 RExC_flags |= posflags;
9139 RExC_flags &= ~negflags;
9140 set_regex_charset(&RExC_flags, cs);
9142 oregflags |= posflags;
9143 oregflags &= ~negflags;
9144 set_regex_charset(&oregflags, cs);
9146 nextchar(pRExC_state);
9157 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9162 }} /* one for the default block, one for the switch */
9169 ret = reganode(pRExC_state, OPEN, parno);
9172 RExC_nestroot = parno;
9173 if (RExC_seen & REG_SEEN_RECURSE
9174 && !RExC_open_parens[parno-1])
9176 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9177 "Setting open paren #%"IVdf" to %d\n",
9178 (IV)parno, REG_NODE_NUM(ret)));
9179 RExC_open_parens[parno-1]= ret;
9182 Set_Node_Length(ret, 1); /* MJD */
9183 Set_Node_Offset(ret, RExC_parse); /* MJD */
9191 /* Pick up the branches, linking them together. */
9192 parse_start = RExC_parse; /* MJD */
9193 br = regbranch(pRExC_state, &flags, 1,depth+1);
9195 /* branch_len = (paren != 0); */
9199 if (*RExC_parse == '|') {
9200 if (!SIZE_ONLY && RExC_extralen) {
9201 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9204 reginsert(pRExC_state, BRANCH, br, depth+1);
9205 Set_Node_Length(br, paren != 0);
9206 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9210 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9212 else if (paren == ':') {
9213 *flagp |= flags&SIMPLE;
9215 if (is_open) { /* Starts with OPEN. */
9216 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9218 else if (paren != '?') /* Not Conditional */
9220 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9222 while (*RExC_parse == '|') {
9223 if (!SIZE_ONLY && RExC_extralen) {
9224 ender = reganode(pRExC_state, LONGJMP,0);
9225 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9228 RExC_extralen += 2; /* Account for LONGJMP. */
9229 nextchar(pRExC_state);
9231 if (RExC_npar > after_freeze)
9232 after_freeze = RExC_npar;
9233 RExC_npar = freeze_paren;
9235 br = regbranch(pRExC_state, &flags, 0, depth+1);
9239 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9241 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9244 if (have_branch || paren != ':') {
9245 /* Make a closing node, and hook it on the end. */
9248 ender = reg_node(pRExC_state, TAIL);
9251 ender = reganode(pRExC_state, CLOSE, parno);
9252 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9253 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9254 "Setting close paren #%"IVdf" to %d\n",
9255 (IV)parno, REG_NODE_NUM(ender)));
9256 RExC_close_parens[parno-1]= ender;
9257 if (RExC_nestroot == parno)
9260 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9261 Set_Node_Length(ender,1); /* MJD */
9267 *flagp &= ~HASWIDTH;
9270 ender = reg_node(pRExC_state, SUCCEED);
9273 ender = reg_node(pRExC_state, END);
9275 assert(!RExC_opend); /* there can only be one! */
9280 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9281 SV * const mysv_val1=sv_newmortal();
9282 SV * const mysv_val2=sv_newmortal();
9283 DEBUG_PARSE_MSG("lsbr");
9284 regprop(RExC_rx, mysv_val1, lastbr);
9285 regprop(RExC_rx, mysv_val2, ender);
9286 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9287 SvPV_nolen_const(mysv_val1),
9288 (IV)REG_NODE_NUM(lastbr),
9289 SvPV_nolen_const(mysv_val2),
9290 (IV)REG_NODE_NUM(ender),
9291 (IV)(ender - lastbr)
9294 REGTAIL(pRExC_state, lastbr, ender);
9296 if (have_branch && !SIZE_ONLY) {
9299 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9301 /* Hook the tails of the branches to the closing node. */
9302 for (br = ret; br; br = regnext(br)) {
9303 const U8 op = PL_regkind[OP(br)];
9305 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9306 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9309 else if (op == BRANCHJ) {
9310 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9311 /* for now we always disable this optimisation * /
9312 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9318 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9319 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9320 SV * const mysv_val1=sv_newmortal();
9321 SV * const mysv_val2=sv_newmortal();
9322 DEBUG_PARSE_MSG("NADA");
9323 regprop(RExC_rx, mysv_val1, ret);
9324 regprop(RExC_rx, mysv_val2, ender);
9325 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9326 SvPV_nolen_const(mysv_val1),
9327 (IV)REG_NODE_NUM(ret),
9328 SvPV_nolen_const(mysv_val2),
9329 (IV)REG_NODE_NUM(ender),
9334 if (OP(ender) == TAIL) {
9339 for ( opt= br + 1; opt < ender ; opt++ )
9341 NEXT_OFF(br)= ender - br;
9349 static const char parens[] = "=!<,>";
9351 if (paren && (p = strchr(parens, paren))) {
9352 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9353 int flag = (p - parens) > 1;
9356 node = SUSPEND, flag = 0;
9357 reginsert(pRExC_state, node,ret, depth+1);
9358 Set_Node_Cur_Length(ret);
9359 Set_Node_Offset(ret, parse_start + 1);
9361 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9365 /* Check for proper termination. */
9367 RExC_flags = oregflags;
9368 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9369 RExC_parse = oregcomp_parse;
9370 vFAIL("Unmatched (");
9373 else if (!paren && RExC_parse < RExC_end) {
9374 if (*RExC_parse == ')') {
9376 vFAIL("Unmatched )");
9379 FAIL("Junk on end of regexp"); /* "Can't happen". */
9380 assert(0); /* NOTREACHED */
9383 if (RExC_in_lookbehind) {
9384 RExC_in_lookbehind--;
9386 if (after_freeze > RExC_npar)
9387 RExC_npar = after_freeze;
9392 - regbranch - one alternative of an | operator
9394 * Implements the concatenation operator.
9397 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9401 regnode *chain = NULL;
9403 I32 flags = 0, c = 0;
9404 GET_RE_DEBUG_FLAGS_DECL;
9406 PERL_ARGS_ASSERT_REGBRANCH;
9408 DEBUG_PARSE("brnc");
9413 if (!SIZE_ONLY && RExC_extralen)
9414 ret = reganode(pRExC_state, BRANCHJ,0);
9416 ret = reg_node(pRExC_state, BRANCH);
9417 Set_Node_Length(ret, 1);
9421 if (!first && SIZE_ONLY)
9422 RExC_extralen += 1; /* BRANCHJ */
9424 *flagp = WORST; /* Tentatively. */
9427 nextchar(pRExC_state);
9428 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9430 latest = regpiece(pRExC_state, &flags,depth+1);
9431 if (latest == NULL) {
9432 if (flags & TRYAGAIN)
9436 else if (ret == NULL)
9438 *flagp |= flags&(HASWIDTH|POSTPONED);
9439 if (chain == NULL) /* First piece. */
9440 *flagp |= flags&SPSTART;
9443 REGTAIL(pRExC_state, chain, latest);
9448 if (chain == NULL) { /* Loop ran zero times. */
9449 chain = reg_node(pRExC_state, NOTHING);
9454 *flagp |= flags&SIMPLE;
9461 - regpiece - something followed by possible [*+?]
9463 * Note that the branching code sequences used for ? and the general cases
9464 * of * and + are somewhat optimized: they use the same NOTHING node as
9465 * both the endmarker for their branch list and the body of the last branch.
9466 * It might seem that this node could be dispensed with entirely, but the
9467 * endmarker role is not redundant.
9470 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9477 const char * const origparse = RExC_parse;
9479 I32 max = REG_INFTY;
9480 #ifdef RE_TRACK_PATTERN_OFFSETS
9483 const char *maxpos = NULL;
9485 /* Save the original in case we change the emitted regop to a FAIL. */
9486 regnode * const orig_emit = RExC_emit;
9488 GET_RE_DEBUG_FLAGS_DECL;
9490 PERL_ARGS_ASSERT_REGPIECE;
9492 DEBUG_PARSE("piec");
9494 ret = regatom(pRExC_state, &flags,depth+1);
9496 if (flags & TRYAGAIN)
9503 if (op == '{' && regcurly(RExC_parse)) {
9505 #ifdef RE_TRACK_PATTERN_OFFSETS
9506 parse_start = RExC_parse; /* MJD */
9508 next = RExC_parse + 1;
9509 while (isDIGIT(*next) || *next == ',') {
9518 if (*next == '}') { /* got one */
9522 min = atoi(RExC_parse);
9526 maxpos = RExC_parse;
9528 if (!max && *maxpos != '0')
9529 max = REG_INFTY; /* meaning "infinity" */
9530 else if (max >= REG_INFTY)
9531 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9533 nextchar(pRExC_state);
9534 if (max < min) { /* If can't match, warn and optimize to fail
9537 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9539 /* We can't back off the size because we have to reserve
9540 * enough space for all the things we are about to throw
9541 * away, but we can shrink it by the ammount we are about
9543 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9546 RExC_emit = orig_emit;
9548 ret = reg_node(pRExC_state, OPFAIL);
9553 if ((flags&SIMPLE)) {
9554 RExC_naughty += 2 + RExC_naughty / 2;
9555 reginsert(pRExC_state, CURLY, ret, depth+1);
9556 Set_Node_Offset(ret, parse_start+1); /* MJD */
9557 Set_Node_Cur_Length(ret);
9560 regnode * const w = reg_node(pRExC_state, WHILEM);
9563 REGTAIL(pRExC_state, ret, w);
9564 if (!SIZE_ONLY && RExC_extralen) {
9565 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9566 reginsert(pRExC_state, NOTHING,ret, depth+1);
9567 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9569 reginsert(pRExC_state, CURLYX,ret, depth+1);
9571 Set_Node_Offset(ret, parse_start+1);
9572 Set_Node_Length(ret,
9573 op == '{' ? (RExC_parse - parse_start) : 1);
9575 if (!SIZE_ONLY && RExC_extralen)
9576 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9577 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9579 RExC_whilem_seen++, RExC_extralen += 3;
9580 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9589 ARG1_SET(ret, (U16)min);
9590 ARG2_SET(ret, (U16)max);
9602 #if 0 /* Now runtime fix should be reliable. */
9604 /* if this is reinstated, don't forget to put this back into perldiag:
9606 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9608 (F) The part of the regexp subject to either the * or + quantifier
9609 could match an empty string. The {#} shows in the regular
9610 expression about where the problem was discovered.
9614 if (!(flags&HASWIDTH) && op != '?')
9615 vFAIL("Regexp *+ operand could be empty");
9618 #ifdef RE_TRACK_PATTERN_OFFSETS
9619 parse_start = RExC_parse;
9621 nextchar(pRExC_state);
9623 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9625 if (op == '*' && (flags&SIMPLE)) {
9626 reginsert(pRExC_state, STAR, ret, depth+1);
9630 else if (op == '*') {
9634 else if (op == '+' && (flags&SIMPLE)) {
9635 reginsert(pRExC_state, PLUS, ret, depth+1);
9639 else if (op == '+') {
9643 else if (op == '?') {
9648 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9649 ckWARN3reg(RExC_parse,
9650 "%.*s matches null string many times",
9651 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9655 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9656 nextchar(pRExC_state);
9657 reginsert(pRExC_state, MINMOD, ret, depth+1);
9658 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9660 #ifndef REG_ALLOW_MINMOD_SUSPEND
9663 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9665 nextchar(pRExC_state);
9666 ender = reg_node(pRExC_state, SUCCEED);
9667 REGTAIL(pRExC_state, ret, ender);
9668 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9670 ender = reg_node(pRExC_state, TAIL);
9671 REGTAIL(pRExC_state, ret, ender);
9675 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9677 vFAIL("Nested quantifiers");
9684 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9687 /* This is expected to be called by a parser routine that has recognized '\N'
9688 and needs to handle the rest. RExC_parse is expected to point at the first
9689 char following the N at the time of the call. On successful return,
9690 RExC_parse has been updated to point to just after the sequence identified
9691 by this routine, and <*flagp> has been updated.
9693 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9696 \N may begin either a named sequence, or if outside a character class, mean
9697 to match a non-newline. For non single-quoted regexes, the tokenizer has
9698 attempted to decide which, and in the case of a named sequence, converted it
9699 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9700 where c1... are the characters in the sequence. For single-quoted regexes,
9701 the tokenizer passes the \N sequence through unchanged; this code will not
9702 attempt to determine this nor expand those, instead raising a syntax error.
9703 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9704 or there is no '}', it signals that this \N occurrence means to match a
9707 Only the \N{U+...} form should occur in a character class, for the same
9708 reason that '.' inside a character class means to just match a period: it
9709 just doesn't make sense.
9711 The function raises an error (via vFAIL), and doesn't return for various
9712 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9713 success; it returns FALSE otherwise.
9715 If <valuep> is non-null, it means the caller can accept an input sequence
9716 consisting of a just a single code point; <*valuep> is set to that value
9717 if the input is such.
9719 If <node_p> is non-null it signifies that the caller can accept any other
9720 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9722 1) \N means not-a-NL: points to a newly created REG_ANY node;
9723 2) \N{}: points to a new NOTHING node;
9724 3) otherwise: points to a new EXACT node containing the resolved
9726 Note that FALSE is returned for single code point sequences if <valuep> is
9730 char * endbrace; /* '}' following the name */
9732 char *endchar; /* Points to '.' or '}' ending cur char in the input
9734 bool has_multiple_chars; /* true if the input stream contains a sequence of
9735 more than one character */
9737 GET_RE_DEBUG_FLAGS_DECL;
9739 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9743 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9745 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9746 * modifier. The other meaning does not */
9747 p = (RExC_flags & RXf_PMf_EXTENDED)
9748 ? regwhite( pRExC_state, RExC_parse )
9751 /* Disambiguate between \N meaning a named character versus \N meaning
9752 * [^\n]. The former is assumed when it can't be the latter. */
9753 if (*p != '{' || regcurly(p)) {
9756 /* no bare \N in a charclass */
9757 if (in_char_class) {
9758 vFAIL("\\N in a character class must be a named character: \\N{...}");
9762 nextchar(pRExC_state);
9763 *node_p = reg_node(pRExC_state, REG_ANY);
9764 *flagp |= HASWIDTH|SIMPLE;
9767 Set_Node_Length(*node_p, 1); /* MJD */
9771 /* Here, we have decided it should be a named character or sequence */
9773 /* The test above made sure that the next real character is a '{', but
9774 * under the /x modifier, it could be separated by space (or a comment and
9775 * \n) and this is not allowed (for consistency with \x{...} and the
9776 * tokenizer handling of \N{NAME}). */
9777 if (*RExC_parse != '{') {
9778 vFAIL("Missing braces on \\N{}");
9781 RExC_parse++; /* Skip past the '{' */
9783 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9784 || ! (endbrace == RExC_parse /* nothing between the {} */
9785 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9786 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9788 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9789 vFAIL("\\N{NAME} must be resolved by the lexer");
9792 if (endbrace == RExC_parse) { /* empty: \N{} */
9795 *node_p = reg_node(pRExC_state,NOTHING);
9797 else if (in_char_class) {
9798 if (SIZE_ONLY && in_char_class) {
9799 ckWARNreg(RExC_parse,
9800 "Ignoring zero length \\N{} in character class"
9808 nextchar(pRExC_state);
9812 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9813 RExC_parse += 2; /* Skip past the 'U+' */
9815 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9817 /* Code points are separated by dots. If none, there is only one code
9818 * point, and is terminated by the brace */
9819 has_multiple_chars = (endchar < endbrace);
9821 if (valuep && (! has_multiple_chars || in_char_class)) {
9822 /* We only pay attention to the first char of
9823 multichar strings being returned in char classes. I kinda wonder
9824 if this makes sense as it does change the behaviour
9825 from earlier versions, OTOH that behaviour was broken
9826 as well. XXX Solution is to recharacterize as
9827 [rest-of-class]|multi1|multi2... */
9829 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9830 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9831 | PERL_SCAN_DISALLOW_PREFIX
9832 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9834 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9836 /* The tokenizer should have guaranteed validity, but it's possible to
9837 * bypass it by using single quoting, so check */
9838 if (length_of_hex == 0
9839 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9841 RExC_parse += length_of_hex; /* Includes all the valid */
9842 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9843 ? UTF8SKIP(RExC_parse)
9845 /* Guard against malformed utf8 */
9846 if (RExC_parse >= endchar) {
9847 RExC_parse = endchar;
9849 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9852 if (in_char_class && has_multiple_chars) {
9853 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9856 RExC_parse = endbrace + 1;
9858 else if (! node_p || ! has_multiple_chars) {
9860 /* Here, the input is legal, but not according to the caller's
9861 * options. We fail without advancing the parse, so that the
9862 * caller can try again */
9868 /* What is done here is to convert this to a sub-pattern of the form
9869 * (?:\x{char1}\x{char2}...)
9870 * and then call reg recursively. That way, it retains its atomicness,
9871 * while not having to worry about special handling that some code
9872 * points may have. toke.c has converted the original Unicode values
9873 * to native, so that we can just pass on the hex values unchanged. We
9874 * do have to set a flag to keep recoding from happening in the
9877 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9879 char *orig_end = RExC_end;
9882 while (RExC_parse < endbrace) {
9884 /* Convert to notation the rest of the code understands */
9885 sv_catpv(substitute_parse, "\\x{");
9886 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9887 sv_catpv(substitute_parse, "}");
9889 /* Point to the beginning of the next character in the sequence. */
9890 RExC_parse = endchar + 1;
9891 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9893 sv_catpv(substitute_parse, ")");
9895 RExC_parse = SvPV(substitute_parse, len);
9897 /* Don't allow empty number */
9899 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9901 RExC_end = RExC_parse + len;
9903 /* The values are Unicode, and therefore not subject to recoding */
9904 RExC_override_recoding = 1;
9906 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9907 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9909 RExC_parse = endbrace;
9910 RExC_end = orig_end;
9911 RExC_override_recoding = 0;
9913 nextchar(pRExC_state);
9923 * It returns the code point in utf8 for the value in *encp.
9924 * value: a code value in the source encoding
9925 * encp: a pointer to an Encode object
9927 * If the result from Encode is not a single character,
9928 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9931 S_reg_recode(pTHX_ const char value, SV **encp)
9934 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9935 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9936 const STRLEN newlen = SvCUR(sv);
9937 UV uv = UNICODE_REPLACEMENT;
9939 PERL_ARGS_ASSERT_REG_RECODE;
9943 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9946 if (!newlen || numlen != newlen) {
9947 uv = UNICODE_REPLACEMENT;
9953 PERL_STATIC_INLINE U8
9954 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9958 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9964 op = get_regex_charset(RExC_flags);
9965 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9966 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9967 been, so there is no hole */
9973 PERL_STATIC_INLINE void
9974 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9976 /* This knows the details about sizing an EXACTish node, setting flags for
9977 * it (by setting <*flagp>, and potentially populating it with a single
9980 * If <len> (the length in bytes) is non-zero, this function assumes that
9981 * the node has already been populated, and just does the sizing. In this
9982 * case <code_point> should be the final code point that has already been
9983 * placed into the node. This value will be ignored except that under some
9984 * circumstances <*flagp> is set based on it.
9986 * If <len> is zero, the function assumes that the node is to contain only
9987 * the single character given by <code_point> and calculates what <len>
9988 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9989 * additionally will populate the node's STRING with <code_point>, if <len>
9990 * is 0. In both cases <*flagp> is appropriately set
9992 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9993 * folded (the latter only when the rules indicate it can match 'ss') */
9995 bool len_passed_in = cBOOL(len != 0);
9996 U8 character[UTF8_MAXBYTES_CASE+1];
9998 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10000 if (! len_passed_in) {
10003 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
10006 uvchr_to_utf8( character, code_point);
10007 len = UTF8SKIP(character);
10011 || code_point != LATIN_SMALL_LETTER_SHARP_S
10012 || ASCII_FOLD_RESTRICTED
10013 || ! AT_LEAST_UNI_SEMANTICS)
10015 *character = (U8) code_point;
10020 *(character + 1) = 's';
10026 RExC_size += STR_SZ(len);
10029 RExC_emit += STR_SZ(len);
10030 STR_LEN(node) = len;
10031 if (! len_passed_in) {
10032 Copy((char *) character, STRING(node), len, char);
10036 *flagp |= HASWIDTH;
10038 /* A single character node is SIMPLE, except for the special-cased SHARP S
10040 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10041 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10042 || ! FOLD || ! DEPENDS_SEMANTICS))
10049 - regatom - the lowest level
10051 Try to identify anything special at the start of the pattern. If there
10052 is, then handle it as required. This may involve generating a single regop,
10053 such as for an assertion; or it may involve recursing, such as to
10054 handle a () structure.
10056 If the string doesn't start with something special then we gobble up
10057 as much literal text as we can.
10059 Once we have been able to handle whatever type of thing started the
10060 sequence, we return.
10062 Note: we have to be careful with escapes, as they can be both literal
10063 and special, and in the case of \10 and friends, context determines which.
10065 A summary of the code structure is:
10067 switch (first_byte) {
10068 cases for each special:
10069 handle this special;
10072 switch (2nd byte) {
10073 cases for each unambiguous special:
10074 handle this special;
10076 cases for each ambigous special/literal:
10078 if (special) handle here
10080 default: // unambiguously literal:
10083 default: // is a literal char
10086 create EXACTish node for literal;
10087 while (more input and node isn't full) {
10088 switch (input_byte) {
10089 cases for each special;
10090 make sure parse pointer is set so that the next call to
10091 regatom will see this special first
10092 goto loopdone; // EXACTish node terminated by prev. char
10094 append char to EXACTISH node;
10096 get next input byte;
10100 return the generated node;
10102 Specifically there are two separate switches for handling
10103 escape sequences, with the one for handling literal escapes requiring
10104 a dummy entry for all of the special escapes that are actually handled
10109 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10112 regnode *ret = NULL;
10114 char *parse_start = RExC_parse;
10116 GET_RE_DEBUG_FLAGS_DECL;
10117 DEBUG_PARSE("atom");
10118 *flagp = WORST; /* Tentatively. */
10120 PERL_ARGS_ASSERT_REGATOM;
10123 switch ((U8)*RExC_parse) {
10125 RExC_seen_zerolen++;
10126 nextchar(pRExC_state);
10127 if (RExC_flags & RXf_PMf_MULTILINE)
10128 ret = reg_node(pRExC_state, MBOL);
10129 else if (RExC_flags & RXf_PMf_SINGLELINE)
10130 ret = reg_node(pRExC_state, SBOL);
10132 ret = reg_node(pRExC_state, BOL);
10133 Set_Node_Length(ret, 1); /* MJD */
10136 nextchar(pRExC_state);
10138 RExC_seen_zerolen++;
10139 if (RExC_flags & RXf_PMf_MULTILINE)
10140 ret = reg_node(pRExC_state, MEOL);
10141 else if (RExC_flags & RXf_PMf_SINGLELINE)
10142 ret = reg_node(pRExC_state, SEOL);
10144 ret = reg_node(pRExC_state, EOL);
10145 Set_Node_Length(ret, 1); /* MJD */
10148 nextchar(pRExC_state);
10149 if (RExC_flags & RXf_PMf_SINGLELINE)
10150 ret = reg_node(pRExC_state, SANY);
10152 ret = reg_node(pRExC_state, REG_ANY);
10153 *flagp |= HASWIDTH|SIMPLE;
10155 Set_Node_Length(ret, 1); /* MJD */
10159 char * const oregcomp_parse = ++RExC_parse;
10160 ret = regclass(pRExC_state, flagp,depth+1);
10161 if (*RExC_parse != ']') {
10162 RExC_parse = oregcomp_parse;
10163 vFAIL("Unmatched [");
10165 nextchar(pRExC_state);
10166 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10170 nextchar(pRExC_state);
10171 ret = reg(pRExC_state, 1, &flags,depth+1);
10173 if (flags & TRYAGAIN) {
10174 if (RExC_parse == RExC_end) {
10175 /* Make parent create an empty node if needed. */
10176 *flagp |= TRYAGAIN;
10183 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10187 if (flags & TRYAGAIN) {
10188 *flagp |= TRYAGAIN;
10191 vFAIL("Internal urp");
10192 /* Supposed to be caught earlier. */
10198 vFAIL("Quantifier follows nothing");
10203 This switch handles escape sequences that resolve to some kind
10204 of special regop and not to literal text. Escape sequnces that
10205 resolve to literal text are handled below in the switch marked
10208 Every entry in this switch *must* have a corresponding entry
10209 in the literal escape switch. However, the opposite is not
10210 required, as the default for this switch is to jump to the
10211 literal text handling code.
10213 switch ((U8)*++RExC_parse) {
10214 /* Special Escapes */
10216 RExC_seen_zerolen++;
10217 ret = reg_node(pRExC_state, SBOL);
10219 goto finish_meta_pat;
10221 ret = reg_node(pRExC_state, GPOS);
10222 RExC_seen |= REG_SEEN_GPOS;
10224 goto finish_meta_pat;
10226 RExC_seen_zerolen++;
10227 ret = reg_node(pRExC_state, KEEPS);
10229 /* XXX:dmq : disabling in-place substitution seems to
10230 * be necessary here to avoid cases of memory corruption, as
10231 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10233 RExC_seen |= REG_SEEN_LOOKBEHIND;
10234 goto finish_meta_pat;
10236 ret = reg_node(pRExC_state, SEOL);
10238 RExC_seen_zerolen++; /* Do not optimize RE away */
10239 goto finish_meta_pat;
10241 ret = reg_node(pRExC_state, EOS);
10243 RExC_seen_zerolen++; /* Do not optimize RE away */
10244 goto finish_meta_pat;
10246 ret = reg_node(pRExC_state, CANY);
10247 RExC_seen |= REG_SEEN_CANY;
10248 *flagp |= HASWIDTH|SIMPLE;
10249 goto finish_meta_pat;
10251 ret = reg_node(pRExC_state, CLUMP);
10252 *flagp |= HASWIDTH;
10253 goto finish_meta_pat;
10255 op = ALNUM + get_regex_charset(RExC_flags);
10256 if (op > ALNUMA) { /* /aa is same as /a */
10259 ret = reg_node(pRExC_state, op);
10260 *flagp |= HASWIDTH|SIMPLE;
10261 goto finish_meta_pat;
10263 op = NALNUM + get_regex_charset(RExC_flags);
10264 if (op > NALNUMA) { /* /aa is same as /a */
10267 ret = reg_node(pRExC_state, op);
10268 *flagp |= HASWIDTH|SIMPLE;
10269 goto finish_meta_pat;
10271 RExC_seen_zerolen++;
10272 RExC_seen |= REG_SEEN_LOOKBEHIND;
10273 op = BOUND + get_regex_charset(RExC_flags);
10274 if (op > BOUNDA) { /* /aa is same as /a */
10277 ret = reg_node(pRExC_state, op);
10278 FLAGS(ret) = get_regex_charset(RExC_flags);
10280 goto finish_meta_pat;
10282 RExC_seen_zerolen++;
10283 RExC_seen |= REG_SEEN_LOOKBEHIND;
10284 op = NBOUND + get_regex_charset(RExC_flags);
10285 if (op > NBOUNDA) { /* /aa is same as /a */
10288 ret = reg_node(pRExC_state, op);
10289 FLAGS(ret) = get_regex_charset(RExC_flags);
10291 goto finish_meta_pat;
10293 op = SPACE + get_regex_charset(RExC_flags);
10294 if (op > SPACEA) { /* /aa is same as /a */
10297 ret = reg_node(pRExC_state, op);
10298 *flagp |= HASWIDTH|SIMPLE;
10299 goto finish_meta_pat;
10301 op = NSPACE + get_regex_charset(RExC_flags);
10302 if (op > NSPACEA) { /* /aa is same as /a */
10305 ret = reg_node(pRExC_state, op);
10306 *flagp |= HASWIDTH|SIMPLE;
10307 goto finish_meta_pat;
10315 U8 offset = get_regex_charset(RExC_flags);
10316 if (offset == REGEX_UNICODE_CHARSET) {
10317 offset = REGEX_DEPENDS_CHARSET;
10319 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10320 offset = REGEX_ASCII_RESTRICTED_CHARSET;
10324 ret = reg_node(pRExC_state, op);
10325 *flagp |= HASWIDTH|SIMPLE;
10326 goto finish_meta_pat;
10328 ret = reg_node(pRExC_state, LNBREAK);
10329 *flagp |= HASWIDTH|SIMPLE;
10330 goto finish_meta_pat;
10332 ret = reg_node(pRExC_state, HORIZWS);
10333 *flagp |= HASWIDTH|SIMPLE;
10334 goto finish_meta_pat;
10336 ret = reg_node(pRExC_state, NHORIZWS);
10337 *flagp |= HASWIDTH|SIMPLE;
10338 goto finish_meta_pat;
10340 ret = reg_node(pRExC_state, VERTWS);
10341 *flagp |= HASWIDTH|SIMPLE;
10342 goto finish_meta_pat;
10344 ret = reg_node(pRExC_state, NVERTWS);
10345 *flagp |= HASWIDTH|SIMPLE;
10347 nextchar(pRExC_state);
10348 Set_Node_Length(ret, 2); /* MJD */
10353 char* const oldregxend = RExC_end;
10355 char* parse_start = RExC_parse - 2;
10358 if (RExC_parse[1] == '{') {
10359 /* a lovely hack--pretend we saw [\pX] instead */
10360 RExC_end = strchr(RExC_parse, '}');
10362 const U8 c = (U8)*RExC_parse;
10364 RExC_end = oldregxend;
10365 vFAIL2("Missing right brace on \\%c{}", c);
10370 RExC_end = RExC_parse + 2;
10371 if (RExC_end > oldregxend)
10372 RExC_end = oldregxend;
10376 ret = regclass(pRExC_state, flagp,depth+1);
10378 RExC_end = oldregxend;
10381 Set_Node_Offset(ret, parse_start + 2);
10382 Set_Node_Cur_Length(ret);
10383 nextchar(pRExC_state);
10387 /* Handle \N and \N{NAME} with multiple code points here and not
10388 * below because it can be multicharacter. join_exact() will join
10389 * them up later on. Also this makes sure that things like
10390 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10391 * The options to the grok function call causes it to fail if the
10392 * sequence is just a single code point. We then go treat it as
10393 * just another character in the current EXACT node, and hence it
10394 * gets uniform treatment with all the other characters. The
10395 * special treatment for quantifiers is not needed for such single
10396 * character sequences */
10398 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10403 case 'k': /* Handle \k<NAME> and \k'NAME' */
10406 char ch= RExC_parse[1];
10407 if (ch != '<' && ch != '\'' && ch != '{') {
10409 vFAIL2("Sequence %.2s... not terminated",parse_start);
10411 /* this pretty much dupes the code for (?P=...) in reg(), if
10412 you change this make sure you change that */
10413 char* name_start = (RExC_parse += 2);
10415 SV *sv_dat = reg_scan_name(pRExC_state,
10416 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10417 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10418 if (RExC_parse == name_start || *RExC_parse != ch)
10419 vFAIL2("Sequence %.3s... not terminated",parse_start);
10422 num = add_data( pRExC_state, 1, "S" );
10423 RExC_rxi->data->data[num]=(void*)sv_dat;
10424 SvREFCNT_inc_simple_void(sv_dat);
10428 ret = reganode(pRExC_state,
10431 : (ASCII_FOLD_RESTRICTED)
10433 : (AT_LEAST_UNI_SEMANTICS)
10439 *flagp |= HASWIDTH;
10441 /* override incorrect value set in reganode MJD */
10442 Set_Node_Offset(ret, parse_start+1);
10443 Set_Node_Cur_Length(ret); /* MJD */
10444 nextchar(pRExC_state);
10450 case '1': case '2': case '3': case '4':
10451 case '5': case '6': case '7': case '8': case '9':
10454 bool isg = *RExC_parse == 'g';
10459 if (*RExC_parse == '{') {
10463 if (*RExC_parse == '-') {
10467 if (hasbrace && !isDIGIT(*RExC_parse)) {
10468 if (isrel) RExC_parse--;
10470 goto parse_named_seq;
10472 num = atoi(RExC_parse);
10473 if (isg && num == 0)
10474 vFAIL("Reference to invalid group 0");
10476 num = RExC_npar - num;
10478 vFAIL("Reference to nonexistent or unclosed group");
10480 if (!isg && num > 9 && num >= RExC_npar)
10481 /* Probably a character specified in octal, e.g. \35 */
10484 char * const parse_start = RExC_parse - 1; /* MJD */
10485 while (isDIGIT(*RExC_parse))
10487 if (parse_start == RExC_parse - 1)
10488 vFAIL("Unterminated \\g... pattern");
10490 if (*RExC_parse != '}')
10491 vFAIL("Unterminated \\g{...} pattern");
10495 if (num > (I32)RExC_rx->nparens)
10496 vFAIL("Reference to nonexistent group");
10499 ret = reganode(pRExC_state,
10502 : (ASCII_FOLD_RESTRICTED)
10504 : (AT_LEAST_UNI_SEMANTICS)
10510 *flagp |= HASWIDTH;
10512 /* override incorrect value set in reganode MJD */
10513 Set_Node_Offset(ret, parse_start+1);
10514 Set_Node_Cur_Length(ret); /* MJD */
10516 nextchar(pRExC_state);
10521 if (RExC_parse >= RExC_end)
10522 FAIL("Trailing \\");
10525 /* Do not generate "unrecognized" warnings here, we fall
10526 back into the quick-grab loop below */
10533 if (RExC_flags & RXf_PMf_EXTENDED) {
10534 if ( reg_skipcomment( pRExC_state ) )
10541 parse_start = RExC_parse - 1;
10550 #define MAX_NODE_STRING_SIZE 127
10551 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10553 U8 upper_parse = MAX_NODE_STRING_SIZE;
10556 bool next_is_quantifier;
10557 char * oldp = NULL;
10559 /* If a folding node contains only code points that don't
10560 * participate in folds, it can be changed into an EXACT node,
10561 * which allows the optimizer more things to look for */
10565 node_type = compute_EXACTish(pRExC_state);
10566 ret = reg_node(pRExC_state, node_type);
10568 /* In pass1, folded, we use a temporary buffer instead of the
10569 * actual node, as the node doesn't exist yet */
10570 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10576 /* We do the EXACTFish to EXACT node only if folding, and not if in
10577 * locale, as whether a character folds or not isn't known until
10579 maybe_exact = FOLD && ! LOC;
10581 /* XXX The node can hold up to 255 bytes, yet this only goes to
10582 * 127. I (khw) do not know why. Keeping it somewhat less than
10583 * 255 allows us to not have to worry about overflow due to
10584 * converting to utf8 and fold expansion, but that value is
10585 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10586 * split up by this limit into a single one using the real max of
10587 * 255. Even at 127, this breaks under rare circumstances. If
10588 * folding, we do not want to split a node at a character that is a
10589 * non-final in a multi-char fold, as an input string could just
10590 * happen to want to match across the node boundary. The join
10591 * would solve that problem if the join actually happens. But a
10592 * series of more than two nodes in a row each of 127 would cause
10593 * the first join to succeed to get to 254, but then there wouldn't
10594 * be room for the next one, which could at be one of those split
10595 * multi-char folds. I don't know of any fool-proof solution. One
10596 * could back off to end with only a code point that isn't such a
10597 * non-final, but it is possible for there not to be any in the
10599 for (p = RExC_parse - 1;
10600 len < upper_parse && p < RExC_end;
10605 if (RExC_flags & RXf_PMf_EXTENDED)
10606 p = regwhite( pRExC_state, p );
10617 /* Literal Escapes Switch
10619 This switch is meant to handle escape sequences that
10620 resolve to a literal character.
10622 Every escape sequence that represents something
10623 else, like an assertion or a char class, is handled
10624 in the switch marked 'Special Escapes' above in this
10625 routine, but also has an entry here as anything that
10626 isn't explicitly mentioned here will be treated as
10627 an unescaped equivalent literal.
10630 switch ((U8)*++p) {
10631 /* These are all the special escapes. */
10632 case 'A': /* Start assertion */
10633 case 'b': case 'B': /* Word-boundary assertion*/
10634 case 'C': /* Single char !DANGEROUS! */
10635 case 'd': case 'D': /* digit class */
10636 case 'g': case 'G': /* generic-backref, pos assertion */
10637 case 'h': case 'H': /* HORIZWS */
10638 case 'k': case 'K': /* named backref, keep marker */
10639 case 'p': case 'P': /* Unicode property */
10640 case 'R': /* LNBREAK */
10641 case 's': case 'S': /* space class */
10642 case 'v': case 'V': /* VERTWS */
10643 case 'w': case 'W': /* word class */
10644 case 'X': /* eXtended Unicode "combining character sequence" */
10645 case 'z': case 'Z': /* End of line/string assertion */
10649 /* Anything after here is an escape that resolves to a
10650 literal. (Except digits, which may or may not)
10656 case 'N': /* Handle a single-code point named character. */
10657 /* The options cause it to fail if a multiple code
10658 * point sequence. Handle those in the switch() above
10660 RExC_parse = p + 1;
10661 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10662 flagp, depth, FALSE))
10664 RExC_parse = p = oldp;
10668 if (ender > 0xff) {
10685 ender = ASCII_TO_NATIVE('\033');
10689 ender = ASCII_TO_NATIVE('\007');
10694 STRLEN brace_len = len;
10696 const char* error_msg;
10698 bool valid = grok_bslash_o(p,
10705 RExC_parse = p; /* going to die anyway; point
10706 to exact spot of failure */
10713 if (PL_encoding && ender < 0x100) {
10714 goto recode_encoding;
10716 if (ender > 0xff) {
10723 STRLEN brace_len = len;
10725 const char* error_msg;
10727 bool valid = grok_bslash_x(p,
10734 RExC_parse = p; /* going to die anyway; point
10735 to exact spot of failure */
10741 if (PL_encoding && ender < 0x100) {
10742 goto recode_encoding;
10744 if (ender > 0xff) {
10751 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10753 case '0': case '1': case '2': case '3':case '4':
10754 case '5': case '6': case '7':
10756 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10758 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10760 ender = grok_oct(p, &numlen, &flags, NULL);
10761 if (ender > 0xff) {
10770 if (PL_encoding && ender < 0x100)
10771 goto recode_encoding;
10774 if (! RExC_override_recoding) {
10775 SV* enc = PL_encoding;
10776 ender = reg_recode((const char)(U8)ender, &enc);
10777 if (!enc && SIZE_ONLY)
10778 ckWARNreg(p, "Invalid escape in the specified encoding");
10784 FAIL("Trailing \\");
10787 if (!SIZE_ONLY&& isALNUMC(*p)) {
10788 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10790 goto normal_default;
10794 /* Currently we don't warn when the lbrace is at the start
10795 * of a construct. This catches it in the middle of a
10796 * literal string, or when its the first thing after
10797 * something like "\b" */
10799 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10801 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10806 if (UTF8_IS_START(*p) && UTF) {
10808 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10809 &numlen, UTF8_ALLOW_DEFAULT);
10815 } /* End of switch on the literal */
10817 /* Here, have looked at the literal character and <ender>
10818 * contains its ordinal, <p> points to the character after it
10821 if ( RExC_flags & RXf_PMf_EXTENDED)
10822 p = regwhite( pRExC_state, p );
10824 /* If the next thing is a quantifier, it applies to this
10825 * character only, which means that this character has to be in
10826 * its own node and can't just be appended to the string in an
10827 * existing node, so if there are already other characters in
10828 * the node, close the node with just them, and set up to do
10829 * this character again next time through, when it will be the
10830 * only thing in its new node */
10831 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10839 /* See comments for join_exact() as to why we fold
10840 * this non-UTF at compile time */
10841 || (node_type == EXACTFU
10842 && ender == LATIN_SMALL_LETTER_SHARP_S))
10846 /* Prime the casefolded buffer. Locale rules, which
10847 * apply only to code points < 256, aren't known until
10848 * execution, so for them, just output the original
10849 * character using utf8. If we start to fold non-UTF
10850 * patterns, be sure to update join_exact() */
10851 if (LOC && ender < 256) {
10852 if (UNI_IS_INVARIANT(ender)) {
10856 *s = UTF8_TWO_BYTE_HI(ender);
10857 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10862 UV folded = _to_uni_fold_flags(
10867 | ((LOC) ? FOLD_FLAGS_LOCALE
10868 : (ASCII_FOLD_RESTRICTED)
10869 ? FOLD_FLAGS_NOMIX_ASCII
10873 /* If this node only contains non-folding code
10874 * points so far, see if this new one is also
10877 if (folded != ender) {
10878 maybe_exact = FALSE;
10881 /* Here the fold is the original; we have
10882 * to check further to see if anything
10884 if (! PL_utf8_foldable) {
10885 SV* swash = swash_init("utf8",
10887 &PL_sv_undef, 1, 0);
10889 _get_swash_invlist(swash);
10890 SvREFCNT_dec(swash);
10892 if (_invlist_contains_cp(PL_utf8_foldable,
10895 maybe_exact = FALSE;
10903 /* The loop increments <len> each time, as all but this
10904 * path (and the one just below for UTF) through it add
10905 * a single byte to the EXACTish node. But this one
10906 * has changed len to be the correct final value, so
10907 * subtract one to cancel out the increment that
10909 len += foldlen - 1;
10913 maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10917 const STRLEN unilen = reguni(pRExC_state, ender, s);
10923 /* See comment just above for - 1 */
10927 REGC((char)ender, s++);
10930 if (next_is_quantifier) {
10932 /* Here, the next input is a quantifier, and to get here,
10933 * the current character is the only one in the node.
10934 * Also, here <len> doesn't include the final byte for this
10940 } /* End of loop through literal characters */
10942 /* Here we have either exhausted the input or ran out of room in
10943 * the node. (If we encountered a character that can't be in the
10944 * node, transfer is made directly to <loopdone>, and so we
10945 * wouldn't have fallen off the end of the loop.) In the latter
10946 * case, we artificially have to split the node into two, because
10947 * we just don't have enough space to hold everything. This
10948 * creates a problem if the final character participates in a
10949 * multi-character fold in the non-final position, as a match that
10950 * should have occurred won't, due to the way nodes are matched,
10951 * and our artificial boundary. So back off until we find a non-
10952 * problematic character -- one that isn't at the beginning or
10953 * middle of such a fold. (Either it doesn't participate in any
10954 * folds, or appears only in the final position of all the folds it
10955 * does participate in.) A better solution with far fewer false
10956 * positives, and that would fill the nodes more completely, would
10957 * be to actually have available all the multi-character folds to
10958 * test against, and to back-off only far enough to be sure that
10959 * this node isn't ending with a partial one. <upper_parse> is set
10960 * further below (if we need to reparse the node) to include just
10961 * up through that final non-problematic character that this code
10962 * identifies, so when it is set to less than the full node, we can
10963 * skip the rest of this */
10964 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10966 const STRLEN full_len = len;
10968 assert(len >= MAX_NODE_STRING_SIZE);
10970 /* Here, <s> points to the final byte of the final character.
10971 * Look backwards through the string until find a non-
10972 * problematic character */
10976 /* These two have no multi-char folds to non-UTF characters
10978 if (ASCII_FOLD_RESTRICTED || LOC) {
10982 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10986 if (! PL_NonL1NonFinalFold) {
10987 PL_NonL1NonFinalFold = _new_invlist_C_array(
10988 NonL1_Perl_Non_Final_Folds_invlist);
10991 /* Point to the first byte of the final character */
10992 s = (char *) utf8_hop((U8 *) s, -1);
10994 while (s >= s0) { /* Search backwards until find
10995 non-problematic char */
10996 if (UTF8_IS_INVARIANT(*s)) {
10998 /* There are no ascii characters that participate
10999 * in multi-char folds under /aa. In EBCDIC, the
11000 * non-ascii invariants are all control characters,
11001 * so don't ever participate in any folds. */
11002 if (ASCII_FOLD_RESTRICTED
11003 || ! IS_NON_FINAL_FOLD(*s))
11008 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11010 /* No Latin1 characters participate in multi-char
11011 * folds under /l */
11013 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11019 else if (! _invlist_contains_cp(
11020 PL_NonL1NonFinalFold,
11021 valid_utf8_to_uvchr((U8 *) s, NULL)))
11026 /* Here, the current character is problematic in that
11027 * it does occur in the non-final position of some
11028 * fold, so try the character before it, but have to
11029 * special case the very first byte in the string, so
11030 * we don't read outside the string */
11031 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11032 } /* End of loop backwards through the string */
11034 /* If there were only problematic characters in the string,
11035 * <s> will point to before s0, in which case the length
11036 * should be 0, otherwise include the length of the
11037 * non-problematic character just found */
11038 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11041 /* Here, have found the final character, if any, that is
11042 * non-problematic as far as ending the node without splitting
11043 * it across a potential multi-char fold. <len> contains the
11044 * number of bytes in the node up-to and including that
11045 * character, or is 0 if there is no such character, meaning
11046 * the whole node contains only problematic characters. In
11047 * this case, give up and just take the node as-is. We can't
11053 /* Here, the node does contain some characters that aren't
11054 * problematic. If one such is the final character in the
11055 * node, we are done */
11056 if (len == full_len) {
11059 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11061 /* If the final character is problematic, but the
11062 * penultimate is not, back-off that last character to
11063 * later start a new node with it */
11068 /* Here, the final non-problematic character is earlier
11069 * in the input than the penultimate character. What we do
11070 * is reparse from the beginning, going up only as far as
11071 * this final ok one, thus guaranteeing that the node ends
11072 * in an acceptable character. The reason we reparse is
11073 * that we know how far in the character is, but we don't
11074 * know how to correlate its position with the input parse.
11075 * An alternate implementation would be to build that
11076 * correlation as we go along during the original parse,
11077 * but that would entail extra work for every node, whereas
11078 * this code gets executed only when the string is too
11079 * large for the node, and the final two characters are
11080 * problematic, an infrequent occurrence. Yet another
11081 * possible strategy would be to save the tail of the
11082 * string, and the next time regatom is called, initialize
11083 * with that. The problem with this is that unless you
11084 * back off one more character, you won't be guaranteed
11085 * regatom will get called again, unless regbranch,
11086 * regpiece ... are also changed. If you do back off that
11087 * extra character, so that there is input guaranteed to
11088 * force calling regatom, you can't handle the case where
11089 * just the first character in the node is acceptable. I
11090 * (khw) decided to try this method which doesn't have that
11091 * pitfall; if performance issues are found, we can do a
11092 * combination of the current approach plus that one */
11098 } /* End of verifying node ends with an appropriate char */
11100 loopdone: /* Jumped to when encounters something that shouldn't be in
11103 /* If 'maybe_exact' is still set here, means there are no
11104 * code points in the node that participate in folds */
11105 if (FOLD && maybe_exact) {
11109 /* I (khw) don't know if you can get here with zero length, but the
11110 * old code handled this situation by creating a zero-length EXACT
11111 * node. Might as well be NOTHING instead */
11116 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11119 RExC_parse = p - 1;
11120 Set_Node_Cur_Length(ret); /* MJD */
11121 nextchar(pRExC_state);
11123 /* len is STRLEN which is unsigned, need to copy to signed */
11126 vFAIL("Internal disaster");
11129 } /* End of label 'defchar:' */
11131 } /* End of giant switch on input character */
11137 S_regwhite( RExC_state_t *pRExC_state, char *p )
11139 const char *e = RExC_end;
11141 PERL_ARGS_ASSERT_REGWHITE;
11146 else if (*p == '#') {
11149 if (*p++ == '\n') {
11155 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11163 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11164 Character classes ([:foo:]) can also be negated ([:^foo:]).
11165 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11166 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11167 but trigger failures because they are currently unimplemented. */
11169 #define POSIXCC_DONE(c) ((c) == ':')
11170 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11171 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11173 PERL_STATIC_INLINE I32
11174 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
11177 I32 namedclass = OOB_NAMEDCLASS;
11179 PERL_ARGS_ASSERT_REGPPOSIXCC;
11181 if (value == '[' && RExC_parse + 1 < RExC_end &&
11182 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11183 POSIXCC(UCHARAT(RExC_parse))) {
11184 const char c = UCHARAT(RExC_parse);
11185 char* const s = RExC_parse++;
11187 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11189 if (RExC_parse == RExC_end)
11190 /* Grandfather lone [:, [=, [. */
11193 const char* const t = RExC_parse++; /* skip over the c */
11196 if (UCHARAT(RExC_parse) == ']') {
11197 const char *posixcc = s + 1;
11198 RExC_parse++; /* skip over the ending ] */
11201 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11202 const I32 skip = t - posixcc;
11204 /* Initially switch on the length of the name. */
11207 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11208 namedclass = ANYOF_WORDCHAR;
11211 /* Names all of length 5. */
11212 /* alnum alpha ascii blank cntrl digit graph lower
11213 print punct space upper */
11214 /* Offset 4 gives the best switch position. */
11215 switch (posixcc[4]) {
11217 if (memEQ(posixcc, "alph", 4)) /* alpha */
11218 namedclass = ANYOF_ALPHA;
11221 if (memEQ(posixcc, "spac", 4)) /* space */
11222 namedclass = ANYOF_PSXSPC;
11225 if (memEQ(posixcc, "grap", 4)) /* graph */
11226 namedclass = ANYOF_GRAPH;
11229 if (memEQ(posixcc, "asci", 4)) /* ascii */
11230 namedclass = ANYOF_ASCII;
11233 if (memEQ(posixcc, "blan", 4)) /* blank */
11234 namedclass = ANYOF_BLANK;
11237 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11238 namedclass = ANYOF_CNTRL;
11241 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11242 namedclass = ANYOF_ALNUMC;
11245 if (memEQ(posixcc, "lowe", 4)) /* lower */
11246 namedclass = ANYOF_LOWER;
11247 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11248 namedclass = ANYOF_UPPER;
11251 if (memEQ(posixcc, "digi", 4)) /* digit */
11252 namedclass = ANYOF_DIGIT;
11253 else if (memEQ(posixcc, "prin", 4)) /* print */
11254 namedclass = ANYOF_PRINT;
11255 else if (memEQ(posixcc, "punc", 4)) /* punct */
11256 namedclass = ANYOF_PUNCT;
11261 if (memEQ(posixcc, "xdigit", 6))
11262 namedclass = ANYOF_XDIGIT;
11266 if (namedclass == OOB_NAMEDCLASS)
11267 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11270 /* The #defines are structured so each complement is +1 to
11271 * the normal one */
11275 assert (posixcc[skip] == ':');
11276 assert (posixcc[skip+1] == ']');
11277 } else if (!SIZE_ONLY) {
11278 /* [[=foo=]] and [[.foo.]] are still future. */
11280 /* adjust RExC_parse so the warning shows after
11281 the class closes */
11282 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11284 SvREFCNT_dec(free_me);
11285 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11288 /* Maternal grandfather:
11289 * "[:" ending in ":" but not in ":]" */
11298 /* Generate the code to add a full posix character <class> to the bracketed
11299 * character class given by <node>. (<node> is needed only under locale rules)
11300 * destlist is the inversion list for non-locale rules that this class is
11302 * sourcelist is the ASCII-range inversion list to add under /a rules
11303 * Xsourcelist is the full Unicode range list to use otherwise. */
11304 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11306 SV* scratch_list = NULL; \
11308 /* Set this class in the node for runtime matching */ \
11309 ANYOF_CLASS_SET(node, class); \
11311 /* For above Latin1 code points, we use the full Unicode range */ \
11312 _invlist_intersection(PL_AboveLatin1, \
11315 /* And set the output to it, adding instead if there already is an \
11316 * output. Checking if <destlist> is NULL first saves an extra \
11317 * clone. Its reference count will be decremented at the next \
11318 * union, etc, or if this is the only instance, at the end of the \
11320 if (! destlist) { \
11321 destlist = scratch_list; \
11324 _invlist_union(destlist, scratch_list, &destlist); \
11325 SvREFCNT_dec(scratch_list); \
11329 /* For non-locale, just add it to any existing list */ \
11330 _invlist_union(destlist, \
11331 (AT_LEAST_ASCII_RESTRICTED) \
11337 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11339 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11341 SV* scratch_list = NULL; \
11342 ANYOF_CLASS_SET(node, class); \
11343 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
11344 if (! destlist) { \
11345 destlist = scratch_list; \
11348 _invlist_union(destlist, scratch_list, &destlist); \
11349 SvREFCNT_dec(scratch_list); \
11353 _invlist_union_complement_2nd(destlist, \
11354 (AT_LEAST_ASCII_RESTRICTED) \
11358 /* Under /d, everything in the upper half of the Latin1 range \
11359 * matches this complement */ \
11360 if (DEPENDS_SEMANTICS) { \
11361 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11365 /* Generate the code to add a posix character <class> to the bracketed
11366 * character class given by <node>. (<node> is needed only under locale rules)
11367 * destlist is the inversion list for non-locale rules that this class is
11369 * sourcelist is the ASCII-range inversion list to add under /a rules
11370 * l1_sourcelist is the Latin1 range list to use otherwise.
11371 * Xpropertyname is the name to add to <run_time_list> of the property to
11372 * specify the code points above Latin1 that will have to be
11373 * determined at run-time
11374 * run_time_list is a SV* that contains text names of properties that are to
11375 * be computed at run time. This concatenates <Xpropertyname>
11376 * to it, appropriately
11377 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11379 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11380 l1_sourcelist, Xpropertyname, run_time_list) \
11381 /* First, resolve whether to use the ASCII-only list or the L1 \
11383 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
11384 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11385 Xpropertyname, run_time_list)
11387 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11388 Xpropertyname, run_time_list) \
11389 /* If not /a matching, there are going to be code points we will have \
11390 * to defer to runtime to look-up */ \
11391 if (! AT_LEAST_ASCII_RESTRICTED) { \
11392 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11395 ANYOF_CLASS_SET(node, class); \
11398 _invlist_union(destlist, sourcelist, &destlist); \
11401 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
11402 * this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
11404 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11405 l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11406 if (AT_LEAST_ASCII_RESTRICTED) { \
11407 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
11410 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11411 matches_above_unicode = TRUE; \
11413 ANYOF_CLASS_SET(node, namedclass); \
11416 SV* scratch_list = NULL; \
11417 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
11418 if (! destlist) { \
11419 destlist = scratch_list; \
11422 _invlist_union(destlist, scratch_list, &destlist); \
11423 SvREFCNT_dec(scratch_list); \
11425 if (DEPENDS_SEMANTICS) { \
11426 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11431 /* The names of properties whose definitions are not known at compile time are
11432 * stored in this SV, after a constant heading. So if the length has been
11433 * changed since initialization, then there is a run-time definition. */
11434 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11436 /* This converts the named class defined in regcomp.h to its equivalent class
11437 * number defined in handy.h. */
11438 #define namedclass_to_classnum(class) ((class) / 2)
11441 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11443 /* parse a bracketed class specification. Most of these will produce an ANYOF node;
11444 * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11445 * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with
11446 * multi-character folds: it will be rewritten following the paradigm of
11447 * this example, where the <multi-fold>s are characters which fold to
11448 * multiple character sequences:
11449 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11450 * gets effectively rewritten as:
11451 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11452 * reg() gets called (recursively) on the rewritten version, and this
11453 * function will return what it constructs. (Actually the <multi-fold>s
11454 * aren't physically removed from the [abcdefghi], it's just that they are
11455 * ignored in the recursion by means of a flag:
11456 * <RExC_in_multi_char_class>.)
11458 * ANYOF nodes contain a bit map for the first 256 characters, with the
11459 * corresponding bit set if that character is in the list. For characters
11460 * above 255, a range list or swash is used. There are extra bits for \w,
11461 * etc. in locale ANYOFs, as what these match is not determinable at
11466 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11468 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11471 IV namedclass = OOB_NAMEDCLASS;
11472 char *rangebegin = NULL;
11473 bool need_class = 0;
11475 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11476 than just initialized. */
11477 SV* properties = NULL; /* Code points that match \p{} \P{} */
11478 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11479 extended beyond the Latin1 range */
11480 UV element_count = 0; /* Number of distinct elements in the class.
11481 Optimizations may be possible if this is tiny */
11482 AV * multi_char_matches = NULL; /* Code points that fold to more than one
11483 character; used under /i */
11486 /* Unicode properties are stored in a swash; this holds the current one
11487 * being parsed. If this swash is the only above-latin1 component of the
11488 * character class, an optimization is to pass it directly on to the
11489 * execution engine. Otherwise, it is set to NULL to indicate that there
11490 * are other things in the class that have to be dealt with at execution
11492 SV* swash = NULL; /* Code points that match \p{} \P{} */
11494 /* Set if a component of this character class is user-defined; just passed
11495 * on to the engine */
11496 bool has_user_defined_property = FALSE;
11498 /* inversion list of code points this node matches only when the target
11499 * string is in UTF-8. (Because is under /d) */
11500 SV* depends_list = NULL;
11502 /* inversion list of code points this node matches. For much of the
11503 * function, it includes only those that match regardless of the utf8ness
11504 * of the target string */
11505 SV* cp_list = NULL;
11508 /* In a range, counts how many 0-2 of the ends of it came from literals,
11509 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11510 UV literal_endpoint = 0;
11512 bool invert = FALSE; /* Is this class to be complemented */
11514 /* Is there any thing like \W or [:^digit:] that matches above the legal
11515 * Unicode range? */
11516 bool runtime_posix_matches_above_Unicode = FALSE;
11518 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11519 case we need to change the emitted regop to an EXACT. */
11520 const char * orig_parse = RExC_parse;
11521 const I32 orig_size = RExC_size;
11522 GET_RE_DEBUG_FLAGS_DECL;
11524 PERL_ARGS_ASSERT_REGCLASS;
11526 PERL_UNUSED_ARG(depth);
11529 DEBUG_PARSE("clas");
11531 /* Assume we are going to generate an ANYOF node. */
11532 ret = reganode(pRExC_state, ANYOF, 0);
11535 ANYOF_FLAGS(ret) = 0;
11538 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11545 RExC_size += ANYOF_SKIP;
11546 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11549 RExC_emit += ANYOF_SKIP;
11551 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11553 listsv = newSVpvs("# comment\n");
11554 initial_listsv_len = SvCUR(listsv);
11557 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11559 if (!SIZE_ONLY && POSIXCC(nextvalue))
11561 const char *s = RExC_parse;
11562 const char c = *s++;
11564 while (isALNUM(*s))
11566 if (*s && c == *s && s[1] == ']') {
11568 "POSIX syntax [%c %c] belongs inside character classes",
11571 /* [[=foo=]] and [[.foo.]] are still future. */
11572 if (POSIXCC_NOTYET(c)) {
11573 /* adjust RExC_parse so the error shows after
11574 the class closes */
11575 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11577 SvREFCNT_dec(listsv);
11578 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11583 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11584 if (UCHARAT(RExC_parse) == ']')
11585 goto charclassloop;
11588 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11592 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11593 save_value = value;
11594 save_prevvalue = prevvalue;
11597 rangebegin = RExC_parse;
11601 value = utf8n_to_uvchr((U8*)RExC_parse,
11602 RExC_end - RExC_parse,
11603 &numlen, UTF8_ALLOW_DEFAULT);
11604 RExC_parse += numlen;
11607 value = UCHARAT(RExC_parse++);
11609 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11610 if (value == '[' && POSIXCC(nextvalue))
11611 namedclass = regpposixcc(pRExC_state, value, listsv);
11612 else if (value == '\\') {
11614 value = utf8n_to_uvchr((U8*)RExC_parse,
11615 RExC_end - RExC_parse,
11616 &numlen, UTF8_ALLOW_DEFAULT);
11617 RExC_parse += numlen;
11620 value = UCHARAT(RExC_parse++);
11621 /* Some compilers cannot handle switching on 64-bit integer
11622 * values, therefore value cannot be an UV. Yes, this will
11623 * be a problem later if we want switch on Unicode.
11624 * A similar issue a little bit later when switching on
11625 * namedclass. --jhi */
11626 switch ((I32)value) {
11627 case 'w': namedclass = ANYOF_WORDCHAR; break;
11628 case 'W': namedclass = ANYOF_NWORDCHAR; break;
11629 case 's': namedclass = ANYOF_SPACE; break;
11630 case 'S': namedclass = ANYOF_NSPACE; break;
11631 case 'd': namedclass = ANYOF_DIGIT; break;
11632 case 'D': namedclass = ANYOF_NDIGIT; break;
11633 case 'v': namedclass = ANYOF_VERTWS; break;
11634 case 'V': namedclass = ANYOF_NVERTWS; break;
11635 case 'h': namedclass = ANYOF_HORIZWS; break;
11636 case 'H': namedclass = ANYOF_NHORIZWS; break;
11637 case 'N': /* Handle \N{NAME} in class */
11639 /* We only pay attention to the first char of
11640 multichar strings being returned. I kinda wonder
11641 if this makes sense as it does change the behaviour
11642 from earlier versions, OTOH that behaviour was broken
11644 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11645 TRUE /* => charclass */))
11656 /* This routine will handle any undefined properties */
11657 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11659 if (RExC_parse >= RExC_end)
11660 vFAIL2("Empty \\%c{}", (U8)value);
11661 if (*RExC_parse == '{') {
11662 const U8 c = (U8)value;
11663 e = strchr(RExC_parse++, '}');
11665 vFAIL2("Missing right brace on \\%c{}", c);
11666 while (isSPACE(UCHARAT(RExC_parse)))
11668 if (e == RExC_parse)
11669 vFAIL2("Empty \\%c{}", c);
11670 n = e - RExC_parse;
11671 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11682 if (UCHARAT(RExC_parse) == '^') {
11685 value = value == 'p' ? 'P' : 'p'; /* toggle */
11686 while (isSPACE(UCHARAT(RExC_parse))) {
11691 /* Try to get the definition of the property into
11692 * <invlist>. If /i is in effect, the effective property
11693 * will have its name be <__NAME_i>. The design is
11694 * discussed in commit
11695 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11696 Newx(name, n + sizeof("_i__\n"), char);
11698 sprintf(name, "%s%.*s%s\n",
11699 (FOLD) ? "__" : "",
11705 /* Look up the property name, and get its swash and
11706 * inversion list, if the property is found */
11708 SvREFCNT_dec(swash);
11710 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11713 NULL, /* No inversion list */
11716 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11718 SvREFCNT_dec(swash);
11722 /* Here didn't find it. It could be a user-defined
11723 * property that will be available at run-time. Add it
11724 * to the list to look up then */
11725 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11726 (value == 'p' ? '+' : '!'),
11728 has_user_defined_property = TRUE;
11730 /* We don't know yet, so have to assume that the
11731 * property could match something in the Latin1 range,
11732 * hence something that isn't utf8. Note that this
11733 * would cause things in <depends_list> to match
11734 * inappropriately, except that any \p{}, including
11735 * this one forces Unicode semantics, which means there
11736 * is <no depends_list> */
11737 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11741 /* Here, did get the swash and its inversion list. If
11742 * the swash is from a user-defined property, then this
11743 * whole character class should be regarded as such */
11744 has_user_defined_property =
11746 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11748 /* Invert if asking for the complement */
11749 if (value == 'P') {
11750 _invlist_union_complement_2nd(properties,
11754 /* The swash can't be used as-is, because we've
11755 * inverted things; delay removing it to here after
11756 * have copied its invlist above */
11757 SvREFCNT_dec(swash);
11761 _invlist_union(properties, invlist, &properties);
11766 RExC_parse = e + 1;
11767 namedclass = ANYOF_UNIPROP; /* no official name, but it's named */
11769 /* \p means they want Unicode semantics */
11770 RExC_uni_semantics = 1;
11773 case 'n': value = '\n'; break;
11774 case 'r': value = '\r'; break;
11775 case 't': value = '\t'; break;
11776 case 'f': value = '\f'; break;
11777 case 'b': value = '\b'; break;
11778 case 'e': value = ASCII_TO_NATIVE('\033');break;
11779 case 'a': value = ASCII_TO_NATIVE('\007');break;
11781 RExC_parse--; /* function expects to be pointed at the 'o' */
11783 const char* error_msg;
11784 bool valid = grok_bslash_o(RExC_parse,
11789 RExC_parse += numlen;
11794 if (PL_encoding && value < 0x100) {
11795 goto recode_encoding;
11799 RExC_parse--; /* function expects to be pointed at the 'x' */
11801 const char* error_msg;
11802 bool valid = grok_bslash_x(RExC_parse,
11807 RExC_parse += numlen;
11812 if (PL_encoding && value < 0x100)
11813 goto recode_encoding;
11816 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11818 case '0': case '1': case '2': case '3': case '4':
11819 case '5': case '6': case '7':
11821 /* Take 1-3 octal digits */
11822 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11824 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11825 RExC_parse += numlen;
11826 if (PL_encoding && value < 0x100)
11827 goto recode_encoding;
11831 if (! RExC_override_recoding) {
11832 SV* enc = PL_encoding;
11833 value = reg_recode((const char)(U8)value, &enc);
11834 if (!enc && SIZE_ONLY)
11835 ckWARNreg(RExC_parse,
11836 "Invalid escape in the specified encoding");
11840 /* Allow \_ to not give an error */
11841 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11842 ckWARN2reg(RExC_parse,
11843 "Unrecognized escape \\%c in character class passed through",
11848 } /* end of \blah */
11851 literal_endpoint++;
11854 /* What matches in a locale is not known until runtime. This
11855 * includes what the Posix classes (like \w, [:space:]) match.
11856 * Room must be reserved (one time per class) to store such
11857 * classes, either if Perl is compiled so that locale nodes always
11858 * should have this space, or if there is such class info to be
11859 * stored. The space will contain a bit for each named class that
11860 * is to be matched against. This isn't needed for \p{} and
11861 * pseudo-classes, as they are not affected by locale, and hence
11862 * are dealt with separately */
11865 && (ANYOF_LOCALE == ANYOF_CLASS
11866 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11870 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11873 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11874 ANYOF_CLASS_ZERO(ret);
11876 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11879 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11881 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11882 * literal, as is the character that began the false range, i.e.
11883 * the 'a' in the examples */
11887 RExC_parse >= rangebegin ?
11888 RExC_parse - rangebegin : 0;
11889 ckWARN4reg(RExC_parse,
11890 "False [] range \"%*.*s\"",
11892 cp_list = add_cp_to_invlist(cp_list, '-');
11893 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11896 range = 0; /* this was not a true range */
11897 element_count += 2; /* So counts for three values */
11901 switch ((I32)namedclass) {
11903 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11904 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11905 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11907 case ANYOF_NALNUMC:
11908 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11909 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11910 runtime_posix_matches_above_Unicode);
11913 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11914 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11917 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11918 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11919 runtime_posix_matches_above_Unicode);
11924 ANYOF_CLASS_SET(ret, namedclass);
11927 #endif /* Not isascii(); just use the hard-coded definition for it */
11928 _invlist_union(posixes, PL_ASCII, &posixes);
11933 ANYOF_CLASS_SET(ret, namedclass);
11937 _invlist_union_complement_2nd(posixes,
11938 PL_ASCII, &posixes);
11939 if (DEPENDS_SEMANTICS) {
11940 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11947 if (hasISBLANK || ! LOC) {
11948 DO_POSIX(ret, namedclass, posixes,
11949 PL_PosixBlank, PL_XPosixBlank);
11951 else { /* There is no isblank() and we are in locale: We
11952 use the ASCII range and the above-Latin1 range
11954 SV* scratch_list = NULL;
11956 /* Include all above-Latin1 blanks */
11957 _invlist_intersection(PL_AboveLatin1,
11960 /* Add it to the running total of posix classes */
11962 posixes = scratch_list;
11965 _invlist_union(posixes, scratch_list, &posixes);
11966 SvREFCNT_dec(scratch_list);
11968 /* Add the ASCII-range blanks to the running total. */
11969 _invlist_union(posixes, PL_PosixBlank, &posixes);
11973 if (hasISBLANK || ! LOC) {
11974 DO_N_POSIX(ret, namedclass, posixes,
11975 PL_PosixBlank, PL_XPosixBlank);
11977 else { /* There is no isblank() and we are in locale */
11978 SV* scratch_list = NULL;
11980 /* Include all above-Latin1 non-blanks */
11981 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11984 /* Add them to the running total of posix classes */
11985 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11988 posixes = scratch_list;
11991 _invlist_union(posixes, scratch_list, &posixes);
11992 SvREFCNT_dec(scratch_list);
11995 /* Get the list of all non-ASCII-blanks in Latin 1, and
11996 * add them to the running total */
11997 _invlist_subtract(PL_Latin1, PL_PosixBlank,
11999 _invlist_union(posixes, scratch_list, &posixes);
12000 SvREFCNT_dec(scratch_list);
12004 DO_POSIX(ret, namedclass, posixes,
12005 PL_PosixCntrl, PL_XPosixCntrl);
12008 DO_N_POSIX(ret, namedclass, posixes,
12009 PL_PosixCntrl, PL_XPosixCntrl);
12012 /* There are no digits in the Latin1 range outside of
12013 * ASCII, so call the macro that doesn't have to resolve
12015 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
12016 PL_PosixDigit, "XPosixDigit", listsv);
12019 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12020 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
12021 runtime_posix_matches_above_Unicode);
12024 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12025 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
12028 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12029 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
12030 runtime_posix_matches_above_Unicode);
12032 case ANYOF_HORIZWS:
12033 /* For these, we use the cp_list, as /d doesn't make a
12034 * difference in what these match. There would be problems
12035 * if these characters had folds other than themselves, as
12036 * cp_list is subject to folding. It turns out that \h
12037 * is just a synonym for XPosixBlank */
12038 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
12040 case ANYOF_NHORIZWS:
12041 _invlist_union_complement_2nd(cp_list,
12042 PL_XPosixBlank, &cp_list);
12046 { /* These require special handling, as they differ under
12047 folding, matching Cased there (which in the ASCII range
12048 is the same as Alpha */
12054 if (FOLD && ! LOC) {
12055 ascii_source = PL_PosixAlpha;
12056 l1_source = PL_L1Cased;
12060 ascii_source = PL_PosixLower;
12061 l1_source = PL_L1PosixLower;
12062 Xname = "XPosixLower";
12064 if (namedclass == ANYOF_LOWER) {
12065 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12066 ascii_source, l1_source, Xname, listsv);
12069 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12070 posixes, ascii_source, l1_source, Xname, listsv,
12071 runtime_posix_matches_above_Unicode);
12076 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12077 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
12080 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12081 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
12082 runtime_posix_matches_above_Unicode);
12085 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12086 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
12089 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12090 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
12091 runtime_posix_matches_above_Unicode);
12094 DO_POSIX(ret, namedclass, posixes,
12095 PL_PosixSpace, PL_XPosixSpace);
12097 case ANYOF_NPSXSPC:
12098 DO_N_POSIX(ret, namedclass, posixes,
12099 PL_PosixSpace, PL_XPosixSpace);
12102 DO_POSIX(ret, namedclass, posixes,
12103 PL_PerlSpace, PL_XPerlSpace);
12106 DO_N_POSIX(ret, namedclass, posixes,
12107 PL_PerlSpace, PL_XPerlSpace);
12109 case ANYOF_UPPER: /* Same as LOWER, above */
12116 if (FOLD && ! LOC) {
12117 ascii_source = PL_PosixAlpha;
12118 l1_source = PL_L1Cased;
12122 ascii_source = PL_PosixUpper;
12123 l1_source = PL_L1PosixUpper;
12124 Xname = "XPosixUpper";
12126 if (namedclass == ANYOF_UPPER) {
12127 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12128 ascii_source, l1_source, Xname, listsv);
12131 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12132 posixes, ascii_source, l1_source, Xname, listsv,
12133 runtime_posix_matches_above_Unicode);
12137 case ANYOF_WORDCHAR:
12138 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12139 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
12141 case ANYOF_NWORDCHAR:
12142 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12143 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
12144 runtime_posix_matches_above_Unicode);
12147 /* For these, we use the cp_list, as /d doesn't make a
12148 * difference in what these match. There would be problems
12149 * if these characters had folds other than themselves, as
12150 * cp_list is subject to folding */
12151 _invlist_union(cp_list, PL_VertSpace, &cp_list);
12153 case ANYOF_NVERTWS:
12154 _invlist_union_complement_2nd(cp_list,
12155 PL_VertSpace, &cp_list);
12158 DO_POSIX(ret, namedclass, posixes,
12159 PL_PosixXDigit, PL_XPosixXDigit);
12161 case ANYOF_NXDIGIT:
12162 DO_N_POSIX(ret, namedclass, posixes,
12163 PL_PosixXDigit, PL_XPosixXDigit);
12165 case ANYOF_UNIPROP: /* this is to handle \p and \P */
12168 vFAIL("Invalid [::] class");
12172 continue; /* Go get next character */
12174 } /* end of namedclass \blah */
12177 if (prevvalue > value) /* b-a */ {
12178 const int w = RExC_parse - rangebegin;
12179 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12180 range = 0; /* not a valid range */
12184 prevvalue = value; /* save the beginning of the potential range */
12185 if (RExC_parse+1 < RExC_end
12186 && *RExC_parse == '-'
12187 && RExC_parse[1] != ']')
12191 /* a bad range like \w-, [:word:]- ? */
12192 if (namedclass > OOB_NAMEDCLASS) {
12193 if (ckWARN(WARN_REGEXP)) {
12195 RExC_parse >= rangebegin ?
12196 RExC_parse - rangebegin : 0;
12198 "False [] range \"%*.*s\"",
12202 cp_list = add_cp_to_invlist(cp_list, '-');
12206 range = 1; /* yeah, it's a range! */
12207 continue; /* but do it the next time */
12211 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12214 /* non-Latin1 code point implies unicode semantics. Must be set in
12215 * pass1 so is there for the whole of pass 2 */
12217 RExC_uni_semantics = 1;
12220 /* Ready to process either the single value, or the completed range.
12221 * For single-valued non-inverted ranges, we consider the possibility
12222 * of multi-char folds. (We made a conscious decision to not do this
12223 * for the other cases because it can often lead to non-intuitive
12224 * results. For example, you have the peculiar case that:
12225 * "s s" =~ /^[^\xDF]+$/i => Y
12226 * "ss" =~ /^[^\xDF]+$/i => N
12228 * See [perl #89750] */
12229 if (FOLD && ! invert && value == prevvalue) {
12230 if (value == LATIN_SMALL_LETTER_SHARP_S
12231 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12234 /* Here <value> is indeed a multi-char fold. Get what it is */
12236 U8 foldbuf[UTF8_MAXBYTES_CASE];
12239 UV folded = _to_uni_fold_flags(
12244 | ((LOC) ? FOLD_FLAGS_LOCALE
12245 : (ASCII_FOLD_RESTRICTED)
12246 ? FOLD_FLAGS_NOMIX_ASCII
12250 /* Here, <folded> should be the first character of the
12251 * multi-char fold of <value>, with <foldbuf> containing the
12252 * whole thing. But, if this fold is not allowed (because of
12253 * the flags), <fold> will be the same as <value>, and should
12254 * be processed like any other character, so skip the special
12256 if (folded != value) {
12258 /* Skip if we are recursed, currently parsing the class
12259 * again. Otherwise add this character to the list of
12260 * multi-char folds. */
12261 if (! RExC_in_multi_char_class) {
12262 AV** this_array_ptr;
12264 STRLEN cp_count = utf8_length(foldbuf,
12265 foldbuf + foldlen);
12266 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12268 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12271 if (! multi_char_matches) {
12272 multi_char_matches = newAV();
12275 /* <multi_char_matches> is actually an array of arrays.
12276 * There will be one or two top-level elements: [2],
12277 * and/or [3]. The [2] element is an array, each
12278 * element thereof is a character which folds to two
12279 * characters; likewise for [3]. (Unicode guarantees a
12280 * maximum of 3 characters in any fold.) When we
12281 * rewrite the character class below, we will do so
12282 * such that the longest folds are written first, so
12283 * that it prefers the longest matching strings first.
12284 * This is done even if it turns out that any
12285 * quantifier is non-greedy, out of programmer
12286 * laziness. Tom Christiansen has agreed that this is
12287 * ok. This makes the test for the ligature 'ffi' come
12288 * before the test for 'ff' */
12289 if (av_exists(multi_char_matches, cp_count)) {
12290 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12292 this_array = *this_array_ptr;
12295 this_array = newAV();
12296 av_store(multi_char_matches, cp_count,
12299 av_push(this_array, multi_fold);
12302 /* This element should not be processed further in this
12305 value = save_value;
12306 prevvalue = save_prevvalue;
12312 /* Deal with this element of the class */
12315 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12317 UV* this_range = _new_invlist(1);
12318 _append_range_to_invlist(this_range, prevvalue, value);
12320 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12321 * If this range was specified using something like 'i-j', we want
12322 * to include only the 'i' and the 'j', and not anything in
12323 * between, so exclude non-ASCII, non-alphabetics from it.
12324 * However, if the range was specified with something like
12325 * [\x89-\x91] or [\x89-j], all code points within it should be
12326 * included. literal_endpoint==2 means both ends of the range used
12327 * a literal character, not \x{foo} */
12328 if (literal_endpoint == 2
12329 && (prevvalue >= 'a' && value <= 'z')
12330 || (prevvalue >= 'A' && value <= 'Z'))
12332 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12333 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12335 _invlist_union(cp_list, this_range, &cp_list);
12336 literal_endpoint = 0;
12340 range = 0; /* this range (if it was one) is done now */
12341 } /* End of loop through all the text within the brackets */
12343 /* If anything in the class expands to more than one character, we have to
12344 * deal with them by building up a substitute parse string, and recursively
12345 * calling reg() on it, instead of proceeding */
12346 if (multi_char_matches) {
12347 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12350 char *save_end = RExC_end;
12351 char *save_parse = RExC_parse;
12352 bool first_time = TRUE; /* First multi-char occurrence doesn't get
12357 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
12358 because too confusing */
12360 sv_catpv(substitute_parse, "(?:");
12364 /* Look at the longest folds first */
12365 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12367 if (av_exists(multi_char_matches, cp_count)) {
12368 AV** this_array_ptr;
12371 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12373 while ((this_sequence = av_pop(*this_array_ptr)) !=
12376 if (! first_time) {
12377 sv_catpv(substitute_parse, "|");
12379 first_time = FALSE;
12381 sv_catpv(substitute_parse, SvPVX(this_sequence));
12386 /* If the character class contains anything else besides these
12387 * multi-character folds, have to include it in recursive parsing */
12388 if (element_count) {
12389 sv_catpv(substitute_parse, "|[");
12390 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12391 sv_catpv(substitute_parse, "]");
12394 sv_catpv(substitute_parse, ")");
12397 /* This is a way to get the parse to skip forward a whole named
12398 * sequence instead of matching the 2nd character when it fails the
12400 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12404 RExC_parse = SvPV(substitute_parse, len);
12405 RExC_end = RExC_parse + len;
12406 RExC_in_multi_char_class = 1;
12407 RExC_emit = (regnode *)orig_emit;
12409 ret = reg(pRExC_state, 1, ®_flags, depth+1);
12411 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12413 RExC_parse = save_parse;
12414 RExC_end = save_end;
12415 RExC_in_multi_char_class = 0;
12416 SvREFCNT_dec(multi_char_matches);
12417 SvREFCNT_dec(listsv);
12421 /* If the character class contains only a single element, it may be
12422 * optimizable into another node type which is smaller and runs faster.
12423 * Check if this is the case for this class */
12424 if (element_count == 1) {
12428 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12429 [:digit:] or \p{foo} */
12431 /* Certain named classes have equivalents that can appear outside a
12432 * character class, e.g. \w, \H. We use these instead of a
12433 * character class. */
12434 switch ((I32)namedclass) {
12437 /* The first group is for node types that depend on the charset
12438 * modifier to the regex. We first calculate the base node
12439 * type, and if it should be inverted */
12441 case ANYOF_NWORDCHAR:
12444 case ANYOF_WORDCHAR:
12446 goto join_charset_classes;
12453 goto join_charset_classes;
12461 join_charset_classes:
12463 /* Now that we have the base node type, we take advantage
12464 * of the enum ordering of the charset modifiers to get the
12465 * exact node type, For example the base SPACE also has
12466 * SPACEL, SPACEU, and SPACEA */
12468 offset = get_regex_charset(RExC_flags);
12470 /* /aa is the same as /a for these */
12471 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12472 offset = REGEX_ASCII_RESTRICTED_CHARSET;
12474 else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12475 offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12480 /* The number of varieties of each of these is the same,
12481 * hence, so is the delta between the normal and
12482 * complemented nodes */
12484 op += NALNUM - ALNUM;
12486 *flagp |= HASWIDTH|SIMPLE;
12489 /* The second group doesn't depend of the charset modifiers.
12490 * We just have normal and complemented */
12491 case ANYOF_NHORIZWS:
12494 case ANYOF_HORIZWS:
12496 op = (invert) ? NHORIZWS : HORIZWS;
12497 *flagp |= HASWIDTH|SIMPLE;
12500 case ANYOF_NVERTWS:
12504 op = (invert) ? NVERTWS : VERTWS;
12505 *flagp |= HASWIDTH|SIMPLE;
12508 case ANYOF_UNIPROP:
12515 if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12520 /* A generic posix class. All the /a ones can be handled
12521 * by the POSIXA opcode. And all are closed under folding
12522 * in the ASCII range, so FOLD doesn't matter */
12523 if (AT_LEAST_ASCII_RESTRICTED
12524 || (! LOC && namedclass == ANYOF_ASCII))
12526 /* The odd numbered ones are the complements of the
12527 * next-lower even number one */
12528 if (namedclass % 2 == 1) {
12532 arg = namedclass_to_classnum(namedclass);
12533 op = (invert) ? NPOSIXA : POSIXA;
12538 else if (value == prevvalue) {
12540 /* Here, the class consists of just a single code point */
12543 if (! LOC && value == '\n') {
12544 op = REG_ANY; /* Optimize [^\n] */
12545 *flagp |= HASWIDTH|SIMPLE;
12549 else if (value < 256 || UTF) {
12551 /* Optimize a single value into an EXACTish node, but not if it
12552 * would require converting the pattern to UTF-8. */
12553 op = compute_EXACTish(pRExC_state);
12555 } /* Otherwise is a range */
12556 else if (! LOC) { /* locale could vary these */
12557 if (prevvalue == '0') {
12558 if (value == '9') {
12559 op = (invert) ? NDIGITA : DIGITA;
12560 *flagp |= HASWIDTH|SIMPLE;
12565 /* Here, we have changed <op> away from its initial value iff we found
12566 * an optimization */
12569 /* Throw away this ANYOF regnode, and emit the calculated one,
12570 * which should correspond to the beginning, not current, state of
12572 const char * cur_parse = RExC_parse;
12573 RExC_parse = (char *)orig_parse;
12577 /* To get locale nodes to not use the full ANYOF size would
12578 * require moving the code above that writes the portions
12579 * of it that aren't in other nodes to after this point.
12580 * e.g. ANYOF_CLASS_SET */
12581 RExC_size = orig_size;
12585 RExC_emit = (regnode *)orig_emit;
12588 ret = reg_node(pRExC_state, op);
12590 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
12594 *flagp |= HASWIDTH|SIMPLE;
12596 else if (PL_regkind[op] == EXACT) {
12597 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12600 RExC_parse = (char *) cur_parse;
12602 SvREFCNT_dec(posixes);
12603 SvREFCNT_dec(listsv);
12604 SvREFCNT_dec(cp_list);
12611 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12613 /* If folding, we calculate all characters that could fold to or from the
12614 * ones already on the list */
12615 if (FOLD && cp_list) {
12616 UV start, end; /* End points of code point ranges */
12618 SV* fold_intersection = NULL;
12620 /* If the highest code point is within Latin1, we can use the
12621 * compiled-in Alphas list, and not have to go out to disk. This
12622 * yields two false positives, the masculine and feminine oridinal
12623 * indicators, which are weeded out below using the
12624 * IS_IN_SOME_FOLD_L1() macro */
12625 if (invlist_highest(cp_list) < 256) {
12626 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12630 /* Here, there are non-Latin1 code points, so we will have to go
12631 * fetch the list of all the characters that participate in folds
12633 if (! PL_utf8_foldable) {
12634 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12635 &PL_sv_undef, 1, 0);
12636 PL_utf8_foldable = _get_swash_invlist(swash);
12637 SvREFCNT_dec(swash);
12640 /* This is a hash that for a particular fold gives all characters
12641 * that are involved in it */
12642 if (! PL_utf8_foldclosures) {
12644 /* If we were unable to find any folds, then we likely won't be
12645 * able to find the closures. So just create an empty list.
12646 * Folding will effectively be restricted to the non-Unicode
12647 * rules hard-coded into Perl. (This case happens legitimately
12648 * during compilation of Perl itself before the Unicode tables
12649 * are generated) */
12650 if (_invlist_len(PL_utf8_foldable) == 0) {
12651 PL_utf8_foldclosures = newHV();
12654 /* If the folds haven't been read in, call a fold function
12656 if (! PL_utf8_tofold) {
12657 U8 dummy[UTF8_MAXBYTES+1];
12659 /* This string is just a short named one above \xff */
12660 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12661 assert(PL_utf8_tofold); /* Verify that worked */
12663 PL_utf8_foldclosures =
12664 _swash_inversion_hash(PL_utf8_tofold);
12668 /* Only the characters in this class that participate in folds need
12669 * be checked. Get the intersection of this class and all the
12670 * possible characters that are foldable. This can quickly narrow
12671 * down a large class */
12672 _invlist_intersection(PL_utf8_foldable, cp_list,
12673 &fold_intersection);
12676 /* Now look at the foldable characters in this class individually */
12677 invlist_iterinit(fold_intersection);
12678 while (invlist_iternext(fold_intersection, &start, &end)) {
12681 /* Locale folding for Latin1 characters is deferred until runtime */
12682 if (LOC && start < 256) {
12686 /* Look at every character in the range */
12687 for (j = start; j <= end; j++) {
12689 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12695 /* We have the latin1 folding rules hard-coded here so that
12696 * an innocent-looking character class, like /[ks]/i won't
12697 * have to go out to disk to find the possible matches.
12698 * XXX It would be better to generate these via regen, in
12699 * case a new version of the Unicode standard adds new
12700 * mappings, though that is not really likely, and may be
12701 * caught by the default: case of the switch below. */
12703 if (IS_IN_SOME_FOLD_L1(j)) {
12705 /* ASCII is always matched; non-ASCII is matched only
12706 * under Unicode rules */
12707 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12709 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12713 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12717 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12718 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12720 /* Certain Latin1 characters have matches outside
12721 * Latin1. To get here, <j> is one of those
12722 * characters. None of these matches is valid for
12723 * ASCII characters under /aa, which is why the 'if'
12724 * just above excludes those. These matches only
12725 * happen when the target string is utf8. The code
12726 * below adds the single fold closures for <j> to the
12727 * inversion list. */
12732 add_cp_to_invlist(cp_list, KELVIN_SIGN);
12736 cp_list = add_cp_to_invlist(cp_list,
12737 LATIN_SMALL_LETTER_LONG_S);
12740 cp_list = add_cp_to_invlist(cp_list,
12741 GREEK_CAPITAL_LETTER_MU);
12742 cp_list = add_cp_to_invlist(cp_list,
12743 GREEK_SMALL_LETTER_MU);
12745 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12746 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12748 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12750 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12751 cp_list = add_cp_to_invlist(cp_list,
12752 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12754 case LATIN_SMALL_LETTER_SHARP_S:
12755 cp_list = add_cp_to_invlist(cp_list,
12756 LATIN_CAPITAL_LETTER_SHARP_S);
12758 case 'F': case 'f':
12759 case 'I': case 'i':
12760 case 'L': case 'l':
12761 case 'T': case 't':
12762 case 'A': case 'a':
12763 case 'H': case 'h':
12764 case 'J': case 'j':
12765 case 'N': case 'n':
12766 case 'W': case 'w':
12767 case 'Y': case 'y':
12768 /* These all are targets of multi-character
12769 * folds from code points that require UTF8 to
12770 * express, so they can't match unless the
12771 * target string is in UTF-8, so no action here
12772 * is necessary, as regexec.c properly handles
12773 * the general case for UTF-8 matching and
12774 * multi-char folds */
12777 /* Use deprecated warning to increase the
12778 * chances of this being output */
12779 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12786 /* Here is an above Latin1 character. We don't have the rules
12787 * hard-coded for it. First, get its fold. This is the simple
12788 * fold, as the multi-character folds have been handled earlier
12789 * and separated out */
12790 _to_uni_fold_flags(j, foldbuf, &foldlen,
12792 ? FOLD_FLAGS_LOCALE
12793 : (ASCII_FOLD_RESTRICTED)
12794 ? FOLD_FLAGS_NOMIX_ASCII
12797 /* Single character fold of above Latin1. Add everything in
12798 * its fold closure to the list that this node should match.
12799 * The fold closures data structure is a hash with the keys
12800 * being the UTF-8 of every character that is folded to, like
12801 * 'k', and the values each an array of all code points that
12802 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
12803 * Multi-character folds are not included */
12804 if ((listp = hv_fetch(PL_utf8_foldclosures,
12805 (char *) foldbuf, foldlen, FALSE)))
12807 AV* list = (AV*) *listp;
12809 for (k = 0; k <= av_len(list); k++) {
12810 SV** c_p = av_fetch(list, k, FALSE);
12813 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12817 /* /aa doesn't allow folds between ASCII and non-; /l
12818 * doesn't allow them between above and below 256 */
12819 if ((ASCII_FOLD_RESTRICTED
12820 && (isASCII(c) != isASCII(j)))
12821 || (LOC && ((c < 256) != (j < 256))))
12826 /* Folds involving non-ascii Latin1 characters
12827 * under /d are added to a separate list */
12828 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12830 cp_list = add_cp_to_invlist(cp_list, c);
12833 depends_list = add_cp_to_invlist(depends_list, c);
12839 SvREFCNT_dec(fold_intersection);
12842 /* And combine the result (if any) with any inversion list from posix
12843 * classes. The lists are kept separate up to now because we don't want to
12844 * fold the classes (folding of those is automatically handled by the swash
12845 * fetching code) */
12847 if (! DEPENDS_SEMANTICS) {
12849 _invlist_union(cp_list, posixes, &cp_list);
12850 SvREFCNT_dec(posixes);
12857 /* Under /d, we put into a separate list the Latin1 things that
12858 * match only when the target string is utf8 */
12859 SV* nonascii_but_latin1_properties = NULL;
12860 _invlist_intersection(posixes, PL_Latin1,
12861 &nonascii_but_latin1_properties);
12862 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12863 &nonascii_but_latin1_properties);
12864 _invlist_subtract(posixes, nonascii_but_latin1_properties,
12867 _invlist_union(cp_list, posixes, &cp_list);
12868 SvREFCNT_dec(posixes);
12874 if (depends_list) {
12875 _invlist_union(depends_list, nonascii_but_latin1_properties,
12877 SvREFCNT_dec(nonascii_but_latin1_properties);
12880 depends_list = nonascii_but_latin1_properties;
12885 /* And combine the result (if any) with any inversion list from properties.
12886 * The lists are kept separate up to now so that we can distinguish the two
12887 * in regards to matching above-Unicode. A run-time warning is generated
12888 * if a Unicode property is matched against a non-Unicode code point. But,
12889 * we allow user-defined properties to match anything, without any warning,
12890 * and we also suppress the warning if there is a portion of the character
12891 * class that isn't a Unicode property, and which matches above Unicode, \W
12892 * or [\x{110000}] for example.
12893 * (Note that in this case, unlike the Posix one above, there is no
12894 * <depends_list>, because having a Unicode property forces Unicode
12897 bool warn_super = ! has_user_defined_property;
12900 /* If it matters to the final outcome, see if a non-property
12901 * component of the class matches above Unicode. If so, the
12902 * warning gets suppressed. This is true even if just a single
12903 * such code point is specified, as though not strictly correct if
12904 * another such code point is matched against, the fact that they
12905 * are using above-Unicode code points indicates they should know
12906 * the issues involved */
12908 bool non_prop_matches_above_Unicode =
12909 runtime_posix_matches_above_Unicode
12910 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12912 non_prop_matches_above_Unicode =
12913 ! non_prop_matches_above_Unicode;
12915 warn_super = ! non_prop_matches_above_Unicode;
12918 _invlist_union(properties, cp_list, &cp_list);
12919 SvREFCNT_dec(properties);
12922 cp_list = properties;
12926 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12930 /* Here, we have calculated what code points should be in the character
12933 * Now we can see about various optimizations. Fold calculation (which we
12934 * did above) needs to take place before inversion. Otherwise /[^k]/i
12935 * would invert to include K, which under /i would match k, which it
12936 * shouldn't. Therefore we can't invert folded locale now, as it won't be
12937 * folded until runtime */
12939 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12940 * at compile time. Besides not inverting folded locale now, we can't
12941 * invert if there are things such as \w, which aren't known until runtime
12944 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12946 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12948 _invlist_invert(cp_list);
12950 /* Any swash can't be used as-is, because we've inverted things */
12952 SvREFCNT_dec(swash);
12956 /* Clear the invert flag since have just done it here */
12960 /* If we didn't do folding, it's because some information isn't available
12961 * until runtime; set the run-time fold flag for these. (We don't have to
12962 * worry about properties folding, as that is taken care of by the swash
12966 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12969 /* Some character classes are equivalent to other nodes. Such nodes take
12970 * up less room and generally fewer operations to execute than ANYOF nodes.
12971 * Above, we checked for and optimized into some such equivalents for
12972 * certain common classes that are easy to test. Getting to this point in
12973 * the code means that the class didn't get optimized there. Since this
12974 * code is only executed in Pass 2, it is too late to save space--it has
12975 * been allocated in Pass 1, and currently isn't given back. But turning
12976 * things into an EXACTish node can allow the optimizer to join it to any
12977 * adjacent such nodes. And if the class is equivalent to things like /./,
12978 * expensive run-time swashes can be avoided. Now that we have more
12979 * complete information, we can find things necessarily missed by the
12980 * earlier code. I (khw) am not sure how much to look for here. It would
12981 * be easy, but perhaps too slow, to check any candidates against all the
12982 * node types they could possibly match using _invlistEQ(). */
12987 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12988 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12991 U8 op = END; /* The optimzation node-type */
12992 const char * cur_parse= RExC_parse;
12994 invlist_iterinit(cp_list);
12995 if (! invlist_iternext(cp_list, &start, &end)) {
12997 /* Here, the list is empty. This happens, for example, when a
12998 * Unicode property is the only thing in the character class, and
12999 * it doesn't match anything. (perluniprops.pod notes such
13002 *flagp |= HASWIDTH|SIMPLE;
13004 else if (start == end) { /* The range is a single code point */
13005 if (! invlist_iternext(cp_list, &start, &end)
13007 /* Don't do this optimization if it would require changing
13008 * the pattern to UTF-8 */
13009 && (start < 256 || UTF))
13011 /* Here, the list contains a single code point. Can optimize
13012 * into an EXACT node */
13021 /* A locale node under folding with one code point can be
13022 * an EXACTFL, as its fold won't be calculated until
13028 /* Here, we are generally folding, but there is only one
13029 * code point to match. If we have to, we use an EXACT
13030 * node, but it would be better for joining with adjacent
13031 * nodes in the optimization pass if we used the same
13032 * EXACTFish node that any such are likely to be. We can
13033 * do this iff the code point doesn't participate in any
13034 * folds. For example, an EXACTF of a colon is the same as
13035 * an EXACT one, since nothing folds to or from a colon. */
13037 if (IS_IN_SOME_FOLD_L1(value)) {
13042 if (! PL_utf8_foldable) {
13043 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13044 &PL_sv_undef, 1, 0);
13045 PL_utf8_foldable = _get_swash_invlist(swash);
13046 SvREFCNT_dec(swash);
13048 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13053 /* If we haven't found the node type, above, it means we
13054 * can use the prevailing one */
13056 op = compute_EXACTish(pRExC_state);
13061 else if (start == 0) {
13062 if (end == UV_MAX) {
13064 *flagp |= HASWIDTH|SIMPLE;
13067 else if (end == '\n' - 1
13068 && invlist_iternext(cp_list, &start, &end)
13069 && start == '\n' + 1 && end == UV_MAX)
13072 *flagp |= HASWIDTH|SIMPLE;
13078 RExC_parse = (char *)orig_parse;
13079 RExC_emit = (regnode *)orig_emit;
13081 ret = reg_node(pRExC_state, op);
13083 RExC_parse = (char *)cur_parse;
13085 if (PL_regkind[op] == EXACT) {
13086 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13089 SvREFCNT_dec(cp_list);
13090 SvREFCNT_dec(listsv);
13095 /* Here, <cp_list> contains all the code points we can determine at
13096 * compile time that match under all conditions. Go through it, and
13097 * for things that belong in the bitmap, put them there, and delete from
13098 * <cp_list>. While we are at it, see if everything above 255 is in the
13099 * list, and if so, set a flag to speed up execution */
13100 ANYOF_BITMAP_ZERO(ret);
13103 /* This gets set if we actually need to modify things */
13104 bool change_invlist = FALSE;
13108 /* Start looking through <cp_list> */
13109 invlist_iterinit(cp_list);
13110 while (invlist_iternext(cp_list, &start, &end)) {
13114 if (end == UV_MAX && start <= 256) {
13115 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13118 /* Quit if are above what we should change */
13123 change_invlist = TRUE;
13125 /* Set all the bits in the range, up to the max that we are doing */
13126 high = (end < 255) ? end : 255;
13127 for (i = start; i <= (int) high; i++) {
13128 if (! ANYOF_BITMAP_TEST(ret, i)) {
13129 ANYOF_BITMAP_SET(ret, i);
13136 /* Done with loop; remove any code points that are in the bitmap from
13138 if (change_invlist) {
13139 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13142 /* If have completely emptied it, remove it completely */
13143 if (_invlist_len(cp_list) == 0) {
13144 SvREFCNT_dec(cp_list);
13150 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13153 /* Here, the bitmap has been populated with all the Latin1 code points that
13154 * always match. Can now add to the overall list those that match only
13155 * when the target string is UTF-8 (<depends_list>). */
13156 if (depends_list) {
13158 _invlist_union(cp_list, depends_list, &cp_list);
13159 SvREFCNT_dec(depends_list);
13162 cp_list = depends_list;
13166 /* If there is a swash and more than one element, we can't use the swash in
13167 * the optimization below. */
13168 if (swash && element_count > 1) {
13169 SvREFCNT_dec(swash);
13174 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13176 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13177 SvREFCNT_dec(listsv);
13180 /* av[0] stores the character class description in its textual form:
13181 * used later (regexec.c:Perl_regclass_swash()) to initialize the
13182 * appropriate swash, and is also useful for dumping the regnode.
13183 * av[1] if NULL, is a placeholder to later contain the swash computed
13184 * from av[0]. But if no further computation need be done, the
13185 * swash is stored there now.
13186 * av[2] stores the cp_list inversion list for use in addition or
13187 * instead of av[0]; used only if av[1] is NULL
13188 * av[3] is set if any component of the class is from a user-defined
13189 * property; used only if av[1] is NULL */
13190 AV * const av = newAV();
13193 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13195 : (SvREFCNT_dec(listsv), &PL_sv_undef));
13197 av_store(av, 1, swash);
13198 SvREFCNT_dec(cp_list);
13201 av_store(av, 1, NULL);
13203 av_store(av, 2, cp_list);
13204 av_store(av, 3, newSVuv(has_user_defined_property));
13208 rv = newRV_noinc(MUTABLE_SV(av));
13209 n = add_data(pRExC_state, 1, "s");
13210 RExC_rxi->data->data[n] = (void*)rv;
13214 *flagp |= HASWIDTH|SIMPLE;
13217 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13220 /* reg_skipcomment()
13222 Absorbs an /x style # comments from the input stream.
13223 Returns true if there is more text remaining in the stream.
13224 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13225 terminates the pattern without including a newline.
13227 Note its the callers responsibility to ensure that we are
13228 actually in /x mode
13233 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13237 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13239 while (RExC_parse < RExC_end)
13240 if (*RExC_parse++ == '\n') {
13245 /* we ran off the end of the pattern without ending
13246 the comment, so we have to add an \n when wrapping */
13247 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13255 Advances the parse position, and optionally absorbs
13256 "whitespace" from the inputstream.
13258 Without /x "whitespace" means (?#...) style comments only,
13259 with /x this means (?#...) and # comments and whitespace proper.
13261 Returns the RExC_parse point from BEFORE the scan occurs.
13263 This is the /x friendly way of saying RExC_parse++.
13267 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13269 char* const retval = RExC_parse++;
13271 PERL_ARGS_ASSERT_NEXTCHAR;
13274 if (RExC_end - RExC_parse >= 3
13275 && *RExC_parse == '('
13276 && RExC_parse[1] == '?'
13277 && RExC_parse[2] == '#')
13279 while (*RExC_parse != ')') {
13280 if (RExC_parse == RExC_end)
13281 FAIL("Sequence (?#... not terminated");
13287 if (RExC_flags & RXf_PMf_EXTENDED) {
13288 if (isSPACE(*RExC_parse)) {
13292 else if (*RExC_parse == '#') {
13293 if ( reg_skipcomment( pRExC_state ) )
13302 - reg_node - emit a node
13304 STATIC regnode * /* Location. */
13305 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13309 regnode * const ret = RExC_emit;
13310 GET_RE_DEBUG_FLAGS_DECL;
13312 PERL_ARGS_ASSERT_REG_NODE;
13315 SIZE_ALIGN(RExC_size);
13319 if (RExC_emit >= RExC_emit_bound)
13320 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13321 op, RExC_emit, RExC_emit_bound);
13323 NODE_ALIGN_FILL(ret);
13325 FILL_ADVANCE_NODE(ptr, op);
13326 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
13327 #ifdef RE_TRACK_PATTERN_OFFSETS
13328 if (RExC_offsets) { /* MJD */
13329 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13330 "reg_node", __LINE__,
13332 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13333 ? "Overwriting end of array!\n" : "OK",
13334 (UV)(RExC_emit - RExC_emit_start),
13335 (UV)(RExC_parse - RExC_start),
13336 (UV)RExC_offsets[0]));
13337 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13345 - reganode - emit a node with an argument
13347 STATIC regnode * /* Location. */
13348 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13352 regnode * const ret = RExC_emit;
13353 GET_RE_DEBUG_FLAGS_DECL;
13355 PERL_ARGS_ASSERT_REGANODE;
13358 SIZE_ALIGN(RExC_size);
13363 assert(2==regarglen[op]+1);
13365 Anything larger than this has to allocate the extra amount.
13366 If we changed this to be:
13368 RExC_size += (1 + regarglen[op]);
13370 then it wouldn't matter. Its not clear what side effect
13371 might come from that so its not done so far.
13376 if (RExC_emit >= RExC_emit_bound)
13377 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13378 op, RExC_emit, RExC_emit_bound);
13380 NODE_ALIGN_FILL(ret);
13382 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13383 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
13384 #ifdef RE_TRACK_PATTERN_OFFSETS
13385 if (RExC_offsets) { /* MJD */
13386 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13390 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13391 "Overwriting end of array!\n" : "OK",
13392 (UV)(RExC_emit - RExC_emit_start),
13393 (UV)(RExC_parse - RExC_start),
13394 (UV)RExC_offsets[0]));
13395 Set_Cur_Node_Offset;
13403 - reguni - emit (if appropriate) a Unicode character
13406 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13410 PERL_ARGS_ASSERT_REGUNI;
13412 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13416 - reginsert - insert an operator in front of already-emitted operand
13418 * Means relocating the operand.
13421 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13427 const int offset = regarglen[(U8)op];
13428 const int size = NODE_STEP_REGNODE + offset;
13429 GET_RE_DEBUG_FLAGS_DECL;
13431 PERL_ARGS_ASSERT_REGINSERT;
13432 PERL_UNUSED_ARG(depth);
13433 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13434 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13443 if (RExC_open_parens) {
13445 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13446 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13447 if ( RExC_open_parens[paren] >= opnd ) {
13448 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13449 RExC_open_parens[paren] += size;
13451 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13453 if ( RExC_close_parens[paren] >= opnd ) {
13454 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13455 RExC_close_parens[paren] += size;
13457 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13462 while (src > opnd) {
13463 StructCopy(--src, --dst, regnode);
13464 #ifdef RE_TRACK_PATTERN_OFFSETS
13465 if (RExC_offsets) { /* MJD 20010112 */
13466 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13470 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13471 ? "Overwriting end of array!\n" : "OK",
13472 (UV)(src - RExC_emit_start),
13473 (UV)(dst - RExC_emit_start),
13474 (UV)RExC_offsets[0]));
13475 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13476 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13482 place = opnd; /* Op node, where operand used to be. */
13483 #ifdef RE_TRACK_PATTERN_OFFSETS
13484 if (RExC_offsets) { /* MJD */
13485 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13489 (UV)(place - RExC_emit_start) > RExC_offsets[0]
13490 ? "Overwriting end of array!\n" : "OK",
13491 (UV)(place - RExC_emit_start),
13492 (UV)(RExC_parse - RExC_start),
13493 (UV)RExC_offsets[0]));
13494 Set_Node_Offset(place, RExC_parse);
13495 Set_Node_Length(place, 1);
13498 src = NEXTOPER(place);
13499 FILL_ADVANCE_NODE(place, op);
13500 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
13501 Zero(src, offset, regnode);
13505 - regtail - set the next-pointer at the end of a node chain of p to val.
13506 - SEE ALSO: regtail_study
13508 /* TODO: All three parms should be const */
13510 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13514 GET_RE_DEBUG_FLAGS_DECL;
13516 PERL_ARGS_ASSERT_REGTAIL;
13518 PERL_UNUSED_ARG(depth);
13524 /* Find last node. */
13527 regnode * const temp = regnext(scan);
13529 SV * const mysv=sv_newmortal();
13530 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13531 regprop(RExC_rx, mysv, scan);
13532 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13533 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13534 (temp == NULL ? "->" : ""),
13535 (temp == NULL ? PL_reg_name[OP(val)] : "")
13543 if (reg_off_by_arg[OP(scan)]) {
13544 ARG_SET(scan, val - scan);
13547 NEXT_OFF(scan) = val - scan;
13553 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13554 - Look for optimizable sequences at the same time.
13555 - currently only looks for EXACT chains.
13557 This is experimental code. The idea is to use this routine to perform
13558 in place optimizations on branches and groups as they are constructed,
13559 with the long term intention of removing optimization from study_chunk so
13560 that it is purely analytical.
13562 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13563 to control which is which.
13566 /* TODO: All four parms should be const */
13569 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13574 #ifdef EXPERIMENTAL_INPLACESCAN
13577 GET_RE_DEBUG_FLAGS_DECL;
13579 PERL_ARGS_ASSERT_REGTAIL_STUDY;
13585 /* Find last node. */
13589 regnode * const temp = regnext(scan);
13590 #ifdef EXPERIMENTAL_INPLACESCAN
13591 if (PL_regkind[OP(scan)] == EXACT) {
13592 bool has_exactf_sharp_s; /* Unexamined in this routine */
13593 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13598 switch (OP(scan)) {
13604 case EXACTFU_TRICKYFOLD:
13606 if( exact == PSEUDO )
13608 else if ( exact != OP(scan) )
13617 SV * const mysv=sv_newmortal();
13618 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13619 regprop(RExC_rx, mysv, scan);
13620 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13621 SvPV_nolen_const(mysv),
13622 REG_NODE_NUM(scan),
13623 PL_reg_name[exact]);
13630 SV * const mysv_val=sv_newmortal();
13631 DEBUG_PARSE_MSG("");
13632 regprop(RExC_rx, mysv_val, val);
13633 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13634 SvPV_nolen_const(mysv_val),
13635 (IV)REG_NODE_NUM(val),
13639 if (reg_off_by_arg[OP(scan)]) {
13640 ARG_SET(scan, val - scan);
13643 NEXT_OFF(scan) = val - scan;
13651 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13655 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13661 for (bit=0; bit<32; bit++) {
13662 if (flags & (1<<bit)) {
13663 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
13666 if (!set++ && lead)
13667 PerlIO_printf(Perl_debug_log, "%s",lead);
13668 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13671 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13672 if (!set++ && lead) {
13673 PerlIO_printf(Perl_debug_log, "%s",lead);
13676 case REGEX_UNICODE_CHARSET:
13677 PerlIO_printf(Perl_debug_log, "UNICODE");
13679 case REGEX_LOCALE_CHARSET:
13680 PerlIO_printf(Perl_debug_log, "LOCALE");
13682 case REGEX_ASCII_RESTRICTED_CHARSET:
13683 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13685 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13686 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13689 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13695 PerlIO_printf(Perl_debug_log, "\n");
13697 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13703 Perl_regdump(pTHX_ const regexp *r)
13707 SV * const sv = sv_newmortal();
13708 SV *dsv= sv_newmortal();
13709 RXi_GET_DECL(r,ri);
13710 GET_RE_DEBUG_FLAGS_DECL;
13712 PERL_ARGS_ASSERT_REGDUMP;
13714 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13716 /* Header fields of interest. */
13717 if (r->anchored_substr) {
13718 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13719 RE_SV_DUMPLEN(r->anchored_substr), 30);
13720 PerlIO_printf(Perl_debug_log,
13721 "anchored %s%s at %"IVdf" ",
13722 s, RE_SV_TAIL(r->anchored_substr),
13723 (IV)r->anchored_offset);
13724 } else if (r->anchored_utf8) {
13725 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13726 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13727 PerlIO_printf(Perl_debug_log,
13728 "anchored utf8 %s%s at %"IVdf" ",
13729 s, RE_SV_TAIL(r->anchored_utf8),
13730 (IV)r->anchored_offset);
13732 if (r->float_substr) {
13733 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13734 RE_SV_DUMPLEN(r->float_substr), 30);
13735 PerlIO_printf(Perl_debug_log,
13736 "floating %s%s at %"IVdf"..%"UVuf" ",
13737 s, RE_SV_TAIL(r->float_substr),
13738 (IV)r->float_min_offset, (UV)r->float_max_offset);
13739 } else if (r->float_utf8) {
13740 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13741 RE_SV_DUMPLEN(r->float_utf8), 30);
13742 PerlIO_printf(Perl_debug_log,
13743 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13744 s, RE_SV_TAIL(r->float_utf8),
13745 (IV)r->float_min_offset, (UV)r->float_max_offset);
13747 if (r->check_substr || r->check_utf8)
13748 PerlIO_printf(Perl_debug_log,
13750 (r->check_substr == r->float_substr
13751 && r->check_utf8 == r->float_utf8
13752 ? "(checking floating" : "(checking anchored"));
13753 if (r->extflags & RXf_NOSCAN)
13754 PerlIO_printf(Perl_debug_log, " noscan");
13755 if (r->extflags & RXf_CHECK_ALL)
13756 PerlIO_printf(Perl_debug_log, " isall");
13757 if (r->check_substr || r->check_utf8)
13758 PerlIO_printf(Perl_debug_log, ") ");
13760 if (ri->regstclass) {
13761 regprop(r, sv, ri->regstclass);
13762 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13764 if (r->extflags & RXf_ANCH) {
13765 PerlIO_printf(Perl_debug_log, "anchored");
13766 if (r->extflags & RXf_ANCH_BOL)
13767 PerlIO_printf(Perl_debug_log, "(BOL)");
13768 if (r->extflags & RXf_ANCH_MBOL)
13769 PerlIO_printf(Perl_debug_log, "(MBOL)");
13770 if (r->extflags & RXf_ANCH_SBOL)
13771 PerlIO_printf(Perl_debug_log, "(SBOL)");
13772 if (r->extflags & RXf_ANCH_GPOS)
13773 PerlIO_printf(Perl_debug_log, "(GPOS)");
13774 PerlIO_putc(Perl_debug_log, ' ');
13776 if (r->extflags & RXf_GPOS_SEEN)
13777 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13778 if (r->intflags & PREGf_SKIP)
13779 PerlIO_printf(Perl_debug_log, "plus ");
13780 if (r->intflags & PREGf_IMPLICIT)
13781 PerlIO_printf(Perl_debug_log, "implicit ");
13782 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13783 if (r->extflags & RXf_EVAL_SEEN)
13784 PerlIO_printf(Perl_debug_log, "with eval ");
13785 PerlIO_printf(Perl_debug_log, "\n");
13786 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13788 PERL_ARGS_ASSERT_REGDUMP;
13789 PERL_UNUSED_CONTEXT;
13790 PERL_UNUSED_ARG(r);
13791 #endif /* DEBUGGING */
13795 - regprop - printable representation of opcode
13797 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13800 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13801 if (flags & ANYOF_INVERT) \
13802 /*make sure the invert info is in each */ \
13803 sv_catpvs(sv, "^"); \
13809 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13815 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13816 static const char * const anyofs[] = {
13848 RXi_GET_DECL(prog,progi);
13849 GET_RE_DEBUG_FLAGS_DECL;
13851 PERL_ARGS_ASSERT_REGPROP;
13855 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13856 /* It would be nice to FAIL() here, but this may be called from
13857 regexec.c, and it would be hard to supply pRExC_state. */
13858 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13859 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13861 k = PL_regkind[OP(o)];
13864 sv_catpvs(sv, " ");
13865 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13866 * is a crude hack but it may be the best for now since
13867 * we have no flag "this EXACTish node was UTF-8"
13869 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13870 PERL_PV_ESCAPE_UNI_DETECT |
13871 PERL_PV_ESCAPE_NONASCII |
13872 PERL_PV_PRETTY_ELLIPSES |
13873 PERL_PV_PRETTY_LTGT |
13874 PERL_PV_PRETTY_NOCLEAR
13876 } else if (k == TRIE) {
13877 /* print the details of the trie in dumpuntil instead, as
13878 * progi->data isn't available here */
13879 const char op = OP(o);
13880 const U32 n = ARG(o);
13881 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13882 (reg_ac_data *)progi->data->data[n] :
13884 const reg_trie_data * const trie
13885 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13887 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13888 DEBUG_TRIE_COMPILE_r(
13889 Perl_sv_catpvf(aTHX_ sv,
13890 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13891 (UV)trie->startstate,
13892 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13893 (UV)trie->wordcount,
13896 (UV)TRIE_CHARCOUNT(trie),
13897 (UV)trie->uniquecharcount
13900 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13902 int rangestart = -1;
13903 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13904 sv_catpvs(sv, "[");
13905 for (i = 0; i <= 256; i++) {
13906 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13907 if (rangestart == -1)
13909 } else if (rangestart != -1) {
13910 if (i <= rangestart + 3)
13911 for (; rangestart < i; rangestart++)
13912 put_byte(sv, rangestart);
13914 put_byte(sv, rangestart);
13915 sv_catpvs(sv, "-");
13916 put_byte(sv, i - 1);
13921 sv_catpvs(sv, "]");
13924 } else if (k == CURLY) {
13925 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13926 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13927 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13929 else if (k == WHILEM && o->flags) /* Ordinal/of */
13930 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13931 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13932 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13933 if ( RXp_PAREN_NAMES(prog) ) {
13934 if ( k != REF || (OP(o) < NREF)) {
13935 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13936 SV **name= av_fetch(list, ARG(o), 0 );
13938 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13941 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13942 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13943 I32 *nums=(I32*)SvPVX(sv_dat);
13944 SV **name= av_fetch(list, nums[0], 0 );
13947 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13948 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13949 (n ? "," : ""), (IV)nums[n]);
13951 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13955 } else if (k == GOSUB)
13956 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13957 else if (k == VERB) {
13959 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13960 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13961 } else if (k == LOGICAL)
13962 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13963 else if (k == ANYOF) {
13964 int i, rangestart = -1;
13965 const U8 flags = ANYOF_FLAGS(o);
13969 if (flags & ANYOF_LOCALE)
13970 sv_catpvs(sv, "{loc}");
13971 if (flags & ANYOF_LOC_FOLD)
13972 sv_catpvs(sv, "{i}");
13973 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13974 if (flags & ANYOF_INVERT)
13975 sv_catpvs(sv, "^");
13977 /* output what the standard cp 0-255 bitmap matches */
13978 for (i = 0; i <= 256; i++) {
13979 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13980 if (rangestart == -1)
13982 } else if (rangestart != -1) {
13983 if (i <= rangestart + 3)
13984 for (; rangestart < i; rangestart++)
13985 put_byte(sv, rangestart);
13987 put_byte(sv, rangestart);
13988 sv_catpvs(sv, "-");
13989 put_byte(sv, i - 1);
13996 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13997 /* output any special charclass tests (used entirely under use locale) */
13998 if (ANYOF_CLASS_TEST_ANY_SET(o))
13999 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14000 if (ANYOF_CLASS_TEST(o,i)) {
14001 sv_catpv(sv, anyofs[i]);
14005 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14007 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14008 sv_catpvs(sv, "{non-utf8-latin1-all}");
14011 /* output information about the unicode matching */
14012 if (flags & ANYOF_UNICODE_ALL)
14013 sv_catpvs(sv, "{unicode_all}");
14014 else if (ANYOF_NONBITMAP(o))
14015 sv_catpvs(sv, "{unicode}");
14016 if (flags & ANYOF_NONBITMAP_NON_UTF8)
14017 sv_catpvs(sv, "{outside bitmap}");
14019 if (ANYOF_NONBITMAP(o)) {
14020 SV *lv; /* Set if there is something outside the bit map */
14021 SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14022 bool byte_output = FALSE; /* If something in the bitmap has been
14025 if (lv && lv != &PL_sv_undef) {
14027 U8 s[UTF8_MAXBYTES_CASE+1];
14029 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14030 uvchr_to_utf8(s, i);
14033 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
14037 && swash_fetch(sw, s, TRUE))
14039 if (rangestart == -1)
14041 } else if (rangestart != -1) {
14042 byte_output = TRUE;
14043 if (i <= rangestart + 3)
14044 for (; rangestart < i; rangestart++) {
14045 put_byte(sv, rangestart);
14048 put_byte(sv, rangestart);
14049 sv_catpvs(sv, "-");
14058 char *s = savesvpv(lv);
14059 char * const origs = s;
14061 while (*s && *s != '\n')
14065 const char * const t = ++s;
14068 sv_catpvs(sv, " ");
14074 /* Truncate very long output */
14075 if (s - origs > 256) {
14076 Perl_sv_catpvf(aTHX_ sv,
14078 (int) (s - origs - 1),
14084 else if (*s == '\t') {
14103 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14105 else if (k == POSIXD || k == NPOSIXD) {
14106 U8 index = FLAGS(o) * 2;
14107 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14108 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14111 sv_catpv(sv, anyofs[index]);
14114 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14115 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14117 PERL_UNUSED_CONTEXT;
14118 PERL_UNUSED_ARG(sv);
14119 PERL_UNUSED_ARG(o);
14120 PERL_UNUSED_ARG(prog);
14121 #endif /* DEBUGGING */
14125 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14126 { /* Assume that RE_INTUIT is set */
14128 struct regexp *const prog = ReANY(r);
14129 GET_RE_DEBUG_FLAGS_DECL;
14131 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14132 PERL_UNUSED_CONTEXT;
14136 const char * const s = SvPV_nolen_const(prog->check_substr
14137 ? prog->check_substr : prog->check_utf8);
14139 if (!PL_colorset) reginitcolors();
14140 PerlIO_printf(Perl_debug_log,
14141 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14143 prog->check_substr ? "" : "utf8 ",
14144 PL_colors[5],PL_colors[0],
14147 (strlen(s) > 60 ? "..." : ""));
14150 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14156 handles refcounting and freeing the perl core regexp structure. When
14157 it is necessary to actually free the structure the first thing it
14158 does is call the 'free' method of the regexp_engine associated to
14159 the regexp, allowing the handling of the void *pprivate; member
14160 first. (This routine is not overridable by extensions, which is why
14161 the extensions free is called first.)
14163 See regdupe and regdupe_internal if you change anything here.
14165 #ifndef PERL_IN_XSUB_RE
14167 Perl_pregfree(pTHX_ REGEXP *r)
14173 Perl_pregfree2(pTHX_ REGEXP *rx)
14176 struct regexp *const r = ReANY(rx);
14177 GET_RE_DEBUG_FLAGS_DECL;
14179 PERL_ARGS_ASSERT_PREGFREE2;
14181 if (r->mother_re) {
14182 ReREFCNT_dec(r->mother_re);
14184 CALLREGFREE_PVT(rx); /* free the private data */
14185 SvREFCNT_dec(RXp_PAREN_NAMES(r));
14186 Safefree(r->xpv_len_u.xpvlenu_pv);
14189 SvREFCNT_dec(r->anchored_substr);
14190 SvREFCNT_dec(r->anchored_utf8);
14191 SvREFCNT_dec(r->float_substr);
14192 SvREFCNT_dec(r->float_utf8);
14193 Safefree(r->substrs);
14195 RX_MATCH_COPY_FREE(rx);
14196 #ifdef PERL_OLD_COPY_ON_WRITE
14197 SvREFCNT_dec(r->saved_copy);
14200 SvREFCNT_dec(r->qr_anoncv);
14201 rx->sv_u.svu_rx = 0;
14206 This is a hacky workaround to the structural issue of match results
14207 being stored in the regexp structure which is in turn stored in
14208 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14209 could be PL_curpm in multiple contexts, and could require multiple
14210 result sets being associated with the pattern simultaneously, such
14211 as when doing a recursive match with (??{$qr})
14213 The solution is to make a lightweight copy of the regexp structure
14214 when a qr// is returned from the code executed by (??{$qr}) this
14215 lightweight copy doesn't actually own any of its data except for
14216 the starp/end and the actual regexp structure itself.
14222 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14224 struct regexp *ret;
14225 struct regexp *const r = ReANY(rx);
14226 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14228 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14231 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14233 SvOK_off((SV *)ret_x);
14235 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14236 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
14237 made both spots point to the same regexp body.) */
14238 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14239 assert(!SvPVX(ret_x));
14240 ret_x->sv_u.svu_rx = temp->sv_any;
14241 temp->sv_any = NULL;
14242 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14243 SvREFCNT_dec(temp);
14244 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14245 ing below will not set it. */
14246 SvCUR_set(ret_x, SvCUR(rx));
14249 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14250 sv_force_normal(sv) is called. */
14252 ret = ReANY(ret_x);
14254 SvFLAGS(ret_x) |= SvUTF8(rx);
14255 /* We share the same string buffer as the original regexp, on which we
14256 hold a reference count, incremented when mother_re is set below.
14257 The string pointer is copied here, being part of the regexp struct.
14259 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14260 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14262 const I32 npar = r->nparens+1;
14263 Newx(ret->offs, npar, regexp_paren_pair);
14264 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14267 Newx(ret->substrs, 1, struct reg_substr_data);
14268 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14270 SvREFCNT_inc_void(ret->anchored_substr);
14271 SvREFCNT_inc_void(ret->anchored_utf8);
14272 SvREFCNT_inc_void(ret->float_substr);
14273 SvREFCNT_inc_void(ret->float_utf8);
14275 /* check_substr and check_utf8, if non-NULL, point to either their
14276 anchored or float namesakes, and don't hold a second reference. */
14278 RX_MATCH_COPIED_off(ret_x);
14279 #ifdef PERL_OLD_COPY_ON_WRITE
14280 ret->saved_copy = NULL;
14282 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14283 SvREFCNT_inc_void(ret->qr_anoncv);
14289 /* regfree_internal()
14291 Free the private data in a regexp. This is overloadable by
14292 extensions. Perl takes care of the regexp structure in pregfree(),
14293 this covers the *pprivate pointer which technically perl doesn't
14294 know about, however of course we have to handle the
14295 regexp_internal structure when no extension is in use.
14297 Note this is called before freeing anything in the regexp
14302 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14305 struct regexp *const r = ReANY(rx);
14306 RXi_GET_DECL(r,ri);
14307 GET_RE_DEBUG_FLAGS_DECL;
14309 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14315 SV *dsv= sv_newmortal();
14316 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14317 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14318 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14319 PL_colors[4],PL_colors[5],s);
14322 #ifdef RE_TRACK_PATTERN_OFFSETS
14324 Safefree(ri->u.offsets); /* 20010421 MJD */
14326 if (ri->code_blocks) {
14328 for (n = 0; n < ri->num_code_blocks; n++)
14329 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14330 Safefree(ri->code_blocks);
14334 int n = ri->data->count;
14337 /* If you add a ->what type here, update the comment in regcomp.h */
14338 switch (ri->data->what[n]) {
14344 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14347 Safefree(ri->data->data[n]);
14353 { /* Aho Corasick add-on structure for a trie node.
14354 Used in stclass optimization only */
14356 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14358 refcount = --aho->refcount;
14361 PerlMemShared_free(aho->states);
14362 PerlMemShared_free(aho->fail);
14363 /* do this last!!!! */
14364 PerlMemShared_free(ri->data->data[n]);
14365 PerlMemShared_free(ri->regstclass);
14371 /* trie structure. */
14373 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14375 refcount = --trie->refcount;
14378 PerlMemShared_free(trie->charmap);
14379 PerlMemShared_free(trie->states);
14380 PerlMemShared_free(trie->trans);
14382 PerlMemShared_free(trie->bitmap);
14384 PerlMemShared_free(trie->jump);
14385 PerlMemShared_free(trie->wordinfo);
14386 /* do this last!!!! */
14387 PerlMemShared_free(ri->data->data[n]);
14392 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14395 Safefree(ri->data->what);
14396 Safefree(ri->data);
14402 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14403 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14404 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14407 re_dup - duplicate a regexp.
14409 This routine is expected to clone a given regexp structure. It is only
14410 compiled under USE_ITHREADS.
14412 After all of the core data stored in struct regexp is duplicated
14413 the regexp_engine.dupe method is used to copy any private data
14414 stored in the *pprivate pointer. This allows extensions to handle
14415 any duplication it needs to do.
14417 See pregfree() and regfree_internal() if you change anything here.
14419 #if defined(USE_ITHREADS)
14420 #ifndef PERL_IN_XSUB_RE
14422 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14426 const struct regexp *r = ReANY(sstr);
14427 struct regexp *ret = ReANY(dstr);
14429 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14431 npar = r->nparens+1;
14432 Newx(ret->offs, npar, regexp_paren_pair);
14433 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14435 /* no need to copy these */
14436 Newx(ret->swap, npar, regexp_paren_pair);
14439 if (ret->substrs) {
14440 /* Do it this way to avoid reading from *r after the StructCopy().
14441 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14442 cache, it doesn't matter. */
14443 const bool anchored = r->check_substr
14444 ? r->check_substr == r->anchored_substr
14445 : r->check_utf8 == r->anchored_utf8;
14446 Newx(ret->substrs, 1, struct reg_substr_data);
14447 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14449 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14450 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14451 ret->float_substr = sv_dup_inc(ret->float_substr, param);
14452 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14454 /* check_substr and check_utf8, if non-NULL, point to either their
14455 anchored or float namesakes, and don't hold a second reference. */
14457 if (ret->check_substr) {
14459 assert(r->check_utf8 == r->anchored_utf8);
14460 ret->check_substr = ret->anchored_substr;
14461 ret->check_utf8 = ret->anchored_utf8;
14463 assert(r->check_substr == r->float_substr);
14464 assert(r->check_utf8 == r->float_utf8);
14465 ret->check_substr = ret->float_substr;
14466 ret->check_utf8 = ret->float_utf8;
14468 } else if (ret->check_utf8) {
14470 ret->check_utf8 = ret->anchored_utf8;
14472 ret->check_utf8 = ret->float_utf8;
14477 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14478 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14481 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14483 if (RX_MATCH_COPIED(dstr))
14484 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
14486 ret->subbeg = NULL;
14487 #ifdef PERL_OLD_COPY_ON_WRITE
14488 ret->saved_copy = NULL;
14491 /* Whether mother_re be set or no, we need to copy the string. We
14492 cannot refrain from copying it when the storage points directly to
14493 our mother regexp, because that's
14494 1: a buffer in a different thread
14495 2: something we no longer hold a reference on
14496 so we need to copy it locally. */
14497 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
14498 ret->mother_re = NULL;
14501 #endif /* PERL_IN_XSUB_RE */
14506 This is the internal complement to regdupe() which is used to copy
14507 the structure pointed to by the *pprivate pointer in the regexp.
14508 This is the core version of the extension overridable cloning hook.
14509 The regexp structure being duplicated will be copied by perl prior
14510 to this and will be provided as the regexp *r argument, however
14511 with the /old/ structures pprivate pointer value. Thus this routine
14512 may override any copying normally done by perl.
14514 It returns a pointer to the new regexp_internal structure.
14518 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14521 struct regexp *const r = ReANY(rx);
14522 regexp_internal *reti;
14524 RXi_GET_DECL(r,ri);
14526 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14530 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14531 Copy(ri->program, reti->program, len+1, regnode);
14533 reti->num_code_blocks = ri->num_code_blocks;
14534 if (ri->code_blocks) {
14536 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14537 struct reg_code_block);
14538 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14539 struct reg_code_block);
14540 for (n = 0; n < ri->num_code_blocks; n++)
14541 reti->code_blocks[n].src_regex = (REGEXP*)
14542 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14545 reti->code_blocks = NULL;
14547 reti->regstclass = NULL;
14550 struct reg_data *d;
14551 const int count = ri->data->count;
14554 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14555 char, struct reg_data);
14556 Newx(d->what, count, U8);
14559 for (i = 0; i < count; i++) {
14560 d->what[i] = ri->data->what[i];
14561 switch (d->what[i]) {
14562 /* see also regcomp.h and regfree_internal() */
14563 case 'a': /* actually an AV, but the dup function is identical. */
14567 case 'u': /* actually an HV, but the dup function is identical. */
14568 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14571 /* This is cheating. */
14572 Newx(d->data[i], 1, struct regnode_charclass_class);
14573 StructCopy(ri->data->data[i], d->data[i],
14574 struct regnode_charclass_class);
14575 reti->regstclass = (regnode*)d->data[i];
14578 /* Trie stclasses are readonly and can thus be shared
14579 * without duplication. We free the stclass in pregfree
14580 * when the corresponding reg_ac_data struct is freed.
14582 reti->regstclass= ri->regstclass;
14586 ((reg_trie_data*)ri->data->data[i])->refcount++;
14591 d->data[i] = ri->data->data[i];
14594 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14603 reti->name_list_idx = ri->name_list_idx;
14605 #ifdef RE_TRACK_PATTERN_OFFSETS
14606 if (ri->u.offsets) {
14607 Newx(reti->u.offsets, 2*len+1, U32);
14608 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14611 SetProgLen(reti,len);
14614 return (void*)reti;
14617 #endif /* USE_ITHREADS */
14619 #ifndef PERL_IN_XSUB_RE
14622 - regnext - dig the "next" pointer out of a node
14625 Perl_regnext(pTHX_ register regnode *p)
14633 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
14634 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14637 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14646 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14649 STRLEN l1 = strlen(pat1);
14650 STRLEN l2 = strlen(pat2);
14653 const char *message;
14655 PERL_ARGS_ASSERT_RE_CROAK2;
14661 Copy(pat1, buf, l1 , char);
14662 Copy(pat2, buf + l1, l2 , char);
14663 buf[l1 + l2] = '\n';
14664 buf[l1 + l2 + 1] = '\0';
14666 /* ANSI variant takes additional second argument */
14667 va_start(args, pat2);
14671 msv = vmess(buf, &args);
14673 message = SvPV_const(msv,l1);
14676 Copy(message, buf, l1 , char);
14677 buf[l1-1] = '\0'; /* Overwrite \n */
14678 Perl_croak(aTHX_ "%s", buf);
14681 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
14683 #ifndef PERL_IN_XSUB_RE
14685 Perl_save_re_context(pTHX)
14689 struct re_save_state *state;
14691 SAVEVPTR(PL_curcop);
14692 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14694 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14695 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14696 SSPUSHUV(SAVEt_RE_STATE);
14698 Copy(&PL_reg_state, state, 1, struct re_save_state);
14700 PL_reg_oldsaved = NULL;
14701 PL_reg_oldsavedlen = 0;
14702 PL_reg_oldsavedoffset = 0;
14703 PL_reg_oldsavedcoffset = 0;
14704 PL_reg_maxiter = 0;
14705 PL_reg_leftiter = 0;
14706 PL_reg_poscache = NULL;
14707 PL_reg_poscache_size = 0;
14708 #ifdef PERL_OLD_COPY_ON_WRITE
14712 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14714 const REGEXP * const rx = PM_GETRE(PL_curpm);
14717 for (i = 1; i <= RX_NPARENS(rx); i++) {
14718 char digits[TYPE_CHARS(long)];
14719 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14720 GV *const *const gvp
14721 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14724 GV * const gv = *gvp;
14725 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14735 clear_re(pTHX_ void *r)
14738 ReREFCNT_dec((REGEXP *)r);
14744 S_put_byte(pTHX_ SV *sv, int c)
14746 PERL_ARGS_ASSERT_PUT_BYTE;
14748 /* Our definition of isPRINT() ignores locales, so only bytes that are
14749 not part of UTF-8 are considered printable. I assume that the same
14750 holds for UTF-EBCDIC.
14751 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14752 which Wikipedia says:
14754 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14755 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14756 identical, to the ASCII delete (DEL) or rubout control character.
14757 ) So the old condition can be simplified to !isPRINT(c) */
14760 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14763 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14767 const char string = c;
14768 if (c == '-' || c == ']' || c == '\\' || c == '^')
14769 sv_catpvs(sv, "\\");
14770 sv_catpvn(sv, &string, 1);
14775 #define CLEAR_OPTSTART \
14776 if (optstart) STMT_START { \
14777 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14781 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14783 STATIC const regnode *
14784 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14785 const regnode *last, const regnode *plast,
14786 SV* sv, I32 indent, U32 depth)
14789 U8 op = PSEUDO; /* Arbitrary non-END op. */
14790 const regnode *next;
14791 const regnode *optstart= NULL;
14793 RXi_GET_DECL(r,ri);
14794 GET_RE_DEBUG_FLAGS_DECL;
14796 PERL_ARGS_ASSERT_DUMPUNTIL;
14798 #ifdef DEBUG_DUMPUNTIL
14799 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14800 last ? last-start : 0,plast ? plast-start : 0);
14803 if (plast && plast < last)
14806 while (PL_regkind[op] != END && (!last || node < last)) {
14807 /* While that wasn't END last time... */
14810 if (op == CLOSE || op == WHILEM)
14812 next = regnext((regnode *)node);
14815 if (OP(node) == OPTIMIZED) {
14816 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14823 regprop(r, sv, node);
14824 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14825 (int)(2*indent + 1), "", SvPVX_const(sv));
14827 if (OP(node) != OPTIMIZED) {
14828 if (next == NULL) /* Next ptr. */
14829 PerlIO_printf(Perl_debug_log, " (0)");
14830 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14831 PerlIO_printf(Perl_debug_log, " (FAIL)");
14833 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14834 (void)PerlIO_putc(Perl_debug_log, '\n');
14838 if (PL_regkind[(U8)op] == BRANCHJ) {
14841 const regnode *nnode = (OP(next) == LONGJMP
14842 ? regnext((regnode *)next)
14844 if (last && nnode > last)
14846 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14849 else if (PL_regkind[(U8)op] == BRANCH) {
14851 DUMPUNTIL(NEXTOPER(node), next);
14853 else if ( PL_regkind[(U8)op] == TRIE ) {
14854 const regnode *this_trie = node;
14855 const char op = OP(node);
14856 const U32 n = ARG(node);
14857 const reg_ac_data * const ac = op>=AHOCORASICK ?
14858 (reg_ac_data *)ri->data->data[n] :
14860 const reg_trie_data * const trie =
14861 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14863 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14865 const regnode *nextbranch= NULL;
14868 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14869 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14871 PerlIO_printf(Perl_debug_log, "%*s%s ",
14872 (int)(2*(indent+3)), "",
14873 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14874 PL_colors[0], PL_colors[1],
14875 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14876 PERL_PV_PRETTY_ELLIPSES |
14877 PERL_PV_PRETTY_LTGT
14882 U16 dist= trie->jump[word_idx+1];
14883 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14884 (UV)((dist ? this_trie + dist : next) - start));
14887 nextbranch= this_trie + trie->jump[0];
14888 DUMPUNTIL(this_trie + dist, nextbranch);
14890 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14891 nextbranch= regnext((regnode *)nextbranch);
14893 PerlIO_printf(Perl_debug_log, "\n");
14896 if (last && next > last)
14901 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14902 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14903 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14905 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14907 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14909 else if ( op == PLUS || op == STAR) {
14910 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14912 else if (PL_regkind[(U8)op] == ANYOF) {
14913 /* arglen 1 + class block */
14914 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14915 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14916 node = NEXTOPER(node);
14918 else if (PL_regkind[(U8)op] == EXACT) {
14919 /* Literal string, where present. */
14920 node += NODE_SZ_STR(node) - 1;
14921 node = NEXTOPER(node);
14924 node = NEXTOPER(node);
14925 node += regarglen[(U8)op];
14927 if (op == CURLYX || op == OPEN)
14931 #ifdef DEBUG_DUMPUNTIL
14932 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14937 #endif /* DEBUGGING */
14941 * c-indentation-style: bsd
14942 * c-basic-offset: 4
14943 * indent-tabs-mode: nil
14946 * ex: set ts=8 sts=4 sw=4 et: