5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
100 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
101 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
122 typedef struct RExC_state_t {
123 U32 flags; /* RXf_* are we folding, multilining? */
124 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
125 char *precomp; /* uncompiled string. */
126 REGEXP *rx_sv; /* The SV that is the regexp. */
127 regexp *rx; /* perl core regexp structure */
128 regexp_internal *rxi; /* internal data for regexp object pprivate field */
129 char *start; /* Start of input for compile */
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
132 I32 whilem_seen; /* number of WHILEM in this expr */
133 regnode *emit_start; /* Start of emitted-code area */
134 regnode *emit_bound; /* First regnode outside of the allocated space */
135 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
136 I32 naughty; /* How bad is this pattern? */
137 I32 sawback; /* Did we see \1, ...? */
139 I32 size; /* Code size. */
140 I32 npar; /* Capture buffer count, (OPEN). */
141 I32 cpar; /* Capture buffer count, (CLOSE). */
142 I32 nestroot; /* root parens we are in - used by accept */
145 regnode **open_parens; /* pointers to open parens */
146 regnode **close_parens; /* pointers to close parens */
147 regnode *opend; /* END node in program */
148 I32 utf8; /* whether the pattern is utf8 or not */
149 I32 orig_utf8; /* whether the pattern was originally in utf8 */
150 /* XXX use this for future optimisation of case
151 * where pattern must be upgraded to utf8. */
152 I32 uni_semantics; /* If a d charset modifier should use unicode
153 rules, even if the pattern is not in
155 HV *paren_names; /* Paren names */
157 regnode **recurse; /* Recurse regops */
158 I32 recurse_count; /* Number of recurse regops */
161 I32 override_recoding;
162 I32 in_multi_char_class;
163 struct reg_code_block *code_blocks; /* positions of literal (?{})
165 int num_code_blocks; /* size of code_blocks[] */
166 int code_index; /* next code_blocks[] slot */
168 char *starttry; /* -Dr: where regtry was called. */
169 #define RExC_starttry (pRExC_state->starttry)
171 SV *runtime_code_qr; /* qr with the runtime code blocks */
173 const char *lastparse;
175 AV *paren_name_list; /* idx -> name */
176 #define RExC_lastparse (pRExC_state->lastparse)
177 #define RExC_lastnum (pRExC_state->lastnum)
178 #define RExC_paren_name_list (pRExC_state->paren_name_list)
182 #define RExC_flags (pRExC_state->flags)
183 #define RExC_pm_flags (pRExC_state->pm_flags)
184 #define RExC_precomp (pRExC_state->precomp)
185 #define RExC_rx_sv (pRExC_state->rx_sv)
186 #define RExC_rx (pRExC_state->rx)
187 #define RExC_rxi (pRExC_state->rxi)
188 #define RExC_start (pRExC_state->start)
189 #define RExC_end (pRExC_state->end)
190 #define RExC_parse (pRExC_state->parse)
191 #define RExC_whilem_seen (pRExC_state->whilem_seen)
192 #ifdef RE_TRACK_PATTERN_OFFSETS
193 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
195 #define RExC_emit (pRExC_state->emit)
196 #define RExC_emit_start (pRExC_state->emit_start)
197 #define RExC_emit_bound (pRExC_state->emit_bound)
198 #define RExC_naughty (pRExC_state->naughty)
199 #define RExC_sawback (pRExC_state->sawback)
200 #define RExC_seen (pRExC_state->seen)
201 #define RExC_size (pRExC_state->size)
202 #define RExC_npar (pRExC_state->npar)
203 #define RExC_nestroot (pRExC_state->nestroot)
204 #define RExC_extralen (pRExC_state->extralen)
205 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
206 #define RExC_utf8 (pRExC_state->utf8)
207 #define RExC_uni_semantics (pRExC_state->uni_semantics)
208 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
209 #define RExC_open_parens (pRExC_state->open_parens)
210 #define RExC_close_parens (pRExC_state->close_parens)
211 #define RExC_opend (pRExC_state->opend)
212 #define RExC_paren_names (pRExC_state->paren_names)
213 #define RExC_recurse (pRExC_state->recurse)
214 #define RExC_recurse_count (pRExC_state->recurse_count)
215 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
216 #define RExC_contains_locale (pRExC_state->contains_locale)
217 #define RExC_override_recoding (pRExC_state->override_recoding)
218 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
221 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
222 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
223 ((*s) == '{' && regcurly(s)))
226 #undef SPSTART /* dratted cpp namespace... */
229 * Flags to be passed up and down.
231 #define WORST 0 /* Worst case. */
232 #define HASWIDTH 0x01 /* Known to match non-null strings. */
234 /* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
235 * character. Note that this is not the same thing as REGNODE_SIMPLE */
237 #define SPSTART 0x04 /* Starts with * or +. */
238 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
239 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
241 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
243 /* whether trie related optimizations are enabled */
244 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
245 #define TRIE_STUDY_OPT
246 #define FULL_TRIE_STUDY
252 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
253 #define PBITVAL(paren) (1 << ((paren) & 7))
254 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
255 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
256 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
258 /* If not already in utf8, do a longjmp back to the beginning */
259 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
260 #define REQUIRE_UTF8 STMT_START { \
261 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
264 /* About scan_data_t.
266 During optimisation we recurse through the regexp program performing
267 various inplace (keyhole style) optimisations. In addition study_chunk
268 and scan_commit populate this data structure with information about
269 what strings MUST appear in the pattern. We look for the longest
270 string that must appear at a fixed location, and we look for the
271 longest string that may appear at a floating location. So for instance
276 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
277 strings (because they follow a .* construct). study_chunk will identify
278 both FOO and BAR as being the longest fixed and floating strings respectively.
280 The strings can be composites, for instance
284 will result in a composite fixed substring 'foo'.
286 For each string some basic information is maintained:
288 - offset or min_offset
289 This is the position the string must appear at, or not before.
290 It also implicitly (when combined with minlenp) tells us how many
291 characters must match before the string we are searching for.
292 Likewise when combined with minlenp and the length of the string it
293 tells us how many characters must appear after the string we have
297 Only used for floating strings. This is the rightmost point that
298 the string can appear at. If set to I32 max it indicates that the
299 string can occur infinitely far to the right.
302 A pointer to the minimum number of characters of the pattern that the
303 string was found inside. This is important as in the case of positive
304 lookahead or positive lookbehind we can have multiple patterns
309 The minimum length of the pattern overall is 3, the minimum length
310 of the lookahead part is 3, but the minimum length of the part that
311 will actually match is 1. So 'FOO's minimum length is 3, but the
312 minimum length for the F is 1. This is important as the minimum length
313 is used to determine offsets in front of and behind the string being
314 looked for. Since strings can be composites this is the length of the
315 pattern at the time it was committed with a scan_commit. Note that
316 the length is calculated by study_chunk, so that the minimum lengths
317 are not known until the full pattern has been compiled, thus the
318 pointer to the value.
322 In the case of lookbehind the string being searched for can be
323 offset past the start point of the final matching string.
324 If this value was just blithely removed from the min_offset it would
325 invalidate some of the calculations for how many chars must match
326 before or after (as they are derived from min_offset and minlen and
327 the length of the string being searched for).
328 When the final pattern is compiled and the data is moved from the
329 scan_data_t structure into the regexp structure the information
330 about lookbehind is factored in, with the information that would
331 have been lost precalculated in the end_shift field for the
334 The fields pos_min and pos_delta are used to store the minimum offset
335 and the delta to the maximum offset at the current point in the pattern.
339 typedef struct scan_data_t {
340 /*I32 len_min; unused */
341 /*I32 len_delta; unused */
345 I32 last_end; /* min value, <0 unless valid. */
348 SV **longest; /* Either &l_fixed, or &l_float. */
349 SV *longest_fixed; /* longest fixed string found in pattern */
350 I32 offset_fixed; /* offset where it starts */
351 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
352 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
353 SV *longest_float; /* longest floating string found in pattern */
354 I32 offset_float_min; /* earliest point in string it can appear */
355 I32 offset_float_max; /* latest point in string it can appear */
356 I32 *minlen_float; /* pointer to the minlen relevant to the string */
357 I32 lookbehind_float; /* is the position of the string modified by LB */
361 struct regnode_charclass_class *start_class;
365 * Forward declarations for pregcomp()'s friends.
368 static const scan_data_t zero_scan_data =
369 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
371 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
372 #define SF_BEFORE_SEOL 0x0001
373 #define SF_BEFORE_MEOL 0x0002
374 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
375 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
378 # define SF_FIX_SHIFT_EOL (0+2)
379 # define SF_FL_SHIFT_EOL (0+4)
381 # define SF_FIX_SHIFT_EOL (+2)
382 # define SF_FL_SHIFT_EOL (+4)
385 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
386 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
388 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
389 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
390 #define SF_IS_INF 0x0040
391 #define SF_HAS_PAR 0x0080
392 #define SF_IN_PAR 0x0100
393 #define SF_HAS_EVAL 0x0200
394 #define SCF_DO_SUBSTR 0x0400
395 #define SCF_DO_STCLASS_AND 0x0800
396 #define SCF_DO_STCLASS_OR 0x1000
397 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
398 #define SCF_WHILEM_VISITED_POS 0x2000
400 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
401 #define SCF_SEEN_ACCEPT 0x8000
403 #define UTF cBOOL(RExC_utf8)
405 /* The enums for all these are ordered so things work out correctly */
406 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
407 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
408 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
409 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
410 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
411 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
412 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
414 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
416 #define OOB_NAMEDCLASS -1
418 /* There is no code point that is out-of-bounds, so this is problematic. But
419 * its only current use is to initialize a variable that is always set before
421 #define OOB_UNICODE 0xDEADBEEF
423 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
424 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
427 /* length of regex to show in messages that don't mark a position within */
428 #define RegexLengthToShowInErrorMessages 127
431 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
432 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
433 * op/pragma/warn/regcomp.
435 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
436 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
438 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
441 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
442 * arg. Show regex, up to a maximum length. If it's too long, chop and add
445 #define _FAIL(code) STMT_START { \
446 const char *ellipses = ""; \
447 IV len = RExC_end - RExC_precomp; \
450 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
451 if (len > RegexLengthToShowInErrorMessages) { \
452 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
453 len = RegexLengthToShowInErrorMessages - 10; \
459 #define FAIL(msg) _FAIL( \
460 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
461 msg, (int)len, RExC_precomp, ellipses))
463 #define FAIL2(msg,arg) _FAIL( \
464 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
465 arg, (int)len, RExC_precomp, ellipses))
468 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
470 #define Simple_vFAIL(m) STMT_START { \
471 const IV offset = RExC_parse - RExC_precomp; \
472 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
473 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
477 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
479 #define vFAIL(m) STMT_START { \
481 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
486 * Like Simple_vFAIL(), but accepts two arguments.
488 #define Simple_vFAIL2(m,a1) STMT_START { \
489 const IV offset = RExC_parse - RExC_precomp; \
490 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
491 (int)offset, RExC_precomp, RExC_precomp + offset); \
495 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
497 #define vFAIL2(m,a1) STMT_START { \
499 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
500 Simple_vFAIL2(m, a1); \
505 * Like Simple_vFAIL(), but accepts three arguments.
507 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
508 const IV offset = RExC_parse - RExC_precomp; \
509 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
510 (int)offset, RExC_precomp, RExC_precomp + offset); \
514 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
516 #define vFAIL3(m,a1,a2) STMT_START { \
518 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
519 Simple_vFAIL3(m, a1, a2); \
523 * Like Simple_vFAIL(), but accepts four arguments.
525 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
526 const IV offset = RExC_parse - RExC_precomp; \
527 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
528 (int)offset, RExC_precomp, RExC_precomp + offset); \
531 #define ckWARNreg(loc,m) STMT_START { \
532 const IV offset = loc - RExC_precomp; \
533 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
534 (int)offset, RExC_precomp, RExC_precomp + offset); \
537 #define ckWARNregdep(loc,m) STMT_START { \
538 const IV offset = loc - RExC_precomp; \
539 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
541 (int)offset, RExC_precomp, RExC_precomp + offset); \
544 #define ckWARN2regdep(loc,m, a1) STMT_START { \
545 const IV offset = loc - RExC_precomp; \
546 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
548 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
551 #define ckWARN2reg(loc, m, a1) STMT_START { \
552 const IV offset = loc - RExC_precomp; \
553 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
554 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
557 #define vWARN3(loc, m, a1, a2) STMT_START { \
558 const IV offset = loc - RExC_precomp; \
559 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
560 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
563 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
564 const IV offset = loc - RExC_precomp; \
565 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
566 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
569 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
570 const IV offset = loc - RExC_precomp; \
571 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
572 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
575 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
576 const IV offset = loc - RExC_precomp; \
577 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
578 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
581 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
582 const IV offset = loc - RExC_precomp; \
583 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
584 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
588 /* Allow for side effects in s */
589 #define REGC(c,s) STMT_START { \
590 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
593 /* Macros for recording node offsets. 20001227 mjd@plover.com
594 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
595 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
596 * Element 0 holds the number n.
597 * Position is 1 indexed.
599 #ifndef RE_TRACK_PATTERN_OFFSETS
600 #define Set_Node_Offset_To_R(node,byte)
601 #define Set_Node_Offset(node,byte)
602 #define Set_Cur_Node_Offset
603 #define Set_Node_Length_To_R(node,len)
604 #define Set_Node_Length(node,len)
605 #define Set_Node_Cur_Length(node)
606 #define Node_Offset(n)
607 #define Node_Length(n)
608 #define Set_Node_Offset_Length(node,offset,len)
609 #define ProgLen(ri) ri->u.proglen
610 #define SetProgLen(ri,x) ri->u.proglen = x
612 #define ProgLen(ri) ri->u.offsets[0]
613 #define SetProgLen(ri,x) ri->u.offsets[0] = x
614 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
616 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
617 __LINE__, (int)(node), (int)(byte))); \
619 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
621 RExC_offsets[2*(node)-1] = (byte); \
626 #define Set_Node_Offset(node,byte) \
627 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
628 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
630 #define Set_Node_Length_To_R(node,len) STMT_START { \
632 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
633 __LINE__, (int)(node), (int)(len))); \
635 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
637 RExC_offsets[2*(node)] = (len); \
642 #define Set_Node_Length(node,len) \
643 Set_Node_Length_To_R((node)-RExC_emit_start, len)
644 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
645 #define Set_Node_Cur_Length(node) \
646 Set_Node_Length(node, RExC_parse - parse_start)
648 /* Get offsets and lengths */
649 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
650 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
652 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
653 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
654 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
658 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
659 #define EXPERIMENTAL_INPLACESCAN
660 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
662 #define DEBUG_STUDYDATA(str,data,depth) \
663 DEBUG_OPTIMISE_MORE_r(if(data){ \
664 PerlIO_printf(Perl_debug_log, \
665 "%*s" str "Pos:%"IVdf"/%"IVdf \
666 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
667 (int)(depth)*2, "", \
668 (IV)((data)->pos_min), \
669 (IV)((data)->pos_delta), \
670 (UV)((data)->flags), \
671 (IV)((data)->whilem_c), \
672 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
673 is_inf ? "INF " : "" \
675 if ((data)->last_found) \
676 PerlIO_printf(Perl_debug_log, \
677 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
678 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
679 SvPVX_const((data)->last_found), \
680 (IV)((data)->last_end), \
681 (IV)((data)->last_start_min), \
682 (IV)((data)->last_start_max), \
683 ((data)->longest && \
684 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
685 SvPVX_const((data)->longest_fixed), \
686 (IV)((data)->offset_fixed), \
687 ((data)->longest && \
688 (data)->longest==&((data)->longest_float)) ? "*" : "", \
689 SvPVX_const((data)->longest_float), \
690 (IV)((data)->offset_float_min), \
691 (IV)((data)->offset_float_max) \
693 PerlIO_printf(Perl_debug_log,"\n"); \
696 static void clear_re(pTHX_ void *r);
698 /* Mark that we cannot extend a found fixed substring at this point.
699 Update the longest found anchored substring and the longest found
700 floating substrings if needed. */
703 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
705 const STRLEN l = CHR_SVLEN(data->last_found);
706 const STRLEN old_l = CHR_SVLEN(*data->longest);
707 GET_RE_DEBUG_FLAGS_DECL;
709 PERL_ARGS_ASSERT_SCAN_COMMIT;
711 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
712 SvSetMagicSV(*data->longest, data->last_found);
713 if (*data->longest == data->longest_fixed) {
714 data->offset_fixed = l ? data->last_start_min : data->pos_min;
715 if (data->flags & SF_BEFORE_EOL)
717 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
719 data->flags &= ~SF_FIX_BEFORE_EOL;
720 data->minlen_fixed=minlenp;
721 data->lookbehind_fixed=0;
723 else { /* *data->longest == data->longest_float */
724 data->offset_float_min = l ? data->last_start_min : data->pos_min;
725 data->offset_float_max = (l
726 ? data->last_start_max
727 : data->pos_min + data->pos_delta);
728 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
729 data->offset_float_max = I32_MAX;
730 if (data->flags & SF_BEFORE_EOL)
732 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
734 data->flags &= ~SF_FL_BEFORE_EOL;
735 data->minlen_float=minlenp;
736 data->lookbehind_float=0;
739 SvCUR_set(data->last_found, 0);
741 SV * const sv = data->last_found;
742 if (SvUTF8(sv) && SvMAGICAL(sv)) {
743 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
749 data->flags &= ~SF_BEFORE_EOL;
750 DEBUG_STUDYDATA("commit: ",data,0);
753 /* Can match anything (initialization) */
755 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
757 PERL_ARGS_ASSERT_CL_ANYTHING;
759 ANYOF_BITMAP_SETALL(cl);
760 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
761 |ANYOF_NON_UTF8_LATIN1_ALL;
763 /* If any portion of the regex is to operate under locale rules,
764 * initialization includes it. The reason this isn't done for all regexes
765 * is that the optimizer was written under the assumption that locale was
766 * all-or-nothing. Given the complexity and lack of documentation in the
767 * optimizer, and that there are inadequate test cases for locale, so many
768 * parts of it may not work properly, it is safest to avoid locale unless
770 if (RExC_contains_locale) {
771 ANYOF_CLASS_SETALL(cl); /* /l uses class */
772 cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
775 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
779 /* Can match anything (initialization) */
781 S_cl_is_anything(const struct regnode_charclass_class *cl)
785 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
787 for (value = 0; value <= ANYOF_MAX; value += 2)
788 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
790 if (!(cl->flags & ANYOF_UNICODE_ALL))
792 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
797 /* Can match anything (initialization) */
799 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
801 PERL_ARGS_ASSERT_CL_INIT;
803 Zero(cl, 1, struct regnode_charclass_class);
805 cl_anything(pRExC_state, cl);
806 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
809 /* These two functions currently do the exact same thing */
810 #define cl_init_zero S_cl_init
812 /* 'AND' a given class with another one. Can create false positives. 'cl'
813 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
814 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
816 S_cl_and(struct regnode_charclass_class *cl,
817 const struct regnode_charclass_class *and_with)
819 PERL_ARGS_ASSERT_CL_AND;
821 assert(and_with->type == ANYOF);
823 /* I (khw) am not sure all these restrictions are necessary XXX */
824 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
825 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
826 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
827 && !(and_with->flags & ANYOF_LOC_FOLD)
828 && !(cl->flags & ANYOF_LOC_FOLD)) {
831 if (and_with->flags & ANYOF_INVERT)
832 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
833 cl->bitmap[i] &= ~and_with->bitmap[i];
835 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
836 cl->bitmap[i] &= and_with->bitmap[i];
837 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
839 if (and_with->flags & ANYOF_INVERT) {
841 /* Here, the and'ed node is inverted. Get the AND of the flags that
842 * aren't affected by the inversion. Those that are affected are
843 * handled individually below */
844 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
845 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
846 cl->flags |= affected_flags;
848 /* We currently don't know how to deal with things that aren't in the
849 * bitmap, but we know that the intersection is no greater than what
850 * is already in cl, so let there be false positives that get sorted
851 * out after the synthetic start class succeeds, and the node is
852 * matched for real. */
854 /* The inversion of these two flags indicate that the resulting
855 * intersection doesn't have them */
856 if (and_with->flags & ANYOF_UNICODE_ALL) {
857 cl->flags &= ~ANYOF_UNICODE_ALL;
859 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
860 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
863 else { /* and'd node is not inverted */
864 U8 outside_bitmap_but_not_utf8; /* Temp variable */
866 if (! ANYOF_NONBITMAP(and_with)) {
868 /* Here 'and_with' doesn't match anything outside the bitmap
869 * (except possibly ANYOF_UNICODE_ALL), which means the
870 * intersection can't either, except for ANYOF_UNICODE_ALL, in
871 * which case we don't know what the intersection is, but it's no
872 * greater than what cl already has, so can just leave it alone,
873 * with possible false positives */
874 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
875 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
876 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
879 else if (! ANYOF_NONBITMAP(cl)) {
881 /* Here, 'and_with' does match something outside the bitmap, and cl
882 * doesn't have a list of things to match outside the bitmap. If
883 * cl can match all code points above 255, the intersection will
884 * be those above-255 code points that 'and_with' matches. If cl
885 * can't match all Unicode code points, it means that it can't
886 * match anything outside the bitmap (since the 'if' that got us
887 * into this block tested for that), so we leave the bitmap empty.
889 if (cl->flags & ANYOF_UNICODE_ALL) {
890 ARG_SET(cl, ARG(and_with));
892 /* and_with's ARG may match things that don't require UTF8.
893 * And now cl's will too, in spite of this being an 'and'. See
894 * the comments below about the kludge */
895 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
899 /* Here, both 'and_with' and cl match something outside the
900 * bitmap. Currently we do not do the intersection, so just match
901 * whatever cl had at the beginning. */
905 /* Take the intersection of the two sets of flags. However, the
906 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
907 * kludge around the fact that this flag is not treated like the others
908 * which are initialized in cl_anything(). The way the optimizer works
909 * is that the synthetic start class (SSC) is initialized to match
910 * anything, and then the first time a real node is encountered, its
911 * values are AND'd with the SSC's with the result being the values of
912 * the real node. However, there are paths through the optimizer where
913 * the AND never gets called, so those initialized bits are set
914 * inappropriately, which is not usually a big deal, as they just cause
915 * false positives in the SSC, which will just mean a probably
916 * imperceptible slow down in execution. However this bit has a
917 * higher false positive consequence in that it can cause utf8.pm,
918 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
919 * bigger slowdown and also causes significant extra memory to be used.
920 * In order to prevent this, the code now takes a different tack. The
921 * bit isn't set unless some part of the regular expression needs it,
922 * but once set it won't get cleared. This means that these extra
923 * modules won't get loaded unless there was some path through the
924 * pattern that would have required them anyway, and so any false
925 * positives that occur by not ANDing them out when they could be
926 * aren't as severe as they would be if we treated this bit like all
928 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
929 & ANYOF_NONBITMAP_NON_UTF8;
930 cl->flags &= and_with->flags;
931 cl->flags |= outside_bitmap_but_not_utf8;
935 /* 'OR' a given class with another one. Can create false positives. 'cl'
936 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
937 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
939 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
941 PERL_ARGS_ASSERT_CL_OR;
943 if (or_with->flags & ANYOF_INVERT) {
945 /* Here, the or'd node is to be inverted. This means we take the
946 * complement of everything not in the bitmap, but currently we don't
947 * know what that is, so give up and match anything */
948 if (ANYOF_NONBITMAP(or_with)) {
949 cl_anything(pRExC_state, cl);
952 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
953 * <= (B1 | !B2) | (CL1 | !CL2)
954 * which is wasteful if CL2 is small, but we ignore CL2:
955 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
956 * XXXX Can we handle case-fold? Unclear:
957 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
958 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
960 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
961 && !(or_with->flags & ANYOF_LOC_FOLD)
962 && !(cl->flags & ANYOF_LOC_FOLD) ) {
965 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
966 cl->bitmap[i] |= ~or_with->bitmap[i];
967 } /* XXXX: logic is complicated otherwise */
969 cl_anything(pRExC_state, cl);
972 /* And, we can just take the union of the flags that aren't affected
973 * by the inversion */
974 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
976 /* For the remaining flags:
977 ANYOF_UNICODE_ALL and inverted means to not match anything above
978 255, which means that the union with cl should just be
979 what cl has in it, so can ignore this flag
980 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
981 is 127-255 to match them, but then invert that, so the
982 union with cl should just be what cl has in it, so can
985 } else { /* 'or_with' is not inverted */
986 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
987 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
988 && (!(or_with->flags & ANYOF_LOC_FOLD)
989 || (cl->flags & ANYOF_LOC_FOLD)) ) {
992 /* OR char bitmap and class bitmap separately */
993 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
994 cl->bitmap[i] |= or_with->bitmap[i];
995 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
996 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
997 cl->classflags[i] |= or_with->classflags[i];
998 cl->flags |= ANYOF_CLASS;
1001 else { /* XXXX: logic is complicated, leave it along for a moment. */
1002 cl_anything(pRExC_state, cl);
1005 if (ANYOF_NONBITMAP(or_with)) {
1007 /* Use the added node's outside-the-bit-map match if there isn't a
1008 * conflict. If there is a conflict (both nodes match something
1009 * outside the bitmap, but what they match outside is not the same
1010 * pointer, and hence not easily compared until XXX we extend
1011 * inversion lists this far), give up and allow the start class to
1012 * match everything outside the bitmap. If that stuff is all above
1013 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1014 if (! ANYOF_NONBITMAP(cl)) {
1015 ARG_SET(cl, ARG(or_with));
1017 else if (ARG(cl) != ARG(or_with)) {
1019 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1020 cl_anything(pRExC_state, cl);
1023 cl->flags |= ANYOF_UNICODE_ALL;
1028 /* Take the union */
1029 cl->flags |= or_with->flags;
1033 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1034 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1035 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1036 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1041 dump_trie(trie,widecharmap,revcharmap)
1042 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1043 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1045 These routines dump out a trie in a somewhat readable format.
1046 The _interim_ variants are used for debugging the interim
1047 tables that are used to generate the final compressed
1048 representation which is what dump_trie expects.
1050 Part of the reason for their existence is to provide a form
1051 of documentation as to how the different representations function.
1056 Dumps the final compressed table form of the trie to Perl_debug_log.
1057 Used for debugging make_trie().
1061 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1062 AV *revcharmap, U32 depth)
1065 SV *sv=sv_newmortal();
1066 int colwidth= widecharmap ? 6 : 4;
1068 GET_RE_DEBUG_FLAGS_DECL;
1070 PERL_ARGS_ASSERT_DUMP_TRIE;
1072 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1073 (int)depth * 2 + 2,"",
1074 "Match","Base","Ofs" );
1076 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1077 SV ** const tmp = av_fetch( revcharmap, state, 0);
1079 PerlIO_printf( Perl_debug_log, "%*s",
1081 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1082 PL_colors[0], PL_colors[1],
1083 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1084 PERL_PV_ESCAPE_FIRSTCHAR
1089 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1090 (int)depth * 2 + 2,"");
1092 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1093 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1094 PerlIO_printf( Perl_debug_log, "\n");
1096 for( state = 1 ; state < trie->statecount ; state++ ) {
1097 const U32 base = trie->states[ state ].trans.base;
1099 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1101 if ( trie->states[ state ].wordnum ) {
1102 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1104 PerlIO_printf( Perl_debug_log, "%6s", "" );
1107 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1112 while( ( base + ofs < trie->uniquecharcount ) ||
1113 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1114 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1117 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1119 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1120 if ( ( base + ofs >= trie->uniquecharcount ) &&
1121 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1122 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1124 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1126 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1128 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1132 PerlIO_printf( Perl_debug_log, "]");
1135 PerlIO_printf( Perl_debug_log, "\n" );
1137 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1138 for (word=1; word <= trie->wordcount; word++) {
1139 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1140 (int)word, (int)(trie->wordinfo[word].prev),
1141 (int)(trie->wordinfo[word].len));
1143 PerlIO_printf(Perl_debug_log, "\n" );
1146 Dumps a fully constructed but uncompressed trie in list form.
1147 List tries normally only are used for construction when the number of
1148 possible chars (trie->uniquecharcount) is very high.
1149 Used for debugging make_trie().
1152 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1153 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1157 SV *sv=sv_newmortal();
1158 int colwidth= widecharmap ? 6 : 4;
1159 GET_RE_DEBUG_FLAGS_DECL;
1161 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1163 /* print out the table precompression. */
1164 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1165 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1166 "------:-----+-----------------\n" );
1168 for( state=1 ; state < next_alloc ; state ++ ) {
1171 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1172 (int)depth * 2 + 2,"", (UV)state );
1173 if ( ! trie->states[ state ].wordnum ) {
1174 PerlIO_printf( Perl_debug_log, "%5s| ","");
1176 PerlIO_printf( Perl_debug_log, "W%4x| ",
1177 trie->states[ state ].wordnum
1180 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1181 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1183 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1185 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1186 PL_colors[0], PL_colors[1],
1187 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1188 PERL_PV_ESCAPE_FIRSTCHAR
1190 TRIE_LIST_ITEM(state,charid).forid,
1191 (UV)TRIE_LIST_ITEM(state,charid).newstate
1194 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1195 (int)((depth * 2) + 14), "");
1198 PerlIO_printf( Perl_debug_log, "\n");
1203 Dumps a fully constructed but uncompressed trie in table form.
1204 This is the normal DFA style state transition table, with a few
1205 twists to facilitate compression later.
1206 Used for debugging make_trie().
1209 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1210 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1215 SV *sv=sv_newmortal();
1216 int colwidth= widecharmap ? 6 : 4;
1217 GET_RE_DEBUG_FLAGS_DECL;
1219 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1222 print out the table precompression so that we can do a visual check
1223 that they are identical.
1226 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1228 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1229 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1231 PerlIO_printf( Perl_debug_log, "%*s",
1233 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1234 PL_colors[0], PL_colors[1],
1235 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1236 PERL_PV_ESCAPE_FIRSTCHAR
1242 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1244 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1245 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1248 PerlIO_printf( Perl_debug_log, "\n" );
1250 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1252 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1253 (int)depth * 2 + 2,"",
1254 (UV)TRIE_NODENUM( state ) );
1256 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1257 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1259 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1261 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1263 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1264 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1266 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1267 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1275 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1276 startbranch: the first branch in the whole branch sequence
1277 first : start branch of sequence of branch-exact nodes.
1278 May be the same as startbranch
1279 last : Thing following the last branch.
1280 May be the same as tail.
1281 tail : item following the branch sequence
1282 count : words in the sequence
1283 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1284 depth : indent depth
1286 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1288 A trie is an N'ary tree where the branches are determined by digital
1289 decomposition of the key. IE, at the root node you look up the 1st character and
1290 follow that branch repeat until you find the end of the branches. Nodes can be
1291 marked as "accepting" meaning they represent a complete word. Eg:
1295 would convert into the following structure. Numbers represent states, letters
1296 following numbers represent valid transitions on the letter from that state, if
1297 the number is in square brackets it represents an accepting state, otherwise it
1298 will be in parenthesis.
1300 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1304 (1) +-i->(6)-+-s->[7]
1306 +-s->(3)-+-h->(4)-+-e->[5]
1308 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1310 This shows that when matching against the string 'hers' we will begin at state 1
1311 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1312 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1313 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1314 single traverse. We store a mapping from accepting to state to which word was
1315 matched, and then when we have multiple possibilities we try to complete the
1316 rest of the regex in the order in which they occured in the alternation.
1318 The only prior NFA like behaviour that would be changed by the TRIE support is
1319 the silent ignoring of duplicate alternations which are of the form:
1321 / (DUPE|DUPE) X? (?{ ... }) Y /x
1323 Thus EVAL blocks following a trie may be called a different number of times with
1324 and without the optimisation. With the optimisations dupes will be silently
1325 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1326 the following demonstrates:
1328 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1330 which prints out 'word' three times, but
1332 'words'=~/(word|word|word)(?{ print $1 })S/
1334 which doesnt print it out at all. This is due to other optimisations kicking in.
1336 Example of what happens on a structural level:
1338 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1340 1: CURLYM[1] {1,32767}(18)
1351 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1352 and should turn into:
1354 1: CURLYM[1] {1,32767}(18)
1356 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1364 Cases where tail != last would be like /(?foo|bar)baz/:
1374 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1375 and would end up looking like:
1378 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1385 d = uvuni_to_utf8_flags(d, uv, 0);
1387 is the recommended Unicode-aware way of saying
1392 #define TRIE_STORE_REVCHAR(val) \
1395 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1396 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1397 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1398 SvCUR_set(zlopp, kapow - flrbbbbb); \
1401 av_push(revcharmap, zlopp); \
1403 char ooooff = (char)val; \
1404 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1408 #define TRIE_READ_CHAR STMT_START { \
1411 /* if it is UTF then it is either already folded, or does not need folding */ \
1412 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1414 else if (folder == PL_fold_latin1) { \
1415 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1416 if ( foldlen > 0 ) { \
1417 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1423 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1424 skiplen = UNISKIP(uvc); \
1425 foldlen -= skiplen; \
1426 scan = foldbuf + skiplen; \
1429 /* raw data, will be folded later if needed */ \
1437 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1438 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1439 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1440 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1442 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1443 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1444 TRIE_LIST_CUR( state )++; \
1447 #define TRIE_LIST_NEW(state) STMT_START { \
1448 Newxz( trie->states[ state ].trans.list, \
1449 4, reg_trie_trans_le ); \
1450 TRIE_LIST_CUR( state ) = 1; \
1451 TRIE_LIST_LEN( state ) = 4; \
1454 #define TRIE_HANDLE_WORD(state) STMT_START { \
1455 U16 dupe= trie->states[ state ].wordnum; \
1456 regnode * const noper_next = regnext( noper ); \
1459 /* store the word for dumping */ \
1461 if (OP(noper) != NOTHING) \
1462 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1464 tmp = newSVpvn_utf8( "", 0, UTF ); \
1465 av_push( trie_words, tmp ); \
1469 trie->wordinfo[curword].prev = 0; \
1470 trie->wordinfo[curword].len = wordlen; \
1471 trie->wordinfo[curword].accept = state; \
1473 if ( noper_next < tail ) { \
1475 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1476 trie->jump[curword] = (U16)(noper_next - convert); \
1478 jumper = noper_next; \
1480 nextbranch= regnext(cur); \
1484 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1485 /* chain, so that when the bits of chain are later */\
1486 /* linked together, the dups appear in the chain */\
1487 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1488 trie->wordinfo[dupe].prev = curword; \
1490 /* we haven't inserted this word yet. */ \
1491 trie->states[ state ].wordnum = curword; \
1496 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1497 ( ( base + charid >= ucharcount \
1498 && base + charid < ubound \
1499 && state == trie->trans[ base - ucharcount + charid ].check \
1500 && trie->trans[ base - ucharcount + charid ].next ) \
1501 ? trie->trans[ base - ucharcount + charid ].next \
1502 : ( state==1 ? special : 0 ) \
1506 #define MADE_JUMP_TRIE 2
1507 #define MADE_EXACT_TRIE 4
1510 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1513 /* first pass, loop through and scan words */
1514 reg_trie_data *trie;
1515 HV *widecharmap = NULL;
1516 AV *revcharmap = newAV();
1518 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1523 regnode *jumper = NULL;
1524 regnode *nextbranch = NULL;
1525 regnode *convert = NULL;
1526 U32 *prev_states; /* temp array mapping each state to previous one */
1527 /* we just use folder as a flag in utf8 */
1528 const U8 * folder = NULL;
1531 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1532 AV *trie_words = NULL;
1533 /* along with revcharmap, this only used during construction but both are
1534 * useful during debugging so we store them in the struct when debugging.
1537 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1538 STRLEN trie_charcount=0;
1540 SV *re_trie_maxbuff;
1541 GET_RE_DEBUG_FLAGS_DECL;
1543 PERL_ARGS_ASSERT_MAKE_TRIE;
1545 PERL_UNUSED_ARG(depth);
1552 case EXACTFU_TRICKYFOLD:
1553 case EXACTFU: folder = PL_fold_latin1; break;
1554 case EXACTF: folder = PL_fold; break;
1555 case EXACTFL: folder = PL_fold_locale; break;
1556 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1559 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1561 trie->startstate = 1;
1562 trie->wordcount = word_count;
1563 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1564 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1566 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1567 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1568 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1571 trie_words = newAV();
1574 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1575 if (!SvIOK(re_trie_maxbuff)) {
1576 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1578 DEBUG_TRIE_COMPILE_r({
1579 PerlIO_printf( Perl_debug_log,
1580 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1581 (int)depth * 2 + 2, "",
1582 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1583 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1587 /* Find the node we are going to overwrite */
1588 if ( first == startbranch && OP( last ) != BRANCH ) {
1589 /* whole branch chain */
1592 /* branch sub-chain */
1593 convert = NEXTOPER( first );
1596 /* -- First loop and Setup --
1598 We first traverse the branches and scan each word to determine if it
1599 contains widechars, and how many unique chars there are, this is
1600 important as we have to build a table with at least as many columns as we
1603 We use an array of integers to represent the character codes 0..255
1604 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1605 native representation of the character value as the key and IV's for the
1608 *TODO* If we keep track of how many times each character is used we can
1609 remap the columns so that the table compression later on is more
1610 efficient in terms of memory by ensuring the most common value is in the
1611 middle and the least common are on the outside. IMO this would be better
1612 than a most to least common mapping as theres a decent chance the most
1613 common letter will share a node with the least common, meaning the node
1614 will not be compressible. With a middle is most common approach the worst
1615 case is when we have the least common nodes twice.
1619 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1620 regnode *noper = NEXTOPER( cur );
1621 const U8 *uc = (U8*)STRING( noper );
1622 const U8 *e = uc + STR_LEN( noper );
1624 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1626 const U8 *scan = (U8*)NULL;
1627 U32 wordlen = 0; /* required init */
1629 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1631 if (OP(noper) == NOTHING) {
1632 regnode *noper_next= regnext(noper);
1633 if (noper_next != tail && OP(noper_next) == flags) {
1635 uc= (U8*)STRING(noper);
1636 e= uc + STR_LEN(noper);
1637 trie->minlen= STR_LEN(noper);
1644 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1645 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1646 regardless of encoding */
1647 if (OP( noper ) == EXACTFU_SS) {
1648 /* false positives are ok, so just set this */
1649 TRIE_BITMAP_SET(trie,0xDF);
1652 for ( ; uc < e ; uc += len ) {
1653 TRIE_CHARCOUNT(trie)++;
1658 U8 folded= folder[ (U8) uvc ];
1659 if ( !trie->charmap[ folded ] ) {
1660 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1661 TRIE_STORE_REVCHAR( folded );
1664 if ( !trie->charmap[ uvc ] ) {
1665 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1666 TRIE_STORE_REVCHAR( uvc );
1669 /* store the codepoint in the bitmap, and its folded
1671 TRIE_BITMAP_SET(trie, uvc);
1673 /* store the folded codepoint */
1674 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1677 /* store first byte of utf8 representation of
1678 variant codepoints */
1679 if (! UNI_IS_INVARIANT(uvc)) {
1680 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1683 set_bit = 0; /* We've done our bit :-) */
1688 widecharmap = newHV();
1690 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1693 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1695 if ( !SvTRUE( *svpp ) ) {
1696 sv_setiv( *svpp, ++trie->uniquecharcount );
1697 TRIE_STORE_REVCHAR(uvc);
1701 if( cur == first ) {
1702 trie->minlen = chars;
1703 trie->maxlen = chars;
1704 } else if (chars < trie->minlen) {
1705 trie->minlen = chars;
1706 } else if (chars > trie->maxlen) {
1707 trie->maxlen = chars;
1709 if (OP( noper ) == EXACTFU_SS) {
1710 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1711 if (trie->minlen > 1)
1714 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1715 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1716 * - We assume that any such sequence might match a 2 byte string */
1717 if (trie->minlen > 2 )
1721 } /* end first pass */
1722 DEBUG_TRIE_COMPILE_r(
1723 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1724 (int)depth * 2 + 2,"",
1725 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1726 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1727 (int)trie->minlen, (int)trie->maxlen )
1731 We now know what we are dealing with in terms of unique chars and
1732 string sizes so we can calculate how much memory a naive
1733 representation using a flat table will take. If it's over a reasonable
1734 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1735 conservative but potentially much slower representation using an array
1738 At the end we convert both representations into the same compressed
1739 form that will be used in regexec.c for matching with. The latter
1740 is a form that cannot be used to construct with but has memory
1741 properties similar to the list form and access properties similar
1742 to the table form making it both suitable for fast searches and
1743 small enough that its feasable to store for the duration of a program.
1745 See the comment in the code where the compressed table is produced
1746 inplace from the flat tabe representation for an explanation of how
1747 the compression works.
1752 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1755 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1757 Second Pass -- Array Of Lists Representation
1759 Each state will be represented by a list of charid:state records
1760 (reg_trie_trans_le) the first such element holds the CUR and LEN
1761 points of the allocated array. (See defines above).
1763 We build the initial structure using the lists, and then convert
1764 it into the compressed table form which allows faster lookups
1765 (but cant be modified once converted).
1768 STRLEN transcount = 1;
1770 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1771 "%*sCompiling trie using list compiler\n",
1772 (int)depth * 2 + 2, ""));
1774 trie->states = (reg_trie_state *)
1775 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1776 sizeof(reg_trie_state) );
1780 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1782 regnode *noper = NEXTOPER( cur );
1783 U8 *uc = (U8*)STRING( noper );
1784 const U8 *e = uc + STR_LEN( noper );
1785 U32 state = 1; /* required init */
1786 U16 charid = 0; /* sanity init */
1787 U8 *scan = (U8*)NULL; /* sanity init */
1788 STRLEN foldlen = 0; /* required init */
1789 U32 wordlen = 0; /* required init */
1790 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1793 if (OP(noper) == NOTHING) {
1794 regnode *noper_next= regnext(noper);
1795 if (noper_next != tail && OP(noper_next) == flags) {
1797 uc= (U8*)STRING(noper);
1798 e= uc + STR_LEN(noper);
1802 if (OP(noper) != NOTHING) {
1803 for ( ; uc < e ; uc += len ) {
1808 charid = trie->charmap[ uvc ];
1810 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1814 charid=(U16)SvIV( *svpp );
1817 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1824 if ( !trie->states[ state ].trans.list ) {
1825 TRIE_LIST_NEW( state );
1827 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1828 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1829 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1834 newstate = next_alloc++;
1835 prev_states[newstate] = state;
1836 TRIE_LIST_PUSH( state, charid, newstate );
1841 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1845 TRIE_HANDLE_WORD(state);
1847 } /* end second pass */
1849 /* next alloc is the NEXT state to be allocated */
1850 trie->statecount = next_alloc;
1851 trie->states = (reg_trie_state *)
1852 PerlMemShared_realloc( trie->states,
1854 * sizeof(reg_trie_state) );
1856 /* and now dump it out before we compress it */
1857 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1858 revcharmap, next_alloc,
1862 trie->trans = (reg_trie_trans *)
1863 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1870 for( state=1 ; state < next_alloc ; state ++ ) {
1874 DEBUG_TRIE_COMPILE_MORE_r(
1875 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1879 if (trie->states[state].trans.list) {
1880 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1884 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1885 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1886 if ( forid < minid ) {
1888 } else if ( forid > maxid ) {
1892 if ( transcount < tp + maxid - minid + 1) {
1894 trie->trans = (reg_trie_trans *)
1895 PerlMemShared_realloc( trie->trans,
1897 * sizeof(reg_trie_trans) );
1898 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1900 base = trie->uniquecharcount + tp - minid;
1901 if ( maxid == minid ) {
1903 for ( ; zp < tp ; zp++ ) {
1904 if ( ! trie->trans[ zp ].next ) {
1905 base = trie->uniquecharcount + zp - minid;
1906 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1907 trie->trans[ zp ].check = state;
1913 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1914 trie->trans[ tp ].check = state;
1919 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1920 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1921 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1922 trie->trans[ tid ].check = state;
1924 tp += ( maxid - minid + 1 );
1926 Safefree(trie->states[ state ].trans.list);
1929 DEBUG_TRIE_COMPILE_MORE_r(
1930 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1933 trie->states[ state ].trans.base=base;
1935 trie->lasttrans = tp + 1;
1939 Second Pass -- Flat Table Representation.
1941 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1942 We know that we will need Charcount+1 trans at most to store the data
1943 (one row per char at worst case) So we preallocate both structures
1944 assuming worst case.
1946 We then construct the trie using only the .next slots of the entry
1949 We use the .check field of the first entry of the node temporarily to
1950 make compression both faster and easier by keeping track of how many non
1951 zero fields are in the node.
1953 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1956 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1957 number representing the first entry of the node, and state as a
1958 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1959 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1960 are 2 entrys per node. eg:
1968 The table is internally in the right hand, idx form. However as we also
1969 have to deal with the states array which is indexed by nodenum we have to
1970 use TRIE_NODENUM() to convert.
1973 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1974 "%*sCompiling trie using table compiler\n",
1975 (int)depth * 2 + 2, ""));
1977 trie->trans = (reg_trie_trans *)
1978 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1979 * trie->uniquecharcount + 1,
1980 sizeof(reg_trie_trans) );
1981 trie->states = (reg_trie_state *)
1982 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1983 sizeof(reg_trie_state) );
1984 next_alloc = trie->uniquecharcount + 1;
1987 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1989 regnode *noper = NEXTOPER( cur );
1990 const U8 *uc = (U8*)STRING( noper );
1991 const U8 *e = uc + STR_LEN( noper );
1993 U32 state = 1; /* required init */
1995 U16 charid = 0; /* sanity init */
1996 U32 accept_state = 0; /* sanity init */
1997 U8 *scan = (U8*)NULL; /* sanity init */
1999 STRLEN foldlen = 0; /* required init */
2000 U32 wordlen = 0; /* required init */
2002 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2004 if (OP(noper) == NOTHING) {
2005 regnode *noper_next= regnext(noper);
2006 if (noper_next != tail && OP(noper_next) == flags) {
2008 uc= (U8*)STRING(noper);
2009 e= uc + STR_LEN(noper);
2013 if ( OP(noper) != NOTHING ) {
2014 for ( ; uc < e ; uc += len ) {
2019 charid = trie->charmap[ uvc ];
2021 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2022 charid = svpp ? (U16)SvIV(*svpp) : 0;
2026 if ( !trie->trans[ state + charid ].next ) {
2027 trie->trans[ state + charid ].next = next_alloc;
2028 trie->trans[ state ].check++;
2029 prev_states[TRIE_NODENUM(next_alloc)]
2030 = TRIE_NODENUM(state);
2031 next_alloc += trie->uniquecharcount;
2033 state = trie->trans[ state + charid ].next;
2035 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2037 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2040 accept_state = TRIE_NODENUM( state );
2041 TRIE_HANDLE_WORD(accept_state);
2043 } /* end second pass */
2045 /* and now dump it out before we compress it */
2046 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2048 next_alloc, depth+1));
2052 * Inplace compress the table.*
2054 For sparse data sets the table constructed by the trie algorithm will
2055 be mostly 0/FAIL transitions or to put it another way mostly empty.
2056 (Note that leaf nodes will not contain any transitions.)
2058 This algorithm compresses the tables by eliminating most such
2059 transitions, at the cost of a modest bit of extra work during lookup:
2061 - Each states[] entry contains a .base field which indicates the
2062 index in the state[] array wheres its transition data is stored.
2064 - If .base is 0 there are no valid transitions from that node.
2066 - If .base is nonzero then charid is added to it to find an entry in
2069 -If trans[states[state].base+charid].check!=state then the
2070 transition is taken to be a 0/Fail transition. Thus if there are fail
2071 transitions at the front of the node then the .base offset will point
2072 somewhere inside the previous nodes data (or maybe even into a node
2073 even earlier), but the .check field determines if the transition is
2077 The following process inplace converts the table to the compressed
2078 table: We first do not compress the root node 1,and mark all its
2079 .check pointers as 1 and set its .base pointer as 1 as well. This
2080 allows us to do a DFA construction from the compressed table later,
2081 and ensures that any .base pointers we calculate later are greater
2084 - We set 'pos' to indicate the first entry of the second node.
2086 - We then iterate over the columns of the node, finding the first and
2087 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2088 and set the .check pointers accordingly, and advance pos
2089 appropriately and repreat for the next node. Note that when we copy
2090 the next pointers we have to convert them from the original
2091 NODEIDX form to NODENUM form as the former is not valid post
2094 - If a node has no transitions used we mark its base as 0 and do not
2095 advance the pos pointer.
2097 - If a node only has one transition we use a second pointer into the
2098 structure to fill in allocated fail transitions from other states.
2099 This pointer is independent of the main pointer and scans forward
2100 looking for null transitions that are allocated to a state. When it
2101 finds one it writes the single transition into the "hole". If the
2102 pointer doesnt find one the single transition is appended as normal.
2104 - Once compressed we can Renew/realloc the structures to release the
2107 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2108 specifically Fig 3.47 and the associated pseudocode.
2112 const U32 laststate = TRIE_NODENUM( next_alloc );
2115 trie->statecount = laststate;
2117 for ( state = 1 ; state < laststate ; state++ ) {
2119 const U32 stateidx = TRIE_NODEIDX( state );
2120 const U32 o_used = trie->trans[ stateidx ].check;
2121 U32 used = trie->trans[ stateidx ].check;
2122 trie->trans[ stateidx ].check = 0;
2124 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2125 if ( flag || trie->trans[ stateidx + charid ].next ) {
2126 if ( trie->trans[ stateidx + charid ].next ) {
2128 for ( ; zp < pos ; zp++ ) {
2129 if ( ! trie->trans[ zp ].next ) {
2133 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2134 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2135 trie->trans[ zp ].check = state;
2136 if ( ++zp > pos ) pos = zp;
2143 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2145 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2146 trie->trans[ pos ].check = state;
2151 trie->lasttrans = pos + 1;
2152 trie->states = (reg_trie_state *)
2153 PerlMemShared_realloc( trie->states, laststate
2154 * sizeof(reg_trie_state) );
2155 DEBUG_TRIE_COMPILE_MORE_r(
2156 PerlIO_printf( Perl_debug_log,
2157 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2158 (int)depth * 2 + 2,"",
2159 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2162 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2165 } /* end table compress */
2167 DEBUG_TRIE_COMPILE_MORE_r(
2168 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2169 (int)depth * 2 + 2, "",
2170 (UV)trie->statecount,
2171 (UV)trie->lasttrans)
2173 /* resize the trans array to remove unused space */
2174 trie->trans = (reg_trie_trans *)
2175 PerlMemShared_realloc( trie->trans, trie->lasttrans
2176 * sizeof(reg_trie_trans) );
2178 { /* Modify the program and insert the new TRIE node */
2179 U8 nodetype =(U8)(flags & 0xFF);
2183 regnode *optimize = NULL;
2184 #ifdef RE_TRACK_PATTERN_OFFSETS
2187 U32 mjd_nodelen = 0;
2188 #endif /* RE_TRACK_PATTERN_OFFSETS */
2189 #endif /* DEBUGGING */
2191 This means we convert either the first branch or the first Exact,
2192 depending on whether the thing following (in 'last') is a branch
2193 or not and whther first is the startbranch (ie is it a sub part of
2194 the alternation or is it the whole thing.)
2195 Assuming its a sub part we convert the EXACT otherwise we convert
2196 the whole branch sequence, including the first.
2198 /* Find the node we are going to overwrite */
2199 if ( first != startbranch || OP( last ) == BRANCH ) {
2200 /* branch sub-chain */
2201 NEXT_OFF( first ) = (U16)(last - first);
2202 #ifdef RE_TRACK_PATTERN_OFFSETS
2204 mjd_offset= Node_Offset((convert));
2205 mjd_nodelen= Node_Length((convert));
2208 /* whole branch chain */
2210 #ifdef RE_TRACK_PATTERN_OFFSETS
2213 const regnode *nop = NEXTOPER( convert );
2214 mjd_offset= Node_Offset((nop));
2215 mjd_nodelen= Node_Length((nop));
2219 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2220 (int)depth * 2 + 2, "",
2221 (UV)mjd_offset, (UV)mjd_nodelen)
2224 /* But first we check to see if there is a common prefix we can
2225 split out as an EXACT and put in front of the TRIE node. */
2226 trie->startstate= 1;
2227 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2229 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2233 const U32 base = trie->states[ state ].trans.base;
2235 if ( trie->states[state].wordnum )
2238 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2239 if ( ( base + ofs >= trie->uniquecharcount ) &&
2240 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2241 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2243 if ( ++count > 1 ) {
2244 SV **tmp = av_fetch( revcharmap, ofs, 0);
2245 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2246 if ( state == 1 ) break;
2248 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2250 PerlIO_printf(Perl_debug_log,
2251 "%*sNew Start State=%"UVuf" Class: [",
2252 (int)depth * 2 + 2, "",
2255 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2256 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2258 TRIE_BITMAP_SET(trie,*ch);
2260 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2262 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2266 TRIE_BITMAP_SET(trie,*ch);
2268 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2269 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2275 SV **tmp = av_fetch( revcharmap, idx, 0);
2277 char *ch = SvPV( *tmp, len );
2279 SV *sv=sv_newmortal();
2280 PerlIO_printf( Perl_debug_log,
2281 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2282 (int)depth * 2 + 2, "",
2284 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2285 PL_colors[0], PL_colors[1],
2286 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2287 PERL_PV_ESCAPE_FIRSTCHAR
2292 OP( convert ) = nodetype;
2293 str=STRING(convert);
2296 STR_LEN(convert) += len;
2302 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2307 trie->prefixlen = (state-1);
2309 regnode *n = convert+NODE_SZ_STR(convert);
2310 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2311 trie->startstate = state;
2312 trie->minlen -= (state - 1);
2313 trie->maxlen -= (state - 1);
2315 /* At least the UNICOS C compiler choked on this
2316 * being argument to DEBUG_r(), so let's just have
2319 #ifdef PERL_EXT_RE_BUILD
2325 regnode *fix = convert;
2326 U32 word = trie->wordcount;
2328 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2329 while( ++fix < n ) {
2330 Set_Node_Offset_Length(fix, 0, 0);
2333 SV ** const tmp = av_fetch( trie_words, word, 0 );
2335 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2336 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2338 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2346 NEXT_OFF(convert) = (U16)(tail - convert);
2347 DEBUG_r(optimize= n);
2353 if ( trie->maxlen ) {
2354 NEXT_OFF( convert ) = (U16)(tail - convert);
2355 ARG_SET( convert, data_slot );
2356 /* Store the offset to the first unabsorbed branch in
2357 jump[0], which is otherwise unused by the jump logic.
2358 We use this when dumping a trie and during optimisation. */
2360 trie->jump[0] = (U16)(nextbranch - convert);
2362 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2363 * and there is a bitmap
2364 * and the first "jump target" node we found leaves enough room
2365 * then convert the TRIE node into a TRIEC node, with the bitmap
2366 * embedded inline in the opcode - this is hypothetically faster.
2368 if ( !trie->states[trie->startstate].wordnum
2370 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2372 OP( convert ) = TRIEC;
2373 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2374 PerlMemShared_free(trie->bitmap);
2377 OP( convert ) = TRIE;
2379 /* store the type in the flags */
2380 convert->flags = nodetype;
2384 + regarglen[ OP( convert ) ];
2386 /* XXX We really should free up the resource in trie now,
2387 as we won't use them - (which resources?) dmq */
2389 /* needed for dumping*/
2390 DEBUG_r(if (optimize) {
2391 regnode *opt = convert;
2393 while ( ++opt < optimize) {
2394 Set_Node_Offset_Length(opt,0,0);
2397 Try to clean up some of the debris left after the
2400 while( optimize < jumper ) {
2401 mjd_nodelen += Node_Length((optimize));
2402 OP( optimize ) = OPTIMIZED;
2403 Set_Node_Offset_Length(optimize,0,0);
2406 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2408 } /* end node insert */
2409 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2411 /* Finish populating the prev field of the wordinfo array. Walk back
2412 * from each accept state until we find another accept state, and if
2413 * so, point the first word's .prev field at the second word. If the
2414 * second already has a .prev field set, stop now. This will be the
2415 * case either if we've already processed that word's accept state,
2416 * or that state had multiple words, and the overspill words were
2417 * already linked up earlier.
2424 for (word=1; word <= trie->wordcount; word++) {
2426 if (trie->wordinfo[word].prev)
2428 state = trie->wordinfo[word].accept;
2430 state = prev_states[state];
2433 prev = trie->states[state].wordnum;
2437 trie->wordinfo[word].prev = prev;
2439 Safefree(prev_states);
2443 /* and now dump out the compressed format */
2444 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2446 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2448 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2449 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2451 SvREFCNT_dec(revcharmap);
2455 : trie->startstate>1
2461 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2463 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2465 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2466 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2469 We find the fail state for each state in the trie, this state is the longest proper
2470 suffix of the current state's 'word' that is also a proper prefix of another word in our
2471 trie. State 1 represents the word '' and is thus the default fail state. This allows
2472 the DFA not to have to restart after its tried and failed a word at a given point, it
2473 simply continues as though it had been matching the other word in the first place.
2475 'abcdgu'=~/abcdefg|cdgu/
2476 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2477 fail, which would bring us to the state representing 'd' in the second word where we would
2478 try 'g' and succeed, proceeding to match 'cdgu'.
2480 /* add a fail transition */
2481 const U32 trie_offset = ARG(source);
2482 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2484 const U32 ucharcount = trie->uniquecharcount;
2485 const U32 numstates = trie->statecount;
2486 const U32 ubound = trie->lasttrans + ucharcount;
2490 U32 base = trie->states[ 1 ].trans.base;
2493 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2494 GET_RE_DEBUG_FLAGS_DECL;
2496 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2498 PERL_UNUSED_ARG(depth);
2502 ARG_SET( stclass, data_slot );
2503 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2504 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2505 aho->trie=trie_offset;
2506 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2507 Copy( trie->states, aho->states, numstates, reg_trie_state );
2508 Newxz( q, numstates, U32);
2509 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2512 /* initialize fail[0..1] to be 1 so that we always have
2513 a valid final fail state */
2514 fail[ 0 ] = fail[ 1 ] = 1;
2516 for ( charid = 0; charid < ucharcount ; charid++ ) {
2517 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2519 q[ q_write ] = newstate;
2520 /* set to point at the root */
2521 fail[ q[ q_write++ ] ]=1;
2524 while ( q_read < q_write) {
2525 const U32 cur = q[ q_read++ % numstates ];
2526 base = trie->states[ cur ].trans.base;
2528 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2529 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2531 U32 fail_state = cur;
2534 fail_state = fail[ fail_state ];
2535 fail_base = aho->states[ fail_state ].trans.base;
2536 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2538 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2539 fail[ ch_state ] = fail_state;
2540 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2542 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2544 q[ q_write++ % numstates] = ch_state;
2548 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2549 when we fail in state 1, this allows us to use the
2550 charclass scan to find a valid start char. This is based on the principle
2551 that theres a good chance the string being searched contains lots of stuff
2552 that cant be a start char.
2554 fail[ 0 ] = fail[ 1 ] = 0;
2555 DEBUG_TRIE_COMPILE_r({
2556 PerlIO_printf(Perl_debug_log,
2557 "%*sStclass Failtable (%"UVuf" states): 0",
2558 (int)(depth * 2), "", (UV)numstates
2560 for( q_read=1; q_read<numstates; q_read++ ) {
2561 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2563 PerlIO_printf(Perl_debug_log, "\n");
2566 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2571 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2572 * These need to be revisited when a newer toolchain becomes available.
2574 #if defined(__sparc64__) && defined(__GNUC__)
2575 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2576 # undef SPARC64_GCC_WORKAROUND
2577 # define SPARC64_GCC_WORKAROUND 1
2581 #define DEBUG_PEEP(str,scan,depth) \
2582 DEBUG_OPTIMISE_r({if (scan){ \
2583 SV * const mysv=sv_newmortal(); \
2584 regnode *Next = regnext(scan); \
2585 regprop(RExC_rx, mysv, scan); \
2586 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2587 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2588 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2592 /* The below joins as many adjacent EXACTish nodes as possible into a single
2593 * one. The regop may be changed if the node(s) contain certain sequences that
2594 * require special handling. The joining is only done if:
2595 * 1) there is room in the current conglomerated node to entirely contain the
2597 * 2) they are the exact same node type
2599 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2600 * these get optimized out
2602 * If a node is to match under /i (folded), the number of characters it matches
2603 * can be different than its character length if it contains a multi-character
2604 * fold. *min_subtract is set to the total delta of the input nodes.
2606 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2607 * and contains LATIN SMALL LETTER SHARP S
2609 * This is as good a place as any to discuss the design of handling these
2610 * multi-character fold sequences. It's been wrong in Perl for a very long
2611 * time. There are three code points in Unicode whose multi-character folds
2612 * were long ago discovered to mess things up. The previous designs for
2613 * dealing with these involved assigning a special node for them. This
2614 * approach doesn't work, as evidenced by this example:
2615 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2616 * Both these fold to "sss", but if the pattern is parsed to create a node that
2617 * would match just the \xDF, it won't be able to handle the case where a
2618 * successful match would have to cross the node's boundary. The new approach
2619 * that hopefully generally solves the problem generates an EXACTFU_SS node
2622 * It turns out that there are problems with all multi-character folds, and not
2623 * just these three. Now the code is general, for all such cases, but the
2624 * three still have some special handling. The approach taken is:
2625 * 1) This routine examines each EXACTFish node that could contain multi-
2626 * character fold sequences. It returns in *min_subtract how much to
2627 * subtract from the the actual length of the string to get a real minimum
2628 * match length; it is 0 if there are no multi-char folds. This delta is
2629 * used by the caller to adjust the min length of the match, and the delta
2630 * between min and max, so that the optimizer doesn't reject these
2631 * possibilities based on size constraints.
2632 * 2) Certain of these sequences require special handling by the trie code,
2633 * so, if found, this code changes the joined node type to special ops:
2634 * EXACTFU_TRICKYFOLD and EXACTFU_SS.
2635 * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2636 * is used for an EXACTFU node that contains at least one "ss" sequence in
2637 * it. For non-UTF-8 patterns and strings, this is the only case where
2638 * there is a possible fold length change. That means that a regular
2639 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2640 * with length changes, and so can be processed faster. regexec.c takes
2641 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2642 * pre-folded by regcomp.c. This saves effort in regex matching.
2643 * However, the pre-folding isn't done for non-UTF8 patterns because the
2644 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2645 * down by forcing the pattern into UTF8 unless necessary. Also what
2646 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2647 * possibilities for the non-UTF8 patterns are quite simple, except for
2648 * the sharp s. All the ones that don't involve a UTF-8 target string are
2649 * members of a fold-pair, and arrays are set up for all of them so that
2650 * the other member of the pair can be found quickly. Code elsewhere in
2651 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2652 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2653 * described in the next item.
2654 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2655 * 'ss' or not is not knowable at compile time. It will match iff the
2656 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2657 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2658 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2659 * described in item 3). An assumption that the optimizer part of
2660 * regexec.c (probably unwittingly) makes is that a character in the
2661 * pattern corresponds to at most a single character in the target string.
2662 * (And I do mean character, and not byte here, unlike other parts of the
2663 * documentation that have never been updated to account for multibyte
2664 * Unicode.) This assumption is wrong only in this case, as all other
2665 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2666 * virtue of having this file pre-fold UTF-8 patterns. I'm
2667 * reluctant to try to change this assumption, so instead the code punts.
2668 * This routine examines EXACTF nodes for the sharp s, and returns a
2669 * boolean indicating whether or not the node is an EXACTF node that
2670 * contains a sharp s. When it is true, the caller sets a flag that later
2671 * causes the optimizer in this file to not set values for the floating
2672 * and fixed string lengths, and thus avoids the optimizer code in
2673 * regexec.c that makes the invalid assumption. Thus, there is no
2674 * optimization based on string lengths for EXACTF nodes that contain the
2675 * sharp s. This only happens for /id rules (which means the pattern
2679 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2680 if (PL_regkind[OP(scan)] == EXACT) \
2681 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2684 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2685 /* Merge several consecutive EXACTish nodes into one. */
2686 regnode *n = regnext(scan);
2688 regnode *next = scan + NODE_SZ_STR(scan);
2692 regnode *stop = scan;
2693 GET_RE_DEBUG_FLAGS_DECL;
2695 PERL_UNUSED_ARG(depth);
2698 PERL_ARGS_ASSERT_JOIN_EXACT;
2699 #ifndef EXPERIMENTAL_INPLACESCAN
2700 PERL_UNUSED_ARG(flags);
2701 PERL_UNUSED_ARG(val);
2703 DEBUG_PEEP("join",scan,depth);
2705 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2706 * EXACT ones that are mergeable to the current one. */
2708 && (PL_regkind[OP(n)] == NOTHING
2709 || (stringok && OP(n) == OP(scan)))
2711 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2714 if (OP(n) == TAIL || n > next)
2716 if (PL_regkind[OP(n)] == NOTHING) {
2717 DEBUG_PEEP("skip:",n,depth);
2718 NEXT_OFF(scan) += NEXT_OFF(n);
2719 next = n + NODE_STEP_REGNODE;
2726 else if (stringok) {
2727 const unsigned int oldl = STR_LEN(scan);
2728 regnode * const nnext = regnext(n);
2730 /* XXX I (khw) kind of doubt that this works on platforms where
2731 * U8_MAX is above 255 because of lots of other assumptions */
2732 if (oldl + STR_LEN(n) > U8_MAX)
2735 DEBUG_PEEP("merg",n,depth);
2738 NEXT_OFF(scan) += NEXT_OFF(n);
2739 STR_LEN(scan) += STR_LEN(n);
2740 next = n + NODE_SZ_STR(n);
2741 /* Now we can overwrite *n : */
2742 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2750 #ifdef EXPERIMENTAL_INPLACESCAN
2751 if (flags && !NEXT_OFF(n)) {
2752 DEBUG_PEEP("atch", val, depth);
2753 if (reg_off_by_arg[OP(n)]) {
2754 ARG_SET(n, val - n);
2757 NEXT_OFF(n) = val - n;
2765 *has_exactf_sharp_s = FALSE;
2767 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2768 * can now analyze for sequences of problematic code points. (Prior to
2769 * this final joining, sequences could have been split over boundaries, and
2770 * hence missed). The sequences only happen in folding, hence for any
2771 * non-EXACT EXACTish node */
2772 if (OP(scan) != EXACT) {
2773 const U8 * const s0 = (U8*) STRING(scan);
2775 const U8 * const s_end = s0 + STR_LEN(scan);
2777 /* One pass is made over the node's string looking for all the
2778 * possibilities. to avoid some tests in the loop, there are two main
2779 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2783 /* Examine the string for a multi-character fold sequence. UTF-8
2784 * patterns have all characters pre-folded by the time this code is
2786 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2787 length sequence we are looking for is 2 */
2790 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2791 if (! len) { /* Not a multi-char fold: get next char */
2796 /* Nodes with 'ss' require special handling, except for EXACTFL
2797 * and EXACTFA for which there is no multi-char fold to this */
2798 if (len == 2 && *s == 's' && *(s+1) == 's'
2799 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2802 OP(scan) = EXACTFU_SS;
2805 else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
2806 && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2807 COMBINING_DIAERESIS_UTF8
2808 COMBINING_ACUTE_ACCENT_UTF8,
2810 || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2811 COMBINING_DIAERESIS_UTF8
2812 COMBINING_ACUTE_ACCENT_UTF8,
2817 /* These two folds require special handling by trie's, so
2818 * change the node type to indicate this. If EXACTFA and
2819 * EXACTFL were ever to be handled by trie's, this would
2820 * have to be changed. If this node has already been
2821 * changed to EXACTFU_SS in this loop, leave it as is. (I
2822 * (khw) think it doesn't matter in regexec.c for UTF
2823 * patterns, but no need to change it */
2824 if (OP(scan) == EXACTFU) {
2825 OP(scan) = EXACTFU_TRICKYFOLD;
2829 else { /* Here is a generic multi-char fold. */
2830 const U8* multi_end = s + len;
2832 /* Count how many characters in it. In the case of /l and
2833 * /aa, no folds which contain ASCII code points are
2834 * allowed, so check for those, and skip if found. (In
2835 * EXACTFL, no folds are allowed to any Latin1 code point,
2836 * not just ASCII. But there aren't any of these
2837 * currently, nor ever likely, so don't take the time to
2838 * test for them. The code that generates the
2839 * is_MULTI_foo() macros croaks should one actually get put
2840 * into Unicode .) */
2841 if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2842 count = utf8_length(s, multi_end);
2846 while (s < multi_end) {
2849 goto next_iteration;
2859 /* The delta is how long the sequence is minus 1 (1 is how long
2860 * the character that folds to the sequence is) */
2861 *min_subtract += count - 1;
2865 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2867 /* Here, the pattern is not UTF-8. Look for the multi-char folds
2868 * that are all ASCII. As in the above case, EXACTFL and EXACTFA
2869 * nodes can't have multi-char folds to this range (and there are
2870 * no existing ones in the upper latin1 range). In the EXACTF
2871 * case we look also for the sharp s, which can be in the final
2872 * position. Otherwise we can stop looking 1 byte earlier because
2873 * have to find at least two characters for a multi-fold */
2874 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2876 /* The below is perhaps overboard, but this allows us to save a
2877 * test each time through the loop at the expense of a mask. This
2878 * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2879 * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
2880 * are 64. This uses an exclusive 'or' to find that bit and then
2881 * inverts it to form a mask, with just a single 0, in the bit
2882 * position where 'S' and 's' differ. */
2883 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2884 const U8 s_masked = 's' & S_or_s_mask;
2887 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2888 if (! len) { /* Not a multi-char fold. */
2889 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2891 *has_exactf_sharp_s = TRUE;
2898 && ((*s & S_or_s_mask) == s_masked)
2899 && ((*(s+1) & S_or_s_mask) == s_masked))
2902 /* EXACTF nodes need to know that the minimum length
2903 * changed so that a sharp s in the string can match this
2904 * ss in the pattern, but they remain EXACTF nodes, as they
2905 * won't match this unless the target string is is UTF-8,
2906 * which we don't know until runtime */
2907 if (OP(scan) != EXACTF) {
2908 OP(scan) = EXACTFU_SS;
2912 *min_subtract += len - 1;
2919 /* Allow dumping but overwriting the collection of skipped
2920 * ops and/or strings with fake optimized ops */
2921 n = scan + NODE_SZ_STR(scan);
2929 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2933 /* REx optimizer. Converts nodes into quicker variants "in place".
2934 Finds fixed substrings. */
2936 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2937 to the position after last scanned or to NULL. */
2939 #define INIT_AND_WITHP \
2940 assert(!and_withp); \
2941 Newx(and_withp,1,struct regnode_charclass_class); \
2942 SAVEFREEPV(and_withp)
2944 /* this is a chain of data about sub patterns we are processing that
2945 need to be handled separately/specially in study_chunk. Its so
2946 we can simulate recursion without losing state. */
2948 typedef struct scan_frame {
2949 regnode *last; /* last node to process in this frame */
2950 regnode *next; /* next node to process when last is reached */
2951 struct scan_frame *prev; /*previous frame*/
2952 I32 stop; /* what stopparen do we use */
2956 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2958 #define CASE_SYNST_FNC(nAmE) \
2960 if (flags & SCF_DO_STCLASS_AND) { \
2961 for (value = 0; value < 256; value++) \
2962 if (!is_ ## nAmE ## _cp(value)) \
2963 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2966 for (value = 0; value < 256; value++) \
2967 if (is_ ## nAmE ## _cp(value)) \
2968 ANYOF_BITMAP_SET(data->start_class, value); \
2972 if (flags & SCF_DO_STCLASS_AND) { \
2973 for (value = 0; value < 256; value++) \
2974 if (is_ ## nAmE ## _cp(value)) \
2975 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2978 for (value = 0; value < 256; value++) \
2979 if (!is_ ## nAmE ## _cp(value)) \
2980 ANYOF_BITMAP_SET(data->start_class, value); \
2987 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2988 I32 *minlenp, I32 *deltap,
2993 struct regnode_charclass_class *and_withp,
2994 U32 flags, U32 depth)
2995 /* scanp: Start here (read-write). */
2996 /* deltap: Write maxlen-minlen here. */
2997 /* last: Stop before this one. */
2998 /* data: string data about the pattern */
2999 /* stopparen: treat close N as END */
3000 /* recursed: which subroutines have we recursed into */
3001 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3004 I32 min = 0; /* There must be at least this number of characters to match */
3006 regnode *scan = *scanp, *next;
3008 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3009 int is_inf_internal = 0; /* The studied chunk is infinite */
3010 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3011 scan_data_t data_fake;
3012 SV *re_trie_maxbuff = NULL;
3013 regnode *first_non_open = scan;
3014 I32 stopmin = I32_MAX;
3015 scan_frame *frame = NULL;
3016 GET_RE_DEBUG_FLAGS_DECL;
3018 PERL_ARGS_ASSERT_STUDY_CHUNK;
3021 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3025 while (first_non_open && OP(first_non_open) == OPEN)
3026 first_non_open=regnext(first_non_open);
3031 while ( scan && OP(scan) != END && scan < last ){
3032 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3033 node length to get a real minimum (because
3034 the folded version may be shorter) */
3035 bool has_exactf_sharp_s = FALSE;
3036 /* Peephole optimizer: */
3037 DEBUG_STUDYDATA("Peep:", data,depth);
3038 DEBUG_PEEP("Peep",scan,depth);
3040 /* Its not clear to khw or hv why this is done here, and not in the
3041 * clauses that deal with EXACT nodes. khw's guess is that it's
3042 * because of a previous design */
3043 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3045 /* Follow the next-chain of the current node and optimize
3046 away all the NOTHINGs from it. */
3047 if (OP(scan) != CURLYX) {
3048 const int max = (reg_off_by_arg[OP(scan)]
3050 /* I32 may be smaller than U16 on CRAYs! */
3051 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3052 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3056 /* Skip NOTHING and LONGJMP. */
3057 while ((n = regnext(n))
3058 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3059 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3060 && off + noff < max)
3062 if (reg_off_by_arg[OP(scan)])
3065 NEXT_OFF(scan) = off;
3070 /* The principal pseudo-switch. Cannot be a switch, since we
3071 look into several different things. */
3072 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3073 || OP(scan) == IFTHEN) {
3074 next = regnext(scan);
3076 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3078 if (OP(next) == code || code == IFTHEN) {
3079 /* NOTE - There is similar code to this block below for handling
3080 TRIE nodes on a re-study. If you change stuff here check there
3082 I32 max1 = 0, min1 = I32_MAX, num = 0;
3083 struct regnode_charclass_class accum;
3084 regnode * const startbranch=scan;
3086 if (flags & SCF_DO_SUBSTR)
3087 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3088 if (flags & SCF_DO_STCLASS)
3089 cl_init_zero(pRExC_state, &accum);
3091 while (OP(scan) == code) {
3092 I32 deltanext, minnext, f = 0, fake;
3093 struct regnode_charclass_class this_class;
3096 data_fake.flags = 0;
3098 data_fake.whilem_c = data->whilem_c;
3099 data_fake.last_closep = data->last_closep;
3102 data_fake.last_closep = &fake;
3104 data_fake.pos_delta = delta;
3105 next = regnext(scan);
3106 scan = NEXTOPER(scan);
3108 scan = NEXTOPER(scan);
3109 if (flags & SCF_DO_STCLASS) {
3110 cl_init(pRExC_state, &this_class);
3111 data_fake.start_class = &this_class;
3112 f = SCF_DO_STCLASS_AND;
3114 if (flags & SCF_WHILEM_VISITED_POS)
3115 f |= SCF_WHILEM_VISITED_POS;
3117 /* we suppose the run is continuous, last=next...*/
3118 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3120 stopparen, recursed, NULL, f,depth+1);
3123 if (max1 < minnext + deltanext)
3124 max1 = minnext + deltanext;
3125 if (deltanext == I32_MAX)
3126 is_inf = is_inf_internal = 1;
3128 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3130 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3131 if ( stopmin > minnext)
3132 stopmin = min + min1;
3133 flags &= ~SCF_DO_SUBSTR;
3135 data->flags |= SCF_SEEN_ACCEPT;
3138 if (data_fake.flags & SF_HAS_EVAL)
3139 data->flags |= SF_HAS_EVAL;
3140 data->whilem_c = data_fake.whilem_c;
3142 if (flags & SCF_DO_STCLASS)
3143 cl_or(pRExC_state, &accum, &this_class);
3145 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3147 if (flags & SCF_DO_SUBSTR) {
3148 data->pos_min += min1;
3149 data->pos_delta += max1 - min1;
3150 if (max1 != min1 || is_inf)
3151 data->longest = &(data->longest_float);
3154 delta += max1 - min1;
3155 if (flags & SCF_DO_STCLASS_OR) {
3156 cl_or(pRExC_state, data->start_class, &accum);
3158 cl_and(data->start_class, and_withp);
3159 flags &= ~SCF_DO_STCLASS;
3162 else if (flags & SCF_DO_STCLASS_AND) {
3164 cl_and(data->start_class, &accum);
3165 flags &= ~SCF_DO_STCLASS;
3168 /* Switch to OR mode: cache the old value of
3169 * data->start_class */
3171 StructCopy(data->start_class, and_withp,
3172 struct regnode_charclass_class);
3173 flags &= ~SCF_DO_STCLASS_AND;
3174 StructCopy(&accum, data->start_class,
3175 struct regnode_charclass_class);
3176 flags |= SCF_DO_STCLASS_OR;
3177 data->start_class->flags |= ANYOF_EOS;
3181 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3184 Assuming this was/is a branch we are dealing with: 'scan' now
3185 points at the item that follows the branch sequence, whatever
3186 it is. We now start at the beginning of the sequence and look
3193 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3195 If we can find such a subsequence we need to turn the first
3196 element into a trie and then add the subsequent branch exact
3197 strings to the trie.
3201 1. patterns where the whole set of branches can be converted.
3203 2. patterns where only a subset can be converted.
3205 In case 1 we can replace the whole set with a single regop
3206 for the trie. In case 2 we need to keep the start and end
3209 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3210 becomes BRANCH TRIE; BRANCH X;
3212 There is an additional case, that being where there is a
3213 common prefix, which gets split out into an EXACT like node
3214 preceding the TRIE node.
3216 If x(1..n)==tail then we can do a simple trie, if not we make
3217 a "jump" trie, such that when we match the appropriate word
3218 we "jump" to the appropriate tail node. Essentially we turn
3219 a nested if into a case structure of sorts.
3224 if (!re_trie_maxbuff) {
3225 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3226 if (!SvIOK(re_trie_maxbuff))
3227 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3229 if ( SvIV(re_trie_maxbuff)>=0 ) {
3231 regnode *first = (regnode *)NULL;
3232 regnode *last = (regnode *)NULL;
3233 regnode *tail = scan;
3238 SV * const mysv = sv_newmortal(); /* for dumping */
3240 /* var tail is used because there may be a TAIL
3241 regop in the way. Ie, the exacts will point to the
3242 thing following the TAIL, but the last branch will
3243 point at the TAIL. So we advance tail. If we
3244 have nested (?:) we may have to move through several
3248 while ( OP( tail ) == TAIL ) {
3249 /* this is the TAIL generated by (?:) */
3250 tail = regnext( tail );
3254 DEBUG_TRIE_COMPILE_r({
3255 regprop(RExC_rx, mysv, tail );
3256 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3257 (int)depth * 2 + 2, "",
3258 "Looking for TRIE'able sequences. Tail node is: ",
3259 SvPV_nolen_const( mysv )
3265 Step through the branches
3266 cur represents each branch,
3267 noper is the first thing to be matched as part of that branch
3268 noper_next is the regnext() of that node.
3270 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3271 via a "jump trie" but we also support building with NOJUMPTRIE,
3272 which restricts the trie logic to structures like /FOO|BAR/.
3274 If noper is a trieable nodetype then the branch is a possible optimization
3275 target. If we are building under NOJUMPTRIE then we require that noper_next
3276 is the same as scan (our current position in the regex program).
3278 Once we have two or more consecutive such branches we can create a
3279 trie of the EXACT's contents and stitch it in place into the program.
3281 If the sequence represents all of the branches in the alternation we
3282 replace the entire thing with a single TRIE node.
3284 Otherwise when it is a subsequence we need to stitch it in place and
3285 replace only the relevant branches. This means the first branch has
3286 to remain as it is used by the alternation logic, and its next pointer,
3287 and needs to be repointed at the item on the branch chain following
3288 the last branch we have optimized away.
3290 This could be either a BRANCH, in which case the subsequence is internal,
3291 or it could be the item following the branch sequence in which case the
3292 subsequence is at the end (which does not necessarily mean the first node
3293 is the start of the alternation).
3295 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3298 ----------------+-----------
3302 EXACTFU_SS | EXACTFU
3303 EXACTFU_TRICKYFOLD | EXACTFU
3308 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3309 ( EXACT == (X) ) ? EXACT : \
3310 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3313 /* dont use tail as the end marker for this traverse */
3314 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3315 regnode * const noper = NEXTOPER( cur );
3316 U8 noper_type = OP( noper );
3317 U8 noper_trietype = TRIE_TYPE( noper_type );
3318 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3319 regnode * const noper_next = regnext( noper );
3320 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3321 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3324 DEBUG_TRIE_COMPILE_r({
3325 regprop(RExC_rx, mysv, cur);
3326 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3327 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3329 regprop(RExC_rx, mysv, noper);
3330 PerlIO_printf( Perl_debug_log, " -> %s",
3331 SvPV_nolen_const(mysv));
3334 regprop(RExC_rx, mysv, noper_next );
3335 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3336 SvPV_nolen_const(mysv));
3338 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3339 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3340 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3344 /* Is noper a trieable nodetype that can be merged with the
3345 * current trie (if there is one)? */
3349 ( noper_trietype == NOTHING)
3350 || ( trietype == NOTHING )
3351 || ( trietype == noper_trietype )
3354 && noper_next == tail
3358 /* Handle mergable triable node
3359 * Either we are the first node in a new trieable sequence,
3360 * in which case we do some bookkeeping, otherwise we update
3361 * the end pointer. */
3364 if ( noper_trietype == NOTHING ) {
3365 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3366 regnode * const noper_next = regnext( noper );
3367 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3368 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3371 if ( noper_next_trietype ) {
3372 trietype = noper_next_trietype;
3373 } else if (noper_next_type) {
3374 /* a NOTHING regop is 1 regop wide. We need at least two
3375 * for a trie so we can't merge this in */
3379 trietype = noper_trietype;
3382 if ( trietype == NOTHING )
3383 trietype = noper_trietype;
3388 } /* end handle mergable triable node */
3390 /* handle unmergable node -
3391 * noper may either be a triable node which can not be tried
3392 * together with the current trie, or a non triable node */
3394 /* If last is set and trietype is not NOTHING then we have found
3395 * at least two triable branch sequences in a row of a similar
3396 * trietype so we can turn them into a trie. If/when we
3397 * allow NOTHING to start a trie sequence this condition will be
3398 * required, and it isn't expensive so we leave it in for now. */
3399 if ( trietype && trietype != NOTHING )
3400 make_trie( pRExC_state,
3401 startbranch, first, cur, tail, count,
3402 trietype, depth+1 );
3403 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3407 && noper_next == tail
3410 /* noper is triable, so we can start a new trie sequence */
3413 trietype = noper_trietype;
3415 /* if we already saw a first but the current node is not triable then we have
3416 * to reset the first information. */
3421 } /* end handle unmergable node */
3422 } /* loop over branches */
3423 DEBUG_TRIE_COMPILE_r({
3424 regprop(RExC_rx, mysv, cur);
3425 PerlIO_printf( Perl_debug_log,
3426 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3427 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3430 if ( last && trietype ) {
3431 if ( trietype != NOTHING ) {
3432 /* the last branch of the sequence was part of a trie,
3433 * so we have to construct it here outside of the loop
3435 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3436 #ifdef TRIE_STUDY_OPT
3437 if ( ((made == MADE_EXACT_TRIE &&
3438 startbranch == first)
3439 || ( first_non_open == first )) &&
3441 flags |= SCF_TRIE_RESTUDY;
3442 if ( startbranch == first
3445 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3450 /* at this point we know whatever we have is a NOTHING sequence/branch
3451 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3453 if ( startbranch == first ) {
3455 /* the entire thing is a NOTHING sequence, something like this:
3456 * (?:|) So we can turn it into a plain NOTHING op. */
3457 DEBUG_TRIE_COMPILE_r({
3458 regprop(RExC_rx, mysv, cur);
3459 PerlIO_printf( Perl_debug_log,
3460 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3461 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3464 OP(startbranch)= NOTHING;
3465 NEXT_OFF(startbranch)= tail - startbranch;
3466 for ( opt= startbranch + 1; opt < tail ; opt++ )
3470 } /* end if ( last) */
3471 } /* TRIE_MAXBUF is non zero */
3476 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3477 scan = NEXTOPER(NEXTOPER(scan));
3478 } else /* single branch is optimized. */
3479 scan = NEXTOPER(scan);
3481 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3482 scan_frame *newframe = NULL;
3487 if (OP(scan) != SUSPEND) {
3488 /* set the pointer */
3489 if (OP(scan) == GOSUB) {
3491 RExC_recurse[ARG2L(scan)] = scan;
3492 start = RExC_open_parens[paren-1];
3493 end = RExC_close_parens[paren-1];
3496 start = RExC_rxi->program + 1;
3500 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3501 SAVEFREEPV(recursed);
3503 if (!PAREN_TEST(recursed,paren+1)) {
3504 PAREN_SET(recursed,paren+1);
3505 Newx(newframe,1,scan_frame);
3507 if (flags & SCF_DO_SUBSTR) {
3508 SCAN_COMMIT(pRExC_state,data,minlenp);
3509 data->longest = &(data->longest_float);
3511 is_inf = is_inf_internal = 1;
3512 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3513 cl_anything(pRExC_state, data->start_class);
3514 flags &= ~SCF_DO_STCLASS;
3517 Newx(newframe,1,scan_frame);
3520 end = regnext(scan);
3525 SAVEFREEPV(newframe);
3526 newframe->next = regnext(scan);
3527 newframe->last = last;
3528 newframe->stop = stopparen;
3529 newframe->prev = frame;
3539 else if (OP(scan) == EXACT) {
3540 I32 l = STR_LEN(scan);
3543 const U8 * const s = (U8*)STRING(scan);
3544 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3545 l = utf8_length(s, s + l);
3547 uc = *((U8*)STRING(scan));
3550 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3551 /* The code below prefers earlier match for fixed
3552 offset, later match for variable offset. */
3553 if (data->last_end == -1) { /* Update the start info. */
3554 data->last_start_min = data->pos_min;
3555 data->last_start_max = is_inf
3556 ? I32_MAX : data->pos_min + data->pos_delta;
3558 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3560 SvUTF8_on(data->last_found);
3562 SV * const sv = data->last_found;
3563 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3564 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3565 if (mg && mg->mg_len >= 0)
3566 mg->mg_len += utf8_length((U8*)STRING(scan),
3567 (U8*)STRING(scan)+STR_LEN(scan));
3569 data->last_end = data->pos_min + l;
3570 data->pos_min += l; /* As in the first entry. */
3571 data->flags &= ~SF_BEFORE_EOL;
3573 if (flags & SCF_DO_STCLASS_AND) {
3574 /* Check whether it is compatible with what we know already! */
3578 /* If compatible, we or it in below. It is compatible if is
3579 * in the bitmp and either 1) its bit or its fold is set, or 2)
3580 * it's for a locale. Even if there isn't unicode semantics
3581 * here, at runtime there may be because of matching against a
3582 * utf8 string, so accept a possible false positive for
3583 * latin1-range folds */
3585 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3586 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3587 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3588 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3593 ANYOF_CLASS_ZERO(data->start_class);
3594 ANYOF_BITMAP_ZERO(data->start_class);
3596 ANYOF_BITMAP_SET(data->start_class, uc);
3597 else if (uc >= 0x100) {
3600 /* Some Unicode code points fold to the Latin1 range; as
3601 * XXX temporary code, instead of figuring out if this is
3602 * one, just assume it is and set all the start class bits
3603 * that could be some such above 255 code point's fold
3604 * which will generate fals positives. As the code
3605 * elsewhere that does compute the fold settles down, it
3606 * can be extracted out and re-used here */
3607 for (i = 0; i < 256; i++){
3608 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3609 ANYOF_BITMAP_SET(data->start_class, i);
3613 data->start_class->flags &= ~ANYOF_EOS;
3615 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3617 else if (flags & SCF_DO_STCLASS_OR) {
3618 /* false positive possible if the class is case-folded */
3620 ANYOF_BITMAP_SET(data->start_class, uc);
3622 data->start_class->flags |= ANYOF_UNICODE_ALL;
3623 data->start_class->flags &= ~ANYOF_EOS;
3624 cl_and(data->start_class, and_withp);
3626 flags &= ~SCF_DO_STCLASS;
3628 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3629 I32 l = STR_LEN(scan);
3630 UV uc = *((U8*)STRING(scan));
3632 /* Search for fixed substrings supports EXACT only. */
3633 if (flags & SCF_DO_SUBSTR) {
3635 SCAN_COMMIT(pRExC_state, data, minlenp);
3638 const U8 * const s = (U8 *)STRING(scan);
3639 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3640 l = utf8_length(s, s + l);
3642 if (has_exactf_sharp_s) {
3643 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3645 min += l - min_subtract;
3647 delta += min_subtract;
3648 if (flags & SCF_DO_SUBSTR) {
3649 data->pos_min += l - min_subtract;
3650 if (data->pos_min < 0) {
3653 data->pos_delta += min_subtract;
3655 data->longest = &(data->longest_float);
3658 if (flags & SCF_DO_STCLASS_AND) {
3659 /* Check whether it is compatible with what we know already! */
3662 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3663 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3664 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3668 ANYOF_CLASS_ZERO(data->start_class);
3669 ANYOF_BITMAP_ZERO(data->start_class);
3671 ANYOF_BITMAP_SET(data->start_class, uc);
3672 data->start_class->flags &= ~ANYOF_EOS;
3673 if (OP(scan) == EXACTFL) {
3674 /* XXX This set is probably no longer necessary, and
3675 * probably wrong as LOCALE now is on in the initial
3677 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3681 /* Also set the other member of the fold pair. In case
3682 * that unicode semantics is called for at runtime, use
3683 * the full latin1 fold. (Can't do this for locale,
3684 * because not known until runtime) */
3685 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3687 /* All other (EXACTFL handled above) folds except under
3688 * /iaa that include s, S, and sharp_s also may include
3690 if (OP(scan) != EXACTFA) {
3691 if (uc == 's' || uc == 'S') {
3692 ANYOF_BITMAP_SET(data->start_class,
3693 LATIN_SMALL_LETTER_SHARP_S);
3695 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3696 ANYOF_BITMAP_SET(data->start_class, 's');
3697 ANYOF_BITMAP_SET(data->start_class, 'S');
3702 else if (uc >= 0x100) {
3704 for (i = 0; i < 256; i++){
3705 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3706 ANYOF_BITMAP_SET(data->start_class, i);
3711 else if (flags & SCF_DO_STCLASS_OR) {
3712 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3713 /* false positive possible if the class is case-folded.
3714 Assume that the locale settings are the same... */
3716 ANYOF_BITMAP_SET(data->start_class, uc);
3717 if (OP(scan) != EXACTFL) {
3719 /* And set the other member of the fold pair, but
3720 * can't do that in locale because not known until
3722 ANYOF_BITMAP_SET(data->start_class,
3723 PL_fold_latin1[uc]);
3725 /* All folds except under /iaa that include s, S,
3726 * and sharp_s also may include the others */
3727 if (OP(scan) != EXACTFA) {
3728 if (uc == 's' || uc == 'S') {
3729 ANYOF_BITMAP_SET(data->start_class,
3730 LATIN_SMALL_LETTER_SHARP_S);
3732 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3733 ANYOF_BITMAP_SET(data->start_class, 's');
3734 ANYOF_BITMAP_SET(data->start_class, 'S');
3739 data->start_class->flags &= ~ANYOF_EOS;
3741 cl_and(data->start_class, and_withp);
3743 flags &= ~SCF_DO_STCLASS;
3745 else if (REGNODE_VARIES(OP(scan))) {
3746 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3747 I32 f = flags, pos_before = 0;
3748 regnode * const oscan = scan;
3749 struct regnode_charclass_class this_class;
3750 struct regnode_charclass_class *oclass = NULL;
3751 I32 next_is_eval = 0;
3753 switch (PL_regkind[OP(scan)]) {
3754 case WHILEM: /* End of (?:...)* . */
3755 scan = NEXTOPER(scan);
3758 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3759 next = NEXTOPER(scan);
3760 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3762 maxcount = REG_INFTY;
3763 next = regnext(scan);
3764 scan = NEXTOPER(scan);
3768 if (flags & SCF_DO_SUBSTR)
3773 if (flags & SCF_DO_STCLASS) {
3775 maxcount = REG_INFTY;
3776 next = regnext(scan);
3777 scan = NEXTOPER(scan);
3780 is_inf = is_inf_internal = 1;
3781 scan = regnext(scan);
3782 if (flags & SCF_DO_SUBSTR) {
3783 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3784 data->longest = &(data->longest_float);
3786 goto optimize_curly_tail;
3788 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3789 && (scan->flags == stopparen))
3794 mincount = ARG1(scan);
3795 maxcount = ARG2(scan);
3797 next = regnext(scan);
3798 if (OP(scan) == CURLYX) {
3799 I32 lp = (data ? *(data->last_closep) : 0);
3800 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3802 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3803 next_is_eval = (OP(scan) == EVAL);
3805 if (flags & SCF_DO_SUBSTR) {
3806 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3807 pos_before = data->pos_min;
3811 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3813 data->flags |= SF_IS_INF;
3815 if (flags & SCF_DO_STCLASS) {
3816 cl_init(pRExC_state, &this_class);
3817 oclass = data->start_class;
3818 data->start_class = &this_class;
3819 f |= SCF_DO_STCLASS_AND;
3820 f &= ~SCF_DO_STCLASS_OR;
3822 /* Exclude from super-linear cache processing any {n,m}
3823 regops for which the combination of input pos and regex
3824 pos is not enough information to determine if a match
3827 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3828 regex pos at the \s*, the prospects for a match depend not
3829 only on the input position but also on how many (bar\s*)
3830 repeats into the {4,8} we are. */
3831 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3832 f &= ~SCF_WHILEM_VISITED_POS;
3834 /* This will finish on WHILEM, setting scan, or on NULL: */
3835 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3836 last, data, stopparen, recursed, NULL,
3838 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3840 if (flags & SCF_DO_STCLASS)
3841 data->start_class = oclass;
3842 if (mincount == 0 || minnext == 0) {
3843 if (flags & SCF_DO_STCLASS_OR) {
3844 cl_or(pRExC_state, data->start_class, &this_class);
3846 else if (flags & SCF_DO_STCLASS_AND) {
3847 /* Switch to OR mode: cache the old value of
3848 * data->start_class */
3850 StructCopy(data->start_class, and_withp,
3851 struct regnode_charclass_class);
3852 flags &= ~SCF_DO_STCLASS_AND;
3853 StructCopy(&this_class, data->start_class,
3854 struct regnode_charclass_class);
3855 flags |= SCF_DO_STCLASS_OR;
3856 data->start_class->flags |= ANYOF_EOS;
3858 } else { /* Non-zero len */
3859 if (flags & SCF_DO_STCLASS_OR) {
3860 cl_or(pRExC_state, data->start_class, &this_class);
3861 cl_and(data->start_class, and_withp);
3863 else if (flags & SCF_DO_STCLASS_AND)
3864 cl_and(data->start_class, &this_class);
3865 flags &= ~SCF_DO_STCLASS;
3867 if (!scan) /* It was not CURLYX, but CURLY. */
3869 if ( /* ? quantifier ok, except for (?{ ... }) */
3870 (next_is_eval || !(mincount == 0 && maxcount == 1))
3871 && (minnext == 0) && (deltanext == 0)
3872 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3873 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3875 ckWARNreg(RExC_parse,
3876 "Quantifier unexpected on zero-length expression");
3879 min += minnext * mincount;
3880 is_inf_internal |= ((maxcount == REG_INFTY
3881 && (minnext + deltanext) > 0)
3882 || deltanext == I32_MAX);
3883 is_inf |= is_inf_internal;
3884 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3886 /* Try powerful optimization CURLYX => CURLYN. */
3887 if ( OP(oscan) == CURLYX && data
3888 && data->flags & SF_IN_PAR
3889 && !(data->flags & SF_HAS_EVAL)
3890 && !deltanext && minnext == 1 ) {
3891 /* Try to optimize to CURLYN. */
3892 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3893 regnode * const nxt1 = nxt;
3900 if (!REGNODE_SIMPLE(OP(nxt))
3901 && !(PL_regkind[OP(nxt)] == EXACT
3902 && STR_LEN(nxt) == 1))
3908 if (OP(nxt) != CLOSE)
3910 if (RExC_open_parens) {
3911 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3912 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3914 /* Now we know that nxt2 is the only contents: */
3915 oscan->flags = (U8)ARG(nxt);
3917 OP(nxt1) = NOTHING; /* was OPEN. */
3920 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3921 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3922 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3923 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3924 OP(nxt + 1) = OPTIMIZED; /* was count. */
3925 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3930 /* Try optimization CURLYX => CURLYM. */
3931 if ( OP(oscan) == CURLYX && data
3932 && !(data->flags & SF_HAS_PAR)
3933 && !(data->flags & SF_HAS_EVAL)
3934 && !deltanext /* atom is fixed width */
3935 && minnext != 0 /* CURLYM can't handle zero width */
3936 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3938 /* XXXX How to optimize if data == 0? */
3939 /* Optimize to a simpler form. */
3940 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3944 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3945 && (OP(nxt2) != WHILEM))
3947 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3948 /* Need to optimize away parenths. */
3949 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3950 /* Set the parenth number. */
3951 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3953 oscan->flags = (U8)ARG(nxt);
3954 if (RExC_open_parens) {
3955 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3956 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3958 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3959 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3962 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3963 OP(nxt + 1) = OPTIMIZED; /* was count. */
3964 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3965 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3968 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3969 regnode *nnxt = regnext(nxt1);
3971 if (reg_off_by_arg[OP(nxt1)])
3972 ARG_SET(nxt1, nxt2 - nxt1);
3973 else if (nxt2 - nxt1 < U16_MAX)
3974 NEXT_OFF(nxt1) = nxt2 - nxt1;
3976 OP(nxt) = NOTHING; /* Cannot beautify */
3981 /* Optimize again: */
3982 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3983 NULL, stopparen, recursed, NULL, 0,depth+1);
3988 else if ((OP(oscan) == CURLYX)
3989 && (flags & SCF_WHILEM_VISITED_POS)
3990 /* See the comment on a similar expression above.
3991 However, this time it's not a subexpression
3992 we care about, but the expression itself. */
3993 && (maxcount == REG_INFTY)
3994 && data && ++data->whilem_c < 16) {
3995 /* This stays as CURLYX, we can put the count/of pair. */
3996 /* Find WHILEM (as in regexec.c) */
3997 regnode *nxt = oscan + NEXT_OFF(oscan);
3999 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4001 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4002 | (RExC_whilem_seen << 4)); /* On WHILEM */
4004 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4006 if (flags & SCF_DO_SUBSTR) {
4007 SV *last_str = NULL;
4008 int counted = mincount != 0;
4010 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4011 #if defined(SPARC64_GCC_WORKAROUND)
4014 const char *s = NULL;
4017 if (pos_before >= data->last_start_min)
4020 b = data->last_start_min;
4023 s = SvPV_const(data->last_found, l);
4024 old = b - data->last_start_min;
4027 I32 b = pos_before >= data->last_start_min
4028 ? pos_before : data->last_start_min;
4030 const char * const s = SvPV_const(data->last_found, l);
4031 I32 old = b - data->last_start_min;
4035 old = utf8_hop((U8*)s, old) - (U8*)s;
4037 /* Get the added string: */
4038 last_str = newSVpvn_utf8(s + old, l, UTF);
4039 if (deltanext == 0 && pos_before == b) {
4040 /* What was added is a constant string */
4042 SvGROW(last_str, (mincount * l) + 1);
4043 repeatcpy(SvPVX(last_str) + l,
4044 SvPVX_const(last_str), l, mincount - 1);
4045 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4046 /* Add additional parts. */
4047 SvCUR_set(data->last_found,
4048 SvCUR(data->last_found) - l);
4049 sv_catsv(data->last_found, last_str);
4051 SV * sv = data->last_found;
4053 SvUTF8(sv) && SvMAGICAL(sv) ?
4054 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4055 if (mg && mg->mg_len >= 0)
4056 mg->mg_len += CHR_SVLEN(last_str) - l;
4058 data->last_end += l * (mincount - 1);
4061 /* start offset must point into the last copy */
4062 data->last_start_min += minnext * (mincount - 1);
4063 data->last_start_max += is_inf ? I32_MAX
4064 : (maxcount - 1) * (minnext + data->pos_delta);
4067 /* It is counted once already... */
4068 data->pos_min += minnext * (mincount - counted);
4069 data->pos_delta += - counted * deltanext +
4070 (minnext + deltanext) * maxcount - minnext * mincount;
4071 if (mincount != maxcount) {
4072 /* Cannot extend fixed substrings found inside
4074 SCAN_COMMIT(pRExC_state,data,minlenp);
4075 if (mincount && last_str) {
4076 SV * const sv = data->last_found;
4077 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4078 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4082 sv_setsv(sv, last_str);
4083 data->last_end = data->pos_min;
4084 data->last_start_min =
4085 data->pos_min - CHR_SVLEN(last_str);
4086 data->last_start_max = is_inf
4088 : data->pos_min + data->pos_delta
4089 - CHR_SVLEN(last_str);
4091 data->longest = &(data->longest_float);
4093 SvREFCNT_dec(last_str);
4095 if (data && (fl & SF_HAS_EVAL))
4096 data->flags |= SF_HAS_EVAL;
4097 optimize_curly_tail:
4098 if (OP(oscan) != CURLYX) {
4099 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4101 NEXT_OFF(oscan) += NEXT_OFF(next);
4104 default: /* REF, ANYOFV, and CLUMP only? */
4105 if (flags & SCF_DO_SUBSTR) {
4106 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4107 data->longest = &(data->longest_float);
4109 is_inf = is_inf_internal = 1;
4110 if (flags & SCF_DO_STCLASS_OR)
4111 cl_anything(pRExC_state, data->start_class);
4112 flags &= ~SCF_DO_STCLASS;
4116 else if (OP(scan) == LNBREAK) {
4117 if (flags & SCF_DO_STCLASS) {
4119 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4120 if (flags & SCF_DO_STCLASS_AND) {
4121 for (value = 0; value < 256; value++)
4122 if (!is_VERTWS_cp(value))
4123 ANYOF_BITMAP_CLEAR(data->start_class, value);
4126 for (value = 0; value < 256; value++)
4127 if (is_VERTWS_cp(value))
4128 ANYOF_BITMAP_SET(data->start_class, value);
4130 if (flags & SCF_DO_STCLASS_OR)
4131 cl_and(data->start_class, and_withp);
4132 flags &= ~SCF_DO_STCLASS;
4135 delta++; /* Because of the 2 char string cr-lf */
4136 if (flags & SCF_DO_SUBSTR) {
4137 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4139 data->pos_delta += 1;
4140 data->longest = &(data->longest_float);
4143 else if (REGNODE_SIMPLE(OP(scan))) {
4146 if (flags & SCF_DO_SUBSTR) {
4147 SCAN_COMMIT(pRExC_state,data,minlenp);
4151 if (flags & SCF_DO_STCLASS) {
4152 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4154 /* Some of the logic below assumes that switching
4155 locale on will only add false positives. */
4156 switch (PL_regkind[OP(scan)]) {
4160 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4161 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4162 cl_anything(pRExC_state, data->start_class);
4165 if (OP(scan) == SANY)
4167 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4168 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4169 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4170 cl_anything(pRExC_state, data->start_class);
4172 if (flags & SCF_DO_STCLASS_AND || !value)
4173 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4176 if (flags & SCF_DO_STCLASS_AND)
4177 cl_and(data->start_class,
4178 (struct regnode_charclass_class*)scan);
4180 cl_or(pRExC_state, data->start_class,
4181 (struct regnode_charclass_class*)scan);
4184 if (flags & SCF_DO_STCLASS_AND) {
4185 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4186 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
4187 if (OP(scan) == ALNUMU) {
4188 for (value = 0; value < 256; value++) {
4189 if (!isWORDCHAR_L1(value)) {
4190 ANYOF_BITMAP_CLEAR(data->start_class, value);
4194 for (value = 0; value < 256; value++) {
4195 if (!isALNUM(value)) {
4196 ANYOF_BITMAP_CLEAR(data->start_class, value);
4203 if (data->start_class->flags & ANYOF_LOCALE)
4204 ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
4206 /* Even if under locale, set the bits for non-locale
4207 * in case it isn't a true locale-node. This will
4208 * create false positives if it truly is locale */
4209 if (OP(scan) == ALNUMU) {
4210 for (value = 0; value < 256; value++) {
4211 if (isWORDCHAR_L1(value)) {
4212 ANYOF_BITMAP_SET(data->start_class, value);
4216 for (value = 0; value < 256; value++) {
4217 if (isALNUM(value)) {
4218 ANYOF_BITMAP_SET(data->start_class, value);
4225 if (flags & SCF_DO_STCLASS_AND) {
4226 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4227 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
4228 if (OP(scan) == NALNUMU) {
4229 for (value = 0; value < 256; value++) {
4230 if (isWORDCHAR_L1(value)) {
4231 ANYOF_BITMAP_CLEAR(data->start_class, value);
4235 for (value = 0; value < 256; value++) {
4236 if (isALNUM(value)) {
4237 ANYOF_BITMAP_CLEAR(data->start_class, value);
4244 if (data->start_class->flags & ANYOF_LOCALE)
4245 ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
4247 /* Even if under locale, set the bits for non-locale in
4248 * case it isn't a true locale-node. This will create
4249 * false positives if it truly is locale */
4250 if (OP(scan) == NALNUMU) {
4251 for (value = 0; value < 256; value++) {
4252 if (! isWORDCHAR_L1(value)) {
4253 ANYOF_BITMAP_SET(data->start_class, value);
4257 for (value = 0; value < 256; value++) {
4258 if (! isALNUM(value)) {
4259 ANYOF_BITMAP_SET(data->start_class, value);
4266 if (flags & SCF_DO_STCLASS_AND) {
4267 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4268 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4269 if (OP(scan) == SPACEU) {
4270 for (value = 0; value < 256; value++) {
4271 if (!isSPACE_L1(value)) {
4272 ANYOF_BITMAP_CLEAR(data->start_class, value);
4276 for (value = 0; value < 256; value++) {
4277 if (!isSPACE(value)) {
4278 ANYOF_BITMAP_CLEAR(data->start_class, value);
4285 if (data->start_class->flags & ANYOF_LOCALE) {
4286 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4288 if (OP(scan) == SPACEU) {
4289 for (value = 0; value < 256; value++) {
4290 if (isSPACE_L1(value)) {
4291 ANYOF_BITMAP_SET(data->start_class, value);
4295 for (value = 0; value < 256; value++) {
4296 if (isSPACE(value)) {
4297 ANYOF_BITMAP_SET(data->start_class, value);
4304 if (flags & SCF_DO_STCLASS_AND) {
4305 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4306 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4307 if (OP(scan) == NSPACEU) {
4308 for (value = 0; value < 256; value++) {
4309 if (isSPACE_L1(value)) {
4310 ANYOF_BITMAP_CLEAR(data->start_class, value);
4314 for (value = 0; value < 256; value++) {
4315 if (isSPACE(value)) {
4316 ANYOF_BITMAP_CLEAR(data->start_class, value);
4323 if (data->start_class->flags & ANYOF_LOCALE)
4324 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4325 if (OP(scan) == NSPACEU) {
4326 for (value = 0; value < 256; value++) {
4327 if (!isSPACE_L1(value)) {
4328 ANYOF_BITMAP_SET(data->start_class, value);
4333 for (value = 0; value < 256; value++) {
4334 if (!isSPACE(value)) {
4335 ANYOF_BITMAP_SET(data->start_class, value);
4342 if (flags & SCF_DO_STCLASS_AND) {
4343 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4344 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4345 for (value = 0; value < 256; value++)
4346 if (!isDIGIT(value))
4347 ANYOF_BITMAP_CLEAR(data->start_class, value);
4351 if (data->start_class->flags & ANYOF_LOCALE)
4352 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4353 for (value = 0; value < 256; value++)
4355 ANYOF_BITMAP_SET(data->start_class, value);
4359 if (flags & SCF_DO_STCLASS_AND) {
4360 if (!(data->start_class->flags & ANYOF_LOCALE))
4361 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4362 for (value = 0; value < 256; value++)
4364 ANYOF_BITMAP_CLEAR(data->start_class, value);
4367 if (data->start_class->flags & ANYOF_LOCALE)
4368 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4369 for (value = 0; value < 256; value++)
4370 if (!isDIGIT(value))
4371 ANYOF_BITMAP_SET(data->start_class, value);
4374 CASE_SYNST_FNC(VERTWS);
4375 CASE_SYNST_FNC(HORIZWS);
4378 if (flags & SCF_DO_STCLASS_OR)
4379 cl_and(data->start_class, and_withp);
4380 flags &= ~SCF_DO_STCLASS;
4383 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4384 data->flags |= (OP(scan) == MEOL
4387 SCAN_COMMIT(pRExC_state, data, minlenp);
4390 else if ( PL_regkind[OP(scan)] == BRANCHJ
4391 /* Lookbehind, or need to calculate parens/evals/stclass: */
4392 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4393 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4394 if ( OP(scan) == UNLESSM &&
4396 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4397 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4400 regnode *upto= regnext(scan);
4402 SV * const mysv_val=sv_newmortal();
4403 DEBUG_STUDYDATA("OPFAIL",data,depth);
4405 /*DEBUG_PARSE_MSG("opfail");*/
4406 regprop(RExC_rx, mysv_val, upto);
4407 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4408 SvPV_nolen_const(mysv_val),
4409 (IV)REG_NODE_NUM(upto),
4414 NEXT_OFF(scan) = upto - scan;
4415 for (opt= scan + 1; opt < upto ; opt++)
4416 OP(opt) = OPTIMIZED;
4420 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4421 || OP(scan) == UNLESSM )
4423 /* Negative Lookahead/lookbehind
4424 In this case we can't do fixed string optimisation.
4427 I32 deltanext, minnext, fake = 0;
4429 struct regnode_charclass_class intrnl;
4432 data_fake.flags = 0;
4434 data_fake.whilem_c = data->whilem_c;
4435 data_fake.last_closep = data->last_closep;
4438 data_fake.last_closep = &fake;
4439 data_fake.pos_delta = delta;
4440 if ( flags & SCF_DO_STCLASS && !scan->flags
4441 && OP(scan) == IFMATCH ) { /* Lookahead */
4442 cl_init(pRExC_state, &intrnl);
4443 data_fake.start_class = &intrnl;
4444 f |= SCF_DO_STCLASS_AND;
4446 if (flags & SCF_WHILEM_VISITED_POS)
4447 f |= SCF_WHILEM_VISITED_POS;
4448 next = regnext(scan);
4449 nscan = NEXTOPER(NEXTOPER(scan));
4450 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4451 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4454 FAIL("Variable length lookbehind not implemented");
4456 else if (minnext > (I32)U8_MAX) {
4457 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4459 scan->flags = (U8)minnext;
4462 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4464 if (data_fake.flags & SF_HAS_EVAL)
4465 data->flags |= SF_HAS_EVAL;
4466 data->whilem_c = data_fake.whilem_c;
4468 if (f & SCF_DO_STCLASS_AND) {
4469 if (flags & SCF_DO_STCLASS_OR) {
4470 /* OR before, AND after: ideally we would recurse with
4471 * data_fake to get the AND applied by study of the
4472 * remainder of the pattern, and then derecurse;
4473 * *** HACK *** for now just treat as "no information".
4474 * See [perl #56690].
4476 cl_init(pRExC_state, data->start_class);
4478 /* AND before and after: combine and continue */
4479 const int was = (data->start_class->flags & ANYOF_EOS);
4481 cl_and(data->start_class, &intrnl);
4483 data->start_class->flags |= ANYOF_EOS;
4487 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4489 /* Positive Lookahead/lookbehind
4490 In this case we can do fixed string optimisation,
4491 but we must be careful about it. Note in the case of
4492 lookbehind the positions will be offset by the minimum
4493 length of the pattern, something we won't know about
4494 until after the recurse.
4496 I32 deltanext, fake = 0;
4498 struct regnode_charclass_class intrnl;
4500 /* We use SAVEFREEPV so that when the full compile
4501 is finished perl will clean up the allocated
4502 minlens when it's all done. This way we don't
4503 have to worry about freeing them when we know
4504 they wont be used, which would be a pain.
4507 Newx( minnextp, 1, I32 );
4508 SAVEFREEPV(minnextp);
4511 StructCopy(data, &data_fake, scan_data_t);
4512 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4515 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4516 data_fake.last_found=newSVsv(data->last_found);
4520 data_fake.last_closep = &fake;
4521 data_fake.flags = 0;
4522 data_fake.pos_delta = delta;
4524 data_fake.flags |= SF_IS_INF;
4525 if ( flags & SCF_DO_STCLASS && !scan->flags
4526 && OP(scan) == IFMATCH ) { /* Lookahead */
4527 cl_init(pRExC_state, &intrnl);
4528 data_fake.start_class = &intrnl;
4529 f |= SCF_DO_STCLASS_AND;
4531 if (flags & SCF_WHILEM_VISITED_POS)
4532 f |= SCF_WHILEM_VISITED_POS;
4533 next = regnext(scan);
4534 nscan = NEXTOPER(NEXTOPER(scan));
4536 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4537 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4540 FAIL("Variable length lookbehind not implemented");
4542 else if (*minnextp > (I32)U8_MAX) {
4543 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4545 scan->flags = (U8)*minnextp;
4550 if (f & SCF_DO_STCLASS_AND) {
4551 const int was = (data->start_class->flags & ANYOF_EOS);
4553 cl_and(data->start_class, &intrnl);
4555 data->start_class->flags |= ANYOF_EOS;
4558 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4560 if (data_fake.flags & SF_HAS_EVAL)
4561 data->flags |= SF_HAS_EVAL;
4562 data->whilem_c = data_fake.whilem_c;
4563 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4564 if (RExC_rx->minlen<*minnextp)
4565 RExC_rx->minlen=*minnextp;
4566 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4567 SvREFCNT_dec(data_fake.last_found);
4569 if ( data_fake.minlen_fixed != minlenp )
4571 data->offset_fixed= data_fake.offset_fixed;
4572 data->minlen_fixed= data_fake.minlen_fixed;
4573 data->lookbehind_fixed+= scan->flags;
4575 if ( data_fake.minlen_float != minlenp )
4577 data->minlen_float= data_fake.minlen_float;
4578 data->offset_float_min=data_fake.offset_float_min;
4579 data->offset_float_max=data_fake.offset_float_max;
4580 data->lookbehind_float+= scan->flags;
4587 else if (OP(scan) == OPEN) {
4588 if (stopparen != (I32)ARG(scan))
4591 else if (OP(scan) == CLOSE) {
4592 if (stopparen == (I32)ARG(scan)) {
4595 if ((I32)ARG(scan) == is_par) {
4596 next = regnext(scan);
4598 if ( next && (OP(next) != WHILEM) && next < last)
4599 is_par = 0; /* Disable optimization */
4602 *(data->last_closep) = ARG(scan);
4604 else if (OP(scan) == EVAL) {
4606 data->flags |= SF_HAS_EVAL;
4608 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4609 if (flags & SCF_DO_SUBSTR) {
4610 SCAN_COMMIT(pRExC_state,data,minlenp);
4611 flags &= ~SCF_DO_SUBSTR;
4613 if (data && OP(scan)==ACCEPT) {
4614 data->flags |= SCF_SEEN_ACCEPT;
4619 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4621 if (flags & SCF_DO_SUBSTR) {
4622 SCAN_COMMIT(pRExC_state,data,minlenp);
4623 data->longest = &(data->longest_float);
4625 is_inf = is_inf_internal = 1;
4626 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4627 cl_anything(pRExC_state, data->start_class);
4628 flags &= ~SCF_DO_STCLASS;
4630 else if (OP(scan) == GPOS) {
4631 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4632 !(delta || is_inf || (data && data->pos_delta)))
4634 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4635 RExC_rx->extflags |= RXf_ANCH_GPOS;
4636 if (RExC_rx->gofs < (U32)min)
4637 RExC_rx->gofs = min;
4639 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4643 #ifdef TRIE_STUDY_OPT
4644 #ifdef FULL_TRIE_STUDY
4645 else if (PL_regkind[OP(scan)] == TRIE) {
4646 /* NOTE - There is similar code to this block above for handling
4647 BRANCH nodes on the initial study. If you change stuff here
4649 regnode *trie_node= scan;
4650 regnode *tail= regnext(scan);
4651 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4652 I32 max1 = 0, min1 = I32_MAX;
4653 struct regnode_charclass_class accum;
4655 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4656 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4657 if (flags & SCF_DO_STCLASS)
4658 cl_init_zero(pRExC_state, &accum);
4664 const regnode *nextbranch= NULL;
4667 for ( word=1 ; word <= trie->wordcount ; word++)
4669 I32 deltanext=0, minnext=0, f = 0, fake;
4670 struct regnode_charclass_class this_class;
4672 data_fake.flags = 0;
4674 data_fake.whilem_c = data->whilem_c;
4675 data_fake.last_closep = data->last_closep;
4678 data_fake.last_closep = &fake;
4679 data_fake.pos_delta = delta;
4680 if (flags & SCF_DO_STCLASS) {
4681 cl_init(pRExC_state, &this_class);
4682 data_fake.start_class = &this_class;
4683 f = SCF_DO_STCLASS_AND;
4685 if (flags & SCF_WHILEM_VISITED_POS)
4686 f |= SCF_WHILEM_VISITED_POS;
4688 if (trie->jump[word]) {
4690 nextbranch = trie_node + trie->jump[0];
4691 scan= trie_node + trie->jump[word];
4692 /* We go from the jump point to the branch that follows
4693 it. Note this means we need the vestigal unused branches
4694 even though they arent otherwise used.
4696 minnext = study_chunk(pRExC_state, &scan, minlenp,
4697 &deltanext, (regnode *)nextbranch, &data_fake,
4698 stopparen, recursed, NULL, f,depth+1);
4700 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4701 nextbranch= regnext((regnode*)nextbranch);
4703 if (min1 > (I32)(minnext + trie->minlen))
4704 min1 = minnext + trie->minlen;
4705 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4706 max1 = minnext + deltanext + trie->maxlen;
4707 if (deltanext == I32_MAX)
4708 is_inf = is_inf_internal = 1;
4710 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4712 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4713 if ( stopmin > min + min1)
4714 stopmin = min + min1;
4715 flags &= ~SCF_DO_SUBSTR;
4717 data->flags |= SCF_SEEN_ACCEPT;
4720 if (data_fake.flags & SF_HAS_EVAL)
4721 data->flags |= SF_HAS_EVAL;
4722 data->whilem_c = data_fake.whilem_c;
4724 if (flags & SCF_DO_STCLASS)
4725 cl_or(pRExC_state, &accum, &this_class);
4728 if (flags & SCF_DO_SUBSTR) {
4729 data->pos_min += min1;
4730 data->pos_delta += max1 - min1;
4731 if (max1 != min1 || is_inf)
4732 data->longest = &(data->longest_float);
4735 delta += max1 - min1;
4736 if (flags & SCF_DO_STCLASS_OR) {
4737 cl_or(pRExC_state, data->start_class, &accum);
4739 cl_and(data->start_class, and_withp);
4740 flags &= ~SCF_DO_STCLASS;
4743 else if (flags & SCF_DO_STCLASS_AND) {
4745 cl_and(data->start_class, &accum);
4746 flags &= ~SCF_DO_STCLASS;
4749 /* Switch to OR mode: cache the old value of
4750 * data->start_class */
4752 StructCopy(data->start_class, and_withp,
4753 struct regnode_charclass_class);
4754 flags &= ~SCF_DO_STCLASS_AND;
4755 StructCopy(&accum, data->start_class,
4756 struct regnode_charclass_class);
4757 flags |= SCF_DO_STCLASS_OR;
4758 data->start_class->flags |= ANYOF_EOS;
4765 else if (PL_regkind[OP(scan)] == TRIE) {
4766 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4769 min += trie->minlen;
4770 delta += (trie->maxlen - trie->minlen);
4771 flags &= ~SCF_DO_STCLASS; /* xxx */
4772 if (flags & SCF_DO_SUBSTR) {
4773 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4774 data->pos_min += trie->minlen;
4775 data->pos_delta += (trie->maxlen - trie->minlen);
4776 if (trie->maxlen != trie->minlen)
4777 data->longest = &(data->longest_float);
4779 if (trie->jump) /* no more substrings -- for now /grr*/
4780 flags &= ~SCF_DO_SUBSTR;
4782 #endif /* old or new */
4783 #endif /* TRIE_STUDY_OPT */
4785 /* Else: zero-length, ignore. */
4786 scan = regnext(scan);
4791 stopparen = frame->stop;
4792 frame = frame->prev;
4793 goto fake_study_recurse;
4798 DEBUG_STUDYDATA("pre-fin:",data,depth);
4801 *deltap = is_inf_internal ? I32_MAX : delta;
4802 if (flags & SCF_DO_SUBSTR && is_inf)
4803 data->pos_delta = I32_MAX - data->pos_min;
4804 if (is_par > (I32)U8_MAX)
4806 if (is_par && pars==1 && data) {
4807 data->flags |= SF_IN_PAR;
4808 data->flags &= ~SF_HAS_PAR;
4810 else if (pars && data) {
4811 data->flags |= SF_HAS_PAR;
4812 data->flags &= ~SF_IN_PAR;
4814 if (flags & SCF_DO_STCLASS_OR)
4815 cl_and(data->start_class, and_withp);
4816 if (flags & SCF_TRIE_RESTUDY)
4817 data->flags |= SCF_TRIE_RESTUDY;
4819 DEBUG_STUDYDATA("post-fin:",data,depth);
4821 return min < stopmin ? min : stopmin;
4825 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4827 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4829 PERL_ARGS_ASSERT_ADD_DATA;
4831 Renewc(RExC_rxi->data,
4832 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4833 char, struct reg_data);
4835 Renew(RExC_rxi->data->what, count + n, U8);
4837 Newx(RExC_rxi->data->what, n, U8);
4838 RExC_rxi->data->count = count + n;
4839 Copy(s, RExC_rxi->data->what + count, n, U8);
4843 /*XXX: todo make this not included in a non debugging perl */
4844 #ifndef PERL_IN_XSUB_RE
4846 Perl_reginitcolors(pTHX)
4849 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4851 char *t = savepv(s);
4855 t = strchr(t, '\t');
4861 PL_colors[i] = t = (char *)"";
4866 PL_colors[i++] = (char *)"";
4873 #ifdef TRIE_STUDY_OPT
4874 #define CHECK_RESTUDY_GOTO \
4876 (data.flags & SCF_TRIE_RESTUDY) \
4880 #define CHECK_RESTUDY_GOTO
4884 * pregcomp - compile a regular expression into internal code
4886 * Decides which engine's compiler to call based on the hint currently in
4890 #ifndef PERL_IN_XSUB_RE
4892 /* return the currently in-scope regex engine (or the default if none) */
4894 regexp_engine const *
4895 Perl_current_re_engine(pTHX)
4899 if (IN_PERL_COMPILETIME) {
4900 HV * const table = GvHV(PL_hintgv);
4904 return &reh_regexp_engine;
4905 ptr = hv_fetchs(table, "regcomp", FALSE);
4906 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4907 return &reh_regexp_engine;
4908 return INT2PTR(regexp_engine*,SvIV(*ptr));
4912 if (!PL_curcop->cop_hints_hash)
4913 return &reh_regexp_engine;
4914 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4915 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4916 return &reh_regexp_engine;
4917 return INT2PTR(regexp_engine*,SvIV(ptr));
4923 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4926 regexp_engine const *eng = current_re_engine();
4927 GET_RE_DEBUG_FLAGS_DECL;
4929 PERL_ARGS_ASSERT_PREGCOMP;
4931 /* Dispatch a request to compile a regexp to correct regexp engine. */
4933 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4936 return CALLREGCOMP_ENG(eng, pattern, flags);
4940 /* public(ish) entry point for the perl core's own regex compiling code.
4941 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4942 * pattern rather than a list of OPs, and uses the internal engine rather
4943 * than the current one */
4946 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4948 SV *pat = pattern; /* defeat constness! */
4949 PERL_ARGS_ASSERT_RE_COMPILE;
4950 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4951 #ifdef PERL_IN_XSUB_RE
4956 NULL, NULL, rx_flags, 0);
4959 /* see if there are any run-time code blocks in the pattern.
4960 * False positives are allowed */
4963 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4964 U32 pm_flags, char *pat, STRLEN plen)
4969 /* avoid infinitely recursing when we recompile the pattern parcelled up
4970 * as qr'...'. A single constant qr// string can't have have any
4971 * run-time component in it, and thus, no runtime code. (A non-qr
4972 * string, however, can, e.g. $x =~ '(?{})') */
4973 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4976 for (s = 0; s < plen; s++) {
4977 if (n < pRExC_state->num_code_blocks
4978 && s == pRExC_state->code_blocks[n].start)
4980 s = pRExC_state->code_blocks[n].end;
4984 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4986 if (pat[s] == '(' && pat[s+1] == '?' &&
4987 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4994 /* Handle run-time code blocks. We will already have compiled any direct
4995 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4996 * copy of it, but with any literal code blocks blanked out and
4997 * appropriate chars escaped; then feed it into
4999 * eval "qr'modified_pattern'"
5003 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5007 * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
5009 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5010 * and merge them with any code blocks of the original regexp.
5012 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5013 * instead, just save the qr and return FALSE; this tells our caller that
5014 * the original pattern needs upgrading to utf8.
5018 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5019 char *pat, STRLEN plen)
5023 GET_RE_DEBUG_FLAGS_DECL;
5025 if (pRExC_state->runtime_code_qr) {
5026 /* this is the second time we've been called; this should
5027 * only happen if the main pattern got upgraded to utf8
5028 * during compilation; re-use the qr we compiled first time
5029 * round (which should be utf8 too)
5031 qr = pRExC_state->runtime_code_qr;
5032 pRExC_state->runtime_code_qr = NULL;
5033 assert(RExC_utf8 && SvUTF8(qr));
5039 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5043 /* determine how many extra chars we need for ' and \ escaping */
5044 for (s = 0; s < plen; s++) {
5045 if (pat[s] == '\'' || pat[s] == '\\')
5049 Newx(newpat, newlen, char);
5051 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5053 for (s = 0; s < plen; s++) {
5054 if (n < pRExC_state->num_code_blocks
5055 && s == pRExC_state->code_blocks[n].start)
5057 /* blank out literal code block */
5058 assert(pat[s] == '(');
5059 while (s <= pRExC_state->code_blocks[n].end) {
5067 if (pat[s] == '\'' || pat[s] == '\\')
5072 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5076 PerlIO_printf(Perl_debug_log,
5077 "%sre-parsing pattern for runtime code:%s %s\n",
5078 PL_colors[4],PL_colors[5],newpat);
5081 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5087 PUSHSTACKi(PERLSI_REQUIRE);
5088 /* this causes the toker to collapse \\ into \ when parsing
5089 * qr''; normally only q'' does this. It also alters hints
5091 PL_reg_state.re_reparsing = TRUE;
5092 eval_sv(sv, G_SCALAR);
5098 Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5099 assert(SvROK(qr_ref));
5101 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5102 /* the leaving below frees the tmp qr_ref.
5103 * Give qr a life of its own */
5111 if (!RExC_utf8 && SvUTF8(qr)) {
5112 /* first time through; the pattern got upgraded; save the
5113 * qr for the next time through */
5114 assert(!pRExC_state->runtime_code_qr);
5115 pRExC_state->runtime_code_qr = qr;
5120 /* extract any code blocks within the returned qr// */
5123 /* merge the main (r1) and run-time (r2) code blocks into one */
5125 RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5126 struct reg_code_block *new_block, *dst;
5127 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5130 if (!r2->num_code_blocks) /* we guessed wrong */
5134 r1->num_code_blocks + r2->num_code_blocks,
5135 struct reg_code_block);
5138 while ( i1 < r1->num_code_blocks
5139 || i2 < r2->num_code_blocks)
5141 struct reg_code_block *src;
5144 if (i1 == r1->num_code_blocks) {
5145 src = &r2->code_blocks[i2++];
5148 else if (i2 == r2->num_code_blocks)
5149 src = &r1->code_blocks[i1++];
5150 else if ( r1->code_blocks[i1].start
5151 < r2->code_blocks[i2].start)
5153 src = &r1->code_blocks[i1++];
5154 assert(src->end < r2->code_blocks[i2].start);
5157 assert( r1->code_blocks[i1].start
5158 > r2->code_blocks[i2].start);
5159 src = &r2->code_blocks[i2++];
5161 assert(src->end < r1->code_blocks[i1].start);
5164 assert(pat[src->start] == '(');
5165 assert(pat[src->end] == ')');
5166 dst->start = src->start;
5167 dst->end = src->end;
5168 dst->block = src->block;
5169 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5173 r1->num_code_blocks += r2->num_code_blocks;
5174 Safefree(r1->code_blocks);
5175 r1->code_blocks = new_block;
5184 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5186 /* This is the common code for setting up the floating and fixed length
5187 * string data extracted from Perlre_op_compile() below. Returns a boolean
5188 * as to whether succeeded or not */
5192 if (! (longest_length
5193 || (eol /* Can't have SEOL and MULTI */
5194 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5196 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5197 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5202 /* copy the information about the longest from the reg_scan_data
5203 over to the program. */
5204 if (SvUTF8(sv_longest)) {
5205 *rx_utf8 = sv_longest;
5208 *rx_substr = sv_longest;
5211 /* end_shift is how many chars that must be matched that
5212 follow this item. We calculate it ahead of time as once the
5213 lookbehind offset is added in we lose the ability to correctly
5215 ml = minlen ? *(minlen) : (I32)longest_length;
5216 *rx_end_shift = ml - offset
5217 - longest_length + (SvTAIL(sv_longest) != 0)
5220 t = (eol/* Can't have SEOL and MULTI */
5221 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5222 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5228 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5229 * regular expression into internal code.
5230 * The pattern may be passed either as:
5231 * a list of SVs (patternp plus pat_count)
5232 * a list of OPs (expr)
5233 * If both are passed, the SV list is used, but the OP list indicates
5234 * which SVs are actually pre-compiled code blocks
5236 * The SVs in the list have magic and qr overloading applied to them (and
5237 * the list may be modified in-place with replacement SVs in the latter
5240 * If the pattern hasn't changed from old_re, then old_re will be
5243 * eng is the current engine. If that engine has an op_comp method, then
5244 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5245 * do the initial concatenation of arguments and pass on to the external
5248 * If is_bare_re is not null, set it to a boolean indicating whether the
5249 * arg list reduced (after overloading) to a single bare regex which has
5250 * been returned (i.e. /$qr/).
5252 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5254 * pm_flags contains the PMf_* flags, typically based on those from the
5255 * pm_flags field of the related PMOP. Currently we're only interested in
5256 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5258 * We can't allocate space until we know how big the compiled form will be,
5259 * but we can't compile it (and thus know how big it is) until we've got a
5260 * place to put the code. So we cheat: we compile it twice, once with code
5261 * generation turned off and size counting turned on, and once "for real".
5262 * This also means that we don't allocate space until we are sure that the
5263 * thing really will compile successfully, and we never have to move the
5264 * code and thus invalidate pointers into it. (Note that it has to be in
5265 * one piece because free() must be able to free it all.) [NB: not true in perl]
5267 * Beware that the optimization-preparation code in here knows about some
5268 * of the structure of the compiled regexp. [I'll say.]
5272 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5273 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5274 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5279 regexp_internal *ri;
5289 /* these are all flags - maybe they should be turned
5290 * into a single int with different bit masks */
5291 I32 sawlookahead = 0;
5294 bool used_setjump = FALSE;
5295 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5296 bool code_is_utf8 = 0;
5297 bool VOL recompile = 0;
5298 bool runtime_code = 0;
5302 RExC_state_t RExC_state;
5303 RExC_state_t * const pRExC_state = &RExC_state;
5304 #ifdef TRIE_STUDY_OPT
5306 RExC_state_t copyRExC_state;
5308 GET_RE_DEBUG_FLAGS_DECL;
5310 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5312 DEBUG_r(if (!PL_colorset) reginitcolors());
5314 #ifndef PERL_IN_XSUB_RE
5315 /* Initialize these here instead of as-needed, as is quick and avoids
5316 * having to test them each time otherwise */
5317 if (! PL_AboveLatin1) {
5318 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5319 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5320 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5322 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5323 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5325 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5326 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5328 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5329 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5331 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5333 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5334 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5336 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5338 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5339 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5341 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5342 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5344 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5345 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5347 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5348 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5350 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5351 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5353 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5354 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5356 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5357 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5359 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5361 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5362 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5364 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5365 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5367 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5371 pRExC_state->code_blocks = NULL;
5372 pRExC_state->num_code_blocks = 0;
5375 *is_bare_re = FALSE;
5377 if (expr && (expr->op_type == OP_LIST ||
5378 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5380 /* is the source UTF8, and how many code blocks are there? */
5384 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5385 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5387 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5388 /* count of DO blocks */
5392 pRExC_state->num_code_blocks = ncode;
5393 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5398 /* handle a list of SVs */
5402 /* apply magic and RE overloading to each arg */
5403 for (svp = patternp; svp < patternp + pat_count; svp++) {
5406 if (SvROK(rx) && SvAMAGIC(rx)) {
5407 SV *sv = AMG_CALLunary(rx, regexp_amg);
5411 if (SvTYPE(sv) != SVt_REGEXP)
5412 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5418 if (pat_count > 1) {
5419 /* concat multiple args and find any code block indexes */
5424 STRLEN orig_patlen = 0;
5426 if (pRExC_state->num_code_blocks) {
5427 o = cLISTOPx(expr)->op_first;
5428 assert(o->op_type == OP_PUSHMARK);
5432 pat = newSVpvn("", 0);
5435 /* determine if the pattern is going to be utf8 (needed
5436 * in advance to align code block indices correctly).
5437 * XXX This could fail to be detected for an arg with
5438 * overloading but not concat overloading; but the main effect
5439 * in this obscure case is to need a 'use re eval' for a
5440 * literal code block */
5441 for (svp = patternp; svp < patternp + pat_count; svp++) {
5448 for (svp = patternp; svp < patternp + pat_count; svp++) {
5449 SV *sv, *msv = *svp;
5453 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5454 assert(n < pRExC_state->num_code_blocks);
5455 pRExC_state->code_blocks[n].start = SvCUR(pat);
5456 pRExC_state->code_blocks[n].block = o;
5457 pRExC_state->code_blocks[n].src_regex = NULL;
5460 o = o->op_sibling; /* skip CONST */
5466 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5467 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5470 /* overloading involved: all bets are off over literal
5471 * code. Pretend we haven't seen it */
5472 pRExC_state->num_code_blocks -= n;
5478 while (SvAMAGIC(msv)
5479 && (sv = AMG_CALLunary(msv, string_amg))
5483 && SvRV(msv) == SvRV(sv))
5488 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5490 orig_patlen = SvCUR(pat);
5491 sv_catsv_nomg(pat, msv);
5494 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5497 /* extract any code blocks within any embedded qr//'s */
5498 if (rx && SvTYPE(rx) == SVt_REGEXP
5499 && RX_ENGINE((REGEXP*)rx)->op_comp)
5502 RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5503 if (ri->num_code_blocks) {
5505 /* the presence of an embedded qr// with code means
5506 * we should always recompile: the text of the
5507 * qr// may not have changed, but it may be a
5508 * different closure than last time */
5510 Renew(pRExC_state->code_blocks,
5511 pRExC_state->num_code_blocks + ri->num_code_blocks,
5512 struct reg_code_block);
5513 pRExC_state->num_code_blocks += ri->num_code_blocks;
5514 for (i=0; i < ri->num_code_blocks; i++) {
5515 struct reg_code_block *src, *dst;
5516 STRLEN offset = orig_patlen
5517 + ((struct regexp *)SvANY(rx))->pre_prefix;
5518 assert(n < pRExC_state->num_code_blocks);
5519 src = &ri->code_blocks[i];
5520 dst = &pRExC_state->code_blocks[n];
5521 dst->start = src->start + offset;
5522 dst->end = src->end + offset;
5523 dst->block = src->block;
5524 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5538 while (SvAMAGIC(pat)
5539 && (sv = AMG_CALLunary(pat, string_amg))
5547 /* handle bare regex: foo =~ $re */
5552 if (SvTYPE(re) == SVt_REGEXP) {
5556 Safefree(pRExC_state->code_blocks);
5562 /* not a list of SVs, so must be a list of OPs */
5564 if (expr->op_type == OP_LIST) {
5569 pat = newSVpvn("", 0);
5574 /* given a list of CONSTs and DO blocks in expr, append all
5575 * the CONSTs to pat, and record the start and end of each
5576 * code block in code_blocks[] (each DO{} op is followed by an
5577 * OP_CONST containing the corresponding literal '(?{...})
5580 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5581 if (o->op_type == OP_CONST) {
5582 sv_catsv(pat, cSVOPo_sv);
5584 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5588 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5589 assert(i+1 < pRExC_state->num_code_blocks);
5590 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5591 pRExC_state->code_blocks[i].block = o;
5592 pRExC_state->code_blocks[i].src_regex = NULL;
5598 assert(expr->op_type == OP_CONST);
5599 pat = cSVOPx_sv(expr);
5603 exp = SvPV_nomg(pat, plen);
5605 if (!eng->op_comp) {
5606 if ((SvUTF8(pat) && IN_BYTES)
5607 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5609 /* make a temporary copy; either to convert to bytes,
5610 * or to avoid repeating get-magic / overloaded stringify */
5611 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5612 (IN_BYTES ? 0 : SvUTF8(pat)));
5614 Safefree(pRExC_state->code_blocks);
5615 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5618 /* ignore the utf8ness if the pattern is 0 length */
5619 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5620 RExC_uni_semantics = 0;
5621 RExC_contains_locale = 0;
5622 pRExC_state->runtime_code_qr = NULL;
5624 /****************** LONG JUMP TARGET HERE***********************/
5625 /* Longjmp back to here if have to switch in midstream to utf8 */
5626 if (! RExC_orig_utf8) {
5627 JMPENV_PUSH(jump_ret);
5628 used_setjump = TRUE;
5631 if (jump_ret == 0) { /* First time through */
5635 SV *dsv= sv_newmortal();
5636 RE_PV_QUOTED_DECL(s, RExC_utf8,
5637 dsv, exp, plen, 60);
5638 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5639 PL_colors[4],PL_colors[5],s);
5642 else { /* longjumped back */
5645 STRLEN s = 0, d = 0;
5648 /* If the cause for the longjmp was other than changing to utf8, pop
5649 * our own setjmp, and longjmp to the correct handler */
5650 if (jump_ret != UTF8_LONGJMP) {
5652 JMPENV_JUMP(jump_ret);
5657 /* It's possible to write a regexp in ascii that represents Unicode
5658 codepoints outside of the byte range, such as via \x{100}. If we
5659 detect such a sequence we have to convert the entire pattern to utf8
5660 and then recompile, as our sizing calculation will have been based
5661 on 1 byte == 1 character, but we will need to use utf8 to encode
5662 at least some part of the pattern, and therefore must convert the whole
5665 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5666 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5668 /* upgrade pattern to UTF8, and if there are code blocks,
5669 * recalculate the indices.
5670 * This is essentially an unrolled Perl_bytes_to_utf8() */
5672 src = (U8*)SvPV_nomg(pat, plen);
5673 Newx(dst, plen * 2 + 1, U8);
5676 const UV uv = NATIVE_TO_ASCII(src[s]);
5677 if (UNI_IS_INVARIANT(uv))
5678 dst[d] = (U8)UTF_TO_NATIVE(uv);
5680 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5681 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5683 if (n < pRExC_state->num_code_blocks) {
5684 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5685 pRExC_state->code_blocks[n].start = d;
5686 assert(dst[d] == '(');
5689 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5690 pRExC_state->code_blocks[n].end = d;
5691 assert(dst[d] == ')');
5704 RExC_orig_utf8 = RExC_utf8 = 1;
5707 /* return old regex if pattern hasn't changed */
5711 && !!RX_UTF8(old_re) == !!RExC_utf8
5712 && RX_PRECOMP(old_re)
5713 && RX_PRELEN(old_re) == plen
5714 && memEQ(RX_PRECOMP(old_re), exp, plen))
5716 /* with runtime code, always recompile */
5717 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5719 if (!runtime_code) {
5723 Safefree(pRExC_state->code_blocks);
5727 else if ((pm_flags & PMf_USE_RE_EVAL)
5728 /* this second condition covers the non-regex literal case,
5729 * i.e. $foo =~ '(?{})'. */
5730 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5731 && (PL_hints & HINT_RE_EVAL))
5733 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5736 #ifdef TRIE_STUDY_OPT
5740 rx_flags = orig_rx_flags;
5742 if (initial_charset == REGEX_LOCALE_CHARSET) {
5743 RExC_contains_locale = 1;
5745 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5747 /* Set to use unicode semantics if the pattern is in utf8 and has the
5748 * 'depends' charset specified, as it means unicode when utf8 */
5749 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5753 RExC_flags = rx_flags;
5754 RExC_pm_flags = pm_flags;
5757 if (PL_tainting && PL_tainted)
5758 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5760 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5761 /* whoops, we have a non-utf8 pattern, whilst run-time code
5762 * got compiled as utf8. Try again with a utf8 pattern */
5763 JMPENV_JUMP(UTF8_LONGJMP);
5766 assert(!pRExC_state->runtime_code_qr);
5771 RExC_in_lookbehind = 0;
5772 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5774 RExC_override_recoding = 0;
5775 RExC_in_multi_char_class = 0;
5777 /* First pass: determine size, legality. */
5785 RExC_emit = &PL_regdummy;
5786 RExC_whilem_seen = 0;
5787 RExC_open_parens = NULL;
5788 RExC_close_parens = NULL;
5790 RExC_paren_names = NULL;
5792 RExC_paren_name_list = NULL;
5794 RExC_recurse = NULL;
5795 RExC_recurse_count = 0;
5796 pRExC_state->code_index = 0;
5798 #if 0 /* REGC() is (currently) a NOP at the first pass.
5799 * Clever compilers notice this and complain. --jhi */
5800 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5803 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5805 RExC_lastparse=NULL;
5807 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5808 RExC_precomp = NULL;
5809 Safefree(pRExC_state->code_blocks);
5813 /* Here, finished first pass. Get rid of any added setjmp */
5819 PerlIO_printf(Perl_debug_log,
5820 "Required size %"IVdf" nodes\n"
5821 "Starting second pass (creation)\n",
5824 RExC_lastparse=NULL;
5827 /* The first pass could have found things that force Unicode semantics */
5828 if ((RExC_utf8 || RExC_uni_semantics)
5829 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5831 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5834 /* Small enough for pointer-storage convention?
5835 If extralen==0, this means that we will not need long jumps. */
5836 if (RExC_size >= 0x10000L && RExC_extralen)
5837 RExC_size += RExC_extralen;
5840 if (RExC_whilem_seen > 15)
5841 RExC_whilem_seen = 15;
5843 /* Allocate space and zero-initialize. Note, the two step process
5844 of zeroing when in debug mode, thus anything assigned has to
5845 happen after that */
5846 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5847 r = (struct regexp*)SvANY(rx);
5848 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5849 char, regexp_internal);
5850 if ( r == NULL || ri == NULL )
5851 FAIL("Regexp out of space");
5853 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5854 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5856 /* bulk initialize base fields with 0. */
5857 Zero(ri, sizeof(regexp_internal), char);
5860 /* non-zero initialization begins here */
5863 r->extflags = rx_flags;
5864 if (pm_flags & PMf_IS_QR) {
5865 ri->code_blocks = pRExC_state->code_blocks;
5866 ri->num_code_blocks = pRExC_state->num_code_blocks;
5869 SAVEFREEPV(pRExC_state->code_blocks);
5872 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5873 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5875 /* The caret is output if there are any defaults: if not all the STD
5876 * flags are set, or if no character set specifier is needed */
5878 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5880 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5881 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5882 >> RXf_PMf_STD_PMMOD_SHIFT);
5883 const char *fptr = STD_PAT_MODS; /*"msix"*/
5885 /* Allocate for the worst case, which is all the std flags are turned
5886 * on. If more precision is desired, we could do a population count of
5887 * the flags set. This could be done with a small lookup table, or by
5888 * shifting, masking and adding, or even, when available, assembly
5889 * language for a machine-language population count.
5890 * We never output a minus, as all those are defaults, so are
5891 * covered by the caret */
5892 const STRLEN wraplen = plen + has_p + has_runon
5893 + has_default /* If needs a caret */
5895 /* If needs a character set specifier */
5896 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5897 + (sizeof(STD_PAT_MODS) - 1)
5898 + (sizeof("(?:)") - 1);
5900 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5903 SvFLAGS(rx) |= SVf_UTF8;
5906 /* If a default, cover it using the caret */
5908 *p++= DEFAULT_PAT_MOD;
5912 const char* const name = get_regex_charset_name(r->extflags, &len);
5913 Copy(name, p, len, char);
5917 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5920 while((ch = *fptr++)) {
5928 Copy(RExC_precomp, p, plen, char);
5929 assert ((RX_WRAPPED(rx) - p) < 16);
5930 r->pre_prefix = p - RX_WRAPPED(rx);
5936 SvCUR_set(rx, p - SvPVX_const(rx));
5940 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5942 if (RExC_seen & REG_SEEN_RECURSE) {
5943 Newxz(RExC_open_parens, RExC_npar,regnode *);
5944 SAVEFREEPV(RExC_open_parens);
5945 Newxz(RExC_close_parens,RExC_npar,regnode *);
5946 SAVEFREEPV(RExC_close_parens);
5949 /* Useful during FAIL. */
5950 #ifdef RE_TRACK_PATTERN_OFFSETS
5951 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5952 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5953 "%s %"UVuf" bytes for offset annotations.\n",
5954 ri->u.offsets ? "Got" : "Couldn't get",
5955 (UV)((2*RExC_size+1) * sizeof(U32))));
5957 SetProgLen(ri,RExC_size);
5961 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5963 /* Second pass: emit code. */
5964 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5965 RExC_pm_flags = pm_flags;
5970 RExC_emit_start = ri->program;
5971 RExC_emit = ri->program;
5972 RExC_emit_bound = ri->program + RExC_size + 1;
5973 pRExC_state->code_index = 0;
5975 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5976 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5980 /* XXXX To minimize changes to RE engine we always allocate
5981 3-units-long substrs field. */
5982 Newx(r->substrs, 1, struct reg_substr_data);
5983 if (RExC_recurse_count) {
5984 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5985 SAVEFREEPV(RExC_recurse);
5989 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5990 Zero(r->substrs, 1, struct reg_substr_data);
5992 #ifdef TRIE_STUDY_OPT
5994 StructCopy(&zero_scan_data, &data, scan_data_t);
5995 copyRExC_state = RExC_state;
5998 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6000 RExC_state = copyRExC_state;
6001 if (seen & REG_TOP_LEVEL_BRANCHES)
6002 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6004 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6005 if (data.last_found) {
6006 SvREFCNT_dec(data.longest_fixed);
6007 SvREFCNT_dec(data.longest_float);
6008 SvREFCNT_dec(data.last_found);
6010 StructCopy(&zero_scan_data, &data, scan_data_t);
6013 StructCopy(&zero_scan_data, &data, scan_data_t);
6016 /* Dig out information for optimizations. */
6017 r->extflags = RExC_flags; /* was pm_op */
6018 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6021 SvUTF8_on(rx); /* Unicode in it? */
6022 ri->regstclass = NULL;
6023 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6024 r->intflags |= PREGf_NAUGHTY;
6025 scan = ri->program + 1; /* First BRANCH. */
6027 /* testing for BRANCH here tells us whether there is "must appear"
6028 data in the pattern. If there is then we can use it for optimisations */
6029 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6031 STRLEN longest_float_length, longest_fixed_length;
6032 struct regnode_charclass_class ch_class; /* pointed to by data */
6034 I32 last_close = 0; /* pointed to by data */
6035 regnode *first= scan;
6036 regnode *first_next= regnext(first);
6038 * Skip introductions and multiplicators >= 1
6039 * so that we can extract the 'meat' of the pattern that must
6040 * match in the large if() sequence following.
6041 * NOTE that EXACT is NOT covered here, as it is normally
6042 * picked up by the optimiser separately.
6044 * This is unfortunate as the optimiser isnt handling lookahead
6045 * properly currently.
6048 while ((OP(first) == OPEN && (sawopen = 1)) ||
6049 /* An OR of *one* alternative - should not happen now. */
6050 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6051 /* for now we can't handle lookbehind IFMATCH*/
6052 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6053 (OP(first) == PLUS) ||
6054 (OP(first) == MINMOD) ||
6055 /* An {n,m} with n>0 */
6056 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6057 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6060 * the only op that could be a regnode is PLUS, all the rest
6061 * will be regnode_1 or regnode_2.
6064 if (OP(first) == PLUS)
6067 first += regarglen[OP(first)];
6069 first = NEXTOPER(first);
6070 first_next= regnext(first);
6073 /* Starting-point info. */
6075 DEBUG_PEEP("first:",first,0);
6076 /* Ignore EXACT as we deal with it later. */
6077 if (PL_regkind[OP(first)] == EXACT) {
6078 if (OP(first) == EXACT)
6079 NOOP; /* Empty, get anchored substr later. */
6081 ri->regstclass = first;
6084 else if (PL_regkind[OP(first)] == TRIE &&
6085 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6088 /* this can happen only on restudy */
6089 if ( OP(first) == TRIE ) {
6090 struct regnode_1 *trieop = (struct regnode_1 *)
6091 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6092 StructCopy(first,trieop,struct regnode_1);
6093 trie_op=(regnode *)trieop;
6095 struct regnode_charclass *trieop = (struct regnode_charclass *)
6096 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6097 StructCopy(first,trieop,struct regnode_charclass);
6098 trie_op=(regnode *)trieop;
6101 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6102 ri->regstclass = trie_op;
6105 else if (REGNODE_SIMPLE(OP(first)))
6106 ri->regstclass = first;
6107 else if (PL_regkind[OP(first)] == BOUND ||
6108 PL_regkind[OP(first)] == NBOUND)
6109 ri->regstclass = first;
6110 else if (PL_regkind[OP(first)] == BOL) {
6111 r->extflags |= (OP(first) == MBOL
6113 : (OP(first) == SBOL
6116 first = NEXTOPER(first);
6119 else if (OP(first) == GPOS) {
6120 r->extflags |= RXf_ANCH_GPOS;
6121 first = NEXTOPER(first);
6124 else if ((!sawopen || !RExC_sawback) &&
6125 (OP(first) == STAR &&
6126 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6127 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6129 /* turn .* into ^.* with an implied $*=1 */
6131 (OP(NEXTOPER(first)) == REG_ANY)
6134 r->extflags |= type;
6135 r->intflags |= PREGf_IMPLICIT;
6136 first = NEXTOPER(first);
6139 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6140 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6141 /* x+ must match at the 1st pos of run of x's */
6142 r->intflags |= PREGf_SKIP;
6144 /* Scan is after the zeroth branch, first is atomic matcher. */
6145 #ifdef TRIE_STUDY_OPT
6148 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6149 (IV)(first - scan + 1))
6153 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6154 (IV)(first - scan + 1))
6160 * If there's something expensive in the r.e., find the
6161 * longest literal string that must appear and make it the
6162 * regmust. Resolve ties in favor of later strings, since
6163 * the regstart check works with the beginning of the r.e.
6164 * and avoiding duplication strengthens checking. Not a
6165 * strong reason, but sufficient in the absence of others.
6166 * [Now we resolve ties in favor of the earlier string if
6167 * it happens that c_offset_min has been invalidated, since the
6168 * earlier string may buy us something the later one won't.]
6171 data.longest_fixed = newSVpvs("");
6172 data.longest_float = newSVpvs("");
6173 data.last_found = newSVpvs("");
6174 data.longest = &(data.longest_fixed);
6176 if (!ri->regstclass) {
6177 cl_init(pRExC_state, &ch_class);
6178 data.start_class = &ch_class;
6179 stclass_flag = SCF_DO_STCLASS_AND;
6180 } else /* XXXX Check for BOUND? */
6182 data.last_closep = &last_close;
6184 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6185 &data, -1, NULL, NULL,
6186 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6192 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6193 && data.last_start_min == 0 && data.last_end > 0
6194 && !RExC_seen_zerolen
6195 && !(RExC_seen & REG_SEEN_VERBARG)
6196 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6197 r->extflags |= RXf_CHECK_ALL;
6198 scan_commit(pRExC_state, &data,&minlen,0);
6199 SvREFCNT_dec(data.last_found);
6201 longest_float_length = CHR_SVLEN(data.longest_float);
6203 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6204 && data.offset_fixed == data.offset_float_min
6205 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6206 && S_setup_longest (aTHX_ pRExC_state,
6210 &(r->float_end_shift),
6211 data.lookbehind_float,
6212 data.offset_float_min,
6214 longest_float_length,
6215 data.flags & SF_FL_BEFORE_EOL,
6216 data.flags & SF_FL_BEFORE_MEOL))
6218 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6219 r->float_max_offset = data.offset_float_max;
6220 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6221 r->float_max_offset -= data.lookbehind_float;
6224 r->float_substr = r->float_utf8 = NULL;
6225 SvREFCNT_dec(data.longest_float);
6226 longest_float_length = 0;
6229 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6231 if (S_setup_longest (aTHX_ pRExC_state,
6233 &(r->anchored_utf8),
6234 &(r->anchored_substr),
6235 &(r->anchored_end_shift),
6236 data.lookbehind_fixed,
6239 longest_fixed_length,
6240 data.flags & SF_FIX_BEFORE_EOL,
6241 data.flags & SF_FIX_BEFORE_MEOL))
6243 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6246 r->anchored_substr = r->anchored_utf8 = NULL;
6247 SvREFCNT_dec(data.longest_fixed);
6248 longest_fixed_length = 0;
6252 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6253 ri->regstclass = NULL;
6255 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6257 && !(data.start_class->flags & ANYOF_EOS)
6258 && !cl_is_anything(data.start_class))
6260 const U32 n = add_data(pRExC_state, 1, "f");
6261 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6263 Newx(RExC_rxi->data->data[n], 1,
6264 struct regnode_charclass_class);
6265 StructCopy(data.start_class,
6266 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6267 struct regnode_charclass_class);
6268 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6269 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6270 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6271 regprop(r, sv, (regnode*)data.start_class);
6272 PerlIO_printf(Perl_debug_log,
6273 "synthetic stclass \"%s\".\n",
6274 SvPVX_const(sv));});
6277 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6278 if (longest_fixed_length > longest_float_length) {
6279 r->check_end_shift = r->anchored_end_shift;
6280 r->check_substr = r->anchored_substr;
6281 r->check_utf8 = r->anchored_utf8;
6282 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6283 if (r->extflags & RXf_ANCH_SINGLE)
6284 r->extflags |= RXf_NOSCAN;
6287 r->check_end_shift = r->float_end_shift;
6288 r->check_substr = r->float_substr;
6289 r->check_utf8 = r->float_utf8;
6290 r->check_offset_min = r->float_min_offset;
6291 r->check_offset_max = r->float_max_offset;
6293 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6294 This should be changed ASAP! */
6295 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6296 r->extflags |= RXf_USE_INTUIT;
6297 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6298 r->extflags |= RXf_INTUIT_TAIL;
6300 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6301 if ( (STRLEN)minlen < longest_float_length )
6302 minlen= longest_float_length;
6303 if ( (STRLEN)minlen < longest_fixed_length )
6304 minlen= longest_fixed_length;
6308 /* Several toplevels. Best we can is to set minlen. */
6310 struct regnode_charclass_class ch_class;
6313 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6315 scan = ri->program + 1;
6316 cl_init(pRExC_state, &ch_class);
6317 data.start_class = &ch_class;
6318 data.last_closep = &last_close;
6321 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6322 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6326 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6327 = r->float_substr = r->float_utf8 = NULL;
6329 if (!(data.start_class->flags & ANYOF_EOS)
6330 && !cl_is_anything(data.start_class))
6332 const U32 n = add_data(pRExC_state, 1, "f");
6333 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6335 Newx(RExC_rxi->data->data[n], 1,
6336 struct regnode_charclass_class);
6337 StructCopy(data.start_class,
6338 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6339 struct regnode_charclass_class);
6340 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6341 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6342 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6343 regprop(r, sv, (regnode*)data.start_class);
6344 PerlIO_printf(Perl_debug_log,
6345 "synthetic stclass \"%s\".\n",
6346 SvPVX_const(sv));});
6350 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6351 the "real" pattern. */
6353 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6354 (IV)minlen, (IV)r->minlen);
6356 r->minlenret = minlen;
6357 if (r->minlen < minlen)
6360 if (RExC_seen & REG_SEEN_GPOS)
6361 r->extflags |= RXf_GPOS_SEEN;
6362 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6363 r->extflags |= RXf_LOOKBEHIND_SEEN;
6364 if (pRExC_state->num_code_blocks)
6365 r->extflags |= RXf_EVAL_SEEN;
6366 if (RExC_seen & REG_SEEN_CANY)
6367 r->extflags |= RXf_CANY_SEEN;
6368 if (RExC_seen & REG_SEEN_VERBARG)
6370 r->intflags |= PREGf_VERBARG_SEEN;
6371 r->extflags |= RXf_MODIFIES_VARS;
6373 if (RExC_seen & REG_SEEN_CUTGROUP)
6374 r->intflags |= PREGf_CUTGROUP_SEEN;
6375 if (pm_flags & PMf_USE_RE_EVAL)
6376 r->intflags |= PREGf_USE_RE_EVAL;
6377 if (RExC_paren_names)
6378 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6380 RXp_PAREN_NAMES(r) = NULL;
6382 #ifdef STUPID_PATTERN_CHECKS
6383 if (RX_PRELEN(rx) == 0)
6384 r->extflags |= RXf_NULL;
6385 if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6386 r->extflags |= RXf_WHITE;
6387 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6388 r->extflags |= RXf_START_ONLY;
6391 regnode *first = ri->program + 1;
6394 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6395 r->extflags |= RXf_NULL;
6396 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6397 r->extflags |= RXf_START_ONLY;
6398 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6399 && OP(regnext(first)) == END)
6400 r->extflags |= RXf_WHITE;
6404 if (RExC_paren_names) {
6405 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6406 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6409 ri->name_list_idx = 0;
6411 if (RExC_recurse_count) {
6412 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6413 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6414 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6417 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6418 /* assume we don't need to swap parens around before we match */
6421 PerlIO_printf(Perl_debug_log,"Final program:\n");
6424 #ifdef RE_TRACK_PATTERN_OFFSETS
6425 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6426 const U32 len = ri->u.offsets[0];
6428 GET_RE_DEBUG_FLAGS_DECL;
6429 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6430 for (i = 1; i <= len; i++) {
6431 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6432 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6433 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6435 PerlIO_printf(Perl_debug_log, "\n");
6443 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6446 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6448 PERL_UNUSED_ARG(value);
6450 if (flags & RXapif_FETCH) {
6451 return reg_named_buff_fetch(rx, key, flags);
6452 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6453 Perl_croak_no_modify(aTHX);
6455 } else if (flags & RXapif_EXISTS) {
6456 return reg_named_buff_exists(rx, key, flags)
6459 } else if (flags & RXapif_REGNAMES) {
6460 return reg_named_buff_all(rx, flags);
6461 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6462 return reg_named_buff_scalar(rx, flags);
6464 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6470 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6473 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6474 PERL_UNUSED_ARG(lastkey);
6476 if (flags & RXapif_FIRSTKEY)
6477 return reg_named_buff_firstkey(rx, flags);
6478 else if (flags & RXapif_NEXTKEY)
6479 return reg_named_buff_nextkey(rx, flags);
6481 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6487 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6490 AV *retarray = NULL;
6492 struct regexp *const rx = (struct regexp *)SvANY(r);
6494 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6496 if (flags & RXapif_ALL)
6499 if (rx && RXp_PAREN_NAMES(rx)) {
6500 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6503 SV* sv_dat=HeVAL(he_str);
6504 I32 *nums=(I32*)SvPVX(sv_dat);
6505 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6506 if ((I32)(rx->nparens) >= nums[i]
6507 && rx->offs[nums[i]].start != -1
6508 && rx->offs[nums[i]].end != -1)
6511 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6516 ret = newSVsv(&PL_sv_undef);
6519 av_push(retarray, ret);
6522 return newRV_noinc(MUTABLE_SV(retarray));
6529 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6532 struct regexp *const rx = (struct regexp *)SvANY(r);
6534 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6536 if (rx && RXp_PAREN_NAMES(rx)) {
6537 if (flags & RXapif_ALL) {
6538 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6540 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6554 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6556 struct regexp *const rx = (struct regexp *)SvANY(r);
6558 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6560 if ( rx && RXp_PAREN_NAMES(rx) ) {
6561 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6563 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6570 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6572 struct regexp *const rx = (struct regexp *)SvANY(r);
6573 GET_RE_DEBUG_FLAGS_DECL;
6575 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6577 if (rx && RXp_PAREN_NAMES(rx)) {
6578 HV *hv = RXp_PAREN_NAMES(rx);
6580 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6583 SV* sv_dat = HeVAL(temphe);
6584 I32 *nums = (I32*)SvPVX(sv_dat);
6585 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6586 if ((I32)(rx->lastparen) >= nums[i] &&
6587 rx->offs[nums[i]].start != -1 &&
6588 rx->offs[nums[i]].end != -1)
6594 if (parno || flags & RXapif_ALL) {
6595 return newSVhek(HeKEY_hek(temphe));
6603 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6608 struct regexp *const rx = (struct regexp *)SvANY(r);
6610 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6612 if (rx && RXp_PAREN_NAMES(rx)) {
6613 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6614 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6615 } else if (flags & RXapif_ONE) {
6616 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6617 av = MUTABLE_AV(SvRV(ret));
6618 length = av_len(av);
6620 return newSViv(length + 1);
6622 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6626 return &PL_sv_undef;
6630 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6632 struct regexp *const rx = (struct regexp *)SvANY(r);
6635 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6637 if (rx && RXp_PAREN_NAMES(rx)) {
6638 HV *hv= RXp_PAREN_NAMES(rx);
6640 (void)hv_iterinit(hv);
6641 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6644 SV* sv_dat = HeVAL(temphe);
6645 I32 *nums = (I32*)SvPVX(sv_dat);
6646 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6647 if ((I32)(rx->lastparen) >= nums[i] &&
6648 rx->offs[nums[i]].start != -1 &&
6649 rx->offs[nums[i]].end != -1)
6655 if (parno || flags & RXapif_ALL) {
6656 av_push(av, newSVhek(HeKEY_hek(temphe)));
6661 return newRV_noinc(MUTABLE_SV(av));
6665 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6668 struct regexp *const rx = (struct regexp *)SvANY(r);
6674 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6676 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6677 || n == RX_BUFF_IDX_CARET_FULLMATCH
6678 || n == RX_BUFF_IDX_CARET_POSTMATCH
6680 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6687 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6688 /* no need to distinguish between them any more */
6689 n = RX_BUFF_IDX_FULLMATCH;
6691 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6692 && rx->offs[0].start != -1)
6694 /* $`, ${^PREMATCH} */
6695 i = rx->offs[0].start;
6699 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6700 && rx->offs[0].end != -1)
6702 /* $', ${^POSTMATCH} */
6703 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6704 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6707 if ( 0 <= n && n <= (I32)rx->nparens &&
6708 (s1 = rx->offs[n].start) != -1 &&
6709 (t1 = rx->offs[n].end) != -1)
6711 /* $&, ${^MATCH}, $1 ... */
6713 s = rx->subbeg + s1 - rx->suboffset;
6718 assert(s >= rx->subbeg);
6719 assert(rx->sublen >= (s - rx->subbeg) + i );
6721 const int oldtainted = PL_tainted;
6723 sv_setpvn(sv, s, i);
6724 PL_tainted = oldtainted;
6725 if ( (rx->extflags & RXf_CANY_SEEN)
6726 ? (RXp_MATCH_UTF8(rx)
6727 && (!i || is_utf8_string((U8*)s, i)))
6728 : (RXp_MATCH_UTF8(rx)) )
6735 if (RXp_MATCH_TAINTED(rx)) {
6736 if (SvTYPE(sv) >= SVt_PVMG) {
6737 MAGIC* const mg = SvMAGIC(sv);
6740 SvMAGIC_set(sv, mg->mg_moremagic);
6742 if ((mgt = SvMAGIC(sv))) {
6743 mg->mg_moremagic = mgt;
6744 SvMAGIC_set(sv, mg);
6755 sv_setsv(sv,&PL_sv_undef);
6761 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6762 SV const * const value)
6764 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6766 PERL_UNUSED_ARG(rx);
6767 PERL_UNUSED_ARG(paren);
6768 PERL_UNUSED_ARG(value);
6771 Perl_croak_no_modify(aTHX);
6775 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6778 struct regexp *const rx = (struct regexp *)SvANY(r);
6782 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6784 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6786 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6787 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6791 case RX_BUFF_IDX_PREMATCH: /* $` */
6792 if (rx->offs[0].start != -1) {
6793 i = rx->offs[0].start;
6802 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6803 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6805 case RX_BUFF_IDX_POSTMATCH: /* $' */
6806 if (rx->offs[0].end != -1) {
6807 i = rx->sublen - rx->offs[0].end;
6809 s1 = rx->offs[0].end;
6816 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6817 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6821 /* $& / ${^MATCH}, $1, $2, ... */
6823 if (paren <= (I32)rx->nparens &&
6824 (s1 = rx->offs[paren].start) != -1 &&
6825 (t1 = rx->offs[paren].end) != -1)
6831 if (ckWARN(WARN_UNINITIALIZED))
6832 report_uninit((const SV *)sv);
6837 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6838 const char * const s = rx->subbeg - rx->suboffset + s1;
6843 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6850 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6852 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6853 PERL_UNUSED_ARG(rx);
6857 return newSVpvs("Regexp");
6860 /* Scans the name of a named buffer from the pattern.
6861 * If flags is REG_RSN_RETURN_NULL returns null.
6862 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6863 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6864 * to the parsed name as looked up in the RExC_paren_names hash.
6865 * If there is an error throws a vFAIL().. type exception.
6868 #define REG_RSN_RETURN_NULL 0
6869 #define REG_RSN_RETURN_NAME 1
6870 #define REG_RSN_RETURN_DATA 2
6873 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6875 char *name_start = RExC_parse;
6877 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6879 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6880 /* skip IDFIRST by using do...while */
6883 RExC_parse += UTF8SKIP(RExC_parse);
6884 } while (isALNUM_utf8((U8*)RExC_parse));
6888 } while (isALNUM(*RExC_parse));
6890 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6891 vFAIL("Group name must start with a non-digit word character");
6895 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6896 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6897 if ( flags == REG_RSN_RETURN_NAME)
6899 else if (flags==REG_RSN_RETURN_DATA) {
6902 if ( ! sv_name ) /* should not happen*/
6903 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6904 if (RExC_paren_names)
6905 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6907 sv_dat = HeVAL(he_str);
6909 vFAIL("Reference to nonexistent named group");
6913 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6914 (unsigned long) flags);
6916 assert(0); /* NOT REACHED */
6921 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6922 int rem=(int)(RExC_end - RExC_parse); \
6931 if (RExC_lastparse!=RExC_parse) \
6932 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6935 iscut ? "..." : "<" \
6938 PerlIO_printf(Perl_debug_log,"%16s",""); \
6941 num = RExC_size + 1; \
6943 num=REG_NODE_NUM(RExC_emit); \
6944 if (RExC_lastnum!=num) \
6945 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6947 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6948 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6949 (int)((depth*2)), "", \
6953 RExC_lastparse=RExC_parse; \
6958 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6959 DEBUG_PARSE_MSG((funcname)); \
6960 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6962 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6963 DEBUG_PARSE_MSG((funcname)); \
6964 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6967 /* This section of code defines the inversion list object and its methods. The
6968 * interfaces are highly subject to change, so as much as possible is static to
6969 * this file. An inversion list is here implemented as a malloc'd C UV array
6970 * with some added info that is placed as UVs at the beginning in a header
6971 * portion. An inversion list for Unicode is an array of code points, sorted
6972 * by ordinal number. The zeroth element is the first code point in the list.
6973 * The 1th element is the first element beyond that not in the list. In other
6974 * words, the first range is
6975 * invlist[0]..(invlist[1]-1)
6976 * The other ranges follow. Thus every element whose index is divisible by two
6977 * marks the beginning of a range that is in the list, and every element not
6978 * divisible by two marks the beginning of a range not in the list. A single
6979 * element inversion list that contains the single code point N generally
6980 * consists of two elements
6983 * (The exception is when N is the highest representable value on the
6984 * machine, in which case the list containing just it would be a single
6985 * element, itself. By extension, if the last range in the list extends to
6986 * infinity, then the first element of that range will be in the inversion list
6987 * at a position that is divisible by two, and is the final element in the
6989 * Taking the complement (inverting) an inversion list is quite simple, if the
6990 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6991 * This implementation reserves an element at the beginning of each inversion
6992 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
6993 * actual beginning of the list is either that element if 0, or the next one if
6996 * More about inversion lists can be found in "Unicode Demystified"
6997 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6998 * More will be coming when functionality is added later.
7000 * The inversion list data structure is currently implemented as an SV pointing
7001 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7002 * array of UV whose memory management is automatically handled by the existing
7003 * facilities for SV's.
7005 * Some of the methods should always be private to the implementation, and some
7006 * should eventually be made public */
7008 /* The header definitions are in F<inline_invlist.c> */
7010 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7011 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7013 #define INVLIST_INITIAL_LEN 10
7015 PERL_STATIC_INLINE UV*
7016 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7018 /* Returns a pointer to the first element in the inversion list's array.
7019 * This is called upon initialization of an inversion list. Where the
7020 * array begins depends on whether the list has the code point U+0000
7021 * in it or not. The other parameter tells it whether the code that
7022 * follows this call is about to put a 0 in the inversion list or not.
7023 * The first element is either the element with 0, if 0, or the next one,
7026 UV* zero = get_invlist_zero_addr(invlist);
7028 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7031 assert(! *_get_invlist_len_addr(invlist));
7033 /* 1^1 = 0; 1^0 = 1 */
7034 *zero = 1 ^ will_have_0;
7035 return zero + *zero;
7038 PERL_STATIC_INLINE UV*
7039 S_invlist_array(pTHX_ SV* const invlist)
7041 /* Returns the pointer to the inversion list's array. Every time the
7042 * length changes, this needs to be called in case malloc or realloc moved
7045 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7047 /* Must not be empty. If these fail, you probably didn't check for <len>
7048 * being non-zero before trying to get the array */
7049 assert(*_get_invlist_len_addr(invlist));
7050 assert(*get_invlist_zero_addr(invlist) == 0
7051 || *get_invlist_zero_addr(invlist) == 1);
7053 /* The array begins either at the element reserved for zero if the
7054 * list contains 0 (that element will be set to 0), or otherwise the next
7055 * element (in which case the reserved element will be set to 1). */
7056 return (UV *) (get_invlist_zero_addr(invlist)
7057 + *get_invlist_zero_addr(invlist));
7060 PERL_STATIC_INLINE void
7061 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7063 /* Sets the current number of elements stored in the inversion list */
7065 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7067 *_get_invlist_len_addr(invlist) = len;
7069 assert(len <= SvLEN(invlist));
7071 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7072 /* If the list contains U+0000, that element is part of the header,
7073 * and should not be counted as part of the array. It will contain
7074 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7076 * SvCUR_set(invlist,
7077 * TO_INTERNAL_SIZE(len
7078 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7079 * But, this is only valid if len is not 0. The consequences of not doing
7080 * this is that the memory allocation code may think that 1 more UV is
7081 * being used than actually is, and so might do an unnecessary grow. That
7082 * seems worth not bothering to make this the precise amount.
7084 * Note that when inverting, SvCUR shouldn't change */
7087 PERL_STATIC_INLINE IV*
7088 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7090 /* Return the address of the UV that is reserved to hold the cached index
7093 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7095 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7098 PERL_STATIC_INLINE IV
7099 S_invlist_previous_index(pTHX_ SV* const invlist)
7101 /* Returns cached index of previous search */
7103 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7105 return *get_invlist_previous_index_addr(invlist);
7108 PERL_STATIC_INLINE void
7109 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7111 /* Caches <index> for later retrieval */
7113 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7115 assert(index == 0 || index < (int) _invlist_len(invlist));
7117 *get_invlist_previous_index_addr(invlist) = index;
7120 PERL_STATIC_INLINE UV
7121 S_invlist_max(pTHX_ SV* const invlist)
7123 /* Returns the maximum number of elements storable in the inversion list's
7124 * array, without having to realloc() */
7126 PERL_ARGS_ASSERT_INVLIST_MAX;
7128 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7131 PERL_STATIC_INLINE UV*
7132 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7134 /* Return the address of the UV that is reserved to hold 0 if the inversion
7135 * list contains 0. This has to be the last element of the heading, as the
7136 * list proper starts with either it if 0, or the next element if not.
7137 * (But we force it to contain either 0 or 1) */
7139 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7141 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7144 #ifndef PERL_IN_XSUB_RE
7146 Perl__new_invlist(pTHX_ IV initial_size)
7149 /* Return a pointer to a newly constructed inversion list, with enough
7150 * space to store 'initial_size' elements. If that number is negative, a
7151 * system default is used instead */
7155 if (initial_size < 0) {
7156 initial_size = INVLIST_INITIAL_LEN;
7159 /* Allocate the initial space */
7160 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7161 invlist_set_len(new_list, 0);
7163 /* Force iterinit() to be used to get iteration to work */
7164 *get_invlist_iter_addr(new_list) = UV_MAX;
7166 /* This should force a segfault if a method doesn't initialize this
7168 *get_invlist_zero_addr(new_list) = UV_MAX;
7170 *get_invlist_previous_index_addr(new_list) = 0;
7171 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7172 #if HEADER_LENGTH != 5
7173 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7181 S__new_invlist_C_array(pTHX_ UV* list)
7183 /* Return a pointer to a newly constructed inversion list, initialized to
7184 * point to <list>, which has to be in the exact correct inversion list
7185 * form, including internal fields. Thus this is a dangerous routine that
7186 * should not be used in the wrong hands */
7188 SV* invlist = newSV_type(SVt_PV);
7190 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7192 SvPV_set(invlist, (char *) list);
7193 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7194 shouldn't touch it */
7195 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7197 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7198 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7205 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7207 /* Grow the maximum size of an inversion list */
7209 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7211 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7214 PERL_STATIC_INLINE void
7215 S_invlist_trim(pTHX_ SV* const invlist)
7217 PERL_ARGS_ASSERT_INVLIST_TRIM;
7219 /* Change the length of the inversion list to how many entries it currently
7222 SvPV_shrink_to_cur((SV *) invlist);
7225 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7228 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7230 /* Subject to change or removal. Append the range from 'start' to 'end' at
7231 * the end of the inversion list. The range must be above any existing
7235 UV max = invlist_max(invlist);
7236 UV len = _invlist_len(invlist);
7238 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7240 if (len == 0) { /* Empty lists must be initialized */
7241 array = _invlist_array_init(invlist, start == 0);
7244 /* Here, the existing list is non-empty. The current max entry in the
7245 * list is generally the first value not in the set, except when the
7246 * set extends to the end of permissible values, in which case it is
7247 * the first entry in that final set, and so this call is an attempt to
7248 * append out-of-order */
7250 UV final_element = len - 1;
7251 array = invlist_array(invlist);
7252 if (array[final_element] > start
7253 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7255 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7256 array[final_element], start,
7257 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7260 /* Here, it is a legal append. If the new range begins with the first
7261 * value not in the set, it is extending the set, so the new first
7262 * value not in the set is one greater than the newly extended range.
7264 if (array[final_element] == start) {
7265 if (end != UV_MAX) {
7266 array[final_element] = end + 1;
7269 /* But if the end is the maximum representable on the machine,
7270 * just let the range that this would extend to have no end */
7271 invlist_set_len(invlist, len - 1);
7277 /* Here the new range doesn't extend any existing set. Add it */
7279 len += 2; /* Includes an element each for the start and end of range */
7281 /* If overflows the existing space, extend, which may cause the array to be
7284 invlist_extend(invlist, len);
7285 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7286 failure in invlist_array() */
7287 array = invlist_array(invlist);
7290 invlist_set_len(invlist, len);
7293 /* The next item on the list starts the range, the one after that is
7294 * one past the new range. */
7295 array[len - 2] = start;
7296 if (end != UV_MAX) {
7297 array[len - 1] = end + 1;
7300 /* But if the end is the maximum representable on the machine, just let
7301 * the range have no end */
7302 invlist_set_len(invlist, len - 1);
7306 #ifndef PERL_IN_XSUB_RE
7309 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7311 /* Searches the inversion list for the entry that contains the input code
7312 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7313 * return value is the index into the list's array of the range that
7318 IV high = _invlist_len(invlist);
7319 const IV highest_element = high - 1;
7322 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7324 /* If list is empty, return failure. */
7329 /* If the code point is before the first element, return failure. (We
7330 * can't combine this with the test above, because we can't get the array
7331 * unless we know the list is non-empty) */
7332 array = invlist_array(invlist);
7334 mid = invlist_previous_index(invlist);
7335 assert(mid >=0 && mid <= highest_element);
7337 /* <mid> contains the cache of the result of the previous call to this
7338 * function (0 the first time). See if this call is for the same result,
7339 * or if it is for mid-1. This is under the theory that calls to this
7340 * function will often be for related code points that are near each other.
7341 * And benchmarks show that caching gives better results. We also test
7342 * here if the code point is within the bounds of the list. These tests
7343 * replace others that would have had to be made anyway to make sure that
7344 * the array bounds were not exceeded, and give us extra information at the
7346 if (cp >= array[mid]) {
7347 if (cp >= array[highest_element]) {
7348 return highest_element;
7351 /* Here, array[mid] <= cp < array[highest_element]. This means that
7352 * the final element is not the answer, so can exclude it; it also
7353 * means that <mid> is not the final element, so can refer to 'mid + 1'
7355 if (cp < array[mid + 1]) {
7361 else { /* cp < aray[mid] */
7362 if (cp < array[0]) { /* Fail if outside the array */
7366 if (cp >= array[mid - 1]) {
7371 /* Binary search. What we are looking for is <i> such that
7372 * array[i] <= cp < array[i+1]
7373 * The loop below converges on the i+1. Note that there may not be an
7374 * (i+1)th element in the array, and things work nonetheless */
7375 while (low < high) {
7376 mid = (low + high) / 2;
7377 assert(mid <= highest_element);
7378 if (array[mid] <= cp) { /* cp >= array[mid] */
7381 /* We could do this extra test to exit the loop early.
7382 if (cp < array[low]) {
7387 else { /* cp < array[mid] */
7394 invlist_set_previous_index(invlist, high);
7399 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7401 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7402 * but is used when the swash has an inversion list. This makes this much
7403 * faster, as it uses a binary search instead of a linear one. This is
7404 * intimately tied to that function, and perhaps should be in utf8.c,
7405 * except it is intimately tied to inversion lists as well. It assumes
7406 * that <swatch> is all 0's on input */
7409 const IV len = _invlist_len(invlist);
7413 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7415 if (len == 0) { /* Empty inversion list */
7419 array = invlist_array(invlist);
7421 /* Find which element it is */
7422 i = _invlist_search(invlist, start);
7424 /* We populate from <start> to <end> */
7425 while (current < end) {
7428 /* The inversion list gives the results for every possible code point
7429 * after the first one in the list. Only those ranges whose index is
7430 * even are ones that the inversion list matches. For the odd ones,
7431 * and if the initial code point is not in the list, we have to skip
7432 * forward to the next element */
7433 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7435 if (i >= len) { /* Finished if beyond the end of the array */
7439 if (current >= end) { /* Finished if beyond the end of what we
7441 if (LIKELY(end < UV_MAX)) {
7445 /* We get here when the upper bound is the maximum
7446 * representable on the machine, and we are looking for just
7447 * that code point. Have to special case it */
7449 goto join_end_of_list;
7452 assert(current >= start);
7454 /* The current range ends one below the next one, except don't go past
7457 upper = (i < len && array[i] < end) ? array[i] : end;
7459 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7460 * for each code point in it */
7461 for (; current < upper; current++) {
7462 const STRLEN offset = (STRLEN)(current - start);
7463 swatch[offset >> 3] |= 1 << (offset & 7);
7468 /* Quit if at the end of the list */
7471 /* But first, have to deal with the highest possible code point on
7472 * the platform. The previous code assumes that <end> is one
7473 * beyond where we want to populate, but that is impossible at the
7474 * platform's infinity, so have to handle it specially */
7475 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7477 const STRLEN offset = (STRLEN)(end - start);
7478 swatch[offset >> 3] |= 1 << (offset & 7);
7483 /* Advance to the next range, which will be for code points not in the
7492 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7494 /* Take the union of two inversion lists and point <output> to it. *output
7495 * should be defined upon input, and if it points to one of the two lists,
7496 * the reference count to that list will be decremented. The first list,
7497 * <a>, may be NULL, in which case a copy of the second list is returned.
7498 * If <complement_b> is TRUE, the union is taken of the complement
7499 * (inversion) of <b> instead of b itself.
7501 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7502 * Richard Gillam, published by Addison-Wesley, and explained at some
7503 * length there. The preface says to incorporate its examples into your
7504 * code at your own risk.
7506 * The algorithm is like a merge sort.
7508 * XXX A potential performance improvement is to keep track as we go along
7509 * if only one of the inputs contributes to the result, meaning the other
7510 * is a subset of that one. In that case, we can skip the final copy and
7511 * return the larger of the input lists, but then outside code might need
7512 * to keep track of whether to free the input list or not */
7514 UV* array_a; /* a's array */
7516 UV len_a; /* length of a's array */
7519 SV* u; /* the resulting union */
7523 UV i_a = 0; /* current index into a's array */
7527 /* running count, as explained in the algorithm source book; items are
7528 * stopped accumulating and are output when the count changes to/from 0.
7529 * The count is incremented when we start a range that's in the set, and
7530 * decremented when we start a range that's not in the set. So its range
7531 * is 0 to 2. Only when the count is zero is something not in the set.
7535 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7538 /* If either one is empty, the union is the other one */
7539 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7546 *output = invlist_clone(b);
7548 _invlist_invert(*output);
7550 } /* else *output already = b; */
7553 else if ((len_b = _invlist_len(b)) == 0) {
7558 /* The complement of an empty list is a list that has everything in it,
7559 * so the union with <a> includes everything too */
7564 *output = _new_invlist(1);
7565 _append_range_to_invlist(*output, 0, UV_MAX);
7567 else if (*output != a) {
7568 *output = invlist_clone(a);
7570 /* else *output already = a; */
7574 /* Here both lists exist and are non-empty */
7575 array_a = invlist_array(a);
7576 array_b = invlist_array(b);
7578 /* If are to take the union of 'a' with the complement of b, set it
7579 * up so are looking at b's complement. */
7582 /* To complement, we invert: if the first element is 0, remove it. To
7583 * do this, we just pretend the array starts one later, and clear the
7584 * flag as we don't have to do anything else later */
7585 if (array_b[0] == 0) {
7588 complement_b = FALSE;
7592 /* But if the first element is not zero, we unshift a 0 before the
7593 * array. The data structure reserves a space for that 0 (which
7594 * should be a '1' right now), so physical shifting is unneeded,
7595 * but temporarily change that element to 0. Before exiting the
7596 * routine, we must restore the element to '1' */
7603 /* Size the union for the worst case: that the sets are completely
7605 u = _new_invlist(len_a + len_b);
7607 /* Will contain U+0000 if either component does */
7608 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7609 || (len_b > 0 && array_b[0] == 0));
7611 /* Go through each list item by item, stopping when exhausted one of
7613 while (i_a < len_a && i_b < len_b) {
7614 UV cp; /* The element to potentially add to the union's array */
7615 bool cp_in_set; /* is it in the the input list's set or not */
7617 /* We need to take one or the other of the two inputs for the union.
7618 * Since we are merging two sorted lists, we take the smaller of the
7619 * next items. In case of a tie, we take the one that is in its set
7620 * first. If we took one not in the set first, it would decrement the
7621 * count, possibly to 0 which would cause it to be output as ending the
7622 * range, and the next time through we would take the same number, and
7623 * output it again as beginning the next range. By doing it the
7624 * opposite way, there is no possibility that the count will be
7625 * momentarily decremented to 0, and thus the two adjoining ranges will
7626 * be seamlessly merged. (In a tie and both are in the set or both not
7627 * in the set, it doesn't matter which we take first.) */
7628 if (array_a[i_a] < array_b[i_b]
7629 || (array_a[i_a] == array_b[i_b]
7630 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7632 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7636 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7640 /* Here, have chosen which of the two inputs to look at. Only output
7641 * if the running count changes to/from 0, which marks the
7642 * beginning/end of a range in that's in the set */
7645 array_u[i_u++] = cp;
7652 array_u[i_u++] = cp;
7657 /* Here, we are finished going through at least one of the lists, which
7658 * means there is something remaining in at most one. We check if the list
7659 * that hasn't been exhausted is positioned such that we are in the middle
7660 * of a range in its set or not. (i_a and i_b point to the element beyond
7661 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7662 * is potentially more to output.
7663 * There are four cases:
7664 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7665 * in the union is entirely from the non-exhausted set.
7666 * 2) Both were in their sets, count is 2. Nothing further should
7667 * be output, as everything that remains will be in the exhausted
7668 * list's set, hence in the union; decrementing to 1 but not 0 insures
7670 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7671 * Nothing further should be output because the union includes
7672 * everything from the exhausted set. Not decrementing ensures that.
7673 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7674 * decrementing to 0 insures that we look at the remainder of the
7675 * non-exhausted set */
7676 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7677 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7682 /* The final length is what we've output so far, plus what else is about to
7683 * be output. (If 'count' is non-zero, then the input list we exhausted
7684 * has everything remaining up to the machine's limit in its set, and hence
7685 * in the union, so there will be no further output. */
7688 /* At most one of the subexpressions will be non-zero */
7689 len_u += (len_a - i_a) + (len_b - i_b);
7692 /* Set result to final length, which can change the pointer to array_u, so
7694 if (len_u != _invlist_len(u)) {
7695 invlist_set_len(u, len_u);
7697 array_u = invlist_array(u);
7700 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7701 * the other) ended with everything above it not in its set. That means
7702 * that the remaining part of the union is precisely the same as the
7703 * non-exhausted list, so can just copy it unchanged. (If both list were
7704 * exhausted at the same time, then the operations below will be both 0.)
7707 IV copy_count; /* At most one will have a non-zero copy count */
7708 if ((copy_count = len_a - i_a) > 0) {
7709 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7711 else if ((copy_count = len_b - i_b) > 0) {
7712 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7716 /* We may be removing a reference to one of the inputs */
7717 if (a == *output || b == *output) {
7718 SvREFCNT_dec(*output);
7721 /* If we've changed b, restore it */
7731 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7733 /* Take the intersection of two inversion lists and point <i> to it. *i
7734 * should be defined upon input, and if it points to one of the two lists,
7735 * the reference count to that list will be decremented.
7736 * If <complement_b> is TRUE, the result will be the intersection of <a>
7737 * and the complement (or inversion) of <b> instead of <b> directly.
7739 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7740 * Richard Gillam, published by Addison-Wesley, and explained at some
7741 * length there. The preface says to incorporate its examples into your
7742 * code at your own risk. In fact, it had bugs
7744 * The algorithm is like a merge sort, and is essentially the same as the
7748 UV* array_a; /* a's array */
7750 UV len_a; /* length of a's array */
7753 SV* r; /* the resulting intersection */
7757 UV i_a = 0; /* current index into a's array */
7761 /* running count, as explained in the algorithm source book; items are
7762 * stopped accumulating and are output when the count changes to/from 2.
7763 * The count is incremented when we start a range that's in the set, and
7764 * decremented when we start a range that's not in the set. So its range
7765 * is 0 to 2. Only when the count is 2 is something in the intersection.
7769 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7772 /* Special case if either one is empty */
7773 len_a = _invlist_len(a);
7774 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7776 if (len_a != 0 && complement_b) {
7778 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7779 * be empty. Here, also we are using 'b's complement, which hence
7780 * must be every possible code point. Thus the intersection is
7783 *i = invlist_clone(a);
7789 /* else *i is already 'a' */
7793 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7794 * intersection must be empty */
7801 *i = _new_invlist(0);
7805 /* Here both lists exist and are non-empty */
7806 array_a = invlist_array(a);
7807 array_b = invlist_array(b);
7809 /* If are to take the intersection of 'a' with the complement of b, set it
7810 * up so are looking at b's complement. */
7813 /* To complement, we invert: if the first element is 0, remove it. To
7814 * do this, we just pretend the array starts one later, and clear the
7815 * flag as we don't have to do anything else later */
7816 if (array_b[0] == 0) {
7819 complement_b = FALSE;
7823 /* But if the first element is not zero, we unshift a 0 before the
7824 * array. The data structure reserves a space for that 0 (which
7825 * should be a '1' right now), so physical shifting is unneeded,
7826 * but temporarily change that element to 0. Before exiting the
7827 * routine, we must restore the element to '1' */
7834 /* Size the intersection for the worst case: that the intersection ends up
7835 * fragmenting everything to be completely disjoint */
7836 r= _new_invlist(len_a + len_b);
7838 /* Will contain U+0000 iff both components do */
7839 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7840 && len_b > 0 && array_b[0] == 0);
7842 /* Go through each list item by item, stopping when exhausted one of
7844 while (i_a < len_a && i_b < len_b) {
7845 UV cp; /* The element to potentially add to the intersection's
7847 bool cp_in_set; /* Is it in the input list's set or not */
7849 /* We need to take one or the other of the two inputs for the
7850 * intersection. Since we are merging two sorted lists, we take the
7851 * smaller of the next items. In case of a tie, we take the one that
7852 * is not in its set first (a difference from the union algorithm). If
7853 * we took one in the set first, it would increment the count, possibly
7854 * to 2 which would cause it to be output as starting a range in the
7855 * intersection, and the next time through we would take that same
7856 * number, and output it again as ending the set. By doing it the
7857 * opposite of this, there is no possibility that the count will be
7858 * momentarily incremented to 2. (In a tie and both are in the set or
7859 * both not in the set, it doesn't matter which we take first.) */
7860 if (array_a[i_a] < array_b[i_b]
7861 || (array_a[i_a] == array_b[i_b]
7862 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7864 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7868 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7872 /* Here, have chosen which of the two inputs to look at. Only output
7873 * if the running count changes to/from 2, which marks the
7874 * beginning/end of a range that's in the intersection */
7878 array_r[i_r++] = cp;
7883 array_r[i_r++] = cp;
7889 /* Here, we are finished going through at least one of the lists, which
7890 * means there is something remaining in at most one. We check if the list
7891 * that has been exhausted is positioned such that we are in the middle
7892 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7893 * the ones we care about.) There are four cases:
7894 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7895 * nothing left in the intersection.
7896 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7897 * above 2. What should be output is exactly that which is in the
7898 * non-exhausted set, as everything it has is also in the intersection
7899 * set, and everything it doesn't have can't be in the intersection
7900 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7901 * gets incremented to 2. Like the previous case, the intersection is
7902 * everything that remains in the non-exhausted set.
7903 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7904 * remains 1. And the intersection has nothing more. */
7905 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7906 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7911 /* The final length is what we've output so far plus what else is in the
7912 * intersection. At most one of the subexpressions below will be non-zero */
7915 len_r += (len_a - i_a) + (len_b - i_b);
7918 /* Set result to final length, which can change the pointer to array_r, so
7920 if (len_r != _invlist_len(r)) {
7921 invlist_set_len(r, len_r);
7923 array_r = invlist_array(r);
7926 /* Finish outputting any remaining */
7927 if (count >= 2) { /* At most one will have a non-zero copy count */
7929 if ((copy_count = len_a - i_a) > 0) {
7930 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7932 else if ((copy_count = len_b - i_b) > 0) {
7933 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7937 /* We may be removing a reference to one of the inputs */
7938 if (a == *i || b == *i) {
7942 /* If we've changed b, restore it */
7952 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7954 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7955 * set. A pointer to the inversion list is returned. This may actually be
7956 * a new list, in which case the passed in one has been destroyed. The
7957 * passed in inversion list can be NULL, in which case a new one is created
7958 * with just the one range in it */
7963 if (invlist == NULL) {
7964 invlist = _new_invlist(2);
7968 len = _invlist_len(invlist);
7971 /* If comes after the final entry, can just append it to the end */
7973 || start >= invlist_array(invlist)
7974 [_invlist_len(invlist) - 1])
7976 _append_range_to_invlist(invlist, start, end);
7980 /* Here, can't just append things, create and return a new inversion list
7981 * which is the union of this range and the existing inversion list */
7982 range_invlist = _new_invlist(2);
7983 _append_range_to_invlist(range_invlist, start, end);
7985 _invlist_union(invlist, range_invlist, &invlist);
7987 /* The temporary can be freed */
7988 SvREFCNT_dec(range_invlist);
7995 PERL_STATIC_INLINE SV*
7996 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7997 return _add_range_to_invlist(invlist, cp, cp);
8000 #ifndef PERL_IN_XSUB_RE
8002 Perl__invlist_invert(pTHX_ SV* const invlist)
8004 /* Complement the input inversion list. This adds a 0 if the list didn't
8005 * have a zero; removes it otherwise. As described above, the data
8006 * structure is set up so that this is very efficient */
8008 UV* len_pos = _get_invlist_len_addr(invlist);
8010 PERL_ARGS_ASSERT__INVLIST_INVERT;
8012 /* The inverse of matching nothing is matching everything */
8013 if (*len_pos == 0) {
8014 _append_range_to_invlist(invlist, 0, UV_MAX);
8018 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
8019 * zero element was a 0, so it is being removed, so the length decrements
8020 * by 1; and vice-versa. SvCUR is unaffected */
8021 if (*get_invlist_zero_addr(invlist) ^= 1) {
8030 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8032 /* Complement the input inversion list (which must be a Unicode property,
8033 * all of which don't match above the Unicode maximum code point.) And
8034 * Perl has chosen to not have the inversion match above that either. This
8035 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8041 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8043 _invlist_invert(invlist);
8045 len = _invlist_len(invlist);
8047 if (len != 0) { /* If empty do nothing */
8048 array = invlist_array(invlist);
8049 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8050 /* Add 0x110000. First, grow if necessary */
8052 if (invlist_max(invlist) < len) {
8053 invlist_extend(invlist, len);
8054 array = invlist_array(invlist);
8056 invlist_set_len(invlist, len);
8057 array[len - 1] = PERL_UNICODE_MAX + 1;
8059 else { /* Remove the 0x110000 */
8060 invlist_set_len(invlist, len - 1);
8068 PERL_STATIC_INLINE SV*
8069 S_invlist_clone(pTHX_ SV* const invlist)
8072 /* Return a new inversion list that is a copy of the input one, which is
8075 /* Need to allocate extra space to accommodate Perl's addition of a
8076 * trailing NUL to SvPV's, since it thinks they are always strings */
8077 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8078 STRLEN length = SvCUR(invlist);
8080 PERL_ARGS_ASSERT_INVLIST_CLONE;
8082 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8083 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8088 PERL_STATIC_INLINE UV*
8089 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8091 /* Return the address of the UV that contains the current iteration
8094 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8096 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8099 PERL_STATIC_INLINE UV*
8100 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8102 /* Return the address of the UV that contains the version id. */
8104 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8106 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8109 PERL_STATIC_INLINE void
8110 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8112 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8114 *get_invlist_iter_addr(invlist) = 0;
8118 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8120 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8121 * This call sets in <*start> and <*end>, the next range in <invlist>.
8122 * Returns <TRUE> if successful and the next call will return the next
8123 * range; <FALSE> if was already at the end of the list. If the latter,
8124 * <*start> and <*end> are unchanged, and the next call to this function
8125 * will start over at the beginning of the list */
8127 UV* pos = get_invlist_iter_addr(invlist);
8128 UV len = _invlist_len(invlist);
8131 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8134 *pos = UV_MAX; /* Force iternit() to be required next time */
8138 array = invlist_array(invlist);
8140 *start = array[(*pos)++];
8146 *end = array[(*pos)++] - 1;
8152 PERL_STATIC_INLINE UV
8153 S_invlist_highest(pTHX_ SV* const invlist)
8155 /* Returns the highest code point that matches an inversion list. This API
8156 * has an ambiguity, as it returns 0 under either the highest is actually
8157 * 0, or if the list is empty. If this distinction matters to you, check
8158 * for emptiness before calling this function */
8160 UV len = _invlist_len(invlist);
8163 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8169 array = invlist_array(invlist);
8171 /* The last element in the array in the inversion list always starts a
8172 * range that goes to infinity. That range may be for code points that are
8173 * matched in the inversion list, or it may be for ones that aren't
8174 * matched. In the latter case, the highest code point in the set is one
8175 * less than the beginning of this range; otherwise it is the final element
8176 * of this range: infinity */
8177 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8179 : array[len - 1] - 1;
8182 #ifndef PERL_IN_XSUB_RE
8184 Perl__invlist_contents(pTHX_ SV* const invlist)
8186 /* Get the contents of an inversion list into a string SV so that they can
8187 * be printed out. It uses the format traditionally done for debug tracing
8191 SV* output = newSVpvs("\n");
8193 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8195 invlist_iterinit(invlist);
8196 while (invlist_iternext(invlist, &start, &end)) {
8197 if (end == UV_MAX) {
8198 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8200 else if (end != start) {
8201 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8205 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8215 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8217 /* Dumps out the ranges in an inversion list. The string 'header'
8218 * if present is output on a line before the first range */
8222 if (header && strlen(header)) {
8223 PerlIO_printf(Perl_debug_log, "%s\n", header);
8225 invlist_iterinit(invlist);
8226 while (invlist_iternext(invlist, &start, &end)) {
8227 if (end == UV_MAX) {
8228 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8231 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8239 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8241 /* Return a boolean as to if the two passed in inversion lists are
8242 * identical. The final argument, if TRUE, says to take the complement of
8243 * the second inversion list before doing the comparison */
8245 UV* array_a = invlist_array(a);
8246 UV* array_b = invlist_array(b);
8247 UV len_a = _invlist_len(a);
8248 UV len_b = _invlist_len(b);
8250 UV i = 0; /* current index into the arrays */
8251 bool retval = TRUE; /* Assume are identical until proven otherwise */
8253 PERL_ARGS_ASSERT__INVLISTEQ;
8255 /* If are to compare 'a' with the complement of b, set it
8256 * up so are looking at b's complement. */
8259 /* The complement of nothing is everything, so <a> would have to have
8260 * just one element, starting at zero (ending at infinity) */
8262 return (len_a == 1 && array_a[0] == 0);
8264 else if (array_b[0] == 0) {
8266 /* Otherwise, to complement, we invert. Here, the first element is
8267 * 0, just remove it. To do this, we just pretend the array starts
8268 * one later, and clear the flag as we don't have to do anything
8273 complement_b = FALSE;
8277 /* But if the first element is not zero, we unshift a 0 before the
8278 * array. The data structure reserves a space for that 0 (which
8279 * should be a '1' right now), so physical shifting is unneeded,
8280 * but temporarily change that element to 0. Before exiting the
8281 * routine, we must restore the element to '1' */
8288 /* Make sure that the lengths are the same, as well as the final element
8289 * before looping through the remainder. (Thus we test the length, final,
8290 * and first elements right off the bat) */
8291 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8294 else for (i = 0; i < len_a - 1; i++) {
8295 if (array_a[i] != array_b[i]) {
8308 #undef HEADER_LENGTH
8309 #undef INVLIST_INITIAL_LENGTH
8310 #undef TO_INTERNAL_SIZE
8311 #undef FROM_INTERNAL_SIZE
8312 #undef INVLIST_LEN_OFFSET
8313 #undef INVLIST_ZERO_OFFSET
8314 #undef INVLIST_ITER_OFFSET
8315 #undef INVLIST_VERSION_ID
8317 /* End of inversion list object */
8320 - reg - regular expression, i.e. main body or parenthesized thing
8322 * Caller must absorb opening parenthesis.
8324 * Combining parenthesis handling with the base level of regular expression
8325 * is a trifle forced, but the need to tie the tails of the branches to what
8326 * follows makes it hard to avoid.
8328 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8330 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8332 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8336 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8337 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8340 regnode *ret; /* Will be the head of the group. */
8343 regnode *ender = NULL;
8346 U32 oregflags = RExC_flags;
8347 bool have_branch = 0;
8349 I32 freeze_paren = 0;
8350 I32 after_freeze = 0;
8352 /* for (?g), (?gc), and (?o) warnings; warning
8353 about (?c) will warn about (?g) -- japhy */
8355 #define WASTED_O 0x01
8356 #define WASTED_G 0x02
8357 #define WASTED_C 0x04
8358 #define WASTED_GC (0x02|0x04)
8359 I32 wastedflags = 0x00;
8361 char * parse_start = RExC_parse; /* MJD */
8362 char * const oregcomp_parse = RExC_parse;
8364 GET_RE_DEBUG_FLAGS_DECL;
8366 PERL_ARGS_ASSERT_REG;
8367 DEBUG_PARSE("reg ");
8369 *flagp = 0; /* Tentatively. */
8372 /* Make an OPEN node, if parenthesized. */
8374 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8375 char *start_verb = RExC_parse;
8376 STRLEN verb_len = 0;
8377 char *start_arg = NULL;
8378 unsigned char op = 0;
8380 int internal_argval = 0; /* internal_argval is only useful if !argok */
8381 while ( *RExC_parse && *RExC_parse != ')' ) {
8382 if ( *RExC_parse == ':' ) {
8383 start_arg = RExC_parse + 1;
8389 verb_len = RExC_parse - start_verb;
8392 while ( *RExC_parse && *RExC_parse != ')' )
8394 if ( *RExC_parse != ')' )
8395 vFAIL("Unterminated verb pattern argument");
8396 if ( RExC_parse == start_arg )
8399 if ( *RExC_parse != ')' )
8400 vFAIL("Unterminated verb pattern");
8403 switch ( *start_verb ) {
8404 case 'A': /* (*ACCEPT) */
8405 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8407 internal_argval = RExC_nestroot;
8410 case 'C': /* (*COMMIT) */
8411 if ( memEQs(start_verb,verb_len,"COMMIT") )
8414 case 'F': /* (*FAIL) */
8415 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8420 case ':': /* (*:NAME) */
8421 case 'M': /* (*MARK:NAME) */
8422 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8427 case 'P': /* (*PRUNE) */
8428 if ( memEQs(start_verb,verb_len,"PRUNE") )
8431 case 'S': /* (*SKIP) */
8432 if ( memEQs(start_verb,verb_len,"SKIP") )
8435 case 'T': /* (*THEN) */
8436 /* [19:06] <TimToady> :: is then */
8437 if ( memEQs(start_verb,verb_len,"THEN") ) {
8439 RExC_seen |= REG_SEEN_CUTGROUP;
8445 vFAIL3("Unknown verb pattern '%.*s'",
8446 verb_len, start_verb);
8449 if ( start_arg && internal_argval ) {
8450 vFAIL3("Verb pattern '%.*s' may not have an argument",
8451 verb_len, start_verb);
8452 } else if ( argok < 0 && !start_arg ) {
8453 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8454 verb_len, start_verb);
8456 ret = reganode(pRExC_state, op, internal_argval);
8457 if ( ! internal_argval && ! SIZE_ONLY ) {
8459 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8460 ARG(ret) = add_data( pRExC_state, 1, "S" );
8461 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8468 if (!internal_argval)
8469 RExC_seen |= REG_SEEN_VERBARG;
8470 } else if ( start_arg ) {
8471 vFAIL3("Verb pattern '%.*s' may not have an argument",
8472 verb_len, start_verb);
8474 ret = reg_node(pRExC_state, op);
8476 nextchar(pRExC_state);
8479 if (*RExC_parse == '?') { /* (?...) */
8480 bool is_logical = 0;
8481 const char * const seqstart = RExC_parse;
8482 bool has_use_defaults = FALSE;
8485 paren = *RExC_parse++;
8486 ret = NULL; /* For look-ahead/behind. */
8489 case 'P': /* (?P...) variants for those used to PCRE/Python */
8490 paren = *RExC_parse++;
8491 if ( paren == '<') /* (?P<...>) named capture */
8493 else if (paren == '>') { /* (?P>name) named recursion */
8494 goto named_recursion;
8496 else if (paren == '=') { /* (?P=...) named backref */
8497 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8498 you change this make sure you change that */
8499 char* name_start = RExC_parse;
8501 SV *sv_dat = reg_scan_name(pRExC_state,
8502 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8503 if (RExC_parse == name_start || *RExC_parse != ')')
8504 vFAIL2("Sequence %.3s... not terminated",parse_start);
8507 num = add_data( pRExC_state, 1, "S" );
8508 RExC_rxi->data->data[num]=(void*)sv_dat;
8509 SvREFCNT_inc_simple_void(sv_dat);
8512 ret = reganode(pRExC_state,
8515 : (ASCII_FOLD_RESTRICTED)
8517 : (AT_LEAST_UNI_SEMANTICS)
8525 Set_Node_Offset(ret, parse_start+1);
8526 Set_Node_Cur_Length(ret); /* MJD */
8528 nextchar(pRExC_state);
8532 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8534 case '<': /* (?<...) */
8535 if (*RExC_parse == '!')
8537 else if (*RExC_parse != '=')
8543 case '\'': /* (?'...') */
8544 name_start= RExC_parse;
8545 svname = reg_scan_name(pRExC_state,
8546 SIZE_ONLY ? /* reverse test from the others */
8547 REG_RSN_RETURN_NAME :
8548 REG_RSN_RETURN_NULL);
8549 if (RExC_parse == name_start) {
8551 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8554 if (*RExC_parse != paren)
8555 vFAIL2("Sequence (?%c... not terminated",
8556 paren=='>' ? '<' : paren);
8560 if (!svname) /* shouldn't happen */
8562 "panic: reg_scan_name returned NULL");
8563 if (!RExC_paren_names) {
8564 RExC_paren_names= newHV();
8565 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8567 RExC_paren_name_list= newAV();
8568 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8571 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8573 sv_dat = HeVAL(he_str);
8575 /* croak baby croak */
8577 "panic: paren_name hash element allocation failed");
8578 } else if ( SvPOK(sv_dat) ) {
8579 /* (?|...) can mean we have dupes so scan to check
8580 its already been stored. Maybe a flag indicating
8581 we are inside such a construct would be useful,
8582 but the arrays are likely to be quite small, so
8583 for now we punt -- dmq */
8584 IV count = SvIV(sv_dat);
8585 I32 *pv = (I32*)SvPVX(sv_dat);
8587 for ( i = 0 ; i < count ; i++ ) {
8588 if ( pv[i] == RExC_npar ) {
8594 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8595 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8596 pv[count] = RExC_npar;
8597 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8600 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8601 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8603 SvIV_set(sv_dat, 1);
8606 /* Yes this does cause a memory leak in debugging Perls */
8607 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8608 SvREFCNT_dec(svname);
8611 /*sv_dump(sv_dat);*/
8613 nextchar(pRExC_state);
8615 goto capturing_parens;
8617 RExC_seen |= REG_SEEN_LOOKBEHIND;
8618 RExC_in_lookbehind++;
8620 case '=': /* (?=...) */
8621 RExC_seen_zerolen++;
8623 case '!': /* (?!...) */
8624 RExC_seen_zerolen++;
8625 if (*RExC_parse == ')') {
8626 ret=reg_node(pRExC_state, OPFAIL);
8627 nextchar(pRExC_state);
8631 case '|': /* (?|...) */
8632 /* branch reset, behave like a (?:...) except that
8633 buffers in alternations share the same numbers */
8635 after_freeze = freeze_paren = RExC_npar;
8637 case ':': /* (?:...) */
8638 case '>': /* (?>...) */
8640 case '$': /* (?$...) */
8641 case '@': /* (?@...) */
8642 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8644 case '#': /* (?#...) */
8645 while (*RExC_parse && *RExC_parse != ')')
8647 if (*RExC_parse != ')')
8648 FAIL("Sequence (?#... not terminated");
8649 nextchar(pRExC_state);
8652 case '0' : /* (?0) */
8653 case 'R' : /* (?R) */
8654 if (*RExC_parse != ')')
8655 FAIL("Sequence (?R) not terminated");
8656 ret = reg_node(pRExC_state, GOSTART);
8657 *flagp |= POSTPONED;
8658 nextchar(pRExC_state);
8661 { /* named and numeric backreferences */
8663 case '&': /* (?&NAME) */
8664 parse_start = RExC_parse - 1;
8667 SV *sv_dat = reg_scan_name(pRExC_state,
8668 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8669 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8671 goto gen_recurse_regop;
8672 assert(0); /* NOT REACHED */
8674 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8676 vFAIL("Illegal pattern");
8678 goto parse_recursion;
8680 case '-': /* (?-1) */
8681 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8682 RExC_parse--; /* rewind to let it be handled later */
8686 case '1': case '2': case '3': case '4': /* (?1) */
8687 case '5': case '6': case '7': case '8': case '9':
8690 num = atoi(RExC_parse);
8691 parse_start = RExC_parse - 1; /* MJD */
8692 if (*RExC_parse == '-')
8694 while (isDIGIT(*RExC_parse))
8696 if (*RExC_parse!=')')
8697 vFAIL("Expecting close bracket");
8700 if ( paren == '-' ) {
8702 Diagram of capture buffer numbering.
8703 Top line is the normal capture buffer numbers
8704 Bottom line is the negative indexing as from
8708 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8712 num = RExC_npar + num;
8715 vFAIL("Reference to nonexistent group");
8717 } else if ( paren == '+' ) {
8718 num = RExC_npar + num - 1;
8721 ret = reganode(pRExC_state, GOSUB, num);
8723 if (num > (I32)RExC_rx->nparens) {
8725 vFAIL("Reference to nonexistent group");
8727 ARG2L_SET( ret, RExC_recurse_count++);
8729 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8730 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8734 RExC_seen |= REG_SEEN_RECURSE;
8735 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8736 Set_Node_Offset(ret, parse_start); /* MJD */
8738 *flagp |= POSTPONED;
8739 nextchar(pRExC_state);
8741 } /* named and numeric backreferences */
8742 assert(0); /* NOT REACHED */
8744 case '?': /* (??...) */
8746 if (*RExC_parse != '{') {
8748 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8751 *flagp |= POSTPONED;
8752 paren = *RExC_parse++;
8754 case '{': /* (?{...}) */
8757 struct reg_code_block *cb;
8759 RExC_seen_zerolen++;
8761 if ( !pRExC_state->num_code_blocks
8762 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8763 || pRExC_state->code_blocks[pRExC_state->code_index].start
8764 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8767 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8768 FAIL("panic: Sequence (?{...}): no code block found\n");
8769 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8771 /* this is a pre-compiled code block (?{...}) */
8772 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8773 RExC_parse = RExC_start + cb->end;
8776 if (cb->src_regex) {
8777 n = add_data(pRExC_state, 2, "rl");
8778 RExC_rxi->data->data[n] =
8779 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8780 RExC_rxi->data->data[n+1] = (void*)o;
8783 n = add_data(pRExC_state, 1,
8784 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8785 RExC_rxi->data->data[n] = (void*)o;
8788 pRExC_state->code_index++;
8789 nextchar(pRExC_state);
8793 ret = reg_node(pRExC_state, LOGICAL);
8794 eval = reganode(pRExC_state, EVAL, n);
8797 /* for later propagation into (??{}) return value */
8798 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8800 REGTAIL(pRExC_state, ret, eval);
8801 /* deal with the length of this later - MJD */
8804 ret = reganode(pRExC_state, EVAL, n);
8805 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8806 Set_Node_Offset(ret, parse_start);
8809 case '(': /* (?(?{...})...) and (?(?=...)...) */
8812 if (RExC_parse[0] == '?') { /* (?(?...)) */
8813 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8814 || RExC_parse[1] == '<'
8815 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8818 ret = reg_node(pRExC_state, LOGICAL);
8821 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8825 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8826 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8828 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8829 char *name_start= RExC_parse++;
8831 SV *sv_dat=reg_scan_name(pRExC_state,
8832 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8833 if (RExC_parse == name_start || *RExC_parse != ch)
8834 vFAIL2("Sequence (?(%c... not terminated",
8835 (ch == '>' ? '<' : ch));
8838 num = add_data( pRExC_state, 1, "S" );
8839 RExC_rxi->data->data[num]=(void*)sv_dat;
8840 SvREFCNT_inc_simple_void(sv_dat);
8842 ret = reganode(pRExC_state,NGROUPP,num);
8843 goto insert_if_check_paren;
8845 else if (RExC_parse[0] == 'D' &&
8846 RExC_parse[1] == 'E' &&
8847 RExC_parse[2] == 'F' &&
8848 RExC_parse[3] == 'I' &&
8849 RExC_parse[4] == 'N' &&
8850 RExC_parse[5] == 'E')
8852 ret = reganode(pRExC_state,DEFINEP,0);
8855 goto insert_if_check_paren;
8857 else if (RExC_parse[0] == 'R') {
8860 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8861 parno = atoi(RExC_parse++);
8862 while (isDIGIT(*RExC_parse))
8864 } else if (RExC_parse[0] == '&') {
8867 sv_dat = reg_scan_name(pRExC_state,
8868 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8869 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8871 ret = reganode(pRExC_state,INSUBP,parno);
8872 goto insert_if_check_paren;
8874 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8877 parno = atoi(RExC_parse++);
8879 while (isDIGIT(*RExC_parse))
8881 ret = reganode(pRExC_state, GROUPP, parno);
8883 insert_if_check_paren:
8884 if ((c = *nextchar(pRExC_state)) != ')')
8885 vFAIL("Switch condition not recognized");
8887 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8888 br = regbranch(pRExC_state, &flags, 1,depth+1);
8890 br = reganode(pRExC_state, LONGJMP, 0);
8892 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8893 c = *nextchar(pRExC_state);
8898 vFAIL("(?(DEFINE)....) does not allow branches");
8899 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8900 regbranch(pRExC_state, &flags, 1,depth+1);
8901 REGTAIL(pRExC_state, ret, lastbr);
8904 c = *nextchar(pRExC_state);
8909 vFAIL("Switch (?(condition)... contains too many branches");
8910 ender = reg_node(pRExC_state, TAIL);
8911 REGTAIL(pRExC_state, br, ender);
8913 REGTAIL(pRExC_state, lastbr, ender);
8914 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8917 REGTAIL(pRExC_state, ret, ender);
8918 RExC_size++; /* XXX WHY do we need this?!!
8919 For large programs it seems to be required
8920 but I can't figure out why. -- dmq*/
8924 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8928 RExC_parse--; /* for vFAIL to print correctly */
8929 vFAIL("Sequence (? incomplete");
8931 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8933 has_use_defaults = TRUE;
8934 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8935 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8936 ? REGEX_UNICODE_CHARSET
8937 : REGEX_DEPENDS_CHARSET);
8941 parse_flags: /* (?i) */
8943 U32 posflags = 0, negflags = 0;
8944 U32 *flagsp = &posflags;
8945 char has_charset_modifier = '\0';
8946 regex_charset cs = get_regex_charset(RExC_flags);
8947 if (cs == REGEX_DEPENDS_CHARSET
8948 && (RExC_utf8 || RExC_uni_semantics))
8950 cs = REGEX_UNICODE_CHARSET;
8953 while (*RExC_parse) {
8954 /* && strchr("iogcmsx", *RExC_parse) */
8955 /* (?g), (?gc) and (?o) are useless here
8956 and must be globally applied -- japhy */
8957 switch (*RExC_parse) {
8958 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8959 case LOCALE_PAT_MOD:
8960 if (has_charset_modifier) {
8961 goto excess_modifier;
8963 else if (flagsp == &negflags) {
8966 cs = REGEX_LOCALE_CHARSET;
8967 has_charset_modifier = LOCALE_PAT_MOD;
8968 RExC_contains_locale = 1;
8970 case UNICODE_PAT_MOD:
8971 if (has_charset_modifier) {
8972 goto excess_modifier;
8974 else if (flagsp == &negflags) {
8977 cs = REGEX_UNICODE_CHARSET;
8978 has_charset_modifier = UNICODE_PAT_MOD;
8980 case ASCII_RESTRICT_PAT_MOD:
8981 if (flagsp == &negflags) {
8984 if (has_charset_modifier) {
8985 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8986 goto excess_modifier;
8988 /* Doubled modifier implies more restricted */
8989 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8992 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8994 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8996 case DEPENDS_PAT_MOD:
8997 if (has_use_defaults) {
8998 goto fail_modifiers;
9000 else if (flagsp == &negflags) {
9003 else if (has_charset_modifier) {
9004 goto excess_modifier;
9007 /* The dual charset means unicode semantics if the
9008 * pattern (or target, not known until runtime) are
9009 * utf8, or something in the pattern indicates unicode
9011 cs = (RExC_utf8 || RExC_uni_semantics)
9012 ? REGEX_UNICODE_CHARSET
9013 : REGEX_DEPENDS_CHARSET;
9014 has_charset_modifier = DEPENDS_PAT_MOD;
9018 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9019 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9021 else if (has_charset_modifier == *(RExC_parse - 1)) {
9022 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9025 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9030 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9032 case ONCE_PAT_MOD: /* 'o' */
9033 case GLOBAL_PAT_MOD: /* 'g' */
9034 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9035 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9036 if (! (wastedflags & wflagbit) ) {
9037 wastedflags |= wflagbit;
9040 "Useless (%s%c) - %suse /%c modifier",
9041 flagsp == &negflags ? "?-" : "?",
9043 flagsp == &negflags ? "don't " : "",
9050 case CONTINUE_PAT_MOD: /* 'c' */
9051 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9052 if (! (wastedflags & WASTED_C) ) {
9053 wastedflags |= WASTED_GC;
9056 "Useless (%sc) - %suse /gc modifier",
9057 flagsp == &negflags ? "?-" : "?",
9058 flagsp == &negflags ? "don't " : ""
9063 case KEEPCOPY_PAT_MOD: /* 'p' */
9064 if (flagsp == &negflags) {
9066 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9068 *flagsp |= RXf_PMf_KEEPCOPY;
9072 /* A flag is a default iff it is following a minus, so
9073 * if there is a minus, it means will be trying to
9074 * re-specify a default which is an error */
9075 if (has_use_defaults || flagsp == &negflags) {
9078 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9082 wastedflags = 0; /* reset so (?g-c) warns twice */
9088 RExC_flags |= posflags;
9089 RExC_flags &= ~negflags;
9090 set_regex_charset(&RExC_flags, cs);
9092 oregflags |= posflags;
9093 oregflags &= ~negflags;
9094 set_regex_charset(&oregflags, cs);
9096 nextchar(pRExC_state);
9107 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9112 }} /* one for the default block, one for the switch */
9119 ret = reganode(pRExC_state, OPEN, parno);
9122 RExC_nestroot = parno;
9123 if (RExC_seen & REG_SEEN_RECURSE
9124 && !RExC_open_parens[parno-1])
9126 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9127 "Setting open paren #%"IVdf" to %d\n",
9128 (IV)parno, REG_NODE_NUM(ret)));
9129 RExC_open_parens[parno-1]= ret;
9132 Set_Node_Length(ret, 1); /* MJD */
9133 Set_Node_Offset(ret, RExC_parse); /* MJD */
9141 /* Pick up the branches, linking them together. */
9142 parse_start = RExC_parse; /* MJD */
9143 br = regbranch(pRExC_state, &flags, 1,depth+1);
9145 /* branch_len = (paren != 0); */
9149 if (*RExC_parse == '|') {
9150 if (!SIZE_ONLY && RExC_extralen) {
9151 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9154 reginsert(pRExC_state, BRANCH, br, depth+1);
9155 Set_Node_Length(br, paren != 0);
9156 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9160 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9162 else if (paren == ':') {
9163 *flagp |= flags&SIMPLE;
9165 if (is_open) { /* Starts with OPEN. */
9166 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9168 else if (paren != '?') /* Not Conditional */
9170 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9172 while (*RExC_parse == '|') {
9173 if (!SIZE_ONLY && RExC_extralen) {
9174 ender = reganode(pRExC_state, LONGJMP,0);
9175 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9178 RExC_extralen += 2; /* Account for LONGJMP. */
9179 nextchar(pRExC_state);
9181 if (RExC_npar > after_freeze)
9182 after_freeze = RExC_npar;
9183 RExC_npar = freeze_paren;
9185 br = regbranch(pRExC_state, &flags, 0, depth+1);
9189 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9191 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9194 if (have_branch || paren != ':') {
9195 /* Make a closing node, and hook it on the end. */
9198 ender = reg_node(pRExC_state, TAIL);
9201 ender = reganode(pRExC_state, CLOSE, parno);
9202 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9203 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9204 "Setting close paren #%"IVdf" to %d\n",
9205 (IV)parno, REG_NODE_NUM(ender)));
9206 RExC_close_parens[parno-1]= ender;
9207 if (RExC_nestroot == parno)
9210 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9211 Set_Node_Length(ender,1); /* MJD */
9217 *flagp &= ~HASWIDTH;
9220 ender = reg_node(pRExC_state, SUCCEED);
9223 ender = reg_node(pRExC_state, END);
9225 assert(!RExC_opend); /* there can only be one! */
9230 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9231 SV * const mysv_val1=sv_newmortal();
9232 SV * const mysv_val2=sv_newmortal();
9233 DEBUG_PARSE_MSG("lsbr");
9234 regprop(RExC_rx, mysv_val1, lastbr);
9235 regprop(RExC_rx, mysv_val2, ender);
9236 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9237 SvPV_nolen_const(mysv_val1),
9238 (IV)REG_NODE_NUM(lastbr),
9239 SvPV_nolen_const(mysv_val2),
9240 (IV)REG_NODE_NUM(ender),
9241 (IV)(ender - lastbr)
9244 REGTAIL(pRExC_state, lastbr, ender);
9246 if (have_branch && !SIZE_ONLY) {
9249 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9251 /* Hook the tails of the branches to the closing node. */
9252 for (br = ret; br; br = regnext(br)) {
9253 const U8 op = PL_regkind[OP(br)];
9255 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9256 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9259 else if (op == BRANCHJ) {
9260 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9261 /* for now we always disable this optimisation * /
9262 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9268 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9269 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9270 SV * const mysv_val1=sv_newmortal();
9271 SV * const mysv_val2=sv_newmortal();
9272 DEBUG_PARSE_MSG("NADA");
9273 regprop(RExC_rx, mysv_val1, ret);
9274 regprop(RExC_rx, mysv_val2, ender);
9275 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9276 SvPV_nolen_const(mysv_val1),
9277 (IV)REG_NODE_NUM(ret),
9278 SvPV_nolen_const(mysv_val2),
9279 (IV)REG_NODE_NUM(ender),
9284 if (OP(ender) == TAIL) {
9289 for ( opt= br + 1; opt < ender ; opt++ )
9291 NEXT_OFF(br)= ender - br;
9299 static const char parens[] = "=!<,>";
9301 if (paren && (p = strchr(parens, paren))) {
9302 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9303 int flag = (p - parens) > 1;
9306 node = SUSPEND, flag = 0;
9307 reginsert(pRExC_state, node,ret, depth+1);
9308 Set_Node_Cur_Length(ret);
9309 Set_Node_Offset(ret, parse_start + 1);
9311 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9315 /* Check for proper termination. */
9317 RExC_flags = oregflags;
9318 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9319 RExC_parse = oregcomp_parse;
9320 vFAIL("Unmatched (");
9323 else if (!paren && RExC_parse < RExC_end) {
9324 if (*RExC_parse == ')') {
9326 vFAIL("Unmatched )");
9329 FAIL("Junk on end of regexp"); /* "Can't happen". */
9330 assert(0); /* NOTREACHED */
9333 if (RExC_in_lookbehind) {
9334 RExC_in_lookbehind--;
9336 if (after_freeze > RExC_npar)
9337 RExC_npar = after_freeze;
9342 - regbranch - one alternative of an | operator
9344 * Implements the concatenation operator.
9347 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9351 regnode *chain = NULL;
9353 I32 flags = 0, c = 0;
9354 GET_RE_DEBUG_FLAGS_DECL;
9356 PERL_ARGS_ASSERT_REGBRANCH;
9358 DEBUG_PARSE("brnc");
9363 if (!SIZE_ONLY && RExC_extralen)
9364 ret = reganode(pRExC_state, BRANCHJ,0);
9366 ret = reg_node(pRExC_state, BRANCH);
9367 Set_Node_Length(ret, 1);
9371 if (!first && SIZE_ONLY)
9372 RExC_extralen += 1; /* BRANCHJ */
9374 *flagp = WORST; /* Tentatively. */
9377 nextchar(pRExC_state);
9378 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9380 latest = regpiece(pRExC_state, &flags,depth+1);
9381 if (latest == NULL) {
9382 if (flags & TRYAGAIN)
9386 else if (ret == NULL)
9388 *flagp |= flags&(HASWIDTH|POSTPONED);
9389 if (chain == NULL) /* First piece. */
9390 *flagp |= flags&SPSTART;
9393 REGTAIL(pRExC_state, chain, latest);
9398 if (chain == NULL) { /* Loop ran zero times. */
9399 chain = reg_node(pRExC_state, NOTHING);
9404 *flagp |= flags&SIMPLE;
9411 - regpiece - something followed by possible [*+?]
9413 * Note that the branching code sequences used for ? and the general cases
9414 * of * and + are somewhat optimized: they use the same NOTHING node as
9415 * both the endmarker for their branch list and the body of the last branch.
9416 * It might seem that this node could be dispensed with entirely, but the
9417 * endmarker role is not redundant.
9420 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9427 const char * const origparse = RExC_parse;
9429 I32 max = REG_INFTY;
9430 #ifdef RE_TRACK_PATTERN_OFFSETS
9433 const char *maxpos = NULL;
9435 /* Save the original in case we change the emitted regop to a FAIL. */
9436 regnode * const orig_emit = RExC_emit;
9438 GET_RE_DEBUG_FLAGS_DECL;
9440 PERL_ARGS_ASSERT_REGPIECE;
9442 DEBUG_PARSE("piec");
9444 ret = regatom(pRExC_state, &flags,depth+1);
9446 if (flags & TRYAGAIN)
9453 if (op == '{' && regcurly(RExC_parse)) {
9455 #ifdef RE_TRACK_PATTERN_OFFSETS
9456 parse_start = RExC_parse; /* MJD */
9458 next = RExC_parse + 1;
9459 while (isDIGIT(*next) || *next == ',') {
9468 if (*next == '}') { /* got one */
9472 min = atoi(RExC_parse);
9476 maxpos = RExC_parse;
9478 if (!max && *maxpos != '0')
9479 max = REG_INFTY; /* meaning "infinity" */
9480 else if (max >= REG_INFTY)
9481 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9483 nextchar(pRExC_state);
9484 if (max < min) { /* If can't match, warn and optimize to fail
9487 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9489 /* We can't back off the size because we have to reserve
9490 * enough space for all the things we are about to throw
9491 * away, but we can shrink it by the ammount we are about
9493 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9496 RExC_emit = orig_emit;
9498 ret = reg_node(pRExC_state, OPFAIL);
9503 if ((flags&SIMPLE)) {
9504 RExC_naughty += 2 + RExC_naughty / 2;
9505 reginsert(pRExC_state, CURLY, ret, depth+1);
9506 Set_Node_Offset(ret, parse_start+1); /* MJD */
9507 Set_Node_Cur_Length(ret);
9510 regnode * const w = reg_node(pRExC_state, WHILEM);
9513 REGTAIL(pRExC_state, ret, w);
9514 if (!SIZE_ONLY && RExC_extralen) {
9515 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9516 reginsert(pRExC_state, NOTHING,ret, depth+1);
9517 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9519 reginsert(pRExC_state, CURLYX,ret, depth+1);
9521 Set_Node_Offset(ret, parse_start+1);
9522 Set_Node_Length(ret,
9523 op == '{' ? (RExC_parse - parse_start) : 1);
9525 if (!SIZE_ONLY && RExC_extralen)
9526 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9527 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9529 RExC_whilem_seen++, RExC_extralen += 3;
9530 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9539 ARG1_SET(ret, (U16)min);
9540 ARG2_SET(ret, (U16)max);
9552 #if 0 /* Now runtime fix should be reliable. */
9554 /* if this is reinstated, don't forget to put this back into perldiag:
9556 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9558 (F) The part of the regexp subject to either the * or + quantifier
9559 could match an empty string. The {#} shows in the regular
9560 expression about where the problem was discovered.
9564 if (!(flags&HASWIDTH) && op != '?')
9565 vFAIL("Regexp *+ operand could be empty");
9568 #ifdef RE_TRACK_PATTERN_OFFSETS
9569 parse_start = RExC_parse;
9571 nextchar(pRExC_state);
9573 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9575 if (op == '*' && (flags&SIMPLE)) {
9576 reginsert(pRExC_state, STAR, ret, depth+1);
9580 else if (op == '*') {
9584 else if (op == '+' && (flags&SIMPLE)) {
9585 reginsert(pRExC_state, PLUS, ret, depth+1);
9589 else if (op == '+') {
9593 else if (op == '?') {
9598 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9599 ckWARN3reg(RExC_parse,
9600 "%.*s matches null string many times",
9601 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9605 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9606 nextchar(pRExC_state);
9607 reginsert(pRExC_state, MINMOD, ret, depth+1);
9608 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9610 #ifndef REG_ALLOW_MINMOD_SUSPEND
9613 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9615 nextchar(pRExC_state);
9616 ender = reg_node(pRExC_state, SUCCEED);
9617 REGTAIL(pRExC_state, ret, ender);
9618 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9620 ender = reg_node(pRExC_state, TAIL);
9621 REGTAIL(pRExC_state, ret, ender);
9625 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9627 vFAIL("Nested quantifiers");
9634 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9637 /* This is expected to be called by a parser routine that has recognized '\N'
9638 and needs to handle the rest. RExC_parse is expected to point at the first
9639 char following the N at the time of the call. On successful return,
9640 RExC_parse has been updated to point to just after the sequence identified
9641 by this routine, and <*flagp> has been updated.
9643 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9646 \N may begin either a named sequence, or if outside a character class, mean
9647 to match a non-newline. For non single-quoted regexes, the tokenizer has
9648 attempted to decide which, and in the case of a named sequence, converted it
9649 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9650 where c1... are the characters in the sequence. For single-quoted regexes,
9651 the tokenizer passes the \N sequence through unchanged; this code will not
9652 attempt to determine this nor expand those, instead raising a syntax error.
9653 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9654 or there is no '}', it signals that this \N occurrence means to match a
9657 Only the \N{U+...} form should occur in a character class, for the same
9658 reason that '.' inside a character class means to just match a period: it
9659 just doesn't make sense.
9661 The function raises an error (via vFAIL), and doesn't return for various
9662 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9663 success; it returns FALSE otherwise.
9665 If <valuep> is non-null, it means the caller can accept an input sequence
9666 consisting of a just a single code point; <*valuep> is set to that value
9667 if the input is such.
9669 If <node_p> is non-null it signifies that the caller can accept any other
9670 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9672 1) \N means not-a-NL: points to a newly created REG_ANY node;
9673 2) \N{}: points to a new NOTHING node;
9674 3) otherwise: points to a new EXACT node containing the resolved
9676 Note that FALSE is returned for single code point sequences if <valuep> is
9680 char * endbrace; /* '}' following the name */
9682 char *endchar; /* Points to '.' or '}' ending cur char in the input
9684 bool has_multiple_chars; /* true if the input stream contains a sequence of
9685 more than one character */
9687 GET_RE_DEBUG_FLAGS_DECL;
9689 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9693 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9695 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9696 * modifier. The other meaning does not */
9697 p = (RExC_flags & RXf_PMf_EXTENDED)
9698 ? regwhite( pRExC_state, RExC_parse )
9701 /* Disambiguate between \N meaning a named character versus \N meaning
9702 * [^\n]. The former is assumed when it can't be the latter. */
9703 if (*p != '{' || regcurly(p)) {
9706 /* no bare \N in a charclass */
9707 if (in_char_class) {
9708 vFAIL("\\N in a character class must be a named character: \\N{...}");
9712 nextchar(pRExC_state);
9713 *node_p = reg_node(pRExC_state, REG_ANY);
9714 *flagp |= HASWIDTH|SIMPLE;
9717 Set_Node_Length(*node_p, 1); /* MJD */
9721 /* Here, we have decided it should be a named character or sequence */
9723 /* The test above made sure that the next real character is a '{', but
9724 * under the /x modifier, it could be separated by space (or a comment and
9725 * \n) and this is not allowed (for consistency with \x{...} and the
9726 * tokenizer handling of \N{NAME}). */
9727 if (*RExC_parse != '{') {
9728 vFAIL("Missing braces on \\N{}");
9731 RExC_parse++; /* Skip past the '{' */
9733 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9734 || ! (endbrace == RExC_parse /* nothing between the {} */
9735 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9736 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9738 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9739 vFAIL("\\N{NAME} must be resolved by the lexer");
9742 if (endbrace == RExC_parse) { /* empty: \N{} */
9745 *node_p = reg_node(pRExC_state,NOTHING);
9747 else if (in_char_class) {
9748 if (SIZE_ONLY && in_char_class) {
9749 ckWARNreg(RExC_parse,
9750 "Ignoring zero length \\N{} in character class"
9758 nextchar(pRExC_state);
9762 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9763 RExC_parse += 2; /* Skip past the 'U+' */
9765 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9767 /* Code points are separated by dots. If none, there is only one code
9768 * point, and is terminated by the brace */
9769 has_multiple_chars = (endchar < endbrace);
9771 if (valuep && (! has_multiple_chars || in_char_class)) {
9772 /* We only pay attention to the first char of
9773 multichar strings being returned in char classes. I kinda wonder
9774 if this makes sense as it does change the behaviour
9775 from earlier versions, OTOH that behaviour was broken
9776 as well. XXX Solution is to recharacterize as
9777 [rest-of-class]|multi1|multi2... */
9779 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9780 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9781 | PERL_SCAN_DISALLOW_PREFIX
9782 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9784 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9786 /* The tokenizer should have guaranteed validity, but it's possible to
9787 * bypass it by using single quoting, so check */
9788 if (length_of_hex == 0
9789 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9791 RExC_parse += length_of_hex; /* Includes all the valid */
9792 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9793 ? UTF8SKIP(RExC_parse)
9795 /* Guard against malformed utf8 */
9796 if (RExC_parse >= endchar) {
9797 RExC_parse = endchar;
9799 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9802 if (in_char_class && has_multiple_chars) {
9803 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9806 RExC_parse = endbrace + 1;
9808 else if (! node_p || ! has_multiple_chars) {
9810 /* Here, the input is legal, but not according to the caller's
9811 * options. We fail without advancing the parse, so that the
9812 * caller can try again */
9818 /* What is done here is to convert this to a sub-pattern of the form
9819 * (?:\x{char1}\x{char2}...)
9820 * and then call reg recursively. That way, it retains its atomicness,
9821 * while not having to worry about special handling that some code
9822 * points may have. toke.c has converted the original Unicode values
9823 * to native, so that we can just pass on the hex values unchanged. We
9824 * do have to set a flag to keep recoding from happening in the
9827 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9829 char *orig_end = RExC_end;
9832 while (RExC_parse < endbrace) {
9834 /* Convert to notation the rest of the code understands */
9835 sv_catpv(substitute_parse, "\\x{");
9836 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9837 sv_catpv(substitute_parse, "}");
9839 /* Point to the beginning of the next character in the sequence. */
9840 RExC_parse = endchar + 1;
9841 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9843 sv_catpv(substitute_parse, ")");
9845 RExC_parse = SvPV(substitute_parse, len);
9847 /* Don't allow empty number */
9849 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9851 RExC_end = RExC_parse + len;
9853 /* The values are Unicode, and therefore not subject to recoding */
9854 RExC_override_recoding = 1;
9856 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9857 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9859 RExC_parse = endbrace;
9860 RExC_end = orig_end;
9861 RExC_override_recoding = 0;
9863 nextchar(pRExC_state);
9873 * It returns the code point in utf8 for the value in *encp.
9874 * value: a code value in the source encoding
9875 * encp: a pointer to an Encode object
9877 * If the result from Encode is not a single character,
9878 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9881 S_reg_recode(pTHX_ const char value, SV **encp)
9884 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9885 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9886 const STRLEN newlen = SvCUR(sv);
9887 UV uv = UNICODE_REPLACEMENT;
9889 PERL_ARGS_ASSERT_REG_RECODE;
9893 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9896 if (!newlen || numlen != newlen) {
9897 uv = UNICODE_REPLACEMENT;
9903 PERL_STATIC_INLINE U8
9904 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9908 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9914 op = get_regex_charset(RExC_flags);
9915 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9916 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9917 been, so there is no hole */
9923 PERL_STATIC_INLINE void
9924 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9926 /* This knows the details about sizing an EXACTish node, setting flags for
9927 * it (by setting <*flagp>, and potentially populating it with a single
9930 * If <len> (the length in bytes) is non-zero, this function assumes that
9931 * the node has already been populated, and just does the sizing. In this
9932 * case <code_point> should be the final code point that has already been
9933 * placed into the node. This value will be ignored except that under some
9934 * circumstances <*flagp> is set based on it.
9936 * If <len> is zero, the function assumes that the node is to contain only
9937 * the single character given by <code_point> and calculates what <len>
9938 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9939 * additionally will populate the node's STRING with <code_point>, if <len>
9940 * is 0. In both cases <*flagp> is appropriately set
9942 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9943 * folded (the latter only when the rules indicate it can match 'ss') */
9945 bool len_passed_in = cBOOL(len != 0);
9946 U8 character[UTF8_MAXBYTES_CASE+1];
9948 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9950 if (! len_passed_in) {
9953 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9956 uvchr_to_utf8( character, code_point);
9957 len = UTF8SKIP(character);
9961 || code_point != LATIN_SMALL_LETTER_SHARP_S
9962 || ASCII_FOLD_RESTRICTED
9963 || ! AT_LEAST_UNI_SEMANTICS)
9965 *character = (U8) code_point;
9970 *(character + 1) = 's';
9976 RExC_size += STR_SZ(len);
9979 RExC_emit += STR_SZ(len);
9980 STR_LEN(node) = len;
9981 if (! len_passed_in) {
9982 Copy((char *) character, STRING(node), len, char);
9988 /* A single character node is SIMPLE, except for the special-cased SHARP S
9990 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
9991 && (code_point != LATIN_SMALL_LETTER_SHARP_S
9992 || ! FOLD || ! DEPENDS_SEMANTICS))
9999 - regatom - the lowest level
10001 Try to identify anything special at the start of the pattern. If there
10002 is, then handle it as required. This may involve generating a single regop,
10003 such as for an assertion; or it may involve recursing, such as to
10004 handle a () structure.
10006 If the string doesn't start with something special then we gobble up
10007 as much literal text as we can.
10009 Once we have been able to handle whatever type of thing started the
10010 sequence, we return.
10012 Note: we have to be careful with escapes, as they can be both literal
10013 and special, and in the case of \10 and friends, context determines which.
10015 A summary of the code structure is:
10017 switch (first_byte) {
10018 cases for each special:
10019 handle this special;
10022 switch (2nd byte) {
10023 cases for each unambiguous special:
10024 handle this special;
10026 cases for each ambigous special/literal:
10028 if (special) handle here
10030 default: // unambiguously literal:
10033 default: // is a literal char
10036 create EXACTish node for literal;
10037 while (more input and node isn't full) {
10038 switch (input_byte) {
10039 cases for each special;
10040 make sure parse pointer is set so that the next call to
10041 regatom will see this special first
10042 goto loopdone; // EXACTish node terminated by prev. char
10044 append char to EXACTISH node;
10046 get next input byte;
10050 return the generated node;
10052 Specifically there are two separate switches for handling
10053 escape sequences, with the one for handling literal escapes requiring
10054 a dummy entry for all of the special escapes that are actually handled
10059 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10062 regnode *ret = NULL;
10064 char *parse_start = RExC_parse;
10066 GET_RE_DEBUG_FLAGS_DECL;
10067 DEBUG_PARSE("atom");
10068 *flagp = WORST; /* Tentatively. */
10070 PERL_ARGS_ASSERT_REGATOM;
10073 switch ((U8)*RExC_parse) {
10075 RExC_seen_zerolen++;
10076 nextchar(pRExC_state);
10077 if (RExC_flags & RXf_PMf_MULTILINE)
10078 ret = reg_node(pRExC_state, MBOL);
10079 else if (RExC_flags & RXf_PMf_SINGLELINE)
10080 ret = reg_node(pRExC_state, SBOL);
10082 ret = reg_node(pRExC_state, BOL);
10083 Set_Node_Length(ret, 1); /* MJD */
10086 nextchar(pRExC_state);
10088 RExC_seen_zerolen++;
10089 if (RExC_flags & RXf_PMf_MULTILINE)
10090 ret = reg_node(pRExC_state, MEOL);
10091 else if (RExC_flags & RXf_PMf_SINGLELINE)
10092 ret = reg_node(pRExC_state, SEOL);
10094 ret = reg_node(pRExC_state, EOL);
10095 Set_Node_Length(ret, 1); /* MJD */
10098 nextchar(pRExC_state);
10099 if (RExC_flags & RXf_PMf_SINGLELINE)
10100 ret = reg_node(pRExC_state, SANY);
10102 ret = reg_node(pRExC_state, REG_ANY);
10103 *flagp |= HASWIDTH|SIMPLE;
10105 Set_Node_Length(ret, 1); /* MJD */
10109 char * const oregcomp_parse = ++RExC_parse;
10110 ret = regclass(pRExC_state, flagp,depth+1);
10111 if (*RExC_parse != ']') {
10112 RExC_parse = oregcomp_parse;
10113 vFAIL("Unmatched [");
10115 nextchar(pRExC_state);
10116 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10120 nextchar(pRExC_state);
10121 ret = reg(pRExC_state, 1, &flags,depth+1);
10123 if (flags & TRYAGAIN) {
10124 if (RExC_parse == RExC_end) {
10125 /* Make parent create an empty node if needed. */
10126 *flagp |= TRYAGAIN;
10133 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10137 if (flags & TRYAGAIN) {
10138 *flagp |= TRYAGAIN;
10141 vFAIL("Internal urp");
10142 /* Supposed to be caught earlier. */
10148 vFAIL("Quantifier follows nothing");
10153 This switch handles escape sequences that resolve to some kind
10154 of special regop and not to literal text. Escape sequnces that
10155 resolve to literal text are handled below in the switch marked
10158 Every entry in this switch *must* have a corresponding entry
10159 in the literal escape switch. However, the opposite is not
10160 required, as the default for this switch is to jump to the
10161 literal text handling code.
10163 switch ((U8)*++RExC_parse) {
10164 /* Special Escapes */
10166 RExC_seen_zerolen++;
10167 ret = reg_node(pRExC_state, SBOL);
10169 goto finish_meta_pat;
10171 ret = reg_node(pRExC_state, GPOS);
10172 RExC_seen |= REG_SEEN_GPOS;
10174 goto finish_meta_pat;
10176 RExC_seen_zerolen++;
10177 ret = reg_node(pRExC_state, KEEPS);
10179 /* XXX:dmq : disabling in-place substitution seems to
10180 * be necessary here to avoid cases of memory corruption, as
10181 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10183 RExC_seen |= REG_SEEN_LOOKBEHIND;
10184 goto finish_meta_pat;
10186 ret = reg_node(pRExC_state, SEOL);
10188 RExC_seen_zerolen++; /* Do not optimize RE away */
10189 goto finish_meta_pat;
10191 ret = reg_node(pRExC_state, EOS);
10193 RExC_seen_zerolen++; /* Do not optimize RE away */
10194 goto finish_meta_pat;
10196 ret = reg_node(pRExC_state, CANY);
10197 RExC_seen |= REG_SEEN_CANY;
10198 *flagp |= HASWIDTH|SIMPLE;
10199 goto finish_meta_pat;
10201 ret = reg_node(pRExC_state, CLUMP);
10202 *flagp |= HASWIDTH;
10203 goto finish_meta_pat;
10205 op = ALNUM + get_regex_charset(RExC_flags);
10206 if (op > ALNUMA) { /* /aa is same as /a */
10209 ret = reg_node(pRExC_state, op);
10210 *flagp |= HASWIDTH|SIMPLE;
10211 goto finish_meta_pat;
10213 op = NALNUM + get_regex_charset(RExC_flags);
10214 if (op > NALNUMA) { /* /aa is same as /a */
10217 ret = reg_node(pRExC_state, op);
10218 *flagp |= HASWIDTH|SIMPLE;
10219 goto finish_meta_pat;
10221 RExC_seen_zerolen++;
10222 RExC_seen |= REG_SEEN_LOOKBEHIND;
10223 op = BOUND + get_regex_charset(RExC_flags);
10224 if (op > BOUNDA) { /* /aa is same as /a */
10227 ret = reg_node(pRExC_state, op);
10228 FLAGS(ret) = get_regex_charset(RExC_flags);
10230 goto finish_meta_pat;
10232 RExC_seen_zerolen++;
10233 RExC_seen |= REG_SEEN_LOOKBEHIND;
10234 op = NBOUND + get_regex_charset(RExC_flags);
10235 if (op > NBOUNDA) { /* /aa is same as /a */
10238 ret = reg_node(pRExC_state, op);
10239 FLAGS(ret) = get_regex_charset(RExC_flags);
10241 goto finish_meta_pat;
10243 op = SPACE + get_regex_charset(RExC_flags);
10244 if (op > SPACEA) { /* /aa is same as /a */
10247 ret = reg_node(pRExC_state, op);
10248 *flagp |= HASWIDTH|SIMPLE;
10249 goto finish_meta_pat;
10251 op = NSPACE + get_regex_charset(RExC_flags);
10252 if (op > NSPACEA) { /* /aa is same as /a */
10255 ret = reg_node(pRExC_state, op);
10256 *flagp |= HASWIDTH|SIMPLE;
10257 goto finish_meta_pat;
10265 U8 offset = get_regex_charset(RExC_flags);
10266 if (offset == REGEX_UNICODE_CHARSET) {
10267 offset = REGEX_DEPENDS_CHARSET;
10269 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10270 offset = REGEX_ASCII_RESTRICTED_CHARSET;
10274 ret = reg_node(pRExC_state, op);
10275 *flagp |= HASWIDTH|SIMPLE;
10276 goto finish_meta_pat;
10278 ret = reg_node(pRExC_state, LNBREAK);
10279 *flagp |= HASWIDTH;
10280 goto finish_meta_pat;
10282 ret = reg_node(pRExC_state, HORIZWS);
10283 *flagp |= HASWIDTH|SIMPLE;
10284 goto finish_meta_pat;
10286 ret = reg_node(pRExC_state, NHORIZWS);
10287 *flagp |= HASWIDTH|SIMPLE;
10288 goto finish_meta_pat;
10290 ret = reg_node(pRExC_state, VERTWS);
10291 *flagp |= HASWIDTH|SIMPLE;
10292 goto finish_meta_pat;
10294 ret = reg_node(pRExC_state, NVERTWS);
10295 *flagp |= HASWIDTH|SIMPLE;
10297 nextchar(pRExC_state);
10298 Set_Node_Length(ret, 2); /* MJD */
10303 char* const oldregxend = RExC_end;
10305 char* parse_start = RExC_parse - 2;
10308 if (RExC_parse[1] == '{') {
10309 /* a lovely hack--pretend we saw [\pX] instead */
10310 RExC_end = strchr(RExC_parse, '}');
10312 const U8 c = (U8)*RExC_parse;
10314 RExC_end = oldregxend;
10315 vFAIL2("Missing right brace on \\%c{}", c);
10320 RExC_end = RExC_parse + 2;
10321 if (RExC_end > oldregxend)
10322 RExC_end = oldregxend;
10326 ret = regclass(pRExC_state, flagp,depth+1);
10328 RExC_end = oldregxend;
10331 Set_Node_Offset(ret, parse_start + 2);
10332 Set_Node_Cur_Length(ret);
10333 nextchar(pRExC_state);
10337 /* Handle \N and \N{NAME} with multiple code points here and not
10338 * below because it can be multicharacter. join_exact() will join
10339 * them up later on. Also this makes sure that things like
10340 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10341 * The options to the grok function call causes it to fail if the
10342 * sequence is just a single code point. We then go treat it as
10343 * just another character in the current EXACT node, and hence it
10344 * gets uniform treatment with all the other characters. The
10345 * special treatment for quantifiers is not needed for such single
10346 * character sequences */
10348 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10353 case 'k': /* Handle \k<NAME> and \k'NAME' */
10356 char ch= RExC_parse[1];
10357 if (ch != '<' && ch != '\'' && ch != '{') {
10359 vFAIL2("Sequence %.2s... not terminated",parse_start);
10361 /* this pretty much dupes the code for (?P=...) in reg(), if
10362 you change this make sure you change that */
10363 char* name_start = (RExC_parse += 2);
10365 SV *sv_dat = reg_scan_name(pRExC_state,
10366 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10367 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10368 if (RExC_parse == name_start || *RExC_parse != ch)
10369 vFAIL2("Sequence %.3s... not terminated",parse_start);
10372 num = add_data( pRExC_state, 1, "S" );
10373 RExC_rxi->data->data[num]=(void*)sv_dat;
10374 SvREFCNT_inc_simple_void(sv_dat);
10378 ret = reganode(pRExC_state,
10381 : (ASCII_FOLD_RESTRICTED)
10383 : (AT_LEAST_UNI_SEMANTICS)
10389 *flagp |= HASWIDTH;
10391 /* override incorrect value set in reganode MJD */
10392 Set_Node_Offset(ret, parse_start+1);
10393 Set_Node_Cur_Length(ret); /* MJD */
10394 nextchar(pRExC_state);
10400 case '1': case '2': case '3': case '4':
10401 case '5': case '6': case '7': case '8': case '9':
10404 bool isg = *RExC_parse == 'g';
10409 if (*RExC_parse == '{') {
10413 if (*RExC_parse == '-') {
10417 if (hasbrace && !isDIGIT(*RExC_parse)) {
10418 if (isrel) RExC_parse--;
10420 goto parse_named_seq;
10422 num = atoi(RExC_parse);
10423 if (isg && num == 0)
10424 vFAIL("Reference to invalid group 0");
10426 num = RExC_npar - num;
10428 vFAIL("Reference to nonexistent or unclosed group");
10430 if (!isg && num > 9 && num >= RExC_npar)
10431 /* Probably a character specified in octal, e.g. \35 */
10434 char * const parse_start = RExC_parse - 1; /* MJD */
10435 while (isDIGIT(*RExC_parse))
10437 if (parse_start == RExC_parse - 1)
10438 vFAIL("Unterminated \\g... pattern");
10440 if (*RExC_parse != '}')
10441 vFAIL("Unterminated \\g{...} pattern");
10445 if (num > (I32)RExC_rx->nparens)
10446 vFAIL("Reference to nonexistent group");
10449 ret = reganode(pRExC_state,
10452 : (ASCII_FOLD_RESTRICTED)
10454 : (AT_LEAST_UNI_SEMANTICS)
10460 *flagp |= HASWIDTH;
10462 /* override incorrect value set in reganode MJD */
10463 Set_Node_Offset(ret, parse_start+1);
10464 Set_Node_Cur_Length(ret); /* MJD */
10466 nextchar(pRExC_state);
10471 if (RExC_parse >= RExC_end)
10472 FAIL("Trailing \\");
10475 /* Do not generate "unrecognized" warnings here, we fall
10476 back into the quick-grab loop below */
10483 if (RExC_flags & RXf_PMf_EXTENDED) {
10484 if ( reg_skipcomment( pRExC_state ) )
10491 parse_start = RExC_parse - 1;
10500 #define MAX_NODE_STRING_SIZE 127
10501 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10503 U8 upper_parse = MAX_NODE_STRING_SIZE;
10506 bool next_is_quantifier;
10507 char * oldp = NULL;
10509 /* If a folding node contains only code points that don't
10510 * participate in folds, it can be changed into an EXACT node,
10511 * which allows the optimizer more things to look for */
10515 node_type = compute_EXACTish(pRExC_state);
10516 ret = reg_node(pRExC_state, node_type);
10518 /* In pass1, folded, we use a temporary buffer instead of the
10519 * actual node, as the node doesn't exist yet */
10520 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10526 /* We do the EXACTFish to EXACT node only if folding, and not if in
10527 * locale, as whether a character folds or not isn't known until
10529 maybe_exact = FOLD && ! LOC;
10531 /* XXX The node can hold up to 255 bytes, yet this only goes to
10532 * 127. I (khw) do not know why. Keeping it somewhat less than
10533 * 255 allows us to not have to worry about overflow due to
10534 * converting to utf8 and fold expansion, but that value is
10535 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10536 * split up by this limit into a single one using the real max of
10537 * 255. Even at 127, this breaks under rare circumstances. If
10538 * folding, we do not want to split a node at a character that is a
10539 * non-final in a multi-char fold, as an input string could just
10540 * happen to want to match across the node boundary. The join
10541 * would solve that problem if the join actually happens. But a
10542 * series of more than two nodes in a row each of 127 would cause
10543 * the first join to succeed to get to 254, but then there wouldn't
10544 * be room for the next one, which could at be one of those split
10545 * multi-char folds. I don't know of any fool-proof solution. One
10546 * could back off to end with only a code point that isn't such a
10547 * non-final, but it is possible for there not to be any in the
10549 for (p = RExC_parse - 1;
10550 len < upper_parse && p < RExC_end;
10555 if (RExC_flags & RXf_PMf_EXTENDED)
10556 p = regwhite( pRExC_state, p );
10567 /* Literal Escapes Switch
10569 This switch is meant to handle escape sequences that
10570 resolve to a literal character.
10572 Every escape sequence that represents something
10573 else, like an assertion or a char class, is handled
10574 in the switch marked 'Special Escapes' above in this
10575 routine, but also has an entry here as anything that
10576 isn't explicitly mentioned here will be treated as
10577 an unescaped equivalent literal.
10580 switch ((U8)*++p) {
10581 /* These are all the special escapes. */
10582 case 'A': /* Start assertion */
10583 case 'b': case 'B': /* Word-boundary assertion*/
10584 case 'C': /* Single char !DANGEROUS! */
10585 case 'd': case 'D': /* digit class */
10586 case 'g': case 'G': /* generic-backref, pos assertion */
10587 case 'h': case 'H': /* HORIZWS */
10588 case 'k': case 'K': /* named backref, keep marker */
10589 case 'p': case 'P': /* Unicode property */
10590 case 'R': /* LNBREAK */
10591 case 's': case 'S': /* space class */
10592 case 'v': case 'V': /* VERTWS */
10593 case 'w': case 'W': /* word class */
10594 case 'X': /* eXtended Unicode "combining character sequence" */
10595 case 'z': case 'Z': /* End of line/string assertion */
10599 /* Anything after here is an escape that resolves to a
10600 literal. (Except digits, which may or may not)
10606 case 'N': /* Handle a single-code point named character. */
10607 /* The options cause it to fail if a multiple code
10608 * point sequence. Handle those in the switch() above
10610 RExC_parse = p + 1;
10611 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10612 flagp, depth, FALSE))
10614 RExC_parse = p = oldp;
10618 if (ender > 0xff) {
10635 ender = ASCII_TO_NATIVE('\033');
10639 ender = ASCII_TO_NATIVE('\007');
10644 STRLEN brace_len = len;
10646 const char* error_msg;
10648 bool valid = grok_bslash_o(p,
10655 RExC_parse = p; /* going to die anyway; point
10656 to exact spot of failure */
10663 if (PL_encoding && ender < 0x100) {
10664 goto recode_encoding;
10666 if (ender > 0xff) {
10673 STRLEN brace_len = len;
10675 const char* error_msg;
10677 bool valid = grok_bslash_x(p,
10684 RExC_parse = p; /* going to die anyway; point
10685 to exact spot of failure */
10691 if (PL_encoding && ender < 0x100) {
10692 goto recode_encoding;
10694 if (ender > 0xff) {
10701 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10703 case '0': case '1': case '2': case '3':case '4':
10704 case '5': case '6': case '7':
10706 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10708 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10710 ender = grok_oct(p, &numlen, &flags, NULL);
10711 if (ender > 0xff) {
10720 if (PL_encoding && ender < 0x100)
10721 goto recode_encoding;
10724 if (! RExC_override_recoding) {
10725 SV* enc = PL_encoding;
10726 ender = reg_recode((const char)(U8)ender, &enc);
10727 if (!enc && SIZE_ONLY)
10728 ckWARNreg(p, "Invalid escape in the specified encoding");
10734 FAIL("Trailing \\");
10737 if (!SIZE_ONLY&& isALNUMC(*p)) {
10738 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10740 goto normal_default;
10744 /* Currently we don't warn when the lbrace is at the start
10745 * of a construct. This catches it in the middle of a
10746 * literal string, or when its the first thing after
10747 * something like "\b" */
10749 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10751 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10756 if (UTF8_IS_START(*p) && UTF) {
10758 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10759 &numlen, UTF8_ALLOW_DEFAULT);
10765 } /* End of switch on the literal */
10767 /* Here, have looked at the literal character and <ender>
10768 * contains its ordinal, <p> points to the character after it
10771 if ( RExC_flags & RXf_PMf_EXTENDED)
10772 p = regwhite( pRExC_state, p );
10774 /* If the next thing is a quantifier, it applies to this
10775 * character only, which means that this character has to be in
10776 * its own node and can't just be appended to the string in an
10777 * existing node, so if there are already other characters in
10778 * the node, close the node with just them, and set up to do
10779 * this character again next time through, when it will be the
10780 * only thing in its new node */
10781 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10789 /* See comments for join_exact() as to why we fold
10790 * this non-UTF at compile time */
10791 || (node_type == EXACTFU
10792 && ender == LATIN_SMALL_LETTER_SHARP_S))
10796 /* Prime the casefolded buffer. Locale rules, which
10797 * apply only to code points < 256, aren't known until
10798 * execution, so for them, just output the original
10799 * character using utf8. If we start to fold non-UTF
10800 * patterns, be sure to update join_exact() */
10801 if (LOC && ender < 256) {
10802 if (UNI_IS_INVARIANT(ender)) {
10806 *s = UTF8_TWO_BYTE_HI(ender);
10807 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10812 UV folded = _to_uni_fold_flags(
10817 | ((LOC) ? FOLD_FLAGS_LOCALE
10818 : (ASCII_FOLD_RESTRICTED)
10819 ? FOLD_FLAGS_NOMIX_ASCII
10823 /* If this node only contains non-folding code
10824 * points so far, see if this new one is also
10827 if (folded != ender) {
10828 maybe_exact = FALSE;
10831 /* Here the fold is the original; we have
10832 * to check further to see if anything
10834 if (! PL_utf8_foldable) {
10835 SV* swash = swash_init("utf8",
10837 &PL_sv_undef, 1, 0);
10839 _get_swash_invlist(swash);
10840 SvREFCNT_dec(swash);
10842 if (_invlist_contains_cp(PL_utf8_foldable,
10845 maybe_exact = FALSE;
10853 /* The loop increments <len> each time, as all but this
10854 * path (and the one just below for UTF) through it add
10855 * a single byte to the EXACTish node. But this one
10856 * has changed len to be the correct final value, so
10857 * subtract one to cancel out the increment that
10859 len += foldlen - 1;
10863 maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10867 const STRLEN unilen = reguni(pRExC_state, ender, s);
10873 /* See comment just above for - 1 */
10877 REGC((char)ender, s++);
10880 if (next_is_quantifier) {
10882 /* Here, the next input is a quantifier, and to get here,
10883 * the current character is the only one in the node.
10884 * Also, here <len> doesn't include the final byte for this
10890 } /* End of loop through literal characters */
10892 /* Here we have either exhausted the input or ran out of room in
10893 * the node. (If we encountered a character that can't be in the
10894 * node, transfer is made directly to <loopdone>, and so we
10895 * wouldn't have fallen off the end of the loop.) In the latter
10896 * case, we artificially have to split the node into two, because
10897 * we just don't have enough space to hold everything. This
10898 * creates a problem if the final character participates in a
10899 * multi-character fold in the non-final position, as a match that
10900 * should have occurred won't, due to the way nodes are matched,
10901 * and our artificial boundary. So back off until we find a non-
10902 * problematic character -- one that isn't at the beginning or
10903 * middle of such a fold. (Either it doesn't participate in any
10904 * folds, or appears only in the final position of all the folds it
10905 * does participate in.) A better solution with far fewer false
10906 * positives, and that would fill the nodes more completely, would
10907 * be to actually have available all the multi-character folds to
10908 * test against, and to back-off only far enough to be sure that
10909 * this node isn't ending with a partial one. <upper_parse> is set
10910 * further below (if we need to reparse the node) to include just
10911 * up through that final non-problematic character that this code
10912 * identifies, so when it is set to less than the full node, we can
10913 * skip the rest of this */
10914 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10916 const STRLEN full_len = len;
10918 assert(len >= MAX_NODE_STRING_SIZE);
10920 /* Here, <s> points to the final byte of the final character.
10921 * Look backwards through the string until find a non-
10922 * problematic character */
10926 /* These two have no multi-char folds to non-UTF characters
10928 if (ASCII_FOLD_RESTRICTED || LOC) {
10932 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10936 if (! PL_NonL1NonFinalFold) {
10937 PL_NonL1NonFinalFold = _new_invlist_C_array(
10938 NonL1_Perl_Non_Final_Folds_invlist);
10941 /* Point to the first byte of the final character */
10942 s = (char *) utf8_hop((U8 *) s, -1);
10944 while (s >= s0) { /* Search backwards until find
10945 non-problematic char */
10946 if (UTF8_IS_INVARIANT(*s)) {
10948 /* There are no ascii characters that participate
10949 * in multi-char folds under /aa. In EBCDIC, the
10950 * non-ascii invariants are all control characters,
10951 * so don't ever participate in any folds. */
10952 if (ASCII_FOLD_RESTRICTED
10953 || ! IS_NON_FINAL_FOLD(*s))
10958 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10960 /* No Latin1 characters participate in multi-char
10961 * folds under /l */
10963 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10969 else if (! _invlist_contains_cp(
10970 PL_NonL1NonFinalFold,
10971 valid_utf8_to_uvchr((U8 *) s, NULL)))
10976 /* Here, the current character is problematic in that
10977 * it does occur in the non-final position of some
10978 * fold, so try the character before it, but have to
10979 * special case the very first byte in the string, so
10980 * we don't read outside the string */
10981 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10982 } /* End of loop backwards through the string */
10984 /* If there were only problematic characters in the string,
10985 * <s> will point to before s0, in which case the length
10986 * should be 0, otherwise include the length of the
10987 * non-problematic character just found */
10988 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10991 /* Here, have found the final character, if any, that is
10992 * non-problematic as far as ending the node without splitting
10993 * it across a potential multi-char fold. <len> contains the
10994 * number of bytes in the node up-to and including that
10995 * character, or is 0 if there is no such character, meaning
10996 * the whole node contains only problematic characters. In
10997 * this case, give up and just take the node as-is. We can't
11003 /* Here, the node does contain some characters that aren't
11004 * problematic. If one such is the final character in the
11005 * node, we are done */
11006 if (len == full_len) {
11009 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11011 /* If the final character is problematic, but the
11012 * penultimate is not, back-off that last character to
11013 * later start a new node with it */
11018 /* Here, the final non-problematic character is earlier
11019 * in the input than the penultimate character. What we do
11020 * is reparse from the beginning, going up only as far as
11021 * this final ok one, thus guaranteeing that the node ends
11022 * in an acceptable character. The reason we reparse is
11023 * that we know how far in the character is, but we don't
11024 * know how to correlate its position with the input parse.
11025 * An alternate implementation would be to build that
11026 * correlation as we go along during the original parse,
11027 * but that would entail extra work for every node, whereas
11028 * this code gets executed only when the string is too
11029 * large for the node, and the final two characters are
11030 * problematic, an infrequent occurrence. Yet another
11031 * possible strategy would be to save the tail of the
11032 * string, and the next time regatom is called, initialize
11033 * with that. The problem with this is that unless you
11034 * back off one more character, you won't be guaranteed
11035 * regatom will get called again, unless regbranch,
11036 * regpiece ... are also changed. If you do back off that
11037 * extra character, so that there is input guaranteed to
11038 * force calling regatom, you can't handle the case where
11039 * just the first character in the node is acceptable. I
11040 * (khw) decided to try this method which doesn't have that
11041 * pitfall; if performance issues are found, we can do a
11042 * combination of the current approach plus that one */
11048 } /* End of verifying node ends with an appropriate char */
11050 loopdone: /* Jumped to when encounters something that shouldn't be in
11053 /* If 'maybe_exact' is still set here, means there are no
11054 * code points in the node that participate in folds */
11055 if (FOLD && maybe_exact) {
11059 /* I (khw) don't know if you can get here with zero length, but the
11060 * old code handled this situation by creating a zero-length EXACT
11061 * node. Might as well be NOTHING instead */
11066 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11069 RExC_parse = p - 1;
11070 Set_Node_Cur_Length(ret); /* MJD */
11071 nextchar(pRExC_state);
11073 /* len is STRLEN which is unsigned, need to copy to signed */
11076 vFAIL("Internal disaster");
11079 } /* End of label 'defchar:' */
11081 } /* End of giant switch on input character */
11087 S_regwhite( RExC_state_t *pRExC_state, char *p )
11089 const char *e = RExC_end;
11091 PERL_ARGS_ASSERT_REGWHITE;
11096 else if (*p == '#') {
11099 if (*p++ == '\n') {
11105 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11113 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11114 Character classes ([:foo:]) can also be negated ([:^foo:]).
11115 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11116 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11117 but trigger failures because they are currently unimplemented. */
11119 #define POSIXCC_DONE(c) ((c) == ':')
11120 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11121 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11124 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
11127 I32 namedclass = OOB_NAMEDCLASS;
11129 PERL_ARGS_ASSERT_REGPPOSIXCC;
11131 if (value == '[' && RExC_parse + 1 < RExC_end &&
11132 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11133 POSIXCC(UCHARAT(RExC_parse))) {
11134 const char c = UCHARAT(RExC_parse);
11135 char* const s = RExC_parse++;
11137 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11139 if (RExC_parse == RExC_end)
11140 /* Grandfather lone [:, [=, [. */
11143 const char* const t = RExC_parse++; /* skip over the c */
11146 if (UCHARAT(RExC_parse) == ']') {
11147 const char *posixcc = s + 1;
11148 RExC_parse++; /* skip over the ending ] */
11151 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11152 const I32 skip = t - posixcc;
11154 /* Initially switch on the length of the name. */
11157 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11158 namedclass = ANYOF_WORDCHAR;
11161 /* Names all of length 5. */
11162 /* alnum alpha ascii blank cntrl digit graph lower
11163 print punct space upper */
11164 /* Offset 4 gives the best switch position. */
11165 switch (posixcc[4]) {
11167 if (memEQ(posixcc, "alph", 4)) /* alpha */
11168 namedclass = ANYOF_ALPHA;
11171 if (memEQ(posixcc, "spac", 4)) /* space */
11172 namedclass = ANYOF_PSXSPC;
11175 if (memEQ(posixcc, "grap", 4)) /* graph */
11176 namedclass = ANYOF_GRAPH;
11179 if (memEQ(posixcc, "asci", 4)) /* ascii */
11180 namedclass = ANYOF_ASCII;
11183 if (memEQ(posixcc, "blan", 4)) /* blank */
11184 namedclass = ANYOF_BLANK;
11187 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11188 namedclass = ANYOF_CNTRL;
11191 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11192 namedclass = ANYOF_ALNUMC;
11195 if (memEQ(posixcc, "lowe", 4)) /* lower */
11196 namedclass = ANYOF_LOWER;
11197 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11198 namedclass = ANYOF_UPPER;
11201 if (memEQ(posixcc, "digi", 4)) /* digit */
11202 namedclass = ANYOF_DIGIT;
11203 else if (memEQ(posixcc, "prin", 4)) /* print */
11204 namedclass = ANYOF_PRINT;
11205 else if (memEQ(posixcc, "punc", 4)) /* punct */
11206 namedclass = ANYOF_PUNCT;
11211 if (memEQ(posixcc, "xdigit", 6))
11212 namedclass = ANYOF_XDIGIT;
11216 if (namedclass == OOB_NAMEDCLASS)
11217 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11220 /* The #defines are structured so each complement is +1 to
11221 * the normal one */
11225 assert (posixcc[skip] == ':');
11226 assert (posixcc[skip+1] == ']');
11227 } else if (!SIZE_ONLY) {
11228 /* [[=foo=]] and [[.foo.]] are still future. */
11230 /* adjust RExC_parse so the warning shows after
11231 the class closes */
11232 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11234 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11237 /* Maternal grandfather:
11238 * "[:" ending in ":" but not in ":]" */
11248 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
11252 PERL_ARGS_ASSERT_CHECKPOSIXCC;
11254 if (POSIXCC(UCHARAT(RExC_parse))) {
11255 const char *s = RExC_parse;
11256 const char c = *s++;
11258 while (isALNUM(*s))
11260 if (*s && c == *s && s[1] == ']') {
11262 "POSIX syntax [%c %c] belongs inside character classes",
11265 /* [[=foo=]] and [[.foo.]] are still future. */
11266 if (POSIXCC_NOTYET(c)) {
11267 /* adjust RExC_parse so the error shows after
11268 the class closes */
11269 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11271 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11277 /* Generate the code to add a full posix character <class> to the bracketed
11278 * character class given by <node>. (<node> is needed only under locale rules)
11279 * destlist is the inversion list for non-locale rules that this class is
11281 * sourcelist is the ASCII-range inversion list to add under /a rules
11282 * Xsourcelist is the full Unicode range list to use otherwise. */
11283 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11285 SV* scratch_list = NULL; \
11287 /* Set this class in the node for runtime matching */ \
11288 ANYOF_CLASS_SET(node, class); \
11290 /* For above Latin1 code points, we use the full Unicode range */ \
11291 _invlist_intersection(PL_AboveLatin1, \
11294 /* And set the output to it, adding instead if there already is an \
11295 * output. Checking if <destlist> is NULL first saves an extra \
11296 * clone. Its reference count will be decremented at the next \
11297 * union, etc, or if this is the only instance, at the end of the \
11299 if (! destlist) { \
11300 destlist = scratch_list; \
11303 _invlist_union(destlist, scratch_list, &destlist); \
11304 SvREFCNT_dec(scratch_list); \
11308 /* For non-locale, just add it to any existing list */ \
11309 _invlist_union(destlist, \
11310 (AT_LEAST_ASCII_RESTRICTED) \
11316 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11318 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11320 SV* scratch_list = NULL; \
11321 ANYOF_CLASS_SET(node, class); \
11322 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
11323 if (! destlist) { \
11324 destlist = scratch_list; \
11327 _invlist_union(destlist, scratch_list, &destlist); \
11328 SvREFCNT_dec(scratch_list); \
11332 _invlist_union_complement_2nd(destlist, \
11333 (AT_LEAST_ASCII_RESTRICTED) \
11337 /* Under /d, everything in the upper half of the Latin1 range \
11338 * matches this complement */ \
11339 if (DEPENDS_SEMANTICS) { \
11340 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11344 /* Generate the code to add a posix character <class> to the bracketed
11345 * character class given by <node>. (<node> is needed only under locale rules)
11346 * destlist is the inversion list for non-locale rules that this class is
11348 * sourcelist is the ASCII-range inversion list to add under /a rules
11349 * l1_sourcelist is the Latin1 range list to use otherwise.
11350 * Xpropertyname is the name to add to <run_time_list> of the property to
11351 * specify the code points above Latin1 that will have to be
11352 * determined at run-time
11353 * run_time_list is a SV* that contains text names of properties that are to
11354 * be computed at run time. This concatenates <Xpropertyname>
11355 * to it, appropriately
11356 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11358 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11359 l1_sourcelist, Xpropertyname, run_time_list) \
11360 /* First, resolve whether to use the ASCII-only list or the L1 \
11362 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
11363 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11364 Xpropertyname, run_time_list)
11366 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11367 Xpropertyname, run_time_list) \
11368 /* If not /a matching, there are going to be code points we will have \
11369 * to defer to runtime to look-up */ \
11370 if (! AT_LEAST_ASCII_RESTRICTED) { \
11371 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11374 ANYOF_CLASS_SET(node, class); \
11377 _invlist_union(destlist, sourcelist, &destlist); \
11380 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
11381 * this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
11383 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11384 l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11385 if (AT_LEAST_ASCII_RESTRICTED) { \
11386 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
11389 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11390 matches_above_unicode = TRUE; \
11392 ANYOF_CLASS_SET(node, namedclass); \
11395 SV* scratch_list = NULL; \
11396 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
11397 if (! destlist) { \
11398 destlist = scratch_list; \
11401 _invlist_union(destlist, scratch_list, &destlist); \
11402 SvREFCNT_dec(scratch_list); \
11404 if (DEPENDS_SEMANTICS) { \
11405 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11410 /* The names of properties whose definitions are not known at compile time are
11411 * stored in this SV, after a constant heading. So if the length has been
11412 * changed since initialization, then there is a run-time definition. */
11413 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11415 /* This converts the named class defined in regcomp.h to its equivalent class
11416 * number defined in handy.h. */
11417 #define namedclass_to_classnum(class) ((class) / 2)
11420 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11422 /* parse a bracketed class specification. Most of these will produce an ANYOF node;
11423 * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11424 * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with
11425 * multi-character folds: it will be rewritten following the paradigm of
11426 * this example, where the <multi-fold>s are characters which fold to
11427 * multiple character sequences:
11428 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11429 * gets effectively rewritten as:
11430 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11431 * reg() gets called (recursively) on the rewritten version, and this
11432 * function will return what it constructs. (Actually the <multi-fold>s
11433 * aren't physically removed from the [abcdefghi], it's just that they are
11434 * ignored in the recursion by means of a a flag:
11435 * <RExC_in_multi_char_class>.)
11437 * ANYOF nodes contain a bit map for the first 256 characters, with the
11438 * corresponding bit set if that character is in the list. For characters
11439 * above 255, a range list or swash is used. There are extra bits for \w,
11440 * etc. in locale ANYOFs, as what these match is not determinable at
11445 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11447 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11450 IV namedclass = OOB_NAMEDCLASS;
11451 char *rangebegin = NULL;
11452 bool need_class = 0;
11453 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
11455 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11456 than just initialized. */
11457 SV* properties = NULL; /* Code points that match \p{} \P{} */
11458 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11459 extended beyond the Latin1 range */
11460 UV element_count = 0; /* Number of distinct elements in the class.
11461 Optimizations may be possible if this is tiny */
11462 AV * multi_char_matches = NULL; /* Code points that fold to more than one
11463 character; used under /i */
11466 /* Unicode properties are stored in a swash; this holds the current one
11467 * being parsed. If this swash is the only above-latin1 component of the
11468 * character class, an optimization is to pass it directly on to the
11469 * execution engine. Otherwise, it is set to NULL to indicate that there
11470 * are other things in the class that have to be dealt with at execution
11472 SV* swash = NULL; /* Code points that match \p{} \P{} */
11474 /* Set if a component of this character class is user-defined; just passed
11475 * on to the engine */
11476 bool has_user_defined_property = FALSE;
11478 /* inversion list of code points this node matches only when the target
11479 * string is in UTF-8. (Because is under /d) */
11480 SV* depends_list = NULL;
11482 /* inversion list of code points this node matches. For much of the
11483 * function, it includes only those that match regardless of the utf8ness
11484 * of the target string */
11485 SV* cp_list = NULL;
11488 /* In a range, counts how many 0-2 of the ends of it came from literals,
11489 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11490 UV literal_endpoint = 0;
11492 bool invert = FALSE; /* Is this class to be complemented */
11494 /* Is there any thing like \W or [:^digit:] that matches above the legal
11495 * Unicode range? */
11496 bool runtime_posix_matches_above_Unicode = FALSE;
11498 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11499 case we need to change the emitted regop to an EXACT. */
11500 const char * orig_parse = RExC_parse;
11501 const I32 orig_size = RExC_size;
11502 GET_RE_DEBUG_FLAGS_DECL;
11504 PERL_ARGS_ASSERT_REGCLASS;
11506 PERL_UNUSED_ARG(depth);
11509 DEBUG_PARSE("clas");
11511 /* Assume we are going to generate an ANYOF node. */
11512 ret = reganode(pRExC_state, ANYOF, 0);
11515 ANYOF_FLAGS(ret) = 0;
11518 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11520 if (! RExC_in_multi_char_class) {
11524 /* We have decided to not allow multi-char folds in inverted
11525 * character classes, due to the confusion that can happen,
11526 * especially with classes that are designed for a non-Unicode
11527 * world: You have the peculiar case that:
11528 "s s" =~ /^[^\xDF]+$/i => Y
11529 "ss" =~ /^[^\xDF]+$/i => N
11531 * See [perl #89750] */
11532 allow_full_fold = FALSE;
11537 RExC_size += ANYOF_SKIP;
11538 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11541 RExC_emit += ANYOF_SKIP;
11543 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11545 listsv = newSVpvs("# comment\n");
11546 initial_listsv_len = SvCUR(listsv);
11549 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11551 if (!SIZE_ONLY && POSIXCC(nextvalue))
11552 checkposixcc(pRExC_state);
11554 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11555 if (UCHARAT(RExC_parse) == ']')
11556 goto charclassloop;
11559 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11563 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11564 save_value = value;
11565 save_prevvalue = prevvalue;
11568 rangebegin = RExC_parse;
11572 value = utf8n_to_uvchr((U8*)RExC_parse,
11573 RExC_end - RExC_parse,
11574 &numlen, UTF8_ALLOW_DEFAULT);
11575 RExC_parse += numlen;
11578 value = UCHARAT(RExC_parse++);
11580 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11581 if (value == '[' && POSIXCC(nextvalue))
11582 namedclass = regpposixcc(pRExC_state, value);
11583 else if (value == '\\') {
11585 value = utf8n_to_uvchr((U8*)RExC_parse,
11586 RExC_end - RExC_parse,
11587 &numlen, UTF8_ALLOW_DEFAULT);
11588 RExC_parse += numlen;
11591 value = UCHARAT(RExC_parse++);
11592 /* Some compilers cannot handle switching on 64-bit integer
11593 * values, therefore value cannot be an UV. Yes, this will
11594 * be a problem later if we want switch on Unicode.
11595 * A similar issue a little bit later when switching on
11596 * namedclass. --jhi */
11597 switch ((I32)value) {
11598 case 'w': namedclass = ANYOF_WORDCHAR; break;
11599 case 'W': namedclass = ANYOF_NWORDCHAR; break;
11600 case 's': namedclass = ANYOF_SPACE; break;
11601 case 'S': namedclass = ANYOF_NSPACE; break;
11602 case 'd': namedclass = ANYOF_DIGIT; break;
11603 case 'D': namedclass = ANYOF_NDIGIT; break;
11604 case 'v': namedclass = ANYOF_VERTWS; break;
11605 case 'V': namedclass = ANYOF_NVERTWS; break;
11606 case 'h': namedclass = ANYOF_HORIZWS; break;
11607 case 'H': namedclass = ANYOF_NHORIZWS; break;
11608 case 'N': /* Handle \N{NAME} in class */
11610 /* We only pay attention to the first char of
11611 multichar strings being returned. I kinda wonder
11612 if this makes sense as it does change the behaviour
11613 from earlier versions, OTOH that behaviour was broken
11615 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11616 TRUE /* => charclass */))
11627 /* This routine will handle any undefined properties */
11628 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11630 if (RExC_parse >= RExC_end)
11631 vFAIL2("Empty \\%c{}", (U8)value);
11632 if (*RExC_parse == '{') {
11633 const U8 c = (U8)value;
11634 e = strchr(RExC_parse++, '}');
11636 vFAIL2("Missing right brace on \\%c{}", c);
11637 while (isSPACE(UCHARAT(RExC_parse)))
11639 if (e == RExC_parse)
11640 vFAIL2("Empty \\%c{}", c);
11641 n = e - RExC_parse;
11642 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11653 if (UCHARAT(RExC_parse) == '^') {
11656 value = value == 'p' ? 'P' : 'p'; /* toggle */
11657 while (isSPACE(UCHARAT(RExC_parse))) {
11662 /* Try to get the definition of the property into
11663 * <invlist>. If /i is in effect, the effective property
11664 * will have its name be <__NAME_i>. The design is
11665 * discussed in commit
11666 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11667 Newx(name, n + sizeof("_i__\n"), char);
11669 sprintf(name, "%s%.*s%s\n",
11670 (FOLD) ? "__" : "",
11676 /* Look up the property name, and get its swash and
11677 * inversion list, if the property is found */
11679 SvREFCNT_dec(swash);
11681 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11684 NULL, /* No inversion list */
11687 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11689 SvREFCNT_dec(swash);
11693 /* Here didn't find it. It could be a user-defined
11694 * property that will be available at run-time. Add it
11695 * to the list to look up then */
11696 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11697 (value == 'p' ? '+' : '!'),
11699 has_user_defined_property = TRUE;
11701 /* We don't know yet, so have to assume that the
11702 * property could match something in the Latin1 range,
11703 * hence something that isn't utf8. Note that this
11704 * would cause things in <depends_list> to match
11705 * inappropriately, except that any \p{}, including
11706 * this one forces Unicode semantics, which means there
11707 * is <no depends_list> */
11708 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11712 /* Here, did get the swash and its inversion list. If
11713 * the swash is from a user-defined property, then this
11714 * whole character class should be regarded as such */
11715 has_user_defined_property =
11717 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11719 /* Invert if asking for the complement */
11720 if (value == 'P') {
11721 _invlist_union_complement_2nd(properties,
11725 /* The swash can't be used as-is, because we've
11726 * inverted things; delay removing it to here after
11727 * have copied its invlist above */
11728 SvREFCNT_dec(swash);
11732 _invlist_union(properties, invlist, &properties);
11737 RExC_parse = e + 1;
11738 namedclass = ANYOF_MAX; /* no official name, but it's named */
11740 /* \p means they want Unicode semantics */
11741 RExC_uni_semantics = 1;
11744 case 'n': value = '\n'; break;
11745 case 'r': value = '\r'; break;
11746 case 't': value = '\t'; break;
11747 case 'f': value = '\f'; break;
11748 case 'b': value = '\b'; break;
11749 case 'e': value = ASCII_TO_NATIVE('\033');break;
11750 case 'a': value = ASCII_TO_NATIVE('\007');break;
11752 RExC_parse--; /* function expects to be pointed at the 'o' */
11754 const char* error_msg;
11755 bool valid = grok_bslash_o(RExC_parse,
11760 RExC_parse += numlen;
11765 if (PL_encoding && value < 0x100) {
11766 goto recode_encoding;
11770 RExC_parse--; /* function expects to be pointed at the 'x' */
11772 const char* error_msg;
11773 bool valid = grok_bslash_x(RExC_parse,
11778 RExC_parse += numlen;
11783 if (PL_encoding && value < 0x100)
11784 goto recode_encoding;
11787 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11789 case '0': case '1': case '2': case '3': case '4':
11790 case '5': case '6': case '7':
11792 /* Take 1-3 octal digits */
11793 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11795 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11796 RExC_parse += numlen;
11797 if (PL_encoding && value < 0x100)
11798 goto recode_encoding;
11802 if (! RExC_override_recoding) {
11803 SV* enc = PL_encoding;
11804 value = reg_recode((const char)(U8)value, &enc);
11805 if (!enc && SIZE_ONLY)
11806 ckWARNreg(RExC_parse,
11807 "Invalid escape in the specified encoding");
11811 /* Allow \_ to not give an error */
11812 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11813 ckWARN2reg(RExC_parse,
11814 "Unrecognized escape \\%c in character class passed through",
11819 } /* end of \blah */
11822 literal_endpoint++;
11825 /* What matches in a locale is not known until runtime. This
11826 * includes what the Posix classes (like \w, [:space:]) match.
11827 * Room must be reserved (one time per class) to store such
11828 * classes, either if Perl is compiled so that locale nodes always
11829 * should have this space, or if there is such class info to be
11830 * stored. The space will contain a bit for each named class that
11831 * is to be matched against. This isn't needed for \p{} and
11832 * pseudo-classes, as they are not affected by locale, and hence
11833 * are dealt with separately */
11836 && (ANYOF_LOCALE == ANYOF_CLASS
11837 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11841 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11844 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11845 ANYOF_CLASS_ZERO(ret);
11847 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11850 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11852 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11853 * literal, as is the character that began the false range, i.e.
11854 * the 'a' in the examples */
11858 RExC_parse >= rangebegin ?
11859 RExC_parse - rangebegin : 0;
11860 ckWARN4reg(RExC_parse,
11861 "False [] range \"%*.*s\"",
11863 cp_list = add_cp_to_invlist(cp_list, '-');
11864 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11867 range = 0; /* this was not a true range */
11868 element_count += 2; /* So counts for three values */
11872 switch ((I32)namedclass) {
11874 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11875 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11876 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11878 case ANYOF_NALNUMC:
11879 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11880 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11881 runtime_posix_matches_above_Unicode);
11884 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11885 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11888 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11889 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11890 runtime_posix_matches_above_Unicode);
11895 ANYOF_CLASS_SET(ret, namedclass);
11898 #endif /* Not isascii(); just use the hard-coded definition for it */
11899 _invlist_union(posixes, PL_ASCII, &posixes);
11904 ANYOF_CLASS_SET(ret, namedclass);
11908 _invlist_union_complement_2nd(posixes,
11909 PL_ASCII, &posixes);
11910 if (DEPENDS_SEMANTICS) {
11911 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11918 if (hasISBLANK || ! LOC) {
11919 DO_POSIX(ret, namedclass, posixes,
11920 PL_PosixBlank, PL_XPosixBlank);
11922 else { /* There is no isblank() and we are in locale: We
11923 use the ASCII range and the above-Latin1 range
11925 SV* scratch_list = NULL;
11927 /* Include all above-Latin1 blanks */
11928 _invlist_intersection(PL_AboveLatin1,
11931 /* Add it to the running total of posix classes */
11933 posixes = scratch_list;
11936 _invlist_union(posixes, scratch_list, &posixes);
11937 SvREFCNT_dec(scratch_list);
11939 /* Add the ASCII-range blanks to the running total. */
11940 _invlist_union(posixes, PL_PosixBlank, &posixes);
11944 if (hasISBLANK || ! LOC) {
11945 DO_N_POSIX(ret, namedclass, posixes,
11946 PL_PosixBlank, PL_XPosixBlank);
11948 else { /* There is no isblank() and we are in locale */
11949 SV* scratch_list = NULL;
11951 /* Include all above-Latin1 non-blanks */
11952 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11955 /* Add them to the running total of posix classes */
11956 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11959 posixes = scratch_list;
11962 _invlist_union(posixes, scratch_list, &posixes);
11963 SvREFCNT_dec(scratch_list);
11966 /* Get the list of all non-ASCII-blanks in Latin 1, and
11967 * add them to the running total */
11968 _invlist_subtract(PL_Latin1, PL_PosixBlank,
11970 _invlist_union(posixes, scratch_list, &posixes);
11971 SvREFCNT_dec(scratch_list);
11975 DO_POSIX(ret, namedclass, posixes,
11976 PL_PosixCntrl, PL_XPosixCntrl);
11979 DO_N_POSIX(ret, namedclass, posixes,
11980 PL_PosixCntrl, PL_XPosixCntrl);
11983 /* There are no digits in the Latin1 range outside of
11984 * ASCII, so call the macro that doesn't have to resolve
11986 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
11987 PL_PosixDigit, "XPosixDigit", listsv);
11990 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11991 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
11992 runtime_posix_matches_above_Unicode);
11995 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11996 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11999 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12000 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
12001 runtime_posix_matches_above_Unicode);
12003 case ANYOF_HORIZWS:
12004 /* For these, we use the cp_list, as /d doesn't make a
12005 * difference in what these match. There would be problems
12006 * if these characters had folds other than themselves, as
12007 * cp_list is subject to folding. It turns out that \h
12008 * is just a synonym for XPosixBlank */
12009 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
12011 case ANYOF_NHORIZWS:
12012 _invlist_union_complement_2nd(cp_list,
12013 PL_XPosixBlank, &cp_list);
12017 { /* These require special handling, as they differ under
12018 folding, matching Cased there (which in the ASCII range
12019 is the same as Alpha */
12025 if (FOLD && ! LOC) {
12026 ascii_source = PL_PosixAlpha;
12027 l1_source = PL_L1Cased;
12031 ascii_source = PL_PosixLower;
12032 l1_source = PL_L1PosixLower;
12033 Xname = "XPosixLower";
12035 if (namedclass == ANYOF_LOWER) {
12036 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12037 ascii_source, l1_source, Xname, listsv);
12040 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12041 posixes, ascii_source, l1_source, Xname, listsv,
12042 runtime_posix_matches_above_Unicode);
12047 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12048 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
12051 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12052 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
12053 runtime_posix_matches_above_Unicode);
12056 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12057 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
12060 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12061 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
12062 runtime_posix_matches_above_Unicode);
12065 DO_POSIX(ret, namedclass, posixes,
12066 PL_PosixSpace, PL_XPosixSpace);
12068 case ANYOF_NPSXSPC:
12069 DO_N_POSIX(ret, namedclass, posixes,
12070 PL_PosixSpace, PL_XPosixSpace);
12073 DO_POSIX(ret, namedclass, posixes,
12074 PL_PerlSpace, PL_XPerlSpace);
12077 DO_N_POSIX(ret, namedclass, posixes,
12078 PL_PerlSpace, PL_XPerlSpace);
12080 case ANYOF_UPPER: /* Same as LOWER, above */
12087 if (FOLD && ! LOC) {
12088 ascii_source = PL_PosixAlpha;
12089 l1_source = PL_L1Cased;
12093 ascii_source = PL_PosixUpper;
12094 l1_source = PL_L1PosixUpper;
12095 Xname = "XPosixUpper";
12097 if (namedclass == ANYOF_UPPER) {
12098 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12099 ascii_source, l1_source, Xname, listsv);
12102 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12103 posixes, ascii_source, l1_source, Xname, listsv,
12104 runtime_posix_matches_above_Unicode);
12108 case ANYOF_WORDCHAR:
12109 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12110 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
12112 case ANYOF_NWORDCHAR:
12113 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12114 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
12115 runtime_posix_matches_above_Unicode);
12118 /* For these, we use the cp_list, as /d doesn't make a
12119 * difference in what these match. There would be problems
12120 * if these characters had folds other than themselves, as
12121 * cp_list is subject to folding */
12122 _invlist_union(cp_list, PL_VertSpace, &cp_list);
12124 case ANYOF_NVERTWS:
12125 _invlist_union_complement_2nd(cp_list,
12126 PL_VertSpace, &cp_list);
12129 DO_POSIX(ret, namedclass, posixes,
12130 PL_PosixXDigit, PL_XPosixXDigit);
12132 case ANYOF_NXDIGIT:
12133 DO_N_POSIX(ret, namedclass, posixes,
12134 PL_PosixXDigit, PL_XPosixXDigit);
12137 /* this is to handle \p and \P */
12140 vFAIL("Invalid [::] class");
12144 continue; /* Go get next character */
12146 } /* end of namedclass \blah */
12149 if (prevvalue > value) /* b-a */ {
12150 const int w = RExC_parse - rangebegin;
12151 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12152 range = 0; /* not a valid range */
12156 prevvalue = value; /* save the beginning of the potential range */
12157 if (RExC_parse+1 < RExC_end
12158 && *RExC_parse == '-'
12159 && RExC_parse[1] != ']')
12163 /* a bad range like \w-, [:word:]- ? */
12164 if (namedclass > OOB_NAMEDCLASS) {
12165 if (ckWARN(WARN_REGEXP)) {
12167 RExC_parse >= rangebegin ?
12168 RExC_parse - rangebegin : 0;
12170 "False [] range \"%*.*s\"",
12174 cp_list = add_cp_to_invlist(cp_list, '-');
12178 range = 1; /* yeah, it's a range! */
12179 continue; /* but do it the next time */
12183 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12186 /* non-Latin1 code point implies unicode semantics. Must be set in
12187 * pass1 so is there for the whole of pass 2 */
12189 RExC_uni_semantics = 1;
12192 /* Ready to process either the single value, or the completed range.
12193 * For single-valued non-inverted ranges, we consider the possibility
12194 * of multi-char folds. (We made a conscious decision to not do this
12195 * for the other cases because it can often lead to non-intuitive
12197 if (FOLD && ! invert && value == prevvalue) {
12198 if (value == LATIN_SMALL_LETTER_SHARP_S
12199 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12202 /* Here <value> is indeed a multi-char fold. Get what it is */
12204 U8 foldbuf[UTF8_MAXBYTES_CASE];
12207 UV folded = _to_uni_fold_flags(
12212 | ((LOC) ? FOLD_FLAGS_LOCALE
12213 : (ASCII_FOLD_RESTRICTED)
12214 ? FOLD_FLAGS_NOMIX_ASCII
12218 /* Here, <folded> should be the first character of the
12219 * multi-char fold of <value>, with <foldbuf> containing the
12220 * whole thing. But, if this fold is not allowed (because of
12221 * the flags), <fold> will be the same as <value>, and should
12222 * be processed like any other character, so skip the special
12224 if (folded != value) {
12226 /* Skip if we are recursed, currently parsing the class
12227 * again. Otherwise add this character to the list of
12228 * multi-char folds. */
12229 if (! RExC_in_multi_char_class) {
12230 AV** this_array_ptr;
12232 STRLEN cp_count = utf8_length(foldbuf,
12233 foldbuf + foldlen);
12234 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12236 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12239 if (! multi_char_matches) {
12240 multi_char_matches = newAV();
12243 /* <multi_char_matches> is actually an array of arrays.
12244 * There will be one or two top-level elements: [2],
12245 * and/or [3]. The [2] element is an array, each
12246 * element thereof is a character which folds to two
12247 * characters; likewise for [3]. (Unicode guarantees a
12248 * maximum of 3 characters in any fold.) When we
12249 * rewrite the character class below, we will do so
12250 * such that the longest folds are written first, so
12251 * that it prefers the longest matching strings first.
12252 * This is done even if it turns out that any
12253 * quantifier is non-greedy, out of programmer
12254 * laziness. Tom Christiansen has agreed that this is
12255 * ok. This makes the test for the ligature 'ffi' come
12256 * before the test for 'ff' */
12257 if (av_exists(multi_char_matches, cp_count)) {
12258 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12260 this_array = *this_array_ptr;
12263 this_array = newAV();
12264 av_store(multi_char_matches, cp_count,
12267 av_push(this_array, multi_fold);
12270 /* This element should not be processed further in this
12273 value = save_value;
12274 prevvalue = save_prevvalue;
12280 /* Deal with this element of the class */
12283 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12285 UV* this_range = _new_invlist(1);
12286 _append_range_to_invlist(this_range, prevvalue, value);
12288 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12289 * If this range was specified using something like 'i-j', we want
12290 * to include only the 'i' and the 'j', and not anything in
12291 * between, so exclude non-ASCII, non-alphabetics from it.
12292 * However, if the range was specified with something like
12293 * [\x89-\x91] or [\x89-j], all code points within it should be
12294 * included. literal_endpoint==2 means both ends of the range used
12295 * a literal character, not \x{foo} */
12296 if (literal_endpoint == 2
12297 && (prevvalue >= 'a' && value <= 'z')
12298 || (prevvalue >= 'A' && value <= 'Z'))
12300 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12301 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12303 _invlist_union(cp_list, this_range, &cp_list);
12304 literal_endpoint = 0;
12308 range = 0; /* this range (if it was one) is done now */
12309 } /* End of loop through all the text within the brackets */
12311 /* If anything in the class expands to more than one character, we have to
12312 * deal with them by building up a substitute parse string, and recursively
12313 * calling reg() on it, instead of proceeding */
12314 if (multi_char_matches) {
12315 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12318 char *save_end = RExC_end;
12319 char *save_parse = RExC_parse;
12320 bool first_time = TRUE; /* First multi-char occurrence doesn't get
12325 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
12326 because too confusing */
12328 sv_catpv(substitute_parse, "(?:");
12332 /* Look at the longest folds first */
12333 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12335 if (av_exists(multi_char_matches, cp_count)) {
12336 AV** this_array_ptr;
12339 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12341 while ((this_sequence = av_pop(*this_array_ptr)) !=
12344 if (! first_time) {
12345 sv_catpv(substitute_parse, "|");
12347 first_time = FALSE;
12349 sv_catpv(substitute_parse, SvPVX(this_sequence));
12354 /* If the character class contains anything else besides these
12355 * multi-character folds, have to include it in recursive parsing */
12356 if (element_count) {
12357 sv_catpv(substitute_parse, "|[");
12358 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12359 sv_catpv(substitute_parse, "]");
12362 sv_catpv(substitute_parse, ")");
12365 /* This is a way to get the parse to skip forward a whole named
12366 * sequence instead of matching the 2nd character when it fails the
12368 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12372 RExC_parse = SvPV(substitute_parse, len);
12373 RExC_end = RExC_parse + len;
12374 RExC_in_multi_char_class = 1;
12375 RExC_emit = (regnode *)orig_emit;
12377 ret = reg(pRExC_state, 1, ®_flags, depth+1);
12379 *flagp |= reg_flags&(HASWIDTH|SPSTART|POSTPONED);
12381 RExC_parse = save_parse;
12382 RExC_end = save_end;
12383 RExC_in_multi_char_class = 0;
12384 SvREFCNT_dec(multi_char_matches);
12388 /* If the character class contains only a single element, it may be
12389 * optimizable into another node type which is smaller and runs faster.
12390 * Check if this is the case for this class */
12391 if (element_count == 1) {
12395 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12396 [:digit:] or \p{foo} */
12398 /* Certain named classes have equivalents that can appear outside a
12399 * character class, e.g. \w, \H. We use these instead of a
12400 * character class. */
12401 switch ((I32)namedclass) {
12404 /* The first group is for node types that depend on the charset
12405 * modifier to the regex. We first calculate the base node
12406 * type, and if it should be inverted */
12408 case ANYOF_NWORDCHAR:
12411 case ANYOF_WORDCHAR:
12413 goto join_charset_classes;
12420 goto join_charset_classes;
12428 join_charset_classes:
12430 /* Now that we have the base node type, we take advantage
12431 * of the enum ordering of the charset modifiers to get the
12432 * exact node type, For example the base SPACE also has
12433 * SPACEL, SPACEU, and SPACEA */
12435 offset = get_regex_charset(RExC_flags);
12437 /* /aa is the same as /a for these */
12438 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12439 offset = REGEX_ASCII_RESTRICTED_CHARSET;
12441 else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12442 offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12447 /* The number of varieties of each of these is the same,
12448 * hence, so is the delta between the normal and
12449 * complemented nodes */
12451 op += NALNUM - ALNUM;
12453 *flagp |= HASWIDTH|SIMPLE;
12456 /* The second group doesn't depend of the charset modifiers.
12457 * We just have normal and complemented */
12458 case ANYOF_NHORIZWS:
12461 case ANYOF_HORIZWS:
12463 op = (invert) ? NHORIZWS : HORIZWS;
12464 *flagp |= HASWIDTH|SIMPLE;
12467 case ANYOF_NVERTWS:
12471 op = (invert) ? NVERTWS : VERTWS;
12472 *flagp |= HASWIDTH|SIMPLE;
12482 if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12487 /* A generic posix class. All the /a ones can be handled
12488 * by the POSIXA opcode. And all are closed under folding
12489 * in the ASCII range, so FOLD doesn't matter */
12490 if (AT_LEAST_ASCII_RESTRICTED
12491 || (! LOC && namedclass == ANYOF_ASCII))
12493 /* The odd numbered ones are the complements of the
12494 * next-lower even number one */
12495 if (namedclass % 2 == 1) {
12499 arg = namedclass_to_classnum(namedclass);
12500 op = (invert) ? NPOSIXA : POSIXA;
12505 else if (value == prevvalue) {
12507 /* Here, the class consists of just a single code point */
12510 if (! LOC && value == '\n') {
12511 op = REG_ANY; /* Optimize [^\n] */
12512 *flagp |= HASWIDTH|SIMPLE;
12516 else if (value < 256 || UTF) {
12518 /* Optimize a single value into an EXACTish node, but not if it
12519 * would require converting the pattern to UTF-8. */
12520 op = compute_EXACTish(pRExC_state);
12522 } /* Otherwise is a range */
12523 else if (! LOC) { /* locale could vary these */
12524 if (prevvalue == '0') {
12525 if (value == '9') {
12526 op = (invert) ? NDIGITA : DIGITA;
12527 *flagp |= HASWIDTH|SIMPLE;
12532 /* Here, we have changed <op> away from its initial value iff we found
12533 * an optimization */
12536 /* Throw away this ANYOF regnode, and emit the calculated one,
12537 * which should correspond to the beginning, not current, state of
12539 const char * cur_parse = RExC_parse;
12540 RExC_parse = (char *)orig_parse;
12544 /* To get locale nodes to not use the full ANYOF size would
12545 * require moving the code above that writes the portions
12546 * of it that aren't in other nodes to after this point.
12547 * e.g. ANYOF_CLASS_SET */
12548 RExC_size = orig_size;
12552 RExC_emit = (regnode *)orig_emit;
12555 ret = reg_node(pRExC_state, op);
12557 if (PL_regkind[op] == POSIXD) {
12561 *flagp |= HASWIDTH|SIMPLE;
12563 else if (PL_regkind[op] == EXACT) {
12564 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12567 RExC_parse = (char *) cur_parse;
12569 SvREFCNT_dec(listsv);
12576 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12578 /* If folding, we calculate all characters that could fold to or from the
12579 * ones already on the list */
12580 if (FOLD && cp_list) {
12581 UV start, end; /* End points of code point ranges */
12583 SV* fold_intersection = NULL;
12585 /* If the highest code point is within Latin1, we can use the
12586 * compiled-in Alphas list, and not have to go out to disk. This
12587 * yields two false positives, the masculine and feminine oridinal
12588 * indicators, which are weeded out below using the
12589 * IS_IN_SOME_FOLD_L1() macro */
12590 if (invlist_highest(cp_list) < 256) {
12591 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12595 /* Here, there are non-Latin1 code points, so we will have to go
12596 * fetch the list of all the characters that participate in folds
12598 if (! PL_utf8_foldable) {
12599 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12600 &PL_sv_undef, 1, 0);
12601 PL_utf8_foldable = _get_swash_invlist(swash);
12602 SvREFCNT_dec(swash);
12605 /* This is a hash that for a particular fold gives all characters
12606 * that are involved in it */
12607 if (! PL_utf8_foldclosures) {
12609 /* If we were unable to find any folds, then we likely won't be
12610 * able to find the closures. So just create an empty list.
12611 * Folding will effectively be restricted to the non-Unicode
12612 * rules hard-coded into Perl. (This case happens legitimately
12613 * during compilation of Perl itself before the Unicode tables
12614 * are generated) */
12615 if (_invlist_len(PL_utf8_foldable) == 0) {
12616 PL_utf8_foldclosures = newHV();
12619 /* If the folds haven't been read in, call a fold function
12621 if (! PL_utf8_tofold) {
12622 U8 dummy[UTF8_MAXBYTES+1];
12624 /* This string is just a short named one above \xff */
12625 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12626 assert(PL_utf8_tofold); /* Verify that worked */
12628 PL_utf8_foldclosures =
12629 _swash_inversion_hash(PL_utf8_tofold);
12633 /* Only the characters in this class that participate in folds need
12634 * be checked. Get the intersection of this class and all the
12635 * possible characters that are foldable. This can quickly narrow
12636 * down a large class */
12637 _invlist_intersection(PL_utf8_foldable, cp_list,
12638 &fold_intersection);
12641 /* Now look at the foldable characters in this class individually */
12642 invlist_iterinit(fold_intersection);
12643 while (invlist_iternext(fold_intersection, &start, &end)) {
12646 /* Locale folding for Latin1 characters is deferred until runtime */
12647 if (LOC && start < 256) {
12651 /* Look at every character in the range */
12652 for (j = start; j <= end; j++) {
12654 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12661 /* We have the latin1 folding rules hard-coded here so that
12662 * an innocent-looking character class, like /[ks]/i won't
12663 * have to go out to disk to find the possible matches.
12664 * XXX It would be better to generate these via regen, in
12665 * case a new version of the Unicode standard adds new
12666 * mappings, though that is not really likely, and may be
12667 * caught by the default: case of the switch below. */
12669 if (IS_IN_SOME_FOLD_L1(j)) {
12671 /* ASCII is always matched; non-ASCII is matched only
12672 * under Unicode rules */
12673 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12675 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12679 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12683 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12684 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12686 /* Certain Latin1 characters have matches outside
12687 * Latin1. To get here, <j> is one of those
12688 * characters. None of these matches is valid for
12689 * ASCII characters under /aa, which is why the 'if'
12690 * just above excludes those. These matches only
12691 * happen when the target string is utf8. The code
12692 * below adds the single fold closures for <j> to the
12693 * inversion list. */
12698 add_cp_to_invlist(cp_list, KELVIN_SIGN);
12702 cp_list = add_cp_to_invlist(cp_list,
12703 LATIN_SMALL_LETTER_LONG_S);
12706 cp_list = add_cp_to_invlist(cp_list,
12707 GREEK_CAPITAL_LETTER_MU);
12708 cp_list = add_cp_to_invlist(cp_list,
12709 GREEK_SMALL_LETTER_MU);
12711 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12712 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12714 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12716 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12717 cp_list = add_cp_to_invlist(cp_list,
12718 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12720 case LATIN_SMALL_LETTER_SHARP_S:
12721 cp_list = add_cp_to_invlist(cp_list,
12722 LATIN_CAPITAL_LETTER_SHARP_S);
12724 case 'F': case 'f':
12725 case 'I': case 'i':
12726 case 'L': case 'l':
12727 case 'T': case 't':
12728 case 'A': case 'a':
12729 case 'H': case 'h':
12730 case 'J': case 'j':
12731 case 'N': case 'n':
12732 case 'W': case 'w':
12733 case 'Y': case 'y':
12734 /* These all are targets of multi-character
12735 * folds from code points that require UTF8 to
12736 * express, so they can't match unless the
12737 * target string is in UTF-8, so no action here
12738 * is necessary, as regexec.c properly handles
12739 * the general case for UTF-8 matching and
12740 * multi-char folds */
12743 /* Use deprecated warning to increase the
12744 * chances of this being output */
12745 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12752 /* Here is an above Latin1 character. We don't have the rules
12753 * hard-coded for it. First, get its fold. This is the simple
12754 * fold, as the multi-character folds have been handled earlier
12755 * and separated out */
12756 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
12758 ? FOLD_FLAGS_LOCALE
12759 : (ASCII_FOLD_RESTRICTED)
12760 ? FOLD_FLAGS_NOMIX_ASCII
12763 /* Single character fold of above Latin1. Add everything in
12764 * its fold closure to the list that this node should match.
12765 * The fold closures data structure is a hash with the keys
12766 * being the UTF-8 of every character that is folded to, like
12767 * 'k', and the values each an array of all code points that
12768 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
12769 * Multi-character folds are not included */
12770 if ((listp = hv_fetch(PL_utf8_foldclosures,
12771 (char *) foldbuf, foldlen, FALSE)))
12773 AV* list = (AV*) *listp;
12775 for (k = 0; k <= av_len(list); k++) {
12776 SV** c_p = av_fetch(list, k, FALSE);
12779 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12783 /* /aa doesn't allow folds between ASCII and non-; /l
12784 * doesn't allow them between above and below 256 */
12785 if ((ASCII_FOLD_RESTRICTED
12786 && (isASCII(c) != isASCII(j)))
12787 || (LOC && ((c < 256) != (j < 256))))
12792 /* Folds involving non-ascii Latin1 characters
12793 * under /d are added to a separate list */
12794 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12796 cp_list = add_cp_to_invlist(cp_list, c);
12799 depends_list = add_cp_to_invlist(depends_list, c);
12805 SvREFCNT_dec(fold_intersection);
12808 /* And combine the result (if any) with any inversion list from posix
12809 * classes. The lists are kept separate up to now because we don't want to
12810 * fold the classes (folding of those is automatically handled by the swash
12811 * fetching code) */
12813 if (! DEPENDS_SEMANTICS) {
12815 _invlist_union(cp_list, posixes, &cp_list);
12816 SvREFCNT_dec(posixes);
12823 /* Under /d, we put into a separate list the Latin1 things that
12824 * match only when the target string is utf8 */
12825 SV* nonascii_but_latin1_properties = NULL;
12826 _invlist_intersection(posixes, PL_Latin1,
12827 &nonascii_but_latin1_properties);
12828 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12829 &nonascii_but_latin1_properties);
12830 _invlist_subtract(posixes, nonascii_but_latin1_properties,
12833 _invlist_union(cp_list, posixes, &cp_list);
12834 SvREFCNT_dec(posixes);
12840 if (depends_list) {
12841 _invlist_union(depends_list, nonascii_but_latin1_properties,
12843 SvREFCNT_dec(nonascii_but_latin1_properties);
12846 depends_list = nonascii_but_latin1_properties;
12851 /* And combine the result (if any) with any inversion list from properties.
12852 * The lists are kept separate up to now so that we can distinguish the two
12853 * in regards to matching above-Unicode. A run-time warning is generated
12854 * if a Unicode property is matched against a non-Unicode code point. But,
12855 * we allow user-defined properties to match anything, without any warning,
12856 * and we also suppress the warning if there is a portion of the character
12857 * class that isn't a Unicode property, and which matches above Unicode, \W
12858 * or [\x{110000}] for example.
12859 * (Note that in this case, unlike the Posix one above, there is no
12860 * <depends_list>, because having a Unicode property forces Unicode
12863 bool warn_super = ! has_user_defined_property;
12866 /* If it matters to the final outcome, see if a non-property
12867 * component of the class matches above Unicode. If so, the
12868 * warning gets suppressed. This is true even if just a single
12869 * such code point is specified, as though not strictly correct if
12870 * another such code point is matched against, the fact that they
12871 * are using above-Unicode code points indicates they should know
12872 * the issues involved */
12874 bool non_prop_matches_above_Unicode =
12875 runtime_posix_matches_above_Unicode
12876 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12878 non_prop_matches_above_Unicode =
12879 ! non_prop_matches_above_Unicode;
12881 warn_super = ! non_prop_matches_above_Unicode;
12884 _invlist_union(properties, cp_list, &cp_list);
12885 SvREFCNT_dec(properties);
12888 cp_list = properties;
12892 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12896 /* Here, we have calculated what code points should be in the character
12899 * Now we can see about various optimizations. Fold calculation (which we
12900 * did above) needs to take place before inversion. Otherwise /[^k]/i
12901 * would invert to include K, which under /i would match k, which it
12902 * shouldn't. Therefore we can't invert folded locale now, as it won't be
12903 * folded until runtime */
12905 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12906 * at compile time. Besides not inverting folded locale now, we can't
12907 * invert if there are things such as \w, which aren't known until runtime
12910 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12912 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12914 _invlist_invert(cp_list);
12916 /* Any swash can't be used as-is, because we've inverted things */
12918 SvREFCNT_dec(swash);
12922 /* Clear the invert flag since have just done it here */
12926 /* If we didn't do folding, it's because some information isn't available
12927 * until runtime; set the run-time fold flag for these. (We don't have to
12928 * worry about properties folding, as that is taken care of by the swash
12932 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12935 /* Some character classes are equivalent to other nodes. Such nodes take
12936 * up less room and generally fewer operations to execute than ANYOF nodes.
12937 * Above, we checked for and optimized into some such equivalents for
12938 * certain common classes that are easy to test. Getting to this point in
12939 * the code means that the class didn't get optimized there. Since this
12940 * code is only executed in Pass 2, it is too late to save space--it has
12941 * been allocated in Pass 1, and currently isn't given back. But turning
12942 * things into an EXACTish node can allow the optimizer to join it to any
12943 * adjacent such nodes. And if the class is equivalent to things like /./,
12944 * expensive run-time swashes can be avoided. Now that we have more
12945 * complete information, we can find things necessarily missed by the
12946 * earlier code. I (khw) am not sure how much to look for here. It would
12947 * be easy, but perhaps too slow, to check any candidates against all the
12948 * node types they could possibly match using _invlistEQ(). */
12953 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12954 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12957 U8 op = END; /* The optimzation node-type */
12958 const char * cur_parse= RExC_parse;
12960 invlist_iterinit(cp_list);
12961 if (! invlist_iternext(cp_list, &start, &end)) {
12963 /* Here, the list is empty. This happens, for example, when a
12964 * Unicode property is the only thing in the character class, and
12965 * it doesn't match anything. (perluniprops.pod notes such
12968 *flagp |= HASWIDTH|SIMPLE;
12970 else if (start == end) { /* The range is a single code point */
12971 if (! invlist_iternext(cp_list, &start, &end)
12973 /* Don't do this optimization if it would require changing
12974 * the pattern to UTF-8 */
12975 && (start < 256 || UTF))
12977 /* Here, the list contains a single code point. Can optimize
12978 * into an EXACT node */
12987 /* A locale node under folding with one code point can be
12988 * an EXACTFL, as its fold won't be calculated until
12994 /* Here, we are generally folding, but there is only one
12995 * code point to match. If we have to, we use an EXACT
12996 * node, but it would be better for joining with adjacent
12997 * nodes in the optimization pass if we used the same
12998 * EXACTFish node that any such are likely to be. We can
12999 * do this iff the code point doesn't participate in any
13000 * folds. For example, an EXACTF of a colon is the same as
13001 * an EXACT one, since nothing folds to or from a colon. */
13003 if (IS_IN_SOME_FOLD_L1(value)) {
13008 if (! PL_utf8_foldable) {
13009 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13010 &PL_sv_undef, 1, 0);
13011 PL_utf8_foldable = _get_swash_invlist(swash);
13012 SvREFCNT_dec(swash);
13014 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13019 /* If we haven't found the node type, above, it means we
13020 * can use the prevailing one */
13022 op = compute_EXACTish(pRExC_state);
13027 else if (start == 0) {
13028 if (end == UV_MAX) {
13030 *flagp |= HASWIDTH|SIMPLE;
13033 else if (end == '\n' - 1
13034 && invlist_iternext(cp_list, &start, &end)
13035 && start == '\n' + 1 && end == UV_MAX)
13038 *flagp |= HASWIDTH|SIMPLE;
13044 RExC_parse = (char *)orig_parse;
13045 RExC_emit = (regnode *)orig_emit;
13047 ret = reg_node(pRExC_state, op);
13049 RExC_parse = (char *)cur_parse;
13051 if (PL_regkind[op] == EXACT) {
13052 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13055 SvREFCNT_dec(listsv);
13060 /* Here, <cp_list> contains all the code points we can determine at
13061 * compile time that match under all conditions. Go through it, and
13062 * for things that belong in the bitmap, put them there, and delete from
13063 * <cp_list>. While we are at it, see if everything above 255 is in the
13064 * list, and if so, set a flag to speed up execution */
13065 ANYOF_BITMAP_ZERO(ret);
13068 /* This gets set if we actually need to modify things */
13069 bool change_invlist = FALSE;
13073 /* Start looking through <cp_list> */
13074 invlist_iterinit(cp_list);
13075 while (invlist_iternext(cp_list, &start, &end)) {
13079 if (end == UV_MAX && start <= 256) {
13080 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13083 /* Quit if are above what we should change */
13088 change_invlist = TRUE;
13090 /* Set all the bits in the range, up to the max that we are doing */
13091 high = (end < 255) ? end : 255;
13092 for (i = start; i <= (int) high; i++) {
13093 if (! ANYOF_BITMAP_TEST(ret, i)) {
13094 ANYOF_BITMAP_SET(ret, i);
13101 /* Done with loop; remove any code points that are in the bitmap from
13103 if (change_invlist) {
13104 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13107 /* If have completely emptied it, remove it completely */
13108 if (_invlist_len(cp_list) == 0) {
13109 SvREFCNT_dec(cp_list);
13115 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13118 /* Here, the bitmap has been populated with all the Latin1 code points that
13119 * always match. Can now add to the overall list those that match only
13120 * when the target string is UTF-8 (<depends_list>). */
13121 if (depends_list) {
13123 _invlist_union(cp_list, depends_list, &cp_list);
13124 SvREFCNT_dec(depends_list);
13127 cp_list = depends_list;
13131 /* If there is a swash and more than one element, we can't use the swash in
13132 * the optimization below. */
13133 if (swash && element_count > 1) {
13134 SvREFCNT_dec(swash);
13139 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13141 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13142 SvREFCNT_dec(listsv);
13145 /* av[0] stores the character class description in its textual form:
13146 * used later (regexec.c:Perl_regclass_swash()) to initialize the
13147 * appropriate swash, and is also useful for dumping the regnode.
13148 * av[1] if NULL, is a placeholder to later contain the swash computed
13149 * from av[0]. But if no further computation need be done, the
13150 * swash is stored there now.
13151 * av[2] stores the cp_list inversion list for use in addition or
13152 * instead of av[0]; used only if av[1] is NULL
13153 * av[3] is set if any component of the class is from a user-defined
13154 * property; used only if av[1] is NULL */
13155 AV * const av = newAV();
13158 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13162 av_store(av, 1, swash);
13163 SvREFCNT_dec(cp_list);
13166 av_store(av, 1, NULL);
13168 av_store(av, 2, cp_list);
13169 av_store(av, 3, newSVuv(has_user_defined_property));
13173 rv = newRV_noinc(MUTABLE_SV(av));
13174 n = add_data(pRExC_state, 1, "s");
13175 RExC_rxi->data->data[n] = (void*)rv;
13179 *flagp |= HASWIDTH|SIMPLE;
13182 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13185 /* reg_skipcomment()
13187 Absorbs an /x style # comments from the input stream.
13188 Returns true if there is more text remaining in the stream.
13189 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13190 terminates the pattern without including a newline.
13192 Note its the callers responsibility to ensure that we are
13193 actually in /x mode
13198 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13202 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13204 while (RExC_parse < RExC_end)
13205 if (*RExC_parse++ == '\n') {
13210 /* we ran off the end of the pattern without ending
13211 the comment, so we have to add an \n when wrapping */
13212 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13220 Advances the parse position, and optionally absorbs
13221 "whitespace" from the inputstream.
13223 Without /x "whitespace" means (?#...) style comments only,
13224 with /x this means (?#...) and # comments and whitespace proper.
13226 Returns the RExC_parse point from BEFORE the scan occurs.
13228 This is the /x friendly way of saying RExC_parse++.
13232 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13234 char* const retval = RExC_parse++;
13236 PERL_ARGS_ASSERT_NEXTCHAR;
13239 if (RExC_end - RExC_parse >= 3
13240 && *RExC_parse == '('
13241 && RExC_parse[1] == '?'
13242 && RExC_parse[2] == '#')
13244 while (*RExC_parse != ')') {
13245 if (RExC_parse == RExC_end)
13246 FAIL("Sequence (?#... not terminated");
13252 if (RExC_flags & RXf_PMf_EXTENDED) {
13253 if (isSPACE(*RExC_parse)) {
13257 else if (*RExC_parse == '#') {
13258 if ( reg_skipcomment( pRExC_state ) )
13267 - reg_node - emit a node
13269 STATIC regnode * /* Location. */
13270 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13274 regnode * const ret = RExC_emit;
13275 GET_RE_DEBUG_FLAGS_DECL;
13277 PERL_ARGS_ASSERT_REG_NODE;
13280 SIZE_ALIGN(RExC_size);
13284 if (RExC_emit >= RExC_emit_bound)
13285 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13286 op, RExC_emit, RExC_emit_bound);
13288 NODE_ALIGN_FILL(ret);
13290 FILL_ADVANCE_NODE(ptr, op);
13291 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
13292 #ifdef RE_TRACK_PATTERN_OFFSETS
13293 if (RExC_offsets) { /* MJD */
13294 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13295 "reg_node", __LINE__,
13297 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13298 ? "Overwriting end of array!\n" : "OK",
13299 (UV)(RExC_emit - RExC_emit_start),
13300 (UV)(RExC_parse - RExC_start),
13301 (UV)RExC_offsets[0]));
13302 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13310 - reganode - emit a node with an argument
13312 STATIC regnode * /* Location. */
13313 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13317 regnode * const ret = RExC_emit;
13318 GET_RE_DEBUG_FLAGS_DECL;
13320 PERL_ARGS_ASSERT_REGANODE;
13323 SIZE_ALIGN(RExC_size);
13328 assert(2==regarglen[op]+1);
13330 Anything larger than this has to allocate the extra amount.
13331 If we changed this to be:
13333 RExC_size += (1 + regarglen[op]);
13335 then it wouldn't matter. Its not clear what side effect
13336 might come from that so its not done so far.
13341 if (RExC_emit >= RExC_emit_bound)
13342 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13343 op, RExC_emit, RExC_emit_bound);
13345 NODE_ALIGN_FILL(ret);
13347 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13348 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
13349 #ifdef RE_TRACK_PATTERN_OFFSETS
13350 if (RExC_offsets) { /* MJD */
13351 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13355 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13356 "Overwriting end of array!\n" : "OK",
13357 (UV)(RExC_emit - RExC_emit_start),
13358 (UV)(RExC_parse - RExC_start),
13359 (UV)RExC_offsets[0]));
13360 Set_Cur_Node_Offset;
13368 - reguni - emit (if appropriate) a Unicode character
13371 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13375 PERL_ARGS_ASSERT_REGUNI;
13377 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13381 - reginsert - insert an operator in front of already-emitted operand
13383 * Means relocating the operand.
13386 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13392 const int offset = regarglen[(U8)op];
13393 const int size = NODE_STEP_REGNODE + offset;
13394 GET_RE_DEBUG_FLAGS_DECL;
13396 PERL_ARGS_ASSERT_REGINSERT;
13397 PERL_UNUSED_ARG(depth);
13398 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13399 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13408 if (RExC_open_parens) {
13410 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13411 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13412 if ( RExC_open_parens[paren] >= opnd ) {
13413 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13414 RExC_open_parens[paren] += size;
13416 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13418 if ( RExC_close_parens[paren] >= opnd ) {
13419 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13420 RExC_close_parens[paren] += size;
13422 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13427 while (src > opnd) {
13428 StructCopy(--src, --dst, regnode);
13429 #ifdef RE_TRACK_PATTERN_OFFSETS
13430 if (RExC_offsets) { /* MJD 20010112 */
13431 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13435 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13436 ? "Overwriting end of array!\n" : "OK",
13437 (UV)(src - RExC_emit_start),
13438 (UV)(dst - RExC_emit_start),
13439 (UV)RExC_offsets[0]));
13440 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13441 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13447 place = opnd; /* Op node, where operand used to be. */
13448 #ifdef RE_TRACK_PATTERN_OFFSETS
13449 if (RExC_offsets) { /* MJD */
13450 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13454 (UV)(place - RExC_emit_start) > RExC_offsets[0]
13455 ? "Overwriting end of array!\n" : "OK",
13456 (UV)(place - RExC_emit_start),
13457 (UV)(RExC_parse - RExC_start),
13458 (UV)RExC_offsets[0]));
13459 Set_Node_Offset(place, RExC_parse);
13460 Set_Node_Length(place, 1);
13463 src = NEXTOPER(place);
13464 FILL_ADVANCE_NODE(place, op);
13465 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
13466 Zero(src, offset, regnode);
13470 - regtail - set the next-pointer at the end of a node chain of p to val.
13471 - SEE ALSO: regtail_study
13473 /* TODO: All three parms should be const */
13475 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13479 GET_RE_DEBUG_FLAGS_DECL;
13481 PERL_ARGS_ASSERT_REGTAIL;
13483 PERL_UNUSED_ARG(depth);
13489 /* Find last node. */
13492 regnode * const temp = regnext(scan);
13494 SV * const mysv=sv_newmortal();
13495 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13496 regprop(RExC_rx, mysv, scan);
13497 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13498 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13499 (temp == NULL ? "->" : ""),
13500 (temp == NULL ? PL_reg_name[OP(val)] : "")
13508 if (reg_off_by_arg[OP(scan)]) {
13509 ARG_SET(scan, val - scan);
13512 NEXT_OFF(scan) = val - scan;
13518 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13519 - Look for optimizable sequences at the same time.
13520 - currently only looks for EXACT chains.
13522 This is experimental code. The idea is to use this routine to perform
13523 in place optimizations on branches and groups as they are constructed,
13524 with the long term intention of removing optimization from study_chunk so
13525 that it is purely analytical.
13527 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13528 to control which is which.
13531 /* TODO: All four parms should be const */
13534 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13539 #ifdef EXPERIMENTAL_INPLACESCAN
13542 GET_RE_DEBUG_FLAGS_DECL;
13544 PERL_ARGS_ASSERT_REGTAIL_STUDY;
13550 /* Find last node. */
13554 regnode * const temp = regnext(scan);
13555 #ifdef EXPERIMENTAL_INPLACESCAN
13556 if (PL_regkind[OP(scan)] == EXACT) {
13557 bool has_exactf_sharp_s; /* Unexamined in this routine */
13558 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13563 switch (OP(scan)) {
13569 case EXACTFU_TRICKYFOLD:
13571 if( exact == PSEUDO )
13573 else if ( exact != OP(scan) )
13582 SV * const mysv=sv_newmortal();
13583 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13584 regprop(RExC_rx, mysv, scan);
13585 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13586 SvPV_nolen_const(mysv),
13587 REG_NODE_NUM(scan),
13588 PL_reg_name[exact]);
13595 SV * const mysv_val=sv_newmortal();
13596 DEBUG_PARSE_MSG("");
13597 regprop(RExC_rx, mysv_val, val);
13598 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13599 SvPV_nolen_const(mysv_val),
13600 (IV)REG_NODE_NUM(val),
13604 if (reg_off_by_arg[OP(scan)]) {
13605 ARG_SET(scan, val - scan);
13608 NEXT_OFF(scan) = val - scan;
13616 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13620 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13626 for (bit=0; bit<32; bit++) {
13627 if (flags & (1<<bit)) {
13628 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
13631 if (!set++ && lead)
13632 PerlIO_printf(Perl_debug_log, "%s",lead);
13633 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13636 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13637 if (!set++ && lead) {
13638 PerlIO_printf(Perl_debug_log, "%s",lead);
13641 case REGEX_UNICODE_CHARSET:
13642 PerlIO_printf(Perl_debug_log, "UNICODE");
13644 case REGEX_LOCALE_CHARSET:
13645 PerlIO_printf(Perl_debug_log, "LOCALE");
13647 case REGEX_ASCII_RESTRICTED_CHARSET:
13648 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13650 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13651 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13654 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13660 PerlIO_printf(Perl_debug_log, "\n");
13662 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13668 Perl_regdump(pTHX_ const regexp *r)
13672 SV * const sv = sv_newmortal();
13673 SV *dsv= sv_newmortal();
13674 RXi_GET_DECL(r,ri);
13675 GET_RE_DEBUG_FLAGS_DECL;
13677 PERL_ARGS_ASSERT_REGDUMP;
13679 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13681 /* Header fields of interest. */
13682 if (r->anchored_substr) {
13683 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13684 RE_SV_DUMPLEN(r->anchored_substr), 30);
13685 PerlIO_printf(Perl_debug_log,
13686 "anchored %s%s at %"IVdf" ",
13687 s, RE_SV_TAIL(r->anchored_substr),
13688 (IV)r->anchored_offset);
13689 } else if (r->anchored_utf8) {
13690 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13691 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13692 PerlIO_printf(Perl_debug_log,
13693 "anchored utf8 %s%s at %"IVdf" ",
13694 s, RE_SV_TAIL(r->anchored_utf8),
13695 (IV)r->anchored_offset);
13697 if (r->float_substr) {
13698 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13699 RE_SV_DUMPLEN(r->float_substr), 30);
13700 PerlIO_printf(Perl_debug_log,
13701 "floating %s%s at %"IVdf"..%"UVuf" ",
13702 s, RE_SV_TAIL(r->float_substr),
13703 (IV)r->float_min_offset, (UV)r->float_max_offset);
13704 } else if (r->float_utf8) {
13705 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13706 RE_SV_DUMPLEN(r->float_utf8), 30);
13707 PerlIO_printf(Perl_debug_log,
13708 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13709 s, RE_SV_TAIL(r->float_utf8),
13710 (IV)r->float_min_offset, (UV)r->float_max_offset);
13712 if (r->check_substr || r->check_utf8)
13713 PerlIO_printf(Perl_debug_log,
13715 (r->check_substr == r->float_substr
13716 && r->check_utf8 == r->float_utf8
13717 ? "(checking floating" : "(checking anchored"));
13718 if (r->extflags & RXf_NOSCAN)
13719 PerlIO_printf(Perl_debug_log, " noscan");
13720 if (r->extflags & RXf_CHECK_ALL)
13721 PerlIO_printf(Perl_debug_log, " isall");
13722 if (r->check_substr || r->check_utf8)
13723 PerlIO_printf(Perl_debug_log, ") ");
13725 if (ri->regstclass) {
13726 regprop(r, sv, ri->regstclass);
13727 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13729 if (r->extflags & RXf_ANCH) {
13730 PerlIO_printf(Perl_debug_log, "anchored");
13731 if (r->extflags & RXf_ANCH_BOL)
13732 PerlIO_printf(Perl_debug_log, "(BOL)");
13733 if (r->extflags & RXf_ANCH_MBOL)
13734 PerlIO_printf(Perl_debug_log, "(MBOL)");
13735 if (r->extflags & RXf_ANCH_SBOL)
13736 PerlIO_printf(Perl_debug_log, "(SBOL)");
13737 if (r->extflags & RXf_ANCH_GPOS)
13738 PerlIO_printf(Perl_debug_log, "(GPOS)");
13739 PerlIO_putc(Perl_debug_log, ' ');
13741 if (r->extflags & RXf_GPOS_SEEN)
13742 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13743 if (r->intflags & PREGf_SKIP)
13744 PerlIO_printf(Perl_debug_log, "plus ");
13745 if (r->intflags & PREGf_IMPLICIT)
13746 PerlIO_printf(Perl_debug_log, "implicit ");
13747 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13748 if (r->extflags & RXf_EVAL_SEEN)
13749 PerlIO_printf(Perl_debug_log, "with eval ");
13750 PerlIO_printf(Perl_debug_log, "\n");
13751 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13753 PERL_ARGS_ASSERT_REGDUMP;
13754 PERL_UNUSED_CONTEXT;
13755 PERL_UNUSED_ARG(r);
13756 #endif /* DEBUGGING */
13760 - regprop - printable representation of opcode
13762 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13765 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13766 if (flags & ANYOF_INVERT) \
13767 /*make sure the invert info is in each */ \
13768 sv_catpvs(sv, "^"); \
13774 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13780 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13781 static const char * const anyofs[] = {
13813 RXi_GET_DECL(prog,progi);
13814 GET_RE_DEBUG_FLAGS_DECL;
13816 PERL_ARGS_ASSERT_REGPROP;
13820 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13821 /* It would be nice to FAIL() here, but this may be called from
13822 regexec.c, and it would be hard to supply pRExC_state. */
13823 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13824 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13826 k = PL_regkind[OP(o)];
13829 sv_catpvs(sv, " ");
13830 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13831 * is a crude hack but it may be the best for now since
13832 * we have no flag "this EXACTish node was UTF-8"
13834 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13835 PERL_PV_ESCAPE_UNI_DETECT |
13836 PERL_PV_ESCAPE_NONASCII |
13837 PERL_PV_PRETTY_ELLIPSES |
13838 PERL_PV_PRETTY_LTGT |
13839 PERL_PV_PRETTY_NOCLEAR
13841 } else if (k == TRIE) {
13842 /* print the details of the trie in dumpuntil instead, as
13843 * progi->data isn't available here */
13844 const char op = OP(o);
13845 const U32 n = ARG(o);
13846 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13847 (reg_ac_data *)progi->data->data[n] :
13849 const reg_trie_data * const trie
13850 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13852 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13853 DEBUG_TRIE_COMPILE_r(
13854 Perl_sv_catpvf(aTHX_ sv,
13855 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13856 (UV)trie->startstate,
13857 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13858 (UV)trie->wordcount,
13861 (UV)TRIE_CHARCOUNT(trie),
13862 (UV)trie->uniquecharcount
13865 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13867 int rangestart = -1;
13868 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13869 sv_catpvs(sv, "[");
13870 for (i = 0; i <= 256; i++) {
13871 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13872 if (rangestart == -1)
13874 } else if (rangestart != -1) {
13875 if (i <= rangestart + 3)
13876 for (; rangestart < i; rangestart++)
13877 put_byte(sv, rangestart);
13879 put_byte(sv, rangestart);
13880 sv_catpvs(sv, "-");
13881 put_byte(sv, i - 1);
13886 sv_catpvs(sv, "]");
13889 } else if (k == CURLY) {
13890 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13891 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13892 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13894 else if (k == WHILEM && o->flags) /* Ordinal/of */
13895 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13896 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13897 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13898 if ( RXp_PAREN_NAMES(prog) ) {
13899 if ( k != REF || (OP(o) < NREF)) {
13900 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13901 SV **name= av_fetch(list, ARG(o), 0 );
13903 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13906 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13907 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13908 I32 *nums=(I32*)SvPVX(sv_dat);
13909 SV **name= av_fetch(list, nums[0], 0 );
13912 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13913 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13914 (n ? "," : ""), (IV)nums[n]);
13916 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13920 } else if (k == GOSUB)
13921 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13922 else if (k == VERB) {
13924 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13925 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13926 } else if (k == LOGICAL)
13927 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13928 else if (k == ANYOF) {
13929 int i, rangestart = -1;
13930 const U8 flags = ANYOF_FLAGS(o);
13934 if (flags & ANYOF_LOCALE)
13935 sv_catpvs(sv, "{loc}");
13936 if (flags & ANYOF_LOC_FOLD)
13937 sv_catpvs(sv, "{i}");
13938 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13939 if (flags & ANYOF_INVERT)
13940 sv_catpvs(sv, "^");
13942 /* output what the standard cp 0-255 bitmap matches */
13943 for (i = 0; i <= 256; i++) {
13944 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13945 if (rangestart == -1)
13947 } else if (rangestart != -1) {
13948 if (i <= rangestart + 3)
13949 for (; rangestart < i; rangestart++)
13950 put_byte(sv, rangestart);
13952 put_byte(sv, rangestart);
13953 sv_catpvs(sv, "-");
13954 put_byte(sv, i - 1);
13961 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13962 /* output any special charclass tests (used entirely under use locale) */
13963 if (ANYOF_CLASS_TEST_ANY_SET(o))
13964 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13965 if (ANYOF_CLASS_TEST(o,i)) {
13966 sv_catpv(sv, anyofs[i]);
13970 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13972 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13973 sv_catpvs(sv, "{non-utf8-latin1-all}");
13976 /* output information about the unicode matching */
13977 if (flags & ANYOF_UNICODE_ALL)
13978 sv_catpvs(sv, "{unicode_all}");
13979 else if (ANYOF_NONBITMAP(o))
13980 sv_catpvs(sv, "{unicode}");
13981 if (flags & ANYOF_NONBITMAP_NON_UTF8)
13982 sv_catpvs(sv, "{outside bitmap}");
13984 if (ANYOF_NONBITMAP(o)) {
13985 SV *lv; /* Set if there is something outside the bit map */
13986 SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
13987 bool byte_output = FALSE; /* If something in the bitmap has been
13990 if (lv && lv != &PL_sv_undef) {
13992 U8 s[UTF8_MAXBYTES_CASE+1];
13994 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13995 uvchr_to_utf8(s, i);
13998 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
14002 && swash_fetch(sw, s, TRUE))
14004 if (rangestart == -1)
14006 } else if (rangestart != -1) {
14007 byte_output = TRUE;
14008 if (i <= rangestart + 3)
14009 for (; rangestart < i; rangestart++) {
14010 put_byte(sv, rangestart);
14013 put_byte(sv, rangestart);
14014 sv_catpvs(sv, "-");
14023 char *s = savesvpv(lv);
14024 char * const origs = s;
14026 while (*s && *s != '\n')
14030 const char * const t = ++s;
14033 sv_catpvs(sv, " ");
14039 /* Truncate very long output */
14040 if (s - origs > 256) {
14041 Perl_sv_catpvf(aTHX_ sv,
14043 (int) (s - origs - 1),
14049 else if (*s == '\t') {
14068 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14070 else if (k == POSIXD) {
14071 U8 index = FLAGS(o) * 2;
14072 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14073 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14076 sv_catpv(sv, anyofs[index]);
14079 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14080 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14082 PERL_UNUSED_CONTEXT;
14083 PERL_UNUSED_ARG(sv);
14084 PERL_UNUSED_ARG(o);
14085 PERL_UNUSED_ARG(prog);
14086 #endif /* DEBUGGING */
14090 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14091 { /* Assume that RE_INTUIT is set */
14093 struct regexp *const prog = (struct regexp *)SvANY(r);
14094 GET_RE_DEBUG_FLAGS_DECL;
14096 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14097 PERL_UNUSED_CONTEXT;
14101 const char * const s = SvPV_nolen_const(prog->check_substr
14102 ? prog->check_substr : prog->check_utf8);
14104 if (!PL_colorset) reginitcolors();
14105 PerlIO_printf(Perl_debug_log,
14106 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14108 prog->check_substr ? "" : "utf8 ",
14109 PL_colors[5],PL_colors[0],
14112 (strlen(s) > 60 ? "..." : ""));
14115 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14121 handles refcounting and freeing the perl core regexp structure. When
14122 it is necessary to actually free the structure the first thing it
14123 does is call the 'free' method of the regexp_engine associated to
14124 the regexp, allowing the handling of the void *pprivate; member
14125 first. (This routine is not overridable by extensions, which is why
14126 the extensions free is called first.)
14128 See regdupe and regdupe_internal if you change anything here.
14130 #ifndef PERL_IN_XSUB_RE
14132 Perl_pregfree(pTHX_ REGEXP *r)
14138 Perl_pregfree2(pTHX_ REGEXP *rx)
14141 struct regexp *const r = (struct regexp *)SvANY(rx);
14142 GET_RE_DEBUG_FLAGS_DECL;
14144 PERL_ARGS_ASSERT_PREGFREE2;
14146 if (r->mother_re) {
14147 ReREFCNT_dec(r->mother_re);
14149 CALLREGFREE_PVT(rx); /* free the private data */
14150 SvREFCNT_dec(RXp_PAREN_NAMES(r));
14153 SvREFCNT_dec(r->anchored_substr);
14154 SvREFCNT_dec(r->anchored_utf8);
14155 SvREFCNT_dec(r->float_substr);
14156 SvREFCNT_dec(r->float_utf8);
14157 Safefree(r->substrs);
14159 RX_MATCH_COPY_FREE(rx);
14160 #ifdef PERL_OLD_COPY_ON_WRITE
14161 SvREFCNT_dec(r->saved_copy);
14164 SvREFCNT_dec(r->qr_anoncv);
14169 This is a hacky workaround to the structural issue of match results
14170 being stored in the regexp structure which is in turn stored in
14171 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14172 could be PL_curpm in multiple contexts, and could require multiple
14173 result sets being associated with the pattern simultaneously, such
14174 as when doing a recursive match with (??{$qr})
14176 The solution is to make a lightweight copy of the regexp structure
14177 when a qr// is returned from the code executed by (??{$qr}) this
14178 lightweight copy doesn't actually own any of its data except for
14179 the starp/end and the actual regexp structure itself.
14185 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14187 struct regexp *ret;
14188 struct regexp *const r = (struct regexp *)SvANY(rx);
14190 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14193 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14194 ret = (struct regexp *)SvANY(ret_x);
14196 (void)ReREFCNT_inc(rx);
14197 /* We can take advantage of the existing "copied buffer" mechanism in SVs
14198 by pointing directly at the buffer, but flagging that the allocated
14199 space in the copy is zero. As we've just done a struct copy, it's now
14200 a case of zero-ing that, rather than copying the current length. */
14201 SvPV_set(ret_x, RX_WRAPPED(rx));
14202 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
14203 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14204 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14205 SvLEN_set(ret_x, 0);
14206 SvSTASH_set(ret_x, NULL);
14207 SvMAGIC_set(ret_x, NULL);
14209 const I32 npar = r->nparens+1;
14210 Newx(ret->offs, npar, regexp_paren_pair);
14211 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14214 Newx(ret->substrs, 1, struct reg_substr_data);
14215 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14217 SvREFCNT_inc_void(ret->anchored_substr);
14218 SvREFCNT_inc_void(ret->anchored_utf8);
14219 SvREFCNT_inc_void(ret->float_substr);
14220 SvREFCNT_inc_void(ret->float_utf8);
14222 /* check_substr and check_utf8, if non-NULL, point to either their
14223 anchored or float namesakes, and don't hold a second reference. */
14225 RX_MATCH_COPIED_off(ret_x);
14226 #ifdef PERL_OLD_COPY_ON_WRITE
14227 ret->saved_copy = NULL;
14229 ret->mother_re = rx;
14230 SvREFCNT_inc_void(ret->qr_anoncv);
14236 /* regfree_internal()
14238 Free the private data in a regexp. This is overloadable by
14239 extensions. Perl takes care of the regexp structure in pregfree(),
14240 this covers the *pprivate pointer which technically perl doesn't
14241 know about, however of course we have to handle the
14242 regexp_internal structure when no extension is in use.
14244 Note this is called before freeing anything in the regexp
14249 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14252 struct regexp *const r = (struct regexp *)SvANY(rx);
14253 RXi_GET_DECL(r,ri);
14254 GET_RE_DEBUG_FLAGS_DECL;
14256 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14262 SV *dsv= sv_newmortal();
14263 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14264 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14265 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14266 PL_colors[4],PL_colors[5],s);
14269 #ifdef RE_TRACK_PATTERN_OFFSETS
14271 Safefree(ri->u.offsets); /* 20010421 MJD */
14273 if (ri->code_blocks) {
14275 for (n = 0; n < ri->num_code_blocks; n++)
14276 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14277 Safefree(ri->code_blocks);
14281 int n = ri->data->count;
14284 /* If you add a ->what type here, update the comment in regcomp.h */
14285 switch (ri->data->what[n]) {
14291 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14294 Safefree(ri->data->data[n]);
14300 { /* Aho Corasick add-on structure for a trie node.
14301 Used in stclass optimization only */
14303 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14305 refcount = --aho->refcount;
14308 PerlMemShared_free(aho->states);
14309 PerlMemShared_free(aho->fail);
14310 /* do this last!!!! */
14311 PerlMemShared_free(ri->data->data[n]);
14312 PerlMemShared_free(ri->regstclass);
14318 /* trie structure. */
14320 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14322 refcount = --trie->refcount;
14325 PerlMemShared_free(trie->charmap);
14326 PerlMemShared_free(trie->states);
14327 PerlMemShared_free(trie->trans);
14329 PerlMemShared_free(trie->bitmap);
14331 PerlMemShared_free(trie->jump);
14332 PerlMemShared_free(trie->wordinfo);
14333 /* do this last!!!! */
14334 PerlMemShared_free(ri->data->data[n]);
14339 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14342 Safefree(ri->data->what);
14343 Safefree(ri->data);
14349 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14350 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14351 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14354 re_dup - duplicate a regexp.
14356 This routine is expected to clone a given regexp structure. It is only
14357 compiled under USE_ITHREADS.
14359 After all of the core data stored in struct regexp is duplicated
14360 the regexp_engine.dupe method is used to copy any private data
14361 stored in the *pprivate pointer. This allows extensions to handle
14362 any duplication it needs to do.
14364 See pregfree() and regfree_internal() if you change anything here.
14366 #if defined(USE_ITHREADS)
14367 #ifndef PERL_IN_XSUB_RE
14369 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14373 const struct regexp *r = (const struct regexp *)SvANY(sstr);
14374 struct regexp *ret = (struct regexp *)SvANY(dstr);
14376 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14378 npar = r->nparens+1;
14379 Newx(ret->offs, npar, regexp_paren_pair);
14380 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14382 /* no need to copy these */
14383 Newx(ret->swap, npar, regexp_paren_pair);
14386 if (ret->substrs) {
14387 /* Do it this way to avoid reading from *r after the StructCopy().
14388 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14389 cache, it doesn't matter. */
14390 const bool anchored = r->check_substr
14391 ? r->check_substr == r->anchored_substr
14392 : r->check_utf8 == r->anchored_utf8;
14393 Newx(ret->substrs, 1, struct reg_substr_data);
14394 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14396 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14397 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14398 ret->float_substr = sv_dup_inc(ret->float_substr, param);
14399 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14401 /* check_substr and check_utf8, if non-NULL, point to either their
14402 anchored or float namesakes, and don't hold a second reference. */
14404 if (ret->check_substr) {
14406 assert(r->check_utf8 == r->anchored_utf8);
14407 ret->check_substr = ret->anchored_substr;
14408 ret->check_utf8 = ret->anchored_utf8;
14410 assert(r->check_substr == r->float_substr);
14411 assert(r->check_utf8 == r->float_utf8);
14412 ret->check_substr = ret->float_substr;
14413 ret->check_utf8 = ret->float_utf8;
14415 } else if (ret->check_utf8) {
14417 ret->check_utf8 = ret->anchored_utf8;
14419 ret->check_utf8 = ret->float_utf8;
14424 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14425 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14428 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14430 if (RX_MATCH_COPIED(dstr))
14431 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
14433 ret->subbeg = NULL;
14434 #ifdef PERL_OLD_COPY_ON_WRITE
14435 ret->saved_copy = NULL;
14438 if (ret->mother_re) {
14439 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
14440 /* Our storage points directly to our mother regexp, but that's
14441 1: a buffer in a different thread
14442 2: something we no longer hold a reference on
14443 so we need to copy it locally. */
14444 /* Note we need to use SvCUR(), rather than
14445 SvLEN(), on our mother_re, because it, in
14446 turn, may well be pointing to its own mother_re. */
14447 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
14448 SvCUR(ret->mother_re)+1));
14449 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
14451 ret->mother_re = NULL;
14455 #endif /* PERL_IN_XSUB_RE */
14460 This is the internal complement to regdupe() which is used to copy
14461 the structure pointed to by the *pprivate pointer in the regexp.
14462 This is the core version of the extension overridable cloning hook.
14463 The regexp structure being duplicated will be copied by perl prior
14464 to this and will be provided as the regexp *r argument, however
14465 with the /old/ structures pprivate pointer value. Thus this routine
14466 may override any copying normally done by perl.
14468 It returns a pointer to the new regexp_internal structure.
14472 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14475 struct regexp *const r = (struct regexp *)SvANY(rx);
14476 regexp_internal *reti;
14478 RXi_GET_DECL(r,ri);
14480 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14484 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14485 Copy(ri->program, reti->program, len+1, regnode);
14487 reti->num_code_blocks = ri->num_code_blocks;
14488 if (ri->code_blocks) {
14490 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14491 struct reg_code_block);
14492 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14493 struct reg_code_block);
14494 for (n = 0; n < ri->num_code_blocks; n++)
14495 reti->code_blocks[n].src_regex = (REGEXP*)
14496 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14499 reti->code_blocks = NULL;
14501 reti->regstclass = NULL;
14504 struct reg_data *d;
14505 const int count = ri->data->count;
14508 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14509 char, struct reg_data);
14510 Newx(d->what, count, U8);
14513 for (i = 0; i < count; i++) {
14514 d->what[i] = ri->data->what[i];
14515 switch (d->what[i]) {
14516 /* see also regcomp.h and regfree_internal() */
14517 case 'a': /* actually an AV, but the dup function is identical. */
14521 case 'u': /* actually an HV, but the dup function is identical. */
14522 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14525 /* This is cheating. */
14526 Newx(d->data[i], 1, struct regnode_charclass_class);
14527 StructCopy(ri->data->data[i], d->data[i],
14528 struct regnode_charclass_class);
14529 reti->regstclass = (regnode*)d->data[i];
14532 /* Trie stclasses are readonly and can thus be shared
14533 * without duplication. We free the stclass in pregfree
14534 * when the corresponding reg_ac_data struct is freed.
14536 reti->regstclass= ri->regstclass;
14540 ((reg_trie_data*)ri->data->data[i])->refcount++;
14545 d->data[i] = ri->data->data[i];
14548 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14557 reti->name_list_idx = ri->name_list_idx;
14559 #ifdef RE_TRACK_PATTERN_OFFSETS
14560 if (ri->u.offsets) {
14561 Newx(reti->u.offsets, 2*len+1, U32);
14562 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14565 SetProgLen(reti,len);
14568 return (void*)reti;
14571 #endif /* USE_ITHREADS */
14573 #ifndef PERL_IN_XSUB_RE
14576 - regnext - dig the "next" pointer out of a node
14579 Perl_regnext(pTHX_ register regnode *p)
14587 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
14588 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14591 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14600 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14603 STRLEN l1 = strlen(pat1);
14604 STRLEN l2 = strlen(pat2);
14607 const char *message;
14609 PERL_ARGS_ASSERT_RE_CROAK2;
14615 Copy(pat1, buf, l1 , char);
14616 Copy(pat2, buf + l1, l2 , char);
14617 buf[l1 + l2] = '\n';
14618 buf[l1 + l2 + 1] = '\0';
14620 /* ANSI variant takes additional second argument */
14621 va_start(args, pat2);
14625 msv = vmess(buf, &args);
14627 message = SvPV_const(msv,l1);
14630 Copy(message, buf, l1 , char);
14631 buf[l1-1] = '\0'; /* Overwrite \n */
14632 Perl_croak(aTHX_ "%s", buf);
14635 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
14637 #ifndef PERL_IN_XSUB_RE
14639 Perl_save_re_context(pTHX)
14643 struct re_save_state *state;
14645 SAVEVPTR(PL_curcop);
14646 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14648 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14649 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14650 SSPUSHUV(SAVEt_RE_STATE);
14652 Copy(&PL_reg_state, state, 1, struct re_save_state);
14654 PL_reg_oldsaved = NULL;
14655 PL_reg_oldsavedlen = 0;
14656 PL_reg_oldsavedoffset = 0;
14657 PL_reg_oldsavedcoffset = 0;
14658 PL_reg_maxiter = 0;
14659 PL_reg_leftiter = 0;
14660 PL_reg_poscache = NULL;
14661 PL_reg_poscache_size = 0;
14662 #ifdef PERL_OLD_COPY_ON_WRITE
14666 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14668 const REGEXP * const rx = PM_GETRE(PL_curpm);
14671 for (i = 1; i <= RX_NPARENS(rx); i++) {
14672 char digits[TYPE_CHARS(long)];
14673 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14674 GV *const *const gvp
14675 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14678 GV * const gv = *gvp;
14679 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14689 clear_re(pTHX_ void *r)
14692 ReREFCNT_dec((REGEXP *)r);
14698 S_put_byte(pTHX_ SV *sv, int c)
14700 PERL_ARGS_ASSERT_PUT_BYTE;
14702 /* Our definition of isPRINT() ignores locales, so only bytes that are
14703 not part of UTF-8 are considered printable. I assume that the same
14704 holds for UTF-EBCDIC.
14705 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14706 which Wikipedia says:
14708 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14709 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14710 identical, to the ASCII delete (DEL) or rubout control character.
14711 ) So the old condition can be simplified to !isPRINT(c) */
14714 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14717 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14721 const char string = c;
14722 if (c == '-' || c == ']' || c == '\\' || c == '^')
14723 sv_catpvs(sv, "\\");
14724 sv_catpvn(sv, &string, 1);
14729 #define CLEAR_OPTSTART \
14730 if (optstart) STMT_START { \
14731 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14735 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14737 STATIC const regnode *
14738 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14739 const regnode *last, const regnode *plast,
14740 SV* sv, I32 indent, U32 depth)
14743 U8 op = PSEUDO; /* Arbitrary non-END op. */
14744 const regnode *next;
14745 const regnode *optstart= NULL;
14747 RXi_GET_DECL(r,ri);
14748 GET_RE_DEBUG_FLAGS_DECL;
14750 PERL_ARGS_ASSERT_DUMPUNTIL;
14752 #ifdef DEBUG_DUMPUNTIL
14753 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14754 last ? last-start : 0,plast ? plast-start : 0);
14757 if (plast && plast < last)
14760 while (PL_regkind[op] != END && (!last || node < last)) {
14761 /* While that wasn't END last time... */
14764 if (op == CLOSE || op == WHILEM)
14766 next = regnext((regnode *)node);
14769 if (OP(node) == OPTIMIZED) {
14770 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14777 regprop(r, sv, node);
14778 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14779 (int)(2*indent + 1), "", SvPVX_const(sv));
14781 if (OP(node) != OPTIMIZED) {
14782 if (next == NULL) /* Next ptr. */
14783 PerlIO_printf(Perl_debug_log, " (0)");
14784 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14785 PerlIO_printf(Perl_debug_log, " (FAIL)");
14787 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14788 (void)PerlIO_putc(Perl_debug_log, '\n');
14792 if (PL_regkind[(U8)op] == BRANCHJ) {
14795 const regnode *nnode = (OP(next) == LONGJMP
14796 ? regnext((regnode *)next)
14798 if (last && nnode > last)
14800 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14803 else if (PL_regkind[(U8)op] == BRANCH) {
14805 DUMPUNTIL(NEXTOPER(node), next);
14807 else if ( PL_regkind[(U8)op] == TRIE ) {
14808 const regnode *this_trie = node;
14809 const char op = OP(node);
14810 const U32 n = ARG(node);
14811 const reg_ac_data * const ac = op>=AHOCORASICK ?
14812 (reg_ac_data *)ri->data->data[n] :
14814 const reg_trie_data * const trie =
14815 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14817 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14819 const regnode *nextbranch= NULL;
14822 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14823 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14825 PerlIO_printf(Perl_debug_log, "%*s%s ",
14826 (int)(2*(indent+3)), "",
14827 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14828 PL_colors[0], PL_colors[1],
14829 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14830 PERL_PV_PRETTY_ELLIPSES |
14831 PERL_PV_PRETTY_LTGT
14836 U16 dist= trie->jump[word_idx+1];
14837 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14838 (UV)((dist ? this_trie + dist : next) - start));
14841 nextbranch= this_trie + trie->jump[0];
14842 DUMPUNTIL(this_trie + dist, nextbranch);
14844 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14845 nextbranch= regnext((regnode *)nextbranch);
14847 PerlIO_printf(Perl_debug_log, "\n");
14850 if (last && next > last)
14855 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14856 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14857 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14859 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14861 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14863 else if ( op == PLUS || op == STAR) {
14864 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14866 else if (PL_regkind[(U8)op] == ANYOF) {
14867 /* arglen 1 + class block */
14868 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14869 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14870 node = NEXTOPER(node);
14872 else if (PL_regkind[(U8)op] == EXACT) {
14873 /* Literal string, where present. */
14874 node += NODE_SZ_STR(node) - 1;
14875 node = NEXTOPER(node);
14878 node = NEXTOPER(node);
14879 node += regarglen[(U8)op];
14881 if (op == CURLYX || op == OPEN)
14885 #ifdef DEBUG_DUMPUNTIL
14886 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14891 #endif /* DEBUGGING */
14895 * c-indentation-style: bsd
14896 * c-basic-offset: 4
14897 * indent-tabs-mode: nil
14900 * ex: set ts=8 sts=4 sw=4 et: