5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
100 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
101 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
122 typedef struct RExC_state_t {
123 U32 flags; /* RXf_* are we folding, multilining? */
124 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
125 char *precomp; /* uncompiled string. */
126 REGEXP *rx_sv; /* The SV that is the regexp. */
127 regexp *rx; /* perl core regexp structure */
128 regexp_internal *rxi; /* internal data for regexp object pprivate field */
129 char *start; /* Start of input for compile */
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
132 I32 whilem_seen; /* number of WHILEM in this expr */
133 regnode *emit_start; /* Start of emitted-code area */
134 regnode *emit_bound; /* First regnode outside of the allocated space */
135 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
136 I32 naughty; /* How bad is this pattern? */
137 I32 sawback; /* Did we see \1, ...? */
139 I32 size; /* Code size. */
140 I32 npar; /* Capture buffer count, (OPEN). */
141 I32 cpar; /* Capture buffer count, (CLOSE). */
142 I32 nestroot; /* root parens we are in - used by accept */
145 regnode **open_parens; /* pointers to open parens */
146 regnode **close_parens; /* pointers to close parens */
147 regnode *opend; /* END node in program */
148 I32 utf8; /* whether the pattern is utf8 or not */
149 I32 orig_utf8; /* whether the pattern was originally in utf8 */
150 /* XXX use this for future optimisation of case
151 * where pattern must be upgraded to utf8. */
152 I32 uni_semantics; /* If a d charset modifier should use unicode
153 rules, even if the pattern is not in
155 HV *paren_names; /* Paren names */
157 regnode **recurse; /* Recurse regops */
158 I32 recurse_count; /* Number of recurse regops */
161 I32 override_recoding;
162 I32 in_multi_char_class;
163 struct reg_code_block *code_blocks; /* positions of literal (?{})
165 int num_code_blocks; /* size of code_blocks[] */
166 int code_index; /* next code_blocks[] slot */
168 char *starttry; /* -Dr: where regtry was called. */
169 #define RExC_starttry (pRExC_state->starttry)
171 SV *runtime_code_qr; /* qr with the runtime code blocks */
173 const char *lastparse;
175 AV *paren_name_list; /* idx -> name */
176 #define RExC_lastparse (pRExC_state->lastparse)
177 #define RExC_lastnum (pRExC_state->lastnum)
178 #define RExC_paren_name_list (pRExC_state->paren_name_list)
182 #define RExC_flags (pRExC_state->flags)
183 #define RExC_pm_flags (pRExC_state->pm_flags)
184 #define RExC_precomp (pRExC_state->precomp)
185 #define RExC_rx_sv (pRExC_state->rx_sv)
186 #define RExC_rx (pRExC_state->rx)
187 #define RExC_rxi (pRExC_state->rxi)
188 #define RExC_start (pRExC_state->start)
189 #define RExC_end (pRExC_state->end)
190 #define RExC_parse (pRExC_state->parse)
191 #define RExC_whilem_seen (pRExC_state->whilem_seen)
192 #ifdef RE_TRACK_PATTERN_OFFSETS
193 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
195 #define RExC_emit (pRExC_state->emit)
196 #define RExC_emit_start (pRExC_state->emit_start)
197 #define RExC_emit_bound (pRExC_state->emit_bound)
198 #define RExC_naughty (pRExC_state->naughty)
199 #define RExC_sawback (pRExC_state->sawback)
200 #define RExC_seen (pRExC_state->seen)
201 #define RExC_size (pRExC_state->size)
202 #define RExC_npar (pRExC_state->npar)
203 #define RExC_nestroot (pRExC_state->nestroot)
204 #define RExC_extralen (pRExC_state->extralen)
205 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
206 #define RExC_utf8 (pRExC_state->utf8)
207 #define RExC_uni_semantics (pRExC_state->uni_semantics)
208 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
209 #define RExC_open_parens (pRExC_state->open_parens)
210 #define RExC_close_parens (pRExC_state->close_parens)
211 #define RExC_opend (pRExC_state->opend)
212 #define RExC_paren_names (pRExC_state->paren_names)
213 #define RExC_recurse (pRExC_state->recurse)
214 #define RExC_recurse_count (pRExC_state->recurse_count)
215 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
216 #define RExC_contains_locale (pRExC_state->contains_locale)
217 #define RExC_override_recoding (pRExC_state->override_recoding)
218 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
221 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
222 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
223 ((*s) == '{' && regcurly(s)))
226 #undef SPSTART /* dratted cpp namespace... */
229 * Flags to be passed up and down.
231 #define WORST 0 /* Worst case. */
232 #define HASWIDTH 0x01 /* Known to match non-null strings. */
234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
235 * character. (There needs to be a case: in the switch statement in regexec.c
236 * for any node marked SIMPLE.) Note that this is not the same thing as
239 #define SPSTART 0x04 /* Starts with * or + */
240 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
241 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
243 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
245 /* whether trie related optimizations are enabled */
246 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
247 #define TRIE_STUDY_OPT
248 #define FULL_TRIE_STUDY
254 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
255 #define PBITVAL(paren) (1 << ((paren) & 7))
256 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
257 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
258 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
260 /* If not already in utf8, do a longjmp back to the beginning */
261 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
262 #define REQUIRE_UTF8 STMT_START { \
263 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
266 /* About scan_data_t.
268 During optimisation we recurse through the regexp program performing
269 various inplace (keyhole style) optimisations. In addition study_chunk
270 and scan_commit populate this data structure with information about
271 what strings MUST appear in the pattern. We look for the longest
272 string that must appear at a fixed location, and we look for the
273 longest string that may appear at a floating location. So for instance
278 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
279 strings (because they follow a .* construct). study_chunk will identify
280 both FOO and BAR as being the longest fixed and floating strings respectively.
282 The strings can be composites, for instance
286 will result in a composite fixed substring 'foo'.
288 For each string some basic information is maintained:
290 - offset or min_offset
291 This is the position the string must appear at, or not before.
292 It also implicitly (when combined with minlenp) tells us how many
293 characters must match before the string we are searching for.
294 Likewise when combined with minlenp and the length of the string it
295 tells us how many characters must appear after the string we have
299 Only used for floating strings. This is the rightmost point that
300 the string can appear at. If set to I32 max it indicates that the
301 string can occur infinitely far to the right.
304 A pointer to the minimum number of characters of the pattern that the
305 string was found inside. This is important as in the case of positive
306 lookahead or positive lookbehind we can have multiple patterns
311 The minimum length of the pattern overall is 3, the minimum length
312 of the lookahead part is 3, but the minimum length of the part that
313 will actually match is 1. So 'FOO's minimum length is 3, but the
314 minimum length for the F is 1. This is important as the minimum length
315 is used to determine offsets in front of and behind the string being
316 looked for. Since strings can be composites this is the length of the
317 pattern at the time it was committed with a scan_commit. Note that
318 the length is calculated by study_chunk, so that the minimum lengths
319 are not known until the full pattern has been compiled, thus the
320 pointer to the value.
324 In the case of lookbehind the string being searched for can be
325 offset past the start point of the final matching string.
326 If this value was just blithely removed from the min_offset it would
327 invalidate some of the calculations for how many chars must match
328 before or after (as they are derived from min_offset and minlen and
329 the length of the string being searched for).
330 When the final pattern is compiled and the data is moved from the
331 scan_data_t structure into the regexp structure the information
332 about lookbehind is factored in, with the information that would
333 have been lost precalculated in the end_shift field for the
336 The fields pos_min and pos_delta are used to store the minimum offset
337 and the delta to the maximum offset at the current point in the pattern.
341 typedef struct scan_data_t {
342 /*I32 len_min; unused */
343 /*I32 len_delta; unused */
347 I32 last_end; /* min value, <0 unless valid. */
350 SV **longest; /* Either &l_fixed, or &l_float. */
351 SV *longest_fixed; /* longest fixed string found in pattern */
352 I32 offset_fixed; /* offset where it starts */
353 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
354 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
355 SV *longest_float; /* longest floating string found in pattern */
356 I32 offset_float_min; /* earliest point in string it can appear */
357 I32 offset_float_max; /* latest point in string it can appear */
358 I32 *minlen_float; /* pointer to the minlen relevant to the string */
359 I32 lookbehind_float; /* is the position of the string modified by LB */
363 struct regnode_charclass_class *start_class;
367 * Forward declarations for pregcomp()'s friends.
370 static const scan_data_t zero_scan_data =
371 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
373 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
374 #define SF_BEFORE_SEOL 0x0001
375 #define SF_BEFORE_MEOL 0x0002
376 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
377 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
380 # define SF_FIX_SHIFT_EOL (0+2)
381 # define SF_FL_SHIFT_EOL (0+4)
383 # define SF_FIX_SHIFT_EOL (+2)
384 # define SF_FL_SHIFT_EOL (+4)
387 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
388 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
390 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
391 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
392 #define SF_IS_INF 0x0040
393 #define SF_HAS_PAR 0x0080
394 #define SF_IN_PAR 0x0100
395 #define SF_HAS_EVAL 0x0200
396 #define SCF_DO_SUBSTR 0x0400
397 #define SCF_DO_STCLASS_AND 0x0800
398 #define SCF_DO_STCLASS_OR 0x1000
399 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
400 #define SCF_WHILEM_VISITED_POS 0x2000
402 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
403 #define SCF_SEEN_ACCEPT 0x8000
405 #define UTF cBOOL(RExC_utf8)
407 /* The enums for all these are ordered so things work out correctly */
408 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
409 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
410 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
411 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
412 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
413 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
414 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
416 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
418 #define OOB_NAMEDCLASS -1
420 /* There is no code point that is out-of-bounds, so this is problematic. But
421 * its only current use is to initialize a variable that is always set before
423 #define OOB_UNICODE 0xDEADBEEF
425 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
426 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
429 /* length of regex to show in messages that don't mark a position within */
430 #define RegexLengthToShowInErrorMessages 127
433 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
434 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
435 * op/pragma/warn/regcomp.
437 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
438 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
440 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
443 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
444 * arg. Show regex, up to a maximum length. If it's too long, chop and add
447 #define _FAIL(code) STMT_START { \
448 const char *ellipses = ""; \
449 IV len = RExC_end - RExC_precomp; \
452 SAVEFREESV(RExC_rx_sv); \
453 if (len > RegexLengthToShowInErrorMessages) { \
454 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
455 len = RegexLengthToShowInErrorMessages - 10; \
461 #define FAIL(msg) _FAIL( \
462 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
463 msg, (int)len, RExC_precomp, ellipses))
465 #define FAIL2(msg,arg) _FAIL( \
466 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
467 arg, (int)len, RExC_precomp, ellipses))
470 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
472 #define Simple_vFAIL(m) STMT_START { \
473 const IV offset = RExC_parse - RExC_precomp; \
474 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
475 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
479 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
481 #define vFAIL(m) STMT_START { \
483 SAVEFREESV(RExC_rx_sv); \
488 * Like Simple_vFAIL(), but accepts two arguments.
490 #define Simple_vFAIL2(m,a1) STMT_START { \
491 const IV offset = RExC_parse - RExC_precomp; \
492 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
493 (int)offset, RExC_precomp, RExC_precomp + offset); \
497 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
499 #define vFAIL2(m,a1) STMT_START { \
501 SAVEFREESV(RExC_rx_sv); \
502 Simple_vFAIL2(m, a1); \
507 * Like Simple_vFAIL(), but accepts three arguments.
509 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
510 const IV offset = RExC_parse - RExC_precomp; \
511 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
512 (int)offset, RExC_precomp, RExC_precomp + offset); \
516 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
518 #define vFAIL3(m,a1,a2) STMT_START { \
520 SAVEFREESV(RExC_rx_sv); \
521 Simple_vFAIL3(m, a1, a2); \
525 * Like Simple_vFAIL(), but accepts four arguments.
527 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
528 const IV offset = RExC_parse - RExC_precomp; \
529 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
530 (int)offset, RExC_precomp, RExC_precomp + offset); \
533 #define ckWARNreg(loc,m) STMT_START { \
534 const IV offset = loc - RExC_precomp; \
535 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
536 (int)offset, RExC_precomp, RExC_precomp + offset); \
539 #define ckWARNregdep(loc,m) STMT_START { \
540 const IV offset = loc - RExC_precomp; \
541 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
543 (int)offset, RExC_precomp, RExC_precomp + offset); \
546 #define ckWARN2regdep(loc,m, a1) STMT_START { \
547 const IV offset = loc - RExC_precomp; \
548 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
550 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
553 #define ckWARN2reg(loc, m, a1) STMT_START { \
554 const IV offset = loc - RExC_precomp; \
555 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
556 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
559 #define vWARN3(loc, m, a1, a2) STMT_START { \
560 const IV offset = loc - RExC_precomp; \
561 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
562 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
565 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
566 const IV offset = loc - RExC_precomp; \
567 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
568 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
571 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
572 const IV offset = loc - RExC_precomp; \
573 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
574 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
577 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
578 const IV offset = loc - RExC_precomp; \
579 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
580 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
583 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
584 const IV offset = loc - RExC_precomp; \
585 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
586 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
590 /* Allow for side effects in s */
591 #define REGC(c,s) STMT_START { \
592 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
595 /* Macros for recording node offsets. 20001227 mjd@plover.com
596 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
597 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
598 * Element 0 holds the number n.
599 * Position is 1 indexed.
601 #ifndef RE_TRACK_PATTERN_OFFSETS
602 #define Set_Node_Offset_To_R(node,byte)
603 #define Set_Node_Offset(node,byte)
604 #define Set_Cur_Node_Offset
605 #define Set_Node_Length_To_R(node,len)
606 #define Set_Node_Length(node,len)
607 #define Set_Node_Cur_Length(node)
608 #define Node_Offset(n)
609 #define Node_Length(n)
610 #define Set_Node_Offset_Length(node,offset,len)
611 #define ProgLen(ri) ri->u.proglen
612 #define SetProgLen(ri,x) ri->u.proglen = x
614 #define ProgLen(ri) ri->u.offsets[0]
615 #define SetProgLen(ri,x) ri->u.offsets[0] = x
616 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
618 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
619 __LINE__, (int)(node), (int)(byte))); \
621 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
623 RExC_offsets[2*(node)-1] = (byte); \
628 #define Set_Node_Offset(node,byte) \
629 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
630 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
632 #define Set_Node_Length_To_R(node,len) STMT_START { \
634 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
635 __LINE__, (int)(node), (int)(len))); \
637 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
639 RExC_offsets[2*(node)] = (len); \
644 #define Set_Node_Length(node,len) \
645 Set_Node_Length_To_R((node)-RExC_emit_start, len)
646 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
647 #define Set_Node_Cur_Length(node) \
648 Set_Node_Length(node, RExC_parse - parse_start)
650 /* Get offsets and lengths */
651 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
652 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
654 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
655 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
656 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
660 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
661 #define EXPERIMENTAL_INPLACESCAN
662 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
664 #define DEBUG_STUDYDATA(str,data,depth) \
665 DEBUG_OPTIMISE_MORE_r(if(data){ \
666 PerlIO_printf(Perl_debug_log, \
667 "%*s" str "Pos:%"IVdf"/%"IVdf \
668 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
669 (int)(depth)*2, "", \
670 (IV)((data)->pos_min), \
671 (IV)((data)->pos_delta), \
672 (UV)((data)->flags), \
673 (IV)((data)->whilem_c), \
674 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
675 is_inf ? "INF " : "" \
677 if ((data)->last_found) \
678 PerlIO_printf(Perl_debug_log, \
679 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
680 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
681 SvPVX_const((data)->last_found), \
682 (IV)((data)->last_end), \
683 (IV)((data)->last_start_min), \
684 (IV)((data)->last_start_max), \
685 ((data)->longest && \
686 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
687 SvPVX_const((data)->longest_fixed), \
688 (IV)((data)->offset_fixed), \
689 ((data)->longest && \
690 (data)->longest==&((data)->longest_float)) ? "*" : "", \
691 SvPVX_const((data)->longest_float), \
692 (IV)((data)->offset_float_min), \
693 (IV)((data)->offset_float_max) \
695 PerlIO_printf(Perl_debug_log,"\n"); \
698 /* Mark that we cannot extend a found fixed substring at this point.
699 Update the longest found anchored substring and the longest found
700 floating substrings if needed. */
703 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
705 const STRLEN l = CHR_SVLEN(data->last_found);
706 const STRLEN old_l = CHR_SVLEN(*data->longest);
707 GET_RE_DEBUG_FLAGS_DECL;
709 PERL_ARGS_ASSERT_SCAN_COMMIT;
711 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
712 SvSetMagicSV(*data->longest, data->last_found);
713 if (*data->longest == data->longest_fixed) {
714 data->offset_fixed = l ? data->last_start_min : data->pos_min;
715 if (data->flags & SF_BEFORE_EOL)
717 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
719 data->flags &= ~SF_FIX_BEFORE_EOL;
720 data->minlen_fixed=minlenp;
721 data->lookbehind_fixed=0;
723 else { /* *data->longest == data->longest_float */
724 data->offset_float_min = l ? data->last_start_min : data->pos_min;
725 data->offset_float_max = (l
726 ? data->last_start_max
727 : data->pos_min + data->pos_delta);
728 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
729 data->offset_float_max = I32_MAX;
730 if (data->flags & SF_BEFORE_EOL)
732 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
734 data->flags &= ~SF_FL_BEFORE_EOL;
735 data->minlen_float=minlenp;
736 data->lookbehind_float=0;
739 SvCUR_set(data->last_found, 0);
741 SV * const sv = data->last_found;
742 if (SvUTF8(sv) && SvMAGICAL(sv)) {
743 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
749 data->flags &= ~SF_BEFORE_EOL;
750 DEBUG_STUDYDATA("commit: ",data,0);
753 /* Can match anything (initialization) */
755 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
757 PERL_ARGS_ASSERT_CL_ANYTHING;
759 ANYOF_BITMAP_SETALL(cl);
760 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
761 |ANYOF_NON_UTF8_LATIN1_ALL;
763 /* If any portion of the regex is to operate under locale rules,
764 * initialization includes it. The reason this isn't done for all regexes
765 * is that the optimizer was written under the assumption that locale was
766 * all-or-nothing. Given the complexity and lack of documentation in the
767 * optimizer, and that there are inadequate test cases for locale, so many
768 * parts of it may not work properly, it is safest to avoid locale unless
770 if (RExC_contains_locale) {
771 ANYOF_CLASS_SETALL(cl); /* /l uses class */
772 cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
775 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
779 /* Can match anything (initialization) */
781 S_cl_is_anything(const struct regnode_charclass_class *cl)
785 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
787 for (value = 0; value <= ANYOF_MAX; value += 2)
788 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
790 if (!(cl->flags & ANYOF_UNICODE_ALL))
792 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
797 /* Can match anything (initialization) */
799 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
801 PERL_ARGS_ASSERT_CL_INIT;
803 Zero(cl, 1, struct regnode_charclass_class);
805 cl_anything(pRExC_state, cl);
806 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
809 /* These two functions currently do the exact same thing */
810 #define cl_init_zero S_cl_init
812 /* 'AND' a given class with another one. Can create false positives. 'cl'
813 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
814 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
816 S_cl_and(struct regnode_charclass_class *cl,
817 const struct regnode_charclass_class *and_with)
819 PERL_ARGS_ASSERT_CL_AND;
821 assert(and_with->type == ANYOF);
823 /* I (khw) am not sure all these restrictions are necessary XXX */
824 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
825 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
826 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
827 && !(and_with->flags & ANYOF_LOC_FOLD)
828 && !(cl->flags & ANYOF_LOC_FOLD)) {
831 if (and_with->flags & ANYOF_INVERT)
832 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
833 cl->bitmap[i] &= ~and_with->bitmap[i];
835 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
836 cl->bitmap[i] &= and_with->bitmap[i];
837 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
839 if (and_with->flags & ANYOF_INVERT) {
841 /* Here, the and'ed node is inverted. Get the AND of the flags that
842 * aren't affected by the inversion. Those that are affected are
843 * handled individually below */
844 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
845 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
846 cl->flags |= affected_flags;
848 /* We currently don't know how to deal with things that aren't in the
849 * bitmap, but we know that the intersection is no greater than what
850 * is already in cl, so let there be false positives that get sorted
851 * out after the synthetic start class succeeds, and the node is
852 * matched for real. */
854 /* The inversion of these two flags indicate that the resulting
855 * intersection doesn't have them */
856 if (and_with->flags & ANYOF_UNICODE_ALL) {
857 cl->flags &= ~ANYOF_UNICODE_ALL;
859 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
860 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
863 else { /* and'd node is not inverted */
864 U8 outside_bitmap_but_not_utf8; /* Temp variable */
866 if (! ANYOF_NONBITMAP(and_with)) {
868 /* Here 'and_with' doesn't match anything outside the bitmap
869 * (except possibly ANYOF_UNICODE_ALL), which means the
870 * intersection can't either, except for ANYOF_UNICODE_ALL, in
871 * which case we don't know what the intersection is, but it's no
872 * greater than what cl already has, so can just leave it alone,
873 * with possible false positives */
874 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
875 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
876 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
879 else if (! ANYOF_NONBITMAP(cl)) {
881 /* Here, 'and_with' does match something outside the bitmap, and cl
882 * doesn't have a list of things to match outside the bitmap. If
883 * cl can match all code points above 255, the intersection will
884 * be those above-255 code points that 'and_with' matches. If cl
885 * can't match all Unicode code points, it means that it can't
886 * match anything outside the bitmap (since the 'if' that got us
887 * into this block tested for that), so we leave the bitmap empty.
889 if (cl->flags & ANYOF_UNICODE_ALL) {
890 ARG_SET(cl, ARG(and_with));
892 /* and_with's ARG may match things that don't require UTF8.
893 * And now cl's will too, in spite of this being an 'and'. See
894 * the comments below about the kludge */
895 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
899 /* Here, both 'and_with' and cl match something outside the
900 * bitmap. Currently we do not do the intersection, so just match
901 * whatever cl had at the beginning. */
905 /* Take the intersection of the two sets of flags. However, the
906 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
907 * kludge around the fact that this flag is not treated like the others
908 * which are initialized in cl_anything(). The way the optimizer works
909 * is that the synthetic start class (SSC) is initialized to match
910 * anything, and then the first time a real node is encountered, its
911 * values are AND'd with the SSC's with the result being the values of
912 * the real node. However, there are paths through the optimizer where
913 * the AND never gets called, so those initialized bits are set
914 * inappropriately, which is not usually a big deal, as they just cause
915 * false positives in the SSC, which will just mean a probably
916 * imperceptible slow down in execution. However this bit has a
917 * higher false positive consequence in that it can cause utf8.pm,
918 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
919 * bigger slowdown and also causes significant extra memory to be used.
920 * In order to prevent this, the code now takes a different tack. The
921 * bit isn't set unless some part of the regular expression needs it,
922 * but once set it won't get cleared. This means that these extra
923 * modules won't get loaded unless there was some path through the
924 * pattern that would have required them anyway, and so any false
925 * positives that occur by not ANDing them out when they could be
926 * aren't as severe as they would be if we treated this bit like all
928 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
929 & ANYOF_NONBITMAP_NON_UTF8;
930 cl->flags &= and_with->flags;
931 cl->flags |= outside_bitmap_but_not_utf8;
935 /* 'OR' a given class with another one. Can create false positives. 'cl'
936 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
937 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
939 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
941 PERL_ARGS_ASSERT_CL_OR;
943 if (or_with->flags & ANYOF_INVERT) {
945 /* Here, the or'd node is to be inverted. This means we take the
946 * complement of everything not in the bitmap, but currently we don't
947 * know what that is, so give up and match anything */
948 if (ANYOF_NONBITMAP(or_with)) {
949 cl_anything(pRExC_state, cl);
952 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
953 * <= (B1 | !B2) | (CL1 | !CL2)
954 * which is wasteful if CL2 is small, but we ignore CL2:
955 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
956 * XXXX Can we handle case-fold? Unclear:
957 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
958 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
960 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
961 && !(or_with->flags & ANYOF_LOC_FOLD)
962 && !(cl->flags & ANYOF_LOC_FOLD) ) {
965 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
966 cl->bitmap[i] |= ~or_with->bitmap[i];
967 } /* XXXX: logic is complicated otherwise */
969 cl_anything(pRExC_state, cl);
972 /* And, we can just take the union of the flags that aren't affected
973 * by the inversion */
974 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
976 /* For the remaining flags:
977 ANYOF_UNICODE_ALL and inverted means to not match anything above
978 255, which means that the union with cl should just be
979 what cl has in it, so can ignore this flag
980 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
981 is 127-255 to match them, but then invert that, so the
982 union with cl should just be what cl has in it, so can
985 } else { /* 'or_with' is not inverted */
986 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
987 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
988 && (!(or_with->flags & ANYOF_LOC_FOLD)
989 || (cl->flags & ANYOF_LOC_FOLD)) ) {
992 /* OR char bitmap and class bitmap separately */
993 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
994 cl->bitmap[i] |= or_with->bitmap[i];
995 ANYOF_CLASS_OR(or_with, cl);
997 else { /* XXXX: logic is complicated, leave it along for a moment. */
998 cl_anything(pRExC_state, cl);
1001 if (ANYOF_NONBITMAP(or_with)) {
1003 /* Use the added node's outside-the-bit-map match if there isn't a
1004 * conflict. If there is a conflict (both nodes match something
1005 * outside the bitmap, but what they match outside is not the same
1006 * pointer, and hence not easily compared until XXX we extend
1007 * inversion lists this far), give up and allow the start class to
1008 * match everything outside the bitmap. If that stuff is all above
1009 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1010 if (! ANYOF_NONBITMAP(cl)) {
1011 ARG_SET(cl, ARG(or_with));
1013 else if (ARG(cl) != ARG(or_with)) {
1015 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1016 cl_anything(pRExC_state, cl);
1019 cl->flags |= ANYOF_UNICODE_ALL;
1024 /* Take the union */
1025 cl->flags |= or_with->flags;
1029 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1030 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1031 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1032 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1037 dump_trie(trie,widecharmap,revcharmap)
1038 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1039 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1041 These routines dump out a trie in a somewhat readable format.
1042 The _interim_ variants are used for debugging the interim
1043 tables that are used to generate the final compressed
1044 representation which is what dump_trie expects.
1046 Part of the reason for their existence is to provide a form
1047 of documentation as to how the different representations function.
1052 Dumps the final compressed table form of the trie to Perl_debug_log.
1053 Used for debugging make_trie().
1057 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1058 AV *revcharmap, U32 depth)
1061 SV *sv=sv_newmortal();
1062 int colwidth= widecharmap ? 6 : 4;
1064 GET_RE_DEBUG_FLAGS_DECL;
1066 PERL_ARGS_ASSERT_DUMP_TRIE;
1068 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1069 (int)depth * 2 + 2,"",
1070 "Match","Base","Ofs" );
1072 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1073 SV ** const tmp = av_fetch( revcharmap, state, 0);
1075 PerlIO_printf( Perl_debug_log, "%*s",
1077 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1078 PL_colors[0], PL_colors[1],
1079 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1080 PERL_PV_ESCAPE_FIRSTCHAR
1085 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1086 (int)depth * 2 + 2,"");
1088 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1089 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1090 PerlIO_printf( Perl_debug_log, "\n");
1092 for( state = 1 ; state < trie->statecount ; state++ ) {
1093 const U32 base = trie->states[ state ].trans.base;
1095 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1097 if ( trie->states[ state ].wordnum ) {
1098 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1100 PerlIO_printf( Perl_debug_log, "%6s", "" );
1103 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1108 while( ( base + ofs < trie->uniquecharcount ) ||
1109 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1110 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1113 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1115 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1116 if ( ( base + ofs >= trie->uniquecharcount ) &&
1117 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1118 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1120 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1122 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1124 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1128 PerlIO_printf( Perl_debug_log, "]");
1131 PerlIO_printf( Perl_debug_log, "\n" );
1133 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1134 for (word=1; word <= trie->wordcount; word++) {
1135 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1136 (int)word, (int)(trie->wordinfo[word].prev),
1137 (int)(trie->wordinfo[word].len));
1139 PerlIO_printf(Perl_debug_log, "\n" );
1142 Dumps a fully constructed but uncompressed trie in list form.
1143 List tries normally only are used for construction when the number of
1144 possible chars (trie->uniquecharcount) is very high.
1145 Used for debugging make_trie().
1148 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1149 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1153 SV *sv=sv_newmortal();
1154 int colwidth= widecharmap ? 6 : 4;
1155 GET_RE_DEBUG_FLAGS_DECL;
1157 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1159 /* print out the table precompression. */
1160 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1161 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1162 "------:-----+-----------------\n" );
1164 for( state=1 ; state < next_alloc ; state ++ ) {
1167 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1168 (int)depth * 2 + 2,"", (UV)state );
1169 if ( ! trie->states[ state ].wordnum ) {
1170 PerlIO_printf( Perl_debug_log, "%5s| ","");
1172 PerlIO_printf( Perl_debug_log, "W%4x| ",
1173 trie->states[ state ].wordnum
1176 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1177 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1179 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1181 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1182 PL_colors[0], PL_colors[1],
1183 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1184 PERL_PV_ESCAPE_FIRSTCHAR
1186 TRIE_LIST_ITEM(state,charid).forid,
1187 (UV)TRIE_LIST_ITEM(state,charid).newstate
1190 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1191 (int)((depth * 2) + 14), "");
1194 PerlIO_printf( Perl_debug_log, "\n");
1199 Dumps a fully constructed but uncompressed trie in table form.
1200 This is the normal DFA style state transition table, with a few
1201 twists to facilitate compression later.
1202 Used for debugging make_trie().
1205 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1206 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1211 SV *sv=sv_newmortal();
1212 int colwidth= widecharmap ? 6 : 4;
1213 GET_RE_DEBUG_FLAGS_DECL;
1215 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1218 print out the table precompression so that we can do a visual check
1219 that they are identical.
1222 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1224 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1225 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1227 PerlIO_printf( Perl_debug_log, "%*s",
1229 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1230 PL_colors[0], PL_colors[1],
1231 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1232 PERL_PV_ESCAPE_FIRSTCHAR
1238 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1240 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1241 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1244 PerlIO_printf( Perl_debug_log, "\n" );
1246 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1248 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1249 (int)depth * 2 + 2,"",
1250 (UV)TRIE_NODENUM( state ) );
1252 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1253 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1255 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1257 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1259 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1260 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1262 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1263 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1271 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1272 startbranch: the first branch in the whole branch sequence
1273 first : start branch of sequence of branch-exact nodes.
1274 May be the same as startbranch
1275 last : Thing following the last branch.
1276 May be the same as tail.
1277 tail : item following the branch sequence
1278 count : words in the sequence
1279 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1280 depth : indent depth
1282 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1284 A trie is an N'ary tree where the branches are determined by digital
1285 decomposition of the key. IE, at the root node you look up the 1st character and
1286 follow that branch repeat until you find the end of the branches. Nodes can be
1287 marked as "accepting" meaning they represent a complete word. Eg:
1291 would convert into the following structure. Numbers represent states, letters
1292 following numbers represent valid transitions on the letter from that state, if
1293 the number is in square brackets it represents an accepting state, otherwise it
1294 will be in parenthesis.
1296 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1300 (1) +-i->(6)-+-s->[7]
1302 +-s->(3)-+-h->(4)-+-e->[5]
1304 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1306 This shows that when matching against the string 'hers' we will begin at state 1
1307 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1308 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1309 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1310 single traverse. We store a mapping from accepting to state to which word was
1311 matched, and then when we have multiple possibilities we try to complete the
1312 rest of the regex in the order in which they occured in the alternation.
1314 The only prior NFA like behaviour that would be changed by the TRIE support is
1315 the silent ignoring of duplicate alternations which are of the form:
1317 / (DUPE|DUPE) X? (?{ ... }) Y /x
1319 Thus EVAL blocks following a trie may be called a different number of times with
1320 and without the optimisation. With the optimisations dupes will be silently
1321 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1322 the following demonstrates:
1324 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1326 which prints out 'word' three times, but
1328 'words'=~/(word|word|word)(?{ print $1 })S/
1330 which doesnt print it out at all. This is due to other optimisations kicking in.
1332 Example of what happens on a structural level:
1334 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1336 1: CURLYM[1] {1,32767}(18)
1347 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1348 and should turn into:
1350 1: CURLYM[1] {1,32767}(18)
1352 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1360 Cases where tail != last would be like /(?foo|bar)baz/:
1370 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1371 and would end up looking like:
1374 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1381 d = uvuni_to_utf8_flags(d, uv, 0);
1383 is the recommended Unicode-aware way of saying
1388 #define TRIE_STORE_REVCHAR(val) \
1391 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1392 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1393 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1394 SvCUR_set(zlopp, kapow - flrbbbbb); \
1397 av_push(revcharmap, zlopp); \
1399 char ooooff = (char)val; \
1400 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1404 #define TRIE_READ_CHAR STMT_START { \
1407 /* if it is UTF then it is either already folded, or does not need folding */ \
1408 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1410 else if (folder == PL_fold_latin1) { \
1411 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1412 if ( foldlen > 0 ) { \
1413 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1419 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1420 skiplen = UNISKIP(uvc); \
1421 foldlen -= skiplen; \
1422 scan = foldbuf + skiplen; \
1425 /* raw data, will be folded later if needed */ \
1433 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1434 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1435 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1436 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1438 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1439 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1440 TRIE_LIST_CUR( state )++; \
1443 #define TRIE_LIST_NEW(state) STMT_START { \
1444 Newxz( trie->states[ state ].trans.list, \
1445 4, reg_trie_trans_le ); \
1446 TRIE_LIST_CUR( state ) = 1; \
1447 TRIE_LIST_LEN( state ) = 4; \
1450 #define TRIE_HANDLE_WORD(state) STMT_START { \
1451 U16 dupe= trie->states[ state ].wordnum; \
1452 regnode * const noper_next = regnext( noper ); \
1455 /* store the word for dumping */ \
1457 if (OP(noper) != NOTHING) \
1458 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1460 tmp = newSVpvn_utf8( "", 0, UTF ); \
1461 av_push( trie_words, tmp ); \
1465 trie->wordinfo[curword].prev = 0; \
1466 trie->wordinfo[curword].len = wordlen; \
1467 trie->wordinfo[curword].accept = state; \
1469 if ( noper_next < tail ) { \
1471 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1472 trie->jump[curword] = (U16)(noper_next - convert); \
1474 jumper = noper_next; \
1476 nextbranch= regnext(cur); \
1480 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1481 /* chain, so that when the bits of chain are later */\
1482 /* linked together, the dups appear in the chain */\
1483 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1484 trie->wordinfo[dupe].prev = curword; \
1486 /* we haven't inserted this word yet. */ \
1487 trie->states[ state ].wordnum = curword; \
1492 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1493 ( ( base + charid >= ucharcount \
1494 && base + charid < ubound \
1495 && state == trie->trans[ base - ucharcount + charid ].check \
1496 && trie->trans[ base - ucharcount + charid ].next ) \
1497 ? trie->trans[ base - ucharcount + charid ].next \
1498 : ( state==1 ? special : 0 ) \
1502 #define MADE_JUMP_TRIE 2
1503 #define MADE_EXACT_TRIE 4
1506 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1509 /* first pass, loop through and scan words */
1510 reg_trie_data *trie;
1511 HV *widecharmap = NULL;
1512 AV *revcharmap = newAV();
1514 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1519 regnode *jumper = NULL;
1520 regnode *nextbranch = NULL;
1521 regnode *convert = NULL;
1522 U32 *prev_states; /* temp array mapping each state to previous one */
1523 /* we just use folder as a flag in utf8 */
1524 const U8 * folder = NULL;
1527 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1528 AV *trie_words = NULL;
1529 /* along with revcharmap, this only used during construction but both are
1530 * useful during debugging so we store them in the struct when debugging.
1533 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1534 STRLEN trie_charcount=0;
1536 SV *re_trie_maxbuff;
1537 GET_RE_DEBUG_FLAGS_DECL;
1539 PERL_ARGS_ASSERT_MAKE_TRIE;
1541 PERL_UNUSED_ARG(depth);
1548 case EXACTFU_TRICKYFOLD:
1549 case EXACTFU: folder = PL_fold_latin1; break;
1550 case EXACTF: folder = PL_fold; break;
1551 case EXACTFL: folder = PL_fold_locale; break;
1552 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1555 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1557 trie->startstate = 1;
1558 trie->wordcount = word_count;
1559 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1560 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1562 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1563 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1564 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1567 trie_words = newAV();
1570 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1571 if (!SvIOK(re_trie_maxbuff)) {
1572 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1574 DEBUG_TRIE_COMPILE_r({
1575 PerlIO_printf( Perl_debug_log,
1576 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1577 (int)depth * 2 + 2, "",
1578 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1579 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1583 /* Find the node we are going to overwrite */
1584 if ( first == startbranch && OP( last ) != BRANCH ) {
1585 /* whole branch chain */
1588 /* branch sub-chain */
1589 convert = NEXTOPER( first );
1592 /* -- First loop and Setup --
1594 We first traverse the branches and scan each word to determine if it
1595 contains widechars, and how many unique chars there are, this is
1596 important as we have to build a table with at least as many columns as we
1599 We use an array of integers to represent the character codes 0..255
1600 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1601 native representation of the character value as the key and IV's for the
1604 *TODO* If we keep track of how many times each character is used we can
1605 remap the columns so that the table compression later on is more
1606 efficient in terms of memory by ensuring the most common value is in the
1607 middle and the least common are on the outside. IMO this would be better
1608 than a most to least common mapping as theres a decent chance the most
1609 common letter will share a node with the least common, meaning the node
1610 will not be compressible. With a middle is most common approach the worst
1611 case is when we have the least common nodes twice.
1615 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1616 regnode *noper = NEXTOPER( cur );
1617 const U8 *uc = (U8*)STRING( noper );
1618 const U8 *e = uc + STR_LEN( noper );
1620 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1622 const U8 *scan = (U8*)NULL;
1623 U32 wordlen = 0; /* required init */
1625 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1627 if (OP(noper) == NOTHING) {
1628 regnode *noper_next= regnext(noper);
1629 if (noper_next != tail && OP(noper_next) == flags) {
1631 uc= (U8*)STRING(noper);
1632 e= uc + STR_LEN(noper);
1633 trie->minlen= STR_LEN(noper);
1640 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1641 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1642 regardless of encoding */
1643 if (OP( noper ) == EXACTFU_SS) {
1644 /* false positives are ok, so just set this */
1645 TRIE_BITMAP_SET(trie,0xDF);
1648 for ( ; uc < e ; uc += len ) {
1649 TRIE_CHARCOUNT(trie)++;
1654 U8 folded= folder[ (U8) uvc ];
1655 if ( !trie->charmap[ folded ] ) {
1656 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1657 TRIE_STORE_REVCHAR( folded );
1660 if ( !trie->charmap[ uvc ] ) {
1661 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1662 TRIE_STORE_REVCHAR( uvc );
1665 /* store the codepoint in the bitmap, and its folded
1667 TRIE_BITMAP_SET(trie, uvc);
1669 /* store the folded codepoint */
1670 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1673 /* store first byte of utf8 representation of
1674 variant codepoints */
1675 if (! UNI_IS_INVARIANT(uvc)) {
1676 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1679 set_bit = 0; /* We've done our bit :-) */
1684 widecharmap = newHV();
1686 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1689 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1691 if ( !SvTRUE( *svpp ) ) {
1692 sv_setiv( *svpp, ++trie->uniquecharcount );
1693 TRIE_STORE_REVCHAR(uvc);
1697 if( cur == first ) {
1698 trie->minlen = chars;
1699 trie->maxlen = chars;
1700 } else if (chars < trie->minlen) {
1701 trie->minlen = chars;
1702 } else if (chars > trie->maxlen) {
1703 trie->maxlen = chars;
1705 if (OP( noper ) == EXACTFU_SS) {
1706 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1707 if (trie->minlen > 1)
1710 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1711 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1712 * - We assume that any such sequence might match a 2 byte string */
1713 if (trie->minlen > 2 )
1717 } /* end first pass */
1718 DEBUG_TRIE_COMPILE_r(
1719 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1720 (int)depth * 2 + 2,"",
1721 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1722 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1723 (int)trie->minlen, (int)trie->maxlen )
1727 We now know what we are dealing with in terms of unique chars and
1728 string sizes so we can calculate how much memory a naive
1729 representation using a flat table will take. If it's over a reasonable
1730 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1731 conservative but potentially much slower representation using an array
1734 At the end we convert both representations into the same compressed
1735 form that will be used in regexec.c for matching with. The latter
1736 is a form that cannot be used to construct with but has memory
1737 properties similar to the list form and access properties similar
1738 to the table form making it both suitable for fast searches and
1739 small enough that its feasable to store for the duration of a program.
1741 See the comment in the code where the compressed table is produced
1742 inplace from the flat tabe representation for an explanation of how
1743 the compression works.
1748 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1751 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1753 Second Pass -- Array Of Lists Representation
1755 Each state will be represented by a list of charid:state records
1756 (reg_trie_trans_le) the first such element holds the CUR and LEN
1757 points of the allocated array. (See defines above).
1759 We build the initial structure using the lists, and then convert
1760 it into the compressed table form which allows faster lookups
1761 (but cant be modified once converted).
1764 STRLEN transcount = 1;
1766 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1767 "%*sCompiling trie using list compiler\n",
1768 (int)depth * 2 + 2, ""));
1770 trie->states = (reg_trie_state *)
1771 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1772 sizeof(reg_trie_state) );
1776 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1778 regnode *noper = NEXTOPER( cur );
1779 U8 *uc = (U8*)STRING( noper );
1780 const U8 *e = uc + STR_LEN( noper );
1781 U32 state = 1; /* required init */
1782 U16 charid = 0; /* sanity init */
1783 U8 *scan = (U8*)NULL; /* sanity init */
1784 STRLEN foldlen = 0; /* required init */
1785 U32 wordlen = 0; /* required init */
1786 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1789 if (OP(noper) == NOTHING) {
1790 regnode *noper_next= regnext(noper);
1791 if (noper_next != tail && OP(noper_next) == flags) {
1793 uc= (U8*)STRING(noper);
1794 e= uc + STR_LEN(noper);
1798 if (OP(noper) != NOTHING) {
1799 for ( ; uc < e ; uc += len ) {
1804 charid = trie->charmap[ uvc ];
1806 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1810 charid=(U16)SvIV( *svpp );
1813 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1820 if ( !trie->states[ state ].trans.list ) {
1821 TRIE_LIST_NEW( state );
1823 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1824 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1825 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1830 newstate = next_alloc++;
1831 prev_states[newstate] = state;
1832 TRIE_LIST_PUSH( state, charid, newstate );
1837 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1841 TRIE_HANDLE_WORD(state);
1843 } /* end second pass */
1845 /* next alloc is the NEXT state to be allocated */
1846 trie->statecount = next_alloc;
1847 trie->states = (reg_trie_state *)
1848 PerlMemShared_realloc( trie->states,
1850 * sizeof(reg_trie_state) );
1852 /* and now dump it out before we compress it */
1853 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1854 revcharmap, next_alloc,
1858 trie->trans = (reg_trie_trans *)
1859 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1866 for( state=1 ; state < next_alloc ; state ++ ) {
1870 DEBUG_TRIE_COMPILE_MORE_r(
1871 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1875 if (trie->states[state].trans.list) {
1876 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1880 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1881 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1882 if ( forid < minid ) {
1884 } else if ( forid > maxid ) {
1888 if ( transcount < tp + maxid - minid + 1) {
1890 trie->trans = (reg_trie_trans *)
1891 PerlMemShared_realloc( trie->trans,
1893 * sizeof(reg_trie_trans) );
1894 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1896 base = trie->uniquecharcount + tp - minid;
1897 if ( maxid == minid ) {
1899 for ( ; zp < tp ; zp++ ) {
1900 if ( ! trie->trans[ zp ].next ) {
1901 base = trie->uniquecharcount + zp - minid;
1902 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1903 trie->trans[ zp ].check = state;
1909 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1910 trie->trans[ tp ].check = state;
1915 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1916 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1917 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1918 trie->trans[ tid ].check = state;
1920 tp += ( maxid - minid + 1 );
1922 Safefree(trie->states[ state ].trans.list);
1925 DEBUG_TRIE_COMPILE_MORE_r(
1926 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1929 trie->states[ state ].trans.base=base;
1931 trie->lasttrans = tp + 1;
1935 Second Pass -- Flat Table Representation.
1937 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1938 We know that we will need Charcount+1 trans at most to store the data
1939 (one row per char at worst case) So we preallocate both structures
1940 assuming worst case.
1942 We then construct the trie using only the .next slots of the entry
1945 We use the .check field of the first entry of the node temporarily to
1946 make compression both faster and easier by keeping track of how many non
1947 zero fields are in the node.
1949 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1952 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1953 number representing the first entry of the node, and state as a
1954 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1955 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1956 are 2 entrys per node. eg:
1964 The table is internally in the right hand, idx form. However as we also
1965 have to deal with the states array which is indexed by nodenum we have to
1966 use TRIE_NODENUM() to convert.
1969 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1970 "%*sCompiling trie using table compiler\n",
1971 (int)depth * 2 + 2, ""));
1973 trie->trans = (reg_trie_trans *)
1974 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1975 * trie->uniquecharcount + 1,
1976 sizeof(reg_trie_trans) );
1977 trie->states = (reg_trie_state *)
1978 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1979 sizeof(reg_trie_state) );
1980 next_alloc = trie->uniquecharcount + 1;
1983 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1985 regnode *noper = NEXTOPER( cur );
1986 const U8 *uc = (U8*)STRING( noper );
1987 const U8 *e = uc + STR_LEN( noper );
1989 U32 state = 1; /* required init */
1991 U16 charid = 0; /* sanity init */
1992 U32 accept_state = 0; /* sanity init */
1993 U8 *scan = (U8*)NULL; /* sanity init */
1995 STRLEN foldlen = 0; /* required init */
1996 U32 wordlen = 0; /* required init */
1998 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2000 if (OP(noper) == NOTHING) {
2001 regnode *noper_next= regnext(noper);
2002 if (noper_next != tail && OP(noper_next) == flags) {
2004 uc= (U8*)STRING(noper);
2005 e= uc + STR_LEN(noper);
2009 if ( OP(noper) != NOTHING ) {
2010 for ( ; uc < e ; uc += len ) {
2015 charid = trie->charmap[ uvc ];
2017 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2018 charid = svpp ? (U16)SvIV(*svpp) : 0;
2022 if ( !trie->trans[ state + charid ].next ) {
2023 trie->trans[ state + charid ].next = next_alloc;
2024 trie->trans[ state ].check++;
2025 prev_states[TRIE_NODENUM(next_alloc)]
2026 = TRIE_NODENUM(state);
2027 next_alloc += trie->uniquecharcount;
2029 state = trie->trans[ state + charid ].next;
2031 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2033 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2036 accept_state = TRIE_NODENUM( state );
2037 TRIE_HANDLE_WORD(accept_state);
2039 } /* end second pass */
2041 /* and now dump it out before we compress it */
2042 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2044 next_alloc, depth+1));
2048 * Inplace compress the table.*
2050 For sparse data sets the table constructed by the trie algorithm will
2051 be mostly 0/FAIL transitions or to put it another way mostly empty.
2052 (Note that leaf nodes will not contain any transitions.)
2054 This algorithm compresses the tables by eliminating most such
2055 transitions, at the cost of a modest bit of extra work during lookup:
2057 - Each states[] entry contains a .base field which indicates the
2058 index in the state[] array wheres its transition data is stored.
2060 - If .base is 0 there are no valid transitions from that node.
2062 - If .base is nonzero then charid is added to it to find an entry in
2065 -If trans[states[state].base+charid].check!=state then the
2066 transition is taken to be a 0/Fail transition. Thus if there are fail
2067 transitions at the front of the node then the .base offset will point
2068 somewhere inside the previous nodes data (or maybe even into a node
2069 even earlier), but the .check field determines if the transition is
2073 The following process inplace converts the table to the compressed
2074 table: We first do not compress the root node 1,and mark all its
2075 .check pointers as 1 and set its .base pointer as 1 as well. This
2076 allows us to do a DFA construction from the compressed table later,
2077 and ensures that any .base pointers we calculate later are greater
2080 - We set 'pos' to indicate the first entry of the second node.
2082 - We then iterate over the columns of the node, finding the first and
2083 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2084 and set the .check pointers accordingly, and advance pos
2085 appropriately and repreat for the next node. Note that when we copy
2086 the next pointers we have to convert them from the original
2087 NODEIDX form to NODENUM form as the former is not valid post
2090 - If a node has no transitions used we mark its base as 0 and do not
2091 advance the pos pointer.
2093 - If a node only has one transition we use a second pointer into the
2094 structure to fill in allocated fail transitions from other states.
2095 This pointer is independent of the main pointer and scans forward
2096 looking for null transitions that are allocated to a state. When it
2097 finds one it writes the single transition into the "hole". If the
2098 pointer doesnt find one the single transition is appended as normal.
2100 - Once compressed we can Renew/realloc the structures to release the
2103 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2104 specifically Fig 3.47 and the associated pseudocode.
2108 const U32 laststate = TRIE_NODENUM( next_alloc );
2111 trie->statecount = laststate;
2113 for ( state = 1 ; state < laststate ; state++ ) {
2115 const U32 stateidx = TRIE_NODEIDX( state );
2116 const U32 o_used = trie->trans[ stateidx ].check;
2117 U32 used = trie->trans[ stateidx ].check;
2118 trie->trans[ stateidx ].check = 0;
2120 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2121 if ( flag || trie->trans[ stateidx + charid ].next ) {
2122 if ( trie->trans[ stateidx + charid ].next ) {
2124 for ( ; zp < pos ; zp++ ) {
2125 if ( ! trie->trans[ zp ].next ) {
2129 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2130 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2131 trie->trans[ zp ].check = state;
2132 if ( ++zp > pos ) pos = zp;
2139 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2141 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2142 trie->trans[ pos ].check = state;
2147 trie->lasttrans = pos + 1;
2148 trie->states = (reg_trie_state *)
2149 PerlMemShared_realloc( trie->states, laststate
2150 * sizeof(reg_trie_state) );
2151 DEBUG_TRIE_COMPILE_MORE_r(
2152 PerlIO_printf( Perl_debug_log,
2153 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2154 (int)depth * 2 + 2,"",
2155 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2158 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2161 } /* end table compress */
2163 DEBUG_TRIE_COMPILE_MORE_r(
2164 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2165 (int)depth * 2 + 2, "",
2166 (UV)trie->statecount,
2167 (UV)trie->lasttrans)
2169 /* resize the trans array to remove unused space */
2170 trie->trans = (reg_trie_trans *)
2171 PerlMemShared_realloc( trie->trans, trie->lasttrans
2172 * sizeof(reg_trie_trans) );
2174 { /* Modify the program and insert the new TRIE node */
2175 U8 nodetype =(U8)(flags & 0xFF);
2179 regnode *optimize = NULL;
2180 #ifdef RE_TRACK_PATTERN_OFFSETS
2183 U32 mjd_nodelen = 0;
2184 #endif /* RE_TRACK_PATTERN_OFFSETS */
2185 #endif /* DEBUGGING */
2187 This means we convert either the first branch or the first Exact,
2188 depending on whether the thing following (in 'last') is a branch
2189 or not and whther first is the startbranch (ie is it a sub part of
2190 the alternation or is it the whole thing.)
2191 Assuming its a sub part we convert the EXACT otherwise we convert
2192 the whole branch sequence, including the first.
2194 /* Find the node we are going to overwrite */
2195 if ( first != startbranch || OP( last ) == BRANCH ) {
2196 /* branch sub-chain */
2197 NEXT_OFF( first ) = (U16)(last - first);
2198 #ifdef RE_TRACK_PATTERN_OFFSETS
2200 mjd_offset= Node_Offset((convert));
2201 mjd_nodelen= Node_Length((convert));
2204 /* whole branch chain */
2206 #ifdef RE_TRACK_PATTERN_OFFSETS
2209 const regnode *nop = NEXTOPER( convert );
2210 mjd_offset= Node_Offset((nop));
2211 mjd_nodelen= Node_Length((nop));
2215 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2216 (int)depth * 2 + 2, "",
2217 (UV)mjd_offset, (UV)mjd_nodelen)
2220 /* But first we check to see if there is a common prefix we can
2221 split out as an EXACT and put in front of the TRIE node. */
2222 trie->startstate= 1;
2223 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2225 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2229 const U32 base = trie->states[ state ].trans.base;
2231 if ( trie->states[state].wordnum )
2234 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2235 if ( ( base + ofs >= trie->uniquecharcount ) &&
2236 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2237 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2239 if ( ++count > 1 ) {
2240 SV **tmp = av_fetch( revcharmap, ofs, 0);
2241 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2242 if ( state == 1 ) break;
2244 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2246 PerlIO_printf(Perl_debug_log,
2247 "%*sNew Start State=%"UVuf" Class: [",
2248 (int)depth * 2 + 2, "",
2251 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2252 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2254 TRIE_BITMAP_SET(trie,*ch);
2256 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2258 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2262 TRIE_BITMAP_SET(trie,*ch);
2264 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2265 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2271 SV **tmp = av_fetch( revcharmap, idx, 0);
2273 char *ch = SvPV( *tmp, len );
2275 SV *sv=sv_newmortal();
2276 PerlIO_printf( Perl_debug_log,
2277 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2278 (int)depth * 2 + 2, "",
2280 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2281 PL_colors[0], PL_colors[1],
2282 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2283 PERL_PV_ESCAPE_FIRSTCHAR
2288 OP( convert ) = nodetype;
2289 str=STRING(convert);
2292 STR_LEN(convert) += len;
2298 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2303 trie->prefixlen = (state-1);
2305 regnode *n = convert+NODE_SZ_STR(convert);
2306 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2307 trie->startstate = state;
2308 trie->minlen -= (state - 1);
2309 trie->maxlen -= (state - 1);
2311 /* At least the UNICOS C compiler choked on this
2312 * being argument to DEBUG_r(), so let's just have
2315 #ifdef PERL_EXT_RE_BUILD
2321 regnode *fix = convert;
2322 U32 word = trie->wordcount;
2324 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2325 while( ++fix < n ) {
2326 Set_Node_Offset_Length(fix, 0, 0);
2329 SV ** const tmp = av_fetch( trie_words, word, 0 );
2331 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2332 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2334 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2342 NEXT_OFF(convert) = (U16)(tail - convert);
2343 DEBUG_r(optimize= n);
2349 if ( trie->maxlen ) {
2350 NEXT_OFF( convert ) = (U16)(tail - convert);
2351 ARG_SET( convert, data_slot );
2352 /* Store the offset to the first unabsorbed branch in
2353 jump[0], which is otherwise unused by the jump logic.
2354 We use this when dumping a trie and during optimisation. */
2356 trie->jump[0] = (U16)(nextbranch - convert);
2358 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2359 * and there is a bitmap
2360 * and the first "jump target" node we found leaves enough room
2361 * then convert the TRIE node into a TRIEC node, with the bitmap
2362 * embedded inline in the opcode - this is hypothetically faster.
2364 if ( !trie->states[trie->startstate].wordnum
2366 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2368 OP( convert ) = TRIEC;
2369 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2370 PerlMemShared_free(trie->bitmap);
2373 OP( convert ) = TRIE;
2375 /* store the type in the flags */
2376 convert->flags = nodetype;
2380 + regarglen[ OP( convert ) ];
2382 /* XXX We really should free up the resource in trie now,
2383 as we won't use them - (which resources?) dmq */
2385 /* needed for dumping*/
2386 DEBUG_r(if (optimize) {
2387 regnode *opt = convert;
2389 while ( ++opt < optimize) {
2390 Set_Node_Offset_Length(opt,0,0);
2393 Try to clean up some of the debris left after the
2396 while( optimize < jumper ) {
2397 mjd_nodelen += Node_Length((optimize));
2398 OP( optimize ) = OPTIMIZED;
2399 Set_Node_Offset_Length(optimize,0,0);
2402 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2404 } /* end node insert */
2405 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2407 /* Finish populating the prev field of the wordinfo array. Walk back
2408 * from each accept state until we find another accept state, and if
2409 * so, point the first word's .prev field at the second word. If the
2410 * second already has a .prev field set, stop now. This will be the
2411 * case either if we've already processed that word's accept state,
2412 * or that state had multiple words, and the overspill words were
2413 * already linked up earlier.
2420 for (word=1; word <= trie->wordcount; word++) {
2422 if (trie->wordinfo[word].prev)
2424 state = trie->wordinfo[word].accept;
2426 state = prev_states[state];
2429 prev = trie->states[state].wordnum;
2433 trie->wordinfo[word].prev = prev;
2435 Safefree(prev_states);
2439 /* and now dump out the compressed format */
2440 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2442 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2444 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2445 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2447 SvREFCNT_dec(revcharmap);
2451 : trie->startstate>1
2457 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2459 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2461 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2462 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2465 We find the fail state for each state in the trie, this state is the longest proper
2466 suffix of the current state's 'word' that is also a proper prefix of another word in our
2467 trie. State 1 represents the word '' and is thus the default fail state. This allows
2468 the DFA not to have to restart after its tried and failed a word at a given point, it
2469 simply continues as though it had been matching the other word in the first place.
2471 'abcdgu'=~/abcdefg|cdgu/
2472 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2473 fail, which would bring us to the state representing 'd' in the second word where we would
2474 try 'g' and succeed, proceeding to match 'cdgu'.
2476 /* add a fail transition */
2477 const U32 trie_offset = ARG(source);
2478 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2480 const U32 ucharcount = trie->uniquecharcount;
2481 const U32 numstates = trie->statecount;
2482 const U32 ubound = trie->lasttrans + ucharcount;
2486 U32 base = trie->states[ 1 ].trans.base;
2489 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2490 GET_RE_DEBUG_FLAGS_DECL;
2492 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2494 PERL_UNUSED_ARG(depth);
2498 ARG_SET( stclass, data_slot );
2499 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2500 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2501 aho->trie=trie_offset;
2502 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2503 Copy( trie->states, aho->states, numstates, reg_trie_state );
2504 Newxz( q, numstates, U32);
2505 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2508 /* initialize fail[0..1] to be 1 so that we always have
2509 a valid final fail state */
2510 fail[ 0 ] = fail[ 1 ] = 1;
2512 for ( charid = 0; charid < ucharcount ; charid++ ) {
2513 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2515 q[ q_write ] = newstate;
2516 /* set to point at the root */
2517 fail[ q[ q_write++ ] ]=1;
2520 while ( q_read < q_write) {
2521 const U32 cur = q[ q_read++ % numstates ];
2522 base = trie->states[ cur ].trans.base;
2524 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2525 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2527 U32 fail_state = cur;
2530 fail_state = fail[ fail_state ];
2531 fail_base = aho->states[ fail_state ].trans.base;
2532 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2534 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2535 fail[ ch_state ] = fail_state;
2536 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2538 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2540 q[ q_write++ % numstates] = ch_state;
2544 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2545 when we fail in state 1, this allows us to use the
2546 charclass scan to find a valid start char. This is based on the principle
2547 that theres a good chance the string being searched contains lots of stuff
2548 that cant be a start char.
2550 fail[ 0 ] = fail[ 1 ] = 0;
2551 DEBUG_TRIE_COMPILE_r({
2552 PerlIO_printf(Perl_debug_log,
2553 "%*sStclass Failtable (%"UVuf" states): 0",
2554 (int)(depth * 2), "", (UV)numstates
2556 for( q_read=1; q_read<numstates; q_read++ ) {
2557 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2559 PerlIO_printf(Perl_debug_log, "\n");
2562 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2567 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2568 * These need to be revisited when a newer toolchain becomes available.
2570 #if defined(__sparc64__) && defined(__GNUC__)
2571 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2572 # undef SPARC64_GCC_WORKAROUND
2573 # define SPARC64_GCC_WORKAROUND 1
2577 #define DEBUG_PEEP(str,scan,depth) \
2578 DEBUG_OPTIMISE_r({if (scan){ \
2579 SV * const mysv=sv_newmortal(); \
2580 regnode *Next = regnext(scan); \
2581 regprop(RExC_rx, mysv, scan); \
2582 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2583 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2584 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2588 /* The below joins as many adjacent EXACTish nodes as possible into a single
2589 * one. The regop may be changed if the node(s) contain certain sequences that
2590 * require special handling. The joining is only done if:
2591 * 1) there is room in the current conglomerated node to entirely contain the
2593 * 2) they are the exact same node type
2595 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2596 * these get optimized out
2598 * If a node is to match under /i (folded), the number of characters it matches
2599 * can be different than its character length if it contains a multi-character
2600 * fold. *min_subtract is set to the total delta of the input nodes.
2602 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2603 * and contains LATIN SMALL LETTER SHARP S
2605 * This is as good a place as any to discuss the design of handling these
2606 * multi-character fold sequences. It's been wrong in Perl for a very long
2607 * time. There are three code points in Unicode whose multi-character folds
2608 * were long ago discovered to mess things up. The previous designs for
2609 * dealing with these involved assigning a special node for them. This
2610 * approach doesn't work, as evidenced by this example:
2611 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2612 * Both these fold to "sss", but if the pattern is parsed to create a node that
2613 * would match just the \xDF, it won't be able to handle the case where a
2614 * successful match would have to cross the node's boundary. The new approach
2615 * that hopefully generally solves the problem generates an EXACTFU_SS node
2618 * It turns out that there are problems with all multi-character folds, and not
2619 * just these three. Now the code is general, for all such cases, but the
2620 * three still have some special handling. The approach taken is:
2621 * 1) This routine examines each EXACTFish node that could contain multi-
2622 * character fold sequences. It returns in *min_subtract how much to
2623 * subtract from the the actual length of the string to get a real minimum
2624 * match length; it is 0 if there are no multi-char folds. This delta is
2625 * used by the caller to adjust the min length of the match, and the delta
2626 * between min and max, so that the optimizer doesn't reject these
2627 * possibilities based on size constraints.
2628 * 2) Certain of these sequences require special handling by the trie code,
2629 * so, if found, this code changes the joined node type to special ops:
2630 * EXACTFU_TRICKYFOLD and EXACTFU_SS.
2631 * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2632 * is used for an EXACTFU node that contains at least one "ss" sequence in
2633 * it. For non-UTF-8 patterns and strings, this is the only case where
2634 * there is a possible fold length change. That means that a regular
2635 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2636 * with length changes, and so can be processed faster. regexec.c takes
2637 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2638 * pre-folded by regcomp.c. This saves effort in regex matching.
2639 * However, the pre-folding isn't done for non-UTF8 patterns because the
2640 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2641 * down by forcing the pattern into UTF8 unless necessary. Also what
2642 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2643 * possibilities for the non-UTF8 patterns are quite simple, except for
2644 * the sharp s. All the ones that don't involve a UTF-8 target string are
2645 * members of a fold-pair, and arrays are set up for all of them so that
2646 * the other member of the pair can be found quickly. Code elsewhere in
2647 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2648 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2649 * described in the next item.
2650 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2651 * 'ss' or not is not knowable at compile time. It will match iff the
2652 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2653 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2654 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2655 * described in item 3). An assumption that the optimizer part of
2656 * regexec.c (probably unwittingly) makes is that a character in the
2657 * pattern corresponds to at most a single character in the target string.
2658 * (And I do mean character, and not byte here, unlike other parts of the
2659 * documentation that have never been updated to account for multibyte
2660 * Unicode.) This assumption is wrong only in this case, as all other
2661 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2662 * virtue of having this file pre-fold UTF-8 patterns. I'm
2663 * reluctant to try to change this assumption, so instead the code punts.
2664 * This routine examines EXACTF nodes for the sharp s, and returns a
2665 * boolean indicating whether or not the node is an EXACTF node that
2666 * contains a sharp s. When it is true, the caller sets a flag that later
2667 * causes the optimizer in this file to not set values for the floating
2668 * and fixed string lengths, and thus avoids the optimizer code in
2669 * regexec.c that makes the invalid assumption. Thus, there is no
2670 * optimization based on string lengths for EXACTF nodes that contain the
2671 * sharp s. This only happens for /id rules (which means the pattern
2675 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2676 if (PL_regkind[OP(scan)] == EXACT) \
2677 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2680 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2681 /* Merge several consecutive EXACTish nodes into one. */
2682 regnode *n = regnext(scan);
2684 regnode *next = scan + NODE_SZ_STR(scan);
2688 regnode *stop = scan;
2689 GET_RE_DEBUG_FLAGS_DECL;
2691 PERL_UNUSED_ARG(depth);
2694 PERL_ARGS_ASSERT_JOIN_EXACT;
2695 #ifndef EXPERIMENTAL_INPLACESCAN
2696 PERL_UNUSED_ARG(flags);
2697 PERL_UNUSED_ARG(val);
2699 DEBUG_PEEP("join",scan,depth);
2701 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2702 * EXACT ones that are mergeable to the current one. */
2704 && (PL_regkind[OP(n)] == NOTHING
2705 || (stringok && OP(n) == OP(scan)))
2707 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2710 if (OP(n) == TAIL || n > next)
2712 if (PL_regkind[OP(n)] == NOTHING) {
2713 DEBUG_PEEP("skip:",n,depth);
2714 NEXT_OFF(scan) += NEXT_OFF(n);
2715 next = n + NODE_STEP_REGNODE;
2722 else if (stringok) {
2723 const unsigned int oldl = STR_LEN(scan);
2724 regnode * const nnext = regnext(n);
2726 /* XXX I (khw) kind of doubt that this works on platforms where
2727 * U8_MAX is above 255 because of lots of other assumptions */
2728 if (oldl + STR_LEN(n) > U8_MAX)
2731 DEBUG_PEEP("merg",n,depth);
2734 NEXT_OFF(scan) += NEXT_OFF(n);
2735 STR_LEN(scan) += STR_LEN(n);
2736 next = n + NODE_SZ_STR(n);
2737 /* Now we can overwrite *n : */
2738 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2746 #ifdef EXPERIMENTAL_INPLACESCAN
2747 if (flags && !NEXT_OFF(n)) {
2748 DEBUG_PEEP("atch", val, depth);
2749 if (reg_off_by_arg[OP(n)]) {
2750 ARG_SET(n, val - n);
2753 NEXT_OFF(n) = val - n;
2761 *has_exactf_sharp_s = FALSE;
2763 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2764 * can now analyze for sequences of problematic code points. (Prior to
2765 * this final joining, sequences could have been split over boundaries, and
2766 * hence missed). The sequences only happen in folding, hence for any
2767 * non-EXACT EXACTish node */
2768 if (OP(scan) != EXACT) {
2769 const U8 * const s0 = (U8*) STRING(scan);
2771 const U8 * const s_end = s0 + STR_LEN(scan);
2773 /* One pass is made over the node's string looking for all the
2774 * possibilities. to avoid some tests in the loop, there are two main
2775 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2779 /* Examine the string for a multi-character fold sequence. UTF-8
2780 * patterns have all characters pre-folded by the time this code is
2782 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2783 length sequence we are looking for is 2 */
2786 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2787 if (! len) { /* Not a multi-char fold: get next char */
2792 /* Nodes with 'ss' require special handling, except for EXACTFL
2793 * and EXACTFA for which there is no multi-char fold to this */
2794 if (len == 2 && *s == 's' && *(s+1) == 's'
2795 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2798 OP(scan) = EXACTFU_SS;
2801 else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
2802 && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2803 COMBINING_DIAERESIS_UTF8
2804 COMBINING_ACUTE_ACCENT_UTF8,
2806 || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2807 COMBINING_DIAERESIS_UTF8
2808 COMBINING_ACUTE_ACCENT_UTF8,
2813 /* These two folds require special handling by trie's, so
2814 * change the node type to indicate this. If EXACTFA and
2815 * EXACTFL were ever to be handled by trie's, this would
2816 * have to be changed. If this node has already been
2817 * changed to EXACTFU_SS in this loop, leave it as is. (I
2818 * (khw) think it doesn't matter in regexec.c for UTF
2819 * patterns, but no need to change it */
2820 if (OP(scan) == EXACTFU) {
2821 OP(scan) = EXACTFU_TRICKYFOLD;
2825 else { /* Here is a generic multi-char fold. */
2826 const U8* multi_end = s + len;
2828 /* Count how many characters in it. In the case of /l and
2829 * /aa, no folds which contain ASCII code points are
2830 * allowed, so check for those, and skip if found. (In
2831 * EXACTFL, no folds are allowed to any Latin1 code point,
2832 * not just ASCII. But there aren't any of these
2833 * currently, nor ever likely, so don't take the time to
2834 * test for them. The code that generates the
2835 * is_MULTI_foo() macros croaks should one actually get put
2836 * into Unicode .) */
2837 if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2838 count = utf8_length(s, multi_end);
2842 while (s < multi_end) {
2845 goto next_iteration;
2855 /* The delta is how long the sequence is minus 1 (1 is how long
2856 * the character that folds to the sequence is) */
2857 *min_subtract += count - 1;
2861 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2863 /* Here, the pattern is not UTF-8. Look for the multi-char folds
2864 * that are all ASCII. As in the above case, EXACTFL and EXACTFA
2865 * nodes can't have multi-char folds to this range (and there are
2866 * no existing ones in the upper latin1 range). In the EXACTF
2867 * case we look also for the sharp s, which can be in the final
2868 * position. Otherwise we can stop looking 1 byte earlier because
2869 * have to find at least two characters for a multi-fold */
2870 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2872 /* The below is perhaps overboard, but this allows us to save a
2873 * test each time through the loop at the expense of a mask. This
2874 * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2875 * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
2876 * are 64. This uses an exclusive 'or' to find that bit and then
2877 * inverts it to form a mask, with just a single 0, in the bit
2878 * position where 'S' and 's' differ. */
2879 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2880 const U8 s_masked = 's' & S_or_s_mask;
2883 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2884 if (! len) { /* Not a multi-char fold. */
2885 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2887 *has_exactf_sharp_s = TRUE;
2894 && ((*s & S_or_s_mask) == s_masked)
2895 && ((*(s+1) & S_or_s_mask) == s_masked))
2898 /* EXACTF nodes need to know that the minimum length
2899 * changed so that a sharp s in the string can match this
2900 * ss in the pattern, but they remain EXACTF nodes, as they
2901 * won't match this unless the target string is is UTF-8,
2902 * which we don't know until runtime */
2903 if (OP(scan) != EXACTF) {
2904 OP(scan) = EXACTFU_SS;
2908 *min_subtract += len - 1;
2915 /* Allow dumping but overwriting the collection of skipped
2916 * ops and/or strings with fake optimized ops */
2917 n = scan + NODE_SZ_STR(scan);
2925 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2929 /* REx optimizer. Converts nodes into quicker variants "in place".
2930 Finds fixed substrings. */
2932 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2933 to the position after last scanned or to NULL. */
2935 #define INIT_AND_WITHP \
2936 assert(!and_withp); \
2937 Newx(and_withp,1,struct regnode_charclass_class); \
2938 SAVEFREEPV(and_withp)
2940 /* this is a chain of data about sub patterns we are processing that
2941 need to be handled separately/specially in study_chunk. Its so
2942 we can simulate recursion without losing state. */
2944 typedef struct scan_frame {
2945 regnode *last; /* last node to process in this frame */
2946 regnode *next; /* next node to process when last is reached */
2947 struct scan_frame *prev; /*previous frame*/
2948 I32 stop; /* what stopparen do we use */
2952 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2954 #define CASE_SYNST_FNC(nAmE) \
2956 if (flags & SCF_DO_STCLASS_AND) { \
2957 for (value = 0; value < 256; value++) \
2958 if (!is_ ## nAmE ## _cp(value)) \
2959 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2962 for (value = 0; value < 256; value++) \
2963 if (is_ ## nAmE ## _cp(value)) \
2964 ANYOF_BITMAP_SET(data->start_class, value); \
2968 if (flags & SCF_DO_STCLASS_AND) { \
2969 for (value = 0; value < 256; value++) \
2970 if (is_ ## nAmE ## _cp(value)) \
2971 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2974 for (value = 0; value < 256; value++) \
2975 if (!is_ ## nAmE ## _cp(value)) \
2976 ANYOF_BITMAP_SET(data->start_class, value); \
2983 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2984 I32 *minlenp, I32 *deltap,
2989 struct regnode_charclass_class *and_withp,
2990 U32 flags, U32 depth)
2991 /* scanp: Start here (read-write). */
2992 /* deltap: Write maxlen-minlen here. */
2993 /* last: Stop before this one. */
2994 /* data: string data about the pattern */
2995 /* stopparen: treat close N as END */
2996 /* recursed: which subroutines have we recursed into */
2997 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3000 I32 min = 0; /* There must be at least this number of characters to match */
3002 regnode *scan = *scanp, *next;
3004 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3005 int is_inf_internal = 0; /* The studied chunk is infinite */
3006 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3007 scan_data_t data_fake;
3008 SV *re_trie_maxbuff = NULL;
3009 regnode *first_non_open = scan;
3010 I32 stopmin = I32_MAX;
3011 scan_frame *frame = NULL;
3012 GET_RE_DEBUG_FLAGS_DECL;
3014 PERL_ARGS_ASSERT_STUDY_CHUNK;
3017 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3021 while (first_non_open && OP(first_non_open) == OPEN)
3022 first_non_open=regnext(first_non_open);
3027 while ( scan && OP(scan) != END && scan < last ){
3028 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3029 node length to get a real minimum (because
3030 the folded version may be shorter) */
3031 bool has_exactf_sharp_s = FALSE;
3032 /* Peephole optimizer: */
3033 DEBUG_STUDYDATA("Peep:", data,depth);
3034 DEBUG_PEEP("Peep",scan,depth);
3036 /* Its not clear to khw or hv why this is done here, and not in the
3037 * clauses that deal with EXACT nodes. khw's guess is that it's
3038 * because of a previous design */
3039 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3041 /* Follow the next-chain of the current node and optimize
3042 away all the NOTHINGs from it. */
3043 if (OP(scan) != CURLYX) {
3044 const int max = (reg_off_by_arg[OP(scan)]
3046 /* I32 may be smaller than U16 on CRAYs! */
3047 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3048 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3052 /* Skip NOTHING and LONGJMP. */
3053 while ((n = regnext(n))
3054 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3055 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3056 && off + noff < max)
3058 if (reg_off_by_arg[OP(scan)])
3061 NEXT_OFF(scan) = off;
3066 /* The principal pseudo-switch. Cannot be a switch, since we
3067 look into several different things. */
3068 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3069 || OP(scan) == IFTHEN) {
3070 next = regnext(scan);
3072 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3074 if (OP(next) == code || code == IFTHEN) {
3075 /* NOTE - There is similar code to this block below for handling
3076 TRIE nodes on a re-study. If you change stuff here check there
3078 I32 max1 = 0, min1 = I32_MAX, num = 0;
3079 struct regnode_charclass_class accum;
3080 regnode * const startbranch=scan;
3082 if (flags & SCF_DO_SUBSTR)
3083 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3084 if (flags & SCF_DO_STCLASS)
3085 cl_init_zero(pRExC_state, &accum);
3087 while (OP(scan) == code) {
3088 I32 deltanext, minnext, f = 0, fake;
3089 struct regnode_charclass_class this_class;
3092 data_fake.flags = 0;
3094 data_fake.whilem_c = data->whilem_c;
3095 data_fake.last_closep = data->last_closep;
3098 data_fake.last_closep = &fake;
3100 data_fake.pos_delta = delta;
3101 next = regnext(scan);
3102 scan = NEXTOPER(scan);
3104 scan = NEXTOPER(scan);
3105 if (flags & SCF_DO_STCLASS) {
3106 cl_init(pRExC_state, &this_class);
3107 data_fake.start_class = &this_class;
3108 f = SCF_DO_STCLASS_AND;
3110 if (flags & SCF_WHILEM_VISITED_POS)
3111 f |= SCF_WHILEM_VISITED_POS;
3113 /* we suppose the run is continuous, last=next...*/
3114 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3116 stopparen, recursed, NULL, f,depth+1);
3119 if (max1 < minnext + deltanext)
3120 max1 = minnext + deltanext;
3121 if (deltanext == I32_MAX)
3122 is_inf = is_inf_internal = 1;
3124 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3126 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3127 if ( stopmin > minnext)
3128 stopmin = min + min1;
3129 flags &= ~SCF_DO_SUBSTR;
3131 data->flags |= SCF_SEEN_ACCEPT;
3134 if (data_fake.flags & SF_HAS_EVAL)
3135 data->flags |= SF_HAS_EVAL;
3136 data->whilem_c = data_fake.whilem_c;
3138 if (flags & SCF_DO_STCLASS)
3139 cl_or(pRExC_state, &accum, &this_class);
3141 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3143 if (flags & SCF_DO_SUBSTR) {
3144 data->pos_min += min1;
3145 data->pos_delta += max1 - min1;
3146 if (max1 != min1 || is_inf)
3147 data->longest = &(data->longest_float);
3150 delta += max1 - min1;
3151 if (flags & SCF_DO_STCLASS_OR) {
3152 cl_or(pRExC_state, data->start_class, &accum);
3154 cl_and(data->start_class, and_withp);
3155 flags &= ~SCF_DO_STCLASS;
3158 else if (flags & SCF_DO_STCLASS_AND) {
3160 cl_and(data->start_class, &accum);
3161 flags &= ~SCF_DO_STCLASS;
3164 /* Switch to OR mode: cache the old value of
3165 * data->start_class */
3167 StructCopy(data->start_class, and_withp,
3168 struct regnode_charclass_class);
3169 flags &= ~SCF_DO_STCLASS_AND;
3170 StructCopy(&accum, data->start_class,
3171 struct regnode_charclass_class);
3172 flags |= SCF_DO_STCLASS_OR;
3173 data->start_class->flags |= ANYOF_EOS;
3177 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3180 Assuming this was/is a branch we are dealing with: 'scan' now
3181 points at the item that follows the branch sequence, whatever
3182 it is. We now start at the beginning of the sequence and look
3189 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3191 If we can find such a subsequence we need to turn the first
3192 element into a trie and then add the subsequent branch exact
3193 strings to the trie.
3197 1. patterns where the whole set of branches can be converted.
3199 2. patterns where only a subset can be converted.
3201 In case 1 we can replace the whole set with a single regop
3202 for the trie. In case 2 we need to keep the start and end
3205 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3206 becomes BRANCH TRIE; BRANCH X;
3208 There is an additional case, that being where there is a
3209 common prefix, which gets split out into an EXACT like node
3210 preceding the TRIE node.
3212 If x(1..n)==tail then we can do a simple trie, if not we make
3213 a "jump" trie, such that when we match the appropriate word
3214 we "jump" to the appropriate tail node. Essentially we turn
3215 a nested if into a case structure of sorts.
3220 if (!re_trie_maxbuff) {
3221 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3222 if (!SvIOK(re_trie_maxbuff))
3223 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3225 if ( SvIV(re_trie_maxbuff)>=0 ) {
3227 regnode *first = (regnode *)NULL;
3228 regnode *last = (regnode *)NULL;
3229 regnode *tail = scan;
3234 SV * const mysv = sv_newmortal(); /* for dumping */
3236 /* var tail is used because there may be a TAIL
3237 regop in the way. Ie, the exacts will point to the
3238 thing following the TAIL, but the last branch will
3239 point at the TAIL. So we advance tail. If we
3240 have nested (?:) we may have to move through several
3244 while ( OP( tail ) == TAIL ) {
3245 /* this is the TAIL generated by (?:) */
3246 tail = regnext( tail );
3250 DEBUG_TRIE_COMPILE_r({
3251 regprop(RExC_rx, mysv, tail );
3252 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3253 (int)depth * 2 + 2, "",
3254 "Looking for TRIE'able sequences. Tail node is: ",
3255 SvPV_nolen_const( mysv )
3261 Step through the branches
3262 cur represents each branch,
3263 noper is the first thing to be matched as part of that branch
3264 noper_next is the regnext() of that node.
3266 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3267 via a "jump trie" but we also support building with NOJUMPTRIE,
3268 which restricts the trie logic to structures like /FOO|BAR/.
3270 If noper is a trieable nodetype then the branch is a possible optimization
3271 target. If we are building under NOJUMPTRIE then we require that noper_next
3272 is the same as scan (our current position in the regex program).
3274 Once we have two or more consecutive such branches we can create a
3275 trie of the EXACT's contents and stitch it in place into the program.
3277 If the sequence represents all of the branches in the alternation we
3278 replace the entire thing with a single TRIE node.
3280 Otherwise when it is a subsequence we need to stitch it in place and
3281 replace only the relevant branches. This means the first branch has
3282 to remain as it is used by the alternation logic, and its next pointer,
3283 and needs to be repointed at the item on the branch chain following
3284 the last branch we have optimized away.
3286 This could be either a BRANCH, in which case the subsequence is internal,
3287 or it could be the item following the branch sequence in which case the
3288 subsequence is at the end (which does not necessarily mean the first node
3289 is the start of the alternation).
3291 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3294 ----------------+-----------
3298 EXACTFU_SS | EXACTFU
3299 EXACTFU_TRICKYFOLD | EXACTFU
3304 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3305 ( EXACT == (X) ) ? EXACT : \
3306 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3309 /* dont use tail as the end marker for this traverse */
3310 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3311 regnode * const noper = NEXTOPER( cur );
3312 U8 noper_type = OP( noper );
3313 U8 noper_trietype = TRIE_TYPE( noper_type );
3314 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3315 regnode * const noper_next = regnext( noper );
3316 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3317 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3320 DEBUG_TRIE_COMPILE_r({
3321 regprop(RExC_rx, mysv, cur);
3322 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3323 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3325 regprop(RExC_rx, mysv, noper);
3326 PerlIO_printf( Perl_debug_log, " -> %s",
3327 SvPV_nolen_const(mysv));
3330 regprop(RExC_rx, mysv, noper_next );
3331 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3332 SvPV_nolen_const(mysv));
3334 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3335 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3336 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3340 /* Is noper a trieable nodetype that can be merged with the
3341 * current trie (if there is one)? */
3345 ( noper_trietype == NOTHING)
3346 || ( trietype == NOTHING )
3347 || ( trietype == noper_trietype )
3350 && noper_next == tail
3354 /* Handle mergable triable node
3355 * Either we are the first node in a new trieable sequence,
3356 * in which case we do some bookkeeping, otherwise we update
3357 * the end pointer. */
3360 if ( noper_trietype == NOTHING ) {
3361 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3362 regnode * const noper_next = regnext( noper );
3363 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3364 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3367 if ( noper_next_trietype ) {
3368 trietype = noper_next_trietype;
3369 } else if (noper_next_type) {
3370 /* a NOTHING regop is 1 regop wide. We need at least two
3371 * for a trie so we can't merge this in */
3375 trietype = noper_trietype;
3378 if ( trietype == NOTHING )
3379 trietype = noper_trietype;
3384 } /* end handle mergable triable node */
3386 /* handle unmergable node -
3387 * noper may either be a triable node which can not be tried
3388 * together with the current trie, or a non triable node */
3390 /* If last is set and trietype is not NOTHING then we have found
3391 * at least two triable branch sequences in a row of a similar
3392 * trietype so we can turn them into a trie. If/when we
3393 * allow NOTHING to start a trie sequence this condition will be
3394 * required, and it isn't expensive so we leave it in for now. */
3395 if ( trietype && trietype != NOTHING )
3396 make_trie( pRExC_state,
3397 startbranch, first, cur, tail, count,
3398 trietype, depth+1 );
3399 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3403 && noper_next == tail
3406 /* noper is triable, so we can start a new trie sequence */
3409 trietype = noper_trietype;
3411 /* if we already saw a first but the current node is not triable then we have
3412 * to reset the first information. */
3417 } /* end handle unmergable node */
3418 } /* loop over branches */
3419 DEBUG_TRIE_COMPILE_r({
3420 regprop(RExC_rx, mysv, cur);
3421 PerlIO_printf( Perl_debug_log,
3422 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3423 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3426 if ( last && trietype ) {
3427 if ( trietype != NOTHING ) {
3428 /* the last branch of the sequence was part of a trie,
3429 * so we have to construct it here outside of the loop
3431 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3432 #ifdef TRIE_STUDY_OPT
3433 if ( ((made == MADE_EXACT_TRIE &&
3434 startbranch == first)
3435 || ( first_non_open == first )) &&
3437 flags |= SCF_TRIE_RESTUDY;
3438 if ( startbranch == first
3441 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3446 /* at this point we know whatever we have is a NOTHING sequence/branch
3447 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3449 if ( startbranch == first ) {
3451 /* the entire thing is a NOTHING sequence, something like this:
3452 * (?:|) So we can turn it into a plain NOTHING op. */
3453 DEBUG_TRIE_COMPILE_r({
3454 regprop(RExC_rx, mysv, cur);
3455 PerlIO_printf( Perl_debug_log,
3456 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3457 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3460 OP(startbranch)= NOTHING;
3461 NEXT_OFF(startbranch)= tail - startbranch;
3462 for ( opt= startbranch + 1; opt < tail ; opt++ )
3466 } /* end if ( last) */
3467 } /* TRIE_MAXBUF is non zero */
3472 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3473 scan = NEXTOPER(NEXTOPER(scan));
3474 } else /* single branch is optimized. */
3475 scan = NEXTOPER(scan);
3477 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3478 scan_frame *newframe = NULL;
3483 if (OP(scan) != SUSPEND) {
3484 /* set the pointer */
3485 if (OP(scan) == GOSUB) {
3487 RExC_recurse[ARG2L(scan)] = scan;
3488 start = RExC_open_parens[paren-1];
3489 end = RExC_close_parens[paren-1];
3492 start = RExC_rxi->program + 1;
3496 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3497 SAVEFREEPV(recursed);
3499 if (!PAREN_TEST(recursed,paren+1)) {
3500 PAREN_SET(recursed,paren+1);
3501 Newx(newframe,1,scan_frame);
3503 if (flags & SCF_DO_SUBSTR) {
3504 SCAN_COMMIT(pRExC_state,data,minlenp);
3505 data->longest = &(data->longest_float);
3507 is_inf = is_inf_internal = 1;
3508 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3509 cl_anything(pRExC_state, data->start_class);
3510 flags &= ~SCF_DO_STCLASS;
3513 Newx(newframe,1,scan_frame);
3516 end = regnext(scan);
3521 SAVEFREEPV(newframe);
3522 newframe->next = regnext(scan);
3523 newframe->last = last;
3524 newframe->stop = stopparen;
3525 newframe->prev = frame;
3535 else if (OP(scan) == EXACT) {
3536 I32 l = STR_LEN(scan);
3539 const U8 * const s = (U8*)STRING(scan);
3540 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3541 l = utf8_length(s, s + l);
3543 uc = *((U8*)STRING(scan));
3546 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3547 /* The code below prefers earlier match for fixed
3548 offset, later match for variable offset. */
3549 if (data->last_end == -1) { /* Update the start info. */
3550 data->last_start_min = data->pos_min;
3551 data->last_start_max = is_inf
3552 ? I32_MAX : data->pos_min + data->pos_delta;
3554 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3556 SvUTF8_on(data->last_found);
3558 SV * const sv = data->last_found;
3559 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3560 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3561 if (mg && mg->mg_len >= 0)
3562 mg->mg_len += utf8_length((U8*)STRING(scan),
3563 (U8*)STRING(scan)+STR_LEN(scan));
3565 data->last_end = data->pos_min + l;
3566 data->pos_min += l; /* As in the first entry. */
3567 data->flags &= ~SF_BEFORE_EOL;
3569 if (flags & SCF_DO_STCLASS_AND) {
3570 /* Check whether it is compatible with what we know already! */
3574 /* If compatible, we or it in below. It is compatible if is
3575 * in the bitmp and either 1) its bit or its fold is set, or 2)
3576 * it's for a locale. Even if there isn't unicode semantics
3577 * here, at runtime there may be because of matching against a
3578 * utf8 string, so accept a possible false positive for
3579 * latin1-range folds */
3581 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3582 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3583 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3584 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3589 ANYOF_CLASS_ZERO(data->start_class);
3590 ANYOF_BITMAP_ZERO(data->start_class);
3592 ANYOF_BITMAP_SET(data->start_class, uc);
3593 else if (uc >= 0x100) {
3596 /* Some Unicode code points fold to the Latin1 range; as
3597 * XXX temporary code, instead of figuring out if this is
3598 * one, just assume it is and set all the start class bits
3599 * that could be some such above 255 code point's fold
3600 * which will generate fals positives. As the code
3601 * elsewhere that does compute the fold settles down, it
3602 * can be extracted out and re-used here */
3603 for (i = 0; i < 256; i++){
3604 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3605 ANYOF_BITMAP_SET(data->start_class, i);
3609 data->start_class->flags &= ~ANYOF_EOS;
3611 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3613 else if (flags & SCF_DO_STCLASS_OR) {
3614 /* false positive possible if the class is case-folded */
3616 ANYOF_BITMAP_SET(data->start_class, uc);
3618 data->start_class->flags |= ANYOF_UNICODE_ALL;
3619 data->start_class->flags &= ~ANYOF_EOS;
3620 cl_and(data->start_class, and_withp);
3622 flags &= ~SCF_DO_STCLASS;
3624 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3625 I32 l = STR_LEN(scan);
3626 UV uc = *((U8*)STRING(scan));
3628 /* Search for fixed substrings supports EXACT only. */
3629 if (flags & SCF_DO_SUBSTR) {
3631 SCAN_COMMIT(pRExC_state, data, minlenp);
3634 const U8 * const s = (U8 *)STRING(scan);
3635 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3636 l = utf8_length(s, s + l);
3638 if (has_exactf_sharp_s) {
3639 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3641 min += l - min_subtract;
3643 delta += min_subtract;
3644 if (flags & SCF_DO_SUBSTR) {
3645 data->pos_min += l - min_subtract;
3646 if (data->pos_min < 0) {
3649 data->pos_delta += min_subtract;
3651 data->longest = &(data->longest_float);
3654 if (flags & SCF_DO_STCLASS_AND) {
3655 /* Check whether it is compatible with what we know already! */
3658 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3659 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3660 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3664 ANYOF_CLASS_ZERO(data->start_class);
3665 ANYOF_BITMAP_ZERO(data->start_class);
3667 ANYOF_BITMAP_SET(data->start_class, uc);
3668 data->start_class->flags &= ~ANYOF_EOS;
3669 if (OP(scan) == EXACTFL) {
3670 /* XXX This set is probably no longer necessary, and
3671 * probably wrong as LOCALE now is on in the initial
3673 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3677 /* Also set the other member of the fold pair. In case
3678 * that unicode semantics is called for at runtime, use
3679 * the full latin1 fold. (Can't do this for locale,
3680 * because not known until runtime) */
3681 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3683 /* All other (EXACTFL handled above) folds except under
3684 * /iaa that include s, S, and sharp_s also may include
3686 if (OP(scan) != EXACTFA) {
3687 if (uc == 's' || uc == 'S') {
3688 ANYOF_BITMAP_SET(data->start_class,
3689 LATIN_SMALL_LETTER_SHARP_S);
3691 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3692 ANYOF_BITMAP_SET(data->start_class, 's');
3693 ANYOF_BITMAP_SET(data->start_class, 'S');
3698 else if (uc >= 0x100) {
3700 for (i = 0; i < 256; i++){
3701 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3702 ANYOF_BITMAP_SET(data->start_class, i);
3707 else if (flags & SCF_DO_STCLASS_OR) {
3708 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3709 /* false positive possible if the class is case-folded.
3710 Assume that the locale settings are the same... */
3712 ANYOF_BITMAP_SET(data->start_class, uc);
3713 if (OP(scan) != EXACTFL) {
3715 /* And set the other member of the fold pair, but
3716 * can't do that in locale because not known until
3718 ANYOF_BITMAP_SET(data->start_class,
3719 PL_fold_latin1[uc]);
3721 /* All folds except under /iaa that include s, S,
3722 * and sharp_s also may include the others */
3723 if (OP(scan) != EXACTFA) {
3724 if (uc == 's' || uc == 'S') {
3725 ANYOF_BITMAP_SET(data->start_class,
3726 LATIN_SMALL_LETTER_SHARP_S);
3728 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3729 ANYOF_BITMAP_SET(data->start_class, 's');
3730 ANYOF_BITMAP_SET(data->start_class, 'S');
3735 data->start_class->flags &= ~ANYOF_EOS;
3737 cl_and(data->start_class, and_withp);
3739 flags &= ~SCF_DO_STCLASS;
3741 else if (REGNODE_VARIES(OP(scan))) {
3742 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3743 I32 f = flags, pos_before = 0;
3744 regnode * const oscan = scan;
3745 struct regnode_charclass_class this_class;
3746 struct regnode_charclass_class *oclass = NULL;
3747 I32 next_is_eval = 0;
3749 switch (PL_regkind[OP(scan)]) {
3750 case WHILEM: /* End of (?:...)* . */
3751 scan = NEXTOPER(scan);
3754 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3755 next = NEXTOPER(scan);
3756 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3758 maxcount = REG_INFTY;
3759 next = regnext(scan);
3760 scan = NEXTOPER(scan);
3764 if (flags & SCF_DO_SUBSTR)
3769 if (flags & SCF_DO_STCLASS) {
3771 maxcount = REG_INFTY;
3772 next = regnext(scan);
3773 scan = NEXTOPER(scan);
3776 is_inf = is_inf_internal = 1;
3777 scan = regnext(scan);
3778 if (flags & SCF_DO_SUBSTR) {
3779 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3780 data->longest = &(data->longest_float);
3782 goto optimize_curly_tail;
3784 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3785 && (scan->flags == stopparen))
3790 mincount = ARG1(scan);
3791 maxcount = ARG2(scan);
3793 next = regnext(scan);
3794 if (OP(scan) == CURLYX) {
3795 I32 lp = (data ? *(data->last_closep) : 0);
3796 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3798 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3799 next_is_eval = (OP(scan) == EVAL);
3801 if (flags & SCF_DO_SUBSTR) {
3802 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3803 pos_before = data->pos_min;
3807 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3809 data->flags |= SF_IS_INF;
3811 if (flags & SCF_DO_STCLASS) {
3812 cl_init(pRExC_state, &this_class);
3813 oclass = data->start_class;
3814 data->start_class = &this_class;
3815 f |= SCF_DO_STCLASS_AND;
3816 f &= ~SCF_DO_STCLASS_OR;
3818 /* Exclude from super-linear cache processing any {n,m}
3819 regops for which the combination of input pos and regex
3820 pos is not enough information to determine if a match
3823 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3824 regex pos at the \s*, the prospects for a match depend not
3825 only on the input position but also on how many (bar\s*)
3826 repeats into the {4,8} we are. */
3827 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3828 f &= ~SCF_WHILEM_VISITED_POS;
3830 /* This will finish on WHILEM, setting scan, or on NULL: */
3831 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3832 last, data, stopparen, recursed, NULL,
3834 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3836 if (flags & SCF_DO_STCLASS)
3837 data->start_class = oclass;
3838 if (mincount == 0 || minnext == 0) {
3839 if (flags & SCF_DO_STCLASS_OR) {
3840 cl_or(pRExC_state, data->start_class, &this_class);
3842 else if (flags & SCF_DO_STCLASS_AND) {
3843 /* Switch to OR mode: cache the old value of
3844 * data->start_class */
3846 StructCopy(data->start_class, and_withp,
3847 struct regnode_charclass_class);
3848 flags &= ~SCF_DO_STCLASS_AND;
3849 StructCopy(&this_class, data->start_class,
3850 struct regnode_charclass_class);
3851 flags |= SCF_DO_STCLASS_OR;
3852 data->start_class->flags |= ANYOF_EOS;
3854 } else { /* Non-zero len */
3855 if (flags & SCF_DO_STCLASS_OR) {
3856 cl_or(pRExC_state, data->start_class, &this_class);
3857 cl_and(data->start_class, and_withp);
3859 else if (flags & SCF_DO_STCLASS_AND)
3860 cl_and(data->start_class, &this_class);
3861 flags &= ~SCF_DO_STCLASS;
3863 if (!scan) /* It was not CURLYX, but CURLY. */
3865 if ( /* ? quantifier ok, except for (?{ ... }) */
3866 (next_is_eval || !(mincount == 0 && maxcount == 1))
3867 && (minnext == 0) && (deltanext == 0)
3868 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3869 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3871 /* Fatal warnings may leak the regexp without this: */
3872 SAVEFREESV(RExC_rx_sv);
3873 ckWARNreg(RExC_parse,
3874 "Quantifier unexpected on zero-length expression");
3875 (void)ReREFCNT_inc(RExC_rx_sv);
3878 min += minnext * mincount;
3879 is_inf_internal |= ((maxcount == REG_INFTY
3880 && (minnext + deltanext) > 0)
3881 || deltanext == I32_MAX);
3882 is_inf |= is_inf_internal;
3883 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3885 /* Try powerful optimization CURLYX => CURLYN. */
3886 if ( OP(oscan) == CURLYX && data
3887 && data->flags & SF_IN_PAR
3888 && !(data->flags & SF_HAS_EVAL)
3889 && !deltanext && minnext == 1 ) {
3890 /* Try to optimize to CURLYN. */
3891 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3892 regnode * const nxt1 = nxt;
3899 if (!REGNODE_SIMPLE(OP(nxt))
3900 && !(PL_regkind[OP(nxt)] == EXACT
3901 && STR_LEN(nxt) == 1))
3907 if (OP(nxt) != CLOSE)
3909 if (RExC_open_parens) {
3910 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3911 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3913 /* Now we know that nxt2 is the only contents: */
3914 oscan->flags = (U8)ARG(nxt);
3916 OP(nxt1) = NOTHING; /* was OPEN. */
3919 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3920 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3921 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3922 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3923 OP(nxt + 1) = OPTIMIZED; /* was count. */
3924 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3929 /* Try optimization CURLYX => CURLYM. */
3930 if ( OP(oscan) == CURLYX && data
3931 && !(data->flags & SF_HAS_PAR)
3932 && !(data->flags & SF_HAS_EVAL)
3933 && !deltanext /* atom is fixed width */
3934 && minnext != 0 /* CURLYM can't handle zero width */
3935 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3937 /* XXXX How to optimize if data == 0? */
3938 /* Optimize to a simpler form. */
3939 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3943 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3944 && (OP(nxt2) != WHILEM))
3946 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3947 /* Need to optimize away parenths. */
3948 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3949 /* Set the parenth number. */
3950 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3952 oscan->flags = (U8)ARG(nxt);
3953 if (RExC_open_parens) {
3954 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3955 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3957 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3958 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3961 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3962 OP(nxt + 1) = OPTIMIZED; /* was count. */
3963 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3964 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3967 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3968 regnode *nnxt = regnext(nxt1);
3970 if (reg_off_by_arg[OP(nxt1)])
3971 ARG_SET(nxt1, nxt2 - nxt1);
3972 else if (nxt2 - nxt1 < U16_MAX)
3973 NEXT_OFF(nxt1) = nxt2 - nxt1;
3975 OP(nxt) = NOTHING; /* Cannot beautify */
3980 /* Optimize again: */
3981 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3982 NULL, stopparen, recursed, NULL, 0,depth+1);
3987 else if ((OP(oscan) == CURLYX)
3988 && (flags & SCF_WHILEM_VISITED_POS)
3989 /* See the comment on a similar expression above.
3990 However, this time it's not a subexpression
3991 we care about, but the expression itself. */
3992 && (maxcount == REG_INFTY)
3993 && data && ++data->whilem_c < 16) {
3994 /* This stays as CURLYX, we can put the count/of pair. */
3995 /* Find WHILEM (as in regexec.c) */
3996 regnode *nxt = oscan + NEXT_OFF(oscan);
3998 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4000 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4001 | (RExC_whilem_seen << 4)); /* On WHILEM */
4003 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4005 if (flags & SCF_DO_SUBSTR) {
4006 SV *last_str = NULL;
4007 int counted = mincount != 0;
4009 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4010 #if defined(SPARC64_GCC_WORKAROUND)
4013 const char *s = NULL;
4016 if (pos_before >= data->last_start_min)
4019 b = data->last_start_min;
4022 s = SvPV_const(data->last_found, l);
4023 old = b - data->last_start_min;
4026 I32 b = pos_before >= data->last_start_min
4027 ? pos_before : data->last_start_min;
4029 const char * const s = SvPV_const(data->last_found, l);
4030 I32 old = b - data->last_start_min;
4034 old = utf8_hop((U8*)s, old) - (U8*)s;
4036 /* Get the added string: */
4037 last_str = newSVpvn_utf8(s + old, l, UTF);
4038 if (deltanext == 0 && pos_before == b) {
4039 /* What was added is a constant string */
4041 SvGROW(last_str, (mincount * l) + 1);
4042 repeatcpy(SvPVX(last_str) + l,
4043 SvPVX_const(last_str), l, mincount - 1);
4044 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4045 /* Add additional parts. */
4046 SvCUR_set(data->last_found,
4047 SvCUR(data->last_found) - l);
4048 sv_catsv(data->last_found, last_str);
4050 SV * sv = data->last_found;
4052 SvUTF8(sv) && SvMAGICAL(sv) ?
4053 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4054 if (mg && mg->mg_len >= 0)
4055 mg->mg_len += CHR_SVLEN(last_str) - l;
4057 data->last_end += l * (mincount - 1);
4060 /* start offset must point into the last copy */
4061 data->last_start_min += minnext * (mincount - 1);
4062 data->last_start_max += is_inf ? I32_MAX
4063 : (maxcount - 1) * (minnext + data->pos_delta);
4066 /* It is counted once already... */
4067 data->pos_min += minnext * (mincount - counted);
4068 data->pos_delta += - counted * deltanext +
4069 (minnext + deltanext) * maxcount - minnext * mincount;
4070 if (mincount != maxcount) {
4071 /* Cannot extend fixed substrings found inside
4073 SCAN_COMMIT(pRExC_state,data,minlenp);
4074 if (mincount && last_str) {
4075 SV * const sv = data->last_found;
4076 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4077 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4081 sv_setsv(sv, last_str);
4082 data->last_end = data->pos_min;
4083 data->last_start_min =
4084 data->pos_min - CHR_SVLEN(last_str);
4085 data->last_start_max = is_inf
4087 : data->pos_min + data->pos_delta
4088 - CHR_SVLEN(last_str);
4090 data->longest = &(data->longest_float);
4092 SvREFCNT_dec(last_str);
4094 if (data && (fl & SF_HAS_EVAL))
4095 data->flags |= SF_HAS_EVAL;
4096 optimize_curly_tail:
4097 if (OP(oscan) != CURLYX) {
4098 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4100 NEXT_OFF(oscan) += NEXT_OFF(next);
4103 default: /* REF, ANYOFV, and CLUMP only? */
4104 if (flags & SCF_DO_SUBSTR) {
4105 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4106 data->longest = &(data->longest_float);
4108 is_inf = is_inf_internal = 1;
4109 if (flags & SCF_DO_STCLASS_OR)
4110 cl_anything(pRExC_state, data->start_class);
4111 flags &= ~SCF_DO_STCLASS;
4115 else if (OP(scan) == LNBREAK) {
4116 if (flags & SCF_DO_STCLASS) {
4118 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4119 if (flags & SCF_DO_STCLASS_AND) {
4120 for (value = 0; value < 256; value++)
4121 if (!is_VERTWS_cp(value))
4122 ANYOF_BITMAP_CLEAR(data->start_class, value);
4125 for (value = 0; value < 256; value++)
4126 if (is_VERTWS_cp(value))
4127 ANYOF_BITMAP_SET(data->start_class, value);
4129 if (flags & SCF_DO_STCLASS_OR)
4130 cl_and(data->start_class, and_withp);
4131 flags &= ~SCF_DO_STCLASS;
4134 delta++; /* Because of the 2 char string cr-lf */
4135 if (flags & SCF_DO_SUBSTR) {
4136 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4138 data->pos_delta += 1;
4139 data->longest = &(data->longest_float);
4142 else if (REGNODE_SIMPLE(OP(scan))) {
4145 if (flags & SCF_DO_SUBSTR) {
4146 SCAN_COMMIT(pRExC_state,data,minlenp);
4150 if (flags & SCF_DO_STCLASS) {
4151 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4153 /* Some of the logic below assumes that switching
4154 locale on will only add false positives. */
4155 switch (PL_regkind[OP(scan)]) {
4159 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4160 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4161 cl_anything(pRExC_state, data->start_class);
4164 if (OP(scan) == SANY)
4166 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4167 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4168 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4169 cl_anything(pRExC_state, data->start_class);
4171 if (flags & SCF_DO_STCLASS_AND || !value)
4172 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4175 if (flags & SCF_DO_STCLASS_AND)
4176 cl_and(data->start_class,
4177 (struct regnode_charclass_class*)scan);
4179 cl_or(pRExC_state, data->start_class,
4180 (struct regnode_charclass_class*)scan);
4183 if (flags & SCF_DO_STCLASS_AND) {
4184 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4185 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
4186 if (OP(scan) == ALNUMU) {
4187 for (value = 0; value < 256; value++) {
4188 if (!isWORDCHAR_L1(value)) {
4189 ANYOF_BITMAP_CLEAR(data->start_class, value);
4193 for (value = 0; value < 256; value++) {
4194 if (!isALNUM(value)) {
4195 ANYOF_BITMAP_CLEAR(data->start_class, value);
4202 if (data->start_class->flags & ANYOF_LOCALE)
4203 ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
4205 /* Even if under locale, set the bits for non-locale
4206 * in case it isn't a true locale-node. This will
4207 * create false positives if it truly is locale */
4208 if (OP(scan) == ALNUMU) {
4209 for (value = 0; value < 256; value++) {
4210 if (isWORDCHAR_L1(value)) {
4211 ANYOF_BITMAP_SET(data->start_class, value);
4215 for (value = 0; value < 256; value++) {
4216 if (isALNUM(value)) {
4217 ANYOF_BITMAP_SET(data->start_class, value);
4224 if (flags & SCF_DO_STCLASS_AND) {
4225 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4226 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
4227 if (OP(scan) == NALNUMU) {
4228 for (value = 0; value < 256; value++) {
4229 if (isWORDCHAR_L1(value)) {
4230 ANYOF_BITMAP_CLEAR(data->start_class, value);
4234 for (value = 0; value < 256; value++) {
4235 if (isALNUM(value)) {
4236 ANYOF_BITMAP_CLEAR(data->start_class, value);
4243 if (data->start_class->flags & ANYOF_LOCALE)
4244 ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
4246 /* Even if under locale, set the bits for non-locale in
4247 * case it isn't a true locale-node. This will create
4248 * false positives if it truly is locale */
4249 if (OP(scan) == NALNUMU) {
4250 for (value = 0; value < 256; value++) {
4251 if (! isWORDCHAR_L1(value)) {
4252 ANYOF_BITMAP_SET(data->start_class, value);
4256 for (value = 0; value < 256; value++) {
4257 if (! isALNUM(value)) {
4258 ANYOF_BITMAP_SET(data->start_class, value);
4265 if (flags & SCF_DO_STCLASS_AND) {
4266 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4267 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4268 if (OP(scan) == SPACEU) {
4269 for (value = 0; value < 256; value++) {
4270 if (!isSPACE_L1(value)) {
4271 ANYOF_BITMAP_CLEAR(data->start_class, value);
4275 for (value = 0; value < 256; value++) {
4276 if (!isSPACE(value)) {
4277 ANYOF_BITMAP_CLEAR(data->start_class, value);
4284 if (data->start_class->flags & ANYOF_LOCALE) {
4285 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4287 if (OP(scan) == SPACEU) {
4288 for (value = 0; value < 256; value++) {
4289 if (isSPACE_L1(value)) {
4290 ANYOF_BITMAP_SET(data->start_class, value);
4294 for (value = 0; value < 256; value++) {
4295 if (isSPACE(value)) {
4296 ANYOF_BITMAP_SET(data->start_class, value);
4303 if (flags & SCF_DO_STCLASS_AND) {
4304 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4305 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4306 if (OP(scan) == NSPACEU) {
4307 for (value = 0; value < 256; value++) {
4308 if (isSPACE_L1(value)) {
4309 ANYOF_BITMAP_CLEAR(data->start_class, value);
4313 for (value = 0; value < 256; value++) {
4314 if (isSPACE(value)) {
4315 ANYOF_BITMAP_CLEAR(data->start_class, value);
4322 if (data->start_class->flags & ANYOF_LOCALE)
4323 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4324 if (OP(scan) == NSPACEU) {
4325 for (value = 0; value < 256; value++) {
4326 if (!isSPACE_L1(value)) {
4327 ANYOF_BITMAP_SET(data->start_class, value);
4332 for (value = 0; value < 256; value++) {
4333 if (!isSPACE(value)) {
4334 ANYOF_BITMAP_SET(data->start_class, value);
4341 if (flags & SCF_DO_STCLASS_AND) {
4342 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4343 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4344 for (value = 0; value < 256; value++)
4345 if (!isDIGIT(value))
4346 ANYOF_BITMAP_CLEAR(data->start_class, value);
4350 if (data->start_class->flags & ANYOF_LOCALE)
4351 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4352 for (value = 0; value < 256; value++)
4354 ANYOF_BITMAP_SET(data->start_class, value);
4358 if (flags & SCF_DO_STCLASS_AND) {
4359 if (!(data->start_class->flags & ANYOF_LOCALE))
4360 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4361 for (value = 0; value < 256; value++)
4363 ANYOF_BITMAP_CLEAR(data->start_class, value);
4366 if (data->start_class->flags & ANYOF_LOCALE)
4367 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4368 for (value = 0; value < 256; value++)
4369 if (!isDIGIT(value))
4370 ANYOF_BITMAP_SET(data->start_class, value);
4373 CASE_SYNST_FNC(VERTWS);
4374 CASE_SYNST_FNC(HORIZWS);
4377 if (flags & SCF_DO_STCLASS_OR)
4378 cl_and(data->start_class, and_withp);
4379 flags &= ~SCF_DO_STCLASS;
4382 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4383 data->flags |= (OP(scan) == MEOL
4386 SCAN_COMMIT(pRExC_state, data, minlenp);
4389 else if ( PL_regkind[OP(scan)] == BRANCHJ
4390 /* Lookbehind, or need to calculate parens/evals/stclass: */
4391 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4392 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4393 if ( OP(scan) == UNLESSM &&
4395 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4396 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4399 regnode *upto= regnext(scan);
4401 SV * const mysv_val=sv_newmortal();
4402 DEBUG_STUDYDATA("OPFAIL",data,depth);
4404 /*DEBUG_PARSE_MSG("opfail");*/
4405 regprop(RExC_rx, mysv_val, upto);
4406 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4407 SvPV_nolen_const(mysv_val),
4408 (IV)REG_NODE_NUM(upto),
4413 NEXT_OFF(scan) = upto - scan;
4414 for (opt= scan + 1; opt < upto ; opt++)
4415 OP(opt) = OPTIMIZED;
4419 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4420 || OP(scan) == UNLESSM )
4422 /* Negative Lookahead/lookbehind
4423 In this case we can't do fixed string optimisation.
4426 I32 deltanext, minnext, fake = 0;
4428 struct regnode_charclass_class intrnl;
4431 data_fake.flags = 0;
4433 data_fake.whilem_c = data->whilem_c;
4434 data_fake.last_closep = data->last_closep;
4437 data_fake.last_closep = &fake;
4438 data_fake.pos_delta = delta;
4439 if ( flags & SCF_DO_STCLASS && !scan->flags
4440 && OP(scan) == IFMATCH ) { /* Lookahead */
4441 cl_init(pRExC_state, &intrnl);
4442 data_fake.start_class = &intrnl;
4443 f |= SCF_DO_STCLASS_AND;
4445 if (flags & SCF_WHILEM_VISITED_POS)
4446 f |= SCF_WHILEM_VISITED_POS;
4447 next = regnext(scan);
4448 nscan = NEXTOPER(NEXTOPER(scan));
4449 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4450 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4453 FAIL("Variable length lookbehind not implemented");
4455 else if (minnext > (I32)U8_MAX) {
4456 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4458 scan->flags = (U8)minnext;
4461 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4463 if (data_fake.flags & SF_HAS_EVAL)
4464 data->flags |= SF_HAS_EVAL;
4465 data->whilem_c = data_fake.whilem_c;
4467 if (f & SCF_DO_STCLASS_AND) {
4468 if (flags & SCF_DO_STCLASS_OR) {
4469 /* OR before, AND after: ideally we would recurse with
4470 * data_fake to get the AND applied by study of the
4471 * remainder of the pattern, and then derecurse;
4472 * *** HACK *** for now just treat as "no information".
4473 * See [perl #56690].
4475 cl_init(pRExC_state, data->start_class);
4477 /* AND before and after: combine and continue */
4478 const int was = (data->start_class->flags & ANYOF_EOS);
4480 cl_and(data->start_class, &intrnl);
4482 data->start_class->flags |= ANYOF_EOS;
4486 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4488 /* Positive Lookahead/lookbehind
4489 In this case we can do fixed string optimisation,
4490 but we must be careful about it. Note in the case of
4491 lookbehind the positions will be offset by the minimum
4492 length of the pattern, something we won't know about
4493 until after the recurse.
4495 I32 deltanext, fake = 0;
4497 struct regnode_charclass_class intrnl;
4499 /* We use SAVEFREEPV so that when the full compile
4500 is finished perl will clean up the allocated
4501 minlens when it's all done. This way we don't
4502 have to worry about freeing them when we know
4503 they wont be used, which would be a pain.
4506 Newx( minnextp, 1, I32 );
4507 SAVEFREEPV(minnextp);
4510 StructCopy(data, &data_fake, scan_data_t);
4511 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4514 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4515 data_fake.last_found=newSVsv(data->last_found);
4519 data_fake.last_closep = &fake;
4520 data_fake.flags = 0;
4521 data_fake.pos_delta = delta;
4523 data_fake.flags |= SF_IS_INF;
4524 if ( flags & SCF_DO_STCLASS && !scan->flags
4525 && OP(scan) == IFMATCH ) { /* Lookahead */
4526 cl_init(pRExC_state, &intrnl);
4527 data_fake.start_class = &intrnl;
4528 f |= SCF_DO_STCLASS_AND;
4530 if (flags & SCF_WHILEM_VISITED_POS)
4531 f |= SCF_WHILEM_VISITED_POS;
4532 next = regnext(scan);
4533 nscan = NEXTOPER(NEXTOPER(scan));
4535 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4536 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4539 FAIL("Variable length lookbehind not implemented");
4541 else if (*minnextp > (I32)U8_MAX) {
4542 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4544 scan->flags = (U8)*minnextp;
4549 if (f & SCF_DO_STCLASS_AND) {
4550 const int was = (data->start_class->flags & ANYOF_EOS);
4552 cl_and(data->start_class, &intrnl);
4554 data->start_class->flags |= ANYOF_EOS;
4557 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4559 if (data_fake.flags & SF_HAS_EVAL)
4560 data->flags |= SF_HAS_EVAL;
4561 data->whilem_c = data_fake.whilem_c;
4562 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4563 if (RExC_rx->minlen<*minnextp)
4564 RExC_rx->minlen=*minnextp;
4565 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4566 SvREFCNT_dec(data_fake.last_found);
4568 if ( data_fake.minlen_fixed != minlenp )
4570 data->offset_fixed= data_fake.offset_fixed;
4571 data->minlen_fixed= data_fake.minlen_fixed;
4572 data->lookbehind_fixed+= scan->flags;
4574 if ( data_fake.minlen_float != minlenp )
4576 data->minlen_float= data_fake.minlen_float;
4577 data->offset_float_min=data_fake.offset_float_min;
4578 data->offset_float_max=data_fake.offset_float_max;
4579 data->lookbehind_float+= scan->flags;
4586 else if (OP(scan) == OPEN) {
4587 if (stopparen != (I32)ARG(scan))
4590 else if (OP(scan) == CLOSE) {
4591 if (stopparen == (I32)ARG(scan)) {
4594 if ((I32)ARG(scan) == is_par) {
4595 next = regnext(scan);
4597 if ( next && (OP(next) != WHILEM) && next < last)
4598 is_par = 0; /* Disable optimization */
4601 *(data->last_closep) = ARG(scan);
4603 else if (OP(scan) == EVAL) {
4605 data->flags |= SF_HAS_EVAL;
4607 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4608 if (flags & SCF_DO_SUBSTR) {
4609 SCAN_COMMIT(pRExC_state,data,minlenp);
4610 flags &= ~SCF_DO_SUBSTR;
4612 if (data && OP(scan)==ACCEPT) {
4613 data->flags |= SCF_SEEN_ACCEPT;
4618 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4620 if (flags & SCF_DO_SUBSTR) {
4621 SCAN_COMMIT(pRExC_state,data,minlenp);
4622 data->longest = &(data->longest_float);
4624 is_inf = is_inf_internal = 1;
4625 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4626 cl_anything(pRExC_state, data->start_class);
4627 flags &= ~SCF_DO_STCLASS;
4629 else if (OP(scan) == GPOS) {
4630 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4631 !(delta || is_inf || (data && data->pos_delta)))
4633 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4634 RExC_rx->extflags |= RXf_ANCH_GPOS;
4635 if (RExC_rx->gofs < (U32)min)
4636 RExC_rx->gofs = min;
4638 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4642 #ifdef TRIE_STUDY_OPT
4643 #ifdef FULL_TRIE_STUDY
4644 else if (PL_regkind[OP(scan)] == TRIE) {
4645 /* NOTE - There is similar code to this block above for handling
4646 BRANCH nodes on the initial study. If you change stuff here
4648 regnode *trie_node= scan;
4649 regnode *tail= regnext(scan);
4650 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4651 I32 max1 = 0, min1 = I32_MAX;
4652 struct regnode_charclass_class accum;
4654 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4655 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4656 if (flags & SCF_DO_STCLASS)
4657 cl_init_zero(pRExC_state, &accum);
4663 const regnode *nextbranch= NULL;
4666 for ( word=1 ; word <= trie->wordcount ; word++)
4668 I32 deltanext=0, minnext=0, f = 0, fake;
4669 struct regnode_charclass_class this_class;
4671 data_fake.flags = 0;
4673 data_fake.whilem_c = data->whilem_c;
4674 data_fake.last_closep = data->last_closep;
4677 data_fake.last_closep = &fake;
4678 data_fake.pos_delta = delta;
4679 if (flags & SCF_DO_STCLASS) {
4680 cl_init(pRExC_state, &this_class);
4681 data_fake.start_class = &this_class;
4682 f = SCF_DO_STCLASS_AND;
4684 if (flags & SCF_WHILEM_VISITED_POS)
4685 f |= SCF_WHILEM_VISITED_POS;
4687 if (trie->jump[word]) {
4689 nextbranch = trie_node + trie->jump[0];
4690 scan= trie_node + trie->jump[word];
4691 /* We go from the jump point to the branch that follows
4692 it. Note this means we need the vestigal unused branches
4693 even though they arent otherwise used.
4695 minnext = study_chunk(pRExC_state, &scan, minlenp,
4696 &deltanext, (regnode *)nextbranch, &data_fake,
4697 stopparen, recursed, NULL, f,depth+1);
4699 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4700 nextbranch= regnext((regnode*)nextbranch);
4702 if (min1 > (I32)(minnext + trie->minlen))
4703 min1 = minnext + trie->minlen;
4704 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4705 max1 = minnext + deltanext + trie->maxlen;
4706 if (deltanext == I32_MAX)
4707 is_inf = is_inf_internal = 1;
4709 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4711 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4712 if ( stopmin > min + min1)
4713 stopmin = min + min1;
4714 flags &= ~SCF_DO_SUBSTR;
4716 data->flags |= SCF_SEEN_ACCEPT;
4719 if (data_fake.flags & SF_HAS_EVAL)
4720 data->flags |= SF_HAS_EVAL;
4721 data->whilem_c = data_fake.whilem_c;
4723 if (flags & SCF_DO_STCLASS)
4724 cl_or(pRExC_state, &accum, &this_class);
4727 if (flags & SCF_DO_SUBSTR) {
4728 data->pos_min += min1;
4729 data->pos_delta += max1 - min1;
4730 if (max1 != min1 || is_inf)
4731 data->longest = &(data->longest_float);
4734 delta += max1 - min1;
4735 if (flags & SCF_DO_STCLASS_OR) {
4736 cl_or(pRExC_state, data->start_class, &accum);
4738 cl_and(data->start_class, and_withp);
4739 flags &= ~SCF_DO_STCLASS;
4742 else if (flags & SCF_DO_STCLASS_AND) {
4744 cl_and(data->start_class, &accum);
4745 flags &= ~SCF_DO_STCLASS;
4748 /* Switch to OR mode: cache the old value of
4749 * data->start_class */
4751 StructCopy(data->start_class, and_withp,
4752 struct regnode_charclass_class);
4753 flags &= ~SCF_DO_STCLASS_AND;
4754 StructCopy(&accum, data->start_class,
4755 struct regnode_charclass_class);
4756 flags |= SCF_DO_STCLASS_OR;
4757 data->start_class->flags |= ANYOF_EOS;
4764 else if (PL_regkind[OP(scan)] == TRIE) {
4765 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4768 min += trie->minlen;
4769 delta += (trie->maxlen - trie->minlen);
4770 flags &= ~SCF_DO_STCLASS; /* xxx */
4771 if (flags & SCF_DO_SUBSTR) {
4772 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4773 data->pos_min += trie->minlen;
4774 data->pos_delta += (trie->maxlen - trie->minlen);
4775 if (trie->maxlen != trie->minlen)
4776 data->longest = &(data->longest_float);
4778 if (trie->jump) /* no more substrings -- for now /grr*/
4779 flags &= ~SCF_DO_SUBSTR;
4781 #endif /* old or new */
4782 #endif /* TRIE_STUDY_OPT */
4784 /* Else: zero-length, ignore. */
4785 scan = regnext(scan);
4790 stopparen = frame->stop;
4791 frame = frame->prev;
4792 goto fake_study_recurse;
4797 DEBUG_STUDYDATA("pre-fin:",data,depth);
4800 *deltap = is_inf_internal ? I32_MAX : delta;
4801 if (flags & SCF_DO_SUBSTR && is_inf)
4802 data->pos_delta = I32_MAX - data->pos_min;
4803 if (is_par > (I32)U8_MAX)
4805 if (is_par && pars==1 && data) {
4806 data->flags |= SF_IN_PAR;
4807 data->flags &= ~SF_HAS_PAR;
4809 else if (pars && data) {
4810 data->flags |= SF_HAS_PAR;
4811 data->flags &= ~SF_IN_PAR;
4813 if (flags & SCF_DO_STCLASS_OR)
4814 cl_and(data->start_class, and_withp);
4815 if (flags & SCF_TRIE_RESTUDY)
4816 data->flags |= SCF_TRIE_RESTUDY;
4818 DEBUG_STUDYDATA("post-fin:",data,depth);
4820 return min < stopmin ? min : stopmin;
4824 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4826 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4828 PERL_ARGS_ASSERT_ADD_DATA;
4830 Renewc(RExC_rxi->data,
4831 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4832 char, struct reg_data);
4834 Renew(RExC_rxi->data->what, count + n, U8);
4836 Newx(RExC_rxi->data->what, n, U8);
4837 RExC_rxi->data->count = count + n;
4838 Copy(s, RExC_rxi->data->what + count, n, U8);
4842 /*XXX: todo make this not included in a non debugging perl */
4843 #ifndef PERL_IN_XSUB_RE
4845 Perl_reginitcolors(pTHX)
4848 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4850 char *t = savepv(s);
4854 t = strchr(t, '\t');
4860 PL_colors[i] = t = (char *)"";
4865 PL_colors[i++] = (char *)"";
4872 #ifdef TRIE_STUDY_OPT
4873 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
4876 (data.flags & SCF_TRIE_RESTUDY) \
4884 #define CHECK_RESTUDY_GOTO_butfirst
4888 * pregcomp - compile a regular expression into internal code
4890 * Decides which engine's compiler to call based on the hint currently in
4894 #ifndef PERL_IN_XSUB_RE
4896 /* return the currently in-scope regex engine (or the default if none) */
4898 regexp_engine const *
4899 Perl_current_re_engine(pTHX)
4903 if (IN_PERL_COMPILETIME) {
4904 HV * const table = GvHV(PL_hintgv);
4908 return &reh_regexp_engine;
4909 ptr = hv_fetchs(table, "regcomp", FALSE);
4910 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4911 return &reh_regexp_engine;
4912 return INT2PTR(regexp_engine*,SvIV(*ptr));
4916 if (!PL_curcop->cop_hints_hash)
4917 return &reh_regexp_engine;
4918 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4919 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4920 return &reh_regexp_engine;
4921 return INT2PTR(regexp_engine*,SvIV(ptr));
4927 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4930 regexp_engine const *eng = current_re_engine();
4931 GET_RE_DEBUG_FLAGS_DECL;
4933 PERL_ARGS_ASSERT_PREGCOMP;
4935 /* Dispatch a request to compile a regexp to correct regexp engine. */
4937 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4940 return CALLREGCOMP_ENG(eng, pattern, flags);
4944 /* public(ish) entry point for the perl core's own regex compiling code.
4945 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4946 * pattern rather than a list of OPs, and uses the internal engine rather
4947 * than the current one */
4950 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4952 SV *pat = pattern; /* defeat constness! */
4953 PERL_ARGS_ASSERT_RE_COMPILE;
4954 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4955 #ifdef PERL_IN_XSUB_RE
4960 NULL, NULL, rx_flags, 0);
4963 /* see if there are any run-time code blocks in the pattern.
4964 * False positives are allowed */
4967 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4968 U32 pm_flags, char *pat, STRLEN plen)
4973 /* avoid infinitely recursing when we recompile the pattern parcelled up
4974 * as qr'...'. A single constant qr// string can't have have any
4975 * run-time component in it, and thus, no runtime code. (A non-qr
4976 * string, however, can, e.g. $x =~ '(?{})') */
4977 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4980 for (s = 0; s < plen; s++) {
4981 if (n < pRExC_state->num_code_blocks
4982 && s == pRExC_state->code_blocks[n].start)
4984 s = pRExC_state->code_blocks[n].end;
4988 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4990 if (pat[s] == '(' && pat[s+1] == '?' &&
4991 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4998 /* Handle run-time code blocks. We will already have compiled any direct
4999 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5000 * copy of it, but with any literal code blocks blanked out and
5001 * appropriate chars escaped; then feed it into
5003 * eval "qr'modified_pattern'"
5007 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5011 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5013 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5014 * and merge them with any code blocks of the original regexp.
5016 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5017 * instead, just save the qr and return FALSE; this tells our caller that
5018 * the original pattern needs upgrading to utf8.
5022 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5023 char *pat, STRLEN plen)
5027 GET_RE_DEBUG_FLAGS_DECL;
5029 if (pRExC_state->runtime_code_qr) {
5030 /* this is the second time we've been called; this should
5031 * only happen if the main pattern got upgraded to utf8
5032 * during compilation; re-use the qr we compiled first time
5033 * round (which should be utf8 too)
5035 qr = pRExC_state->runtime_code_qr;
5036 pRExC_state->runtime_code_qr = NULL;
5037 assert(RExC_utf8 && SvUTF8(qr));
5043 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5047 /* determine how many extra chars we need for ' and \ escaping */
5048 for (s = 0; s < plen; s++) {
5049 if (pat[s] == '\'' || pat[s] == '\\')
5053 Newx(newpat, newlen, char);
5055 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5057 for (s = 0; s < plen; s++) {
5058 if (n < pRExC_state->num_code_blocks
5059 && s == pRExC_state->code_blocks[n].start)
5061 /* blank out literal code block */
5062 assert(pat[s] == '(');
5063 while (s <= pRExC_state->code_blocks[n].end) {
5071 if (pat[s] == '\'' || pat[s] == '\\')
5076 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5080 PerlIO_printf(Perl_debug_log,
5081 "%sre-parsing pattern for runtime code:%s %s\n",
5082 PL_colors[4],PL_colors[5],newpat);
5085 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5091 PUSHSTACKi(PERLSI_REQUIRE);
5092 /* this causes the toker to collapse \\ into \ when parsing
5093 * qr''; normally only q'' does this. It also alters hints
5095 PL_reg_state.re_reparsing = TRUE;
5096 eval_sv(sv, G_SCALAR);
5102 SV * const errsv = ERRSV;
5103 if (SvTRUE_NN(errsv))
5105 Safefree(pRExC_state->code_blocks);
5106 /* use croak_sv ? */
5107 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5110 assert(SvROK(qr_ref));
5112 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5113 /* the leaving below frees the tmp qr_ref.
5114 * Give qr a life of its own */
5122 if (!RExC_utf8 && SvUTF8(qr)) {
5123 /* first time through; the pattern got upgraded; save the
5124 * qr for the next time through */
5125 assert(!pRExC_state->runtime_code_qr);
5126 pRExC_state->runtime_code_qr = qr;
5131 /* extract any code blocks within the returned qr// */
5134 /* merge the main (r1) and run-time (r2) code blocks into one */
5136 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5137 struct reg_code_block *new_block, *dst;
5138 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5141 if (!r2->num_code_blocks) /* we guessed wrong */
5148 r1->num_code_blocks + r2->num_code_blocks,
5149 struct reg_code_block);
5152 while ( i1 < r1->num_code_blocks
5153 || i2 < r2->num_code_blocks)
5155 struct reg_code_block *src;
5158 if (i1 == r1->num_code_blocks) {
5159 src = &r2->code_blocks[i2++];
5162 else if (i2 == r2->num_code_blocks)
5163 src = &r1->code_blocks[i1++];
5164 else if ( r1->code_blocks[i1].start
5165 < r2->code_blocks[i2].start)
5167 src = &r1->code_blocks[i1++];
5168 assert(src->end < r2->code_blocks[i2].start);
5171 assert( r1->code_blocks[i1].start
5172 > r2->code_blocks[i2].start);
5173 src = &r2->code_blocks[i2++];
5175 assert(src->end < r1->code_blocks[i1].start);
5178 assert(pat[src->start] == '(');
5179 assert(pat[src->end] == ')');
5180 dst->start = src->start;
5181 dst->end = src->end;
5182 dst->block = src->block;
5183 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5187 r1->num_code_blocks += r2->num_code_blocks;
5188 Safefree(r1->code_blocks);
5189 r1->code_blocks = new_block;
5198 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5200 /* This is the common code for setting up the floating and fixed length
5201 * string data extracted from Perlre_op_compile() below. Returns a boolean
5202 * as to whether succeeded or not */
5206 if (! (longest_length
5207 || (eol /* Can't have SEOL and MULTI */
5208 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5210 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5211 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5216 /* copy the information about the longest from the reg_scan_data
5217 over to the program. */
5218 if (SvUTF8(sv_longest)) {
5219 *rx_utf8 = sv_longest;
5222 *rx_substr = sv_longest;
5225 /* end_shift is how many chars that must be matched that
5226 follow this item. We calculate it ahead of time as once the
5227 lookbehind offset is added in we lose the ability to correctly
5229 ml = minlen ? *(minlen) : (I32)longest_length;
5230 *rx_end_shift = ml - offset
5231 - longest_length + (SvTAIL(sv_longest) != 0)
5234 t = (eol/* Can't have SEOL and MULTI */
5235 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5236 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5242 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5243 * regular expression into internal code.
5244 * The pattern may be passed either as:
5245 * a list of SVs (patternp plus pat_count)
5246 * a list of OPs (expr)
5247 * If both are passed, the SV list is used, but the OP list indicates
5248 * which SVs are actually pre-compiled code blocks
5250 * The SVs in the list have magic and qr overloading applied to them (and
5251 * the list may be modified in-place with replacement SVs in the latter
5254 * If the pattern hasn't changed from old_re, then old_re will be
5257 * eng is the current engine. If that engine has an op_comp method, then
5258 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5259 * do the initial concatenation of arguments and pass on to the external
5262 * If is_bare_re is not null, set it to a boolean indicating whether the
5263 * arg list reduced (after overloading) to a single bare regex which has
5264 * been returned (i.e. /$qr/).
5266 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5268 * pm_flags contains the PMf_* flags, typically based on those from the
5269 * pm_flags field of the related PMOP. Currently we're only interested in
5270 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5272 * We can't allocate space until we know how big the compiled form will be,
5273 * but we can't compile it (and thus know how big it is) until we've got a
5274 * place to put the code. So we cheat: we compile it twice, once with code
5275 * generation turned off and size counting turned on, and once "for real".
5276 * This also means that we don't allocate space until we are sure that the
5277 * thing really will compile successfully, and we never have to move the
5278 * code and thus invalidate pointers into it. (Note that it has to be in
5279 * one piece because free() must be able to free it all.) [NB: not true in perl]
5281 * Beware that the optimization-preparation code in here knows about some
5282 * of the structure of the compiled regexp. [I'll say.]
5286 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5287 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5288 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5293 regexp_internal *ri;
5302 SV * VOL code_blocksv = NULL;
5304 /* these are all flags - maybe they should be turned
5305 * into a single int with different bit masks */
5306 I32 sawlookahead = 0;
5309 bool used_setjump = FALSE;
5310 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5311 bool code_is_utf8 = 0;
5312 bool VOL recompile = 0;
5313 bool runtime_code = 0;
5317 RExC_state_t RExC_state;
5318 RExC_state_t * const pRExC_state = &RExC_state;
5319 #ifdef TRIE_STUDY_OPT
5321 RExC_state_t copyRExC_state;
5323 GET_RE_DEBUG_FLAGS_DECL;
5325 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5327 DEBUG_r(if (!PL_colorset) reginitcolors());
5329 #ifndef PERL_IN_XSUB_RE
5330 /* Initialize these here instead of as-needed, as is quick and avoids
5331 * having to test them each time otherwise */
5332 if (! PL_AboveLatin1) {
5333 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5334 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5335 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5337 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5338 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5340 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5341 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5343 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5344 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5346 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5348 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5349 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5351 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5353 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5354 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5356 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5357 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5359 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5360 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5362 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5363 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5365 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5366 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5368 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5369 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5371 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5372 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5374 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5376 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5377 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5379 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5380 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5382 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5386 pRExC_state->code_blocks = NULL;
5387 pRExC_state->num_code_blocks = 0;
5390 *is_bare_re = FALSE;
5392 if (expr && (expr->op_type == OP_LIST ||
5393 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5395 /* is the source UTF8, and how many code blocks are there? */
5399 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5400 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5402 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5403 /* count of DO blocks */
5407 pRExC_state->num_code_blocks = ncode;
5408 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5413 /* handle a list of SVs */
5417 /* apply magic and RE overloading to each arg */
5418 for (svp = patternp; svp < patternp + pat_count; svp++) {
5421 if (SvROK(rx) && SvAMAGIC(rx)) {
5422 SV *sv = AMG_CALLunary(rx, regexp_amg);
5426 if (SvTYPE(sv) != SVt_REGEXP)
5427 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5433 if (pat_count > 1) {
5434 /* concat multiple args and find any code block indexes */
5439 STRLEN orig_patlen = 0;
5441 if (pRExC_state->num_code_blocks) {
5442 o = cLISTOPx(expr)->op_first;
5443 assert( o->op_type == OP_PUSHMARK
5444 || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5445 || o->op_type == OP_PADRANGE);
5449 pat = newSVpvn("", 0);
5452 /* determine if the pattern is going to be utf8 (needed
5453 * in advance to align code block indices correctly).
5454 * XXX This could fail to be detected for an arg with
5455 * overloading but not concat overloading; but the main effect
5456 * in this obscure case is to need a 'use re eval' for a
5457 * literal code block */
5458 for (svp = patternp; svp < patternp + pat_count; svp++) {
5465 for (svp = patternp; svp < patternp + pat_count; svp++) {
5466 SV *sv, *msv = *svp;
5469 /* we make the assumption here that each op in the list of
5470 * op_siblings maps to one SV pushed onto the stack,
5471 * except for code blocks, with have both an OP_NULL and
5473 * This allows us to match up the list of SVs against the
5474 * list of OPs to find the next code block.
5476 * Note that PUSHMARK PADSV PADSV ..
5478 * PADRANGE NULL NULL ..
5479 * so the alignment still works. */
5481 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5482 assert(n < pRExC_state->num_code_blocks);
5483 pRExC_state->code_blocks[n].start = SvCUR(pat);
5484 pRExC_state->code_blocks[n].block = o;
5485 pRExC_state->code_blocks[n].src_regex = NULL;
5488 o = o->op_sibling; /* skip CONST */
5494 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5495 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5498 /* overloading involved: all bets are off over literal
5499 * code. Pretend we haven't seen it */
5500 pRExC_state->num_code_blocks -= n;
5506 while (SvAMAGIC(msv)
5507 && (sv = AMG_CALLunary(msv, string_amg))
5511 && SvRV(msv) == SvRV(sv))
5516 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5518 orig_patlen = SvCUR(pat);
5519 sv_catsv_nomg(pat, msv);
5522 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5525 /* extract any code blocks within any embedded qr//'s */
5526 if (rx && SvTYPE(rx) == SVt_REGEXP
5527 && RX_ENGINE((REGEXP*)rx)->op_comp)
5530 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5531 if (ri->num_code_blocks) {
5533 /* the presence of an embedded qr// with code means
5534 * we should always recompile: the text of the
5535 * qr// may not have changed, but it may be a
5536 * different closure than last time */
5538 Renew(pRExC_state->code_blocks,
5539 pRExC_state->num_code_blocks + ri->num_code_blocks,
5540 struct reg_code_block);
5541 pRExC_state->num_code_blocks += ri->num_code_blocks;
5542 for (i=0; i < ri->num_code_blocks; i++) {
5543 struct reg_code_block *src, *dst;
5544 STRLEN offset = orig_patlen
5545 + ReANY((REGEXP *)rx)->pre_prefix;
5546 assert(n < pRExC_state->num_code_blocks);
5547 src = &ri->code_blocks[i];
5548 dst = &pRExC_state->code_blocks[n];
5549 dst->start = src->start + offset;
5550 dst->end = src->end + offset;
5551 dst->block = src->block;
5552 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5566 while (SvAMAGIC(pat)
5567 && (sv = AMG_CALLunary(pat, string_amg))
5575 /* handle bare regex: foo =~ $re */
5580 if (SvTYPE(re) == SVt_REGEXP) {
5584 Safefree(pRExC_state->code_blocks);
5590 /* not a list of SVs, so must be a list of OPs */
5592 if (expr->op_type == OP_LIST) {
5597 pat = newSVpvn("", 0);
5602 /* given a list of CONSTs and DO blocks in expr, append all
5603 * the CONSTs to pat, and record the start and end of each
5604 * code block in code_blocks[] (each DO{} op is followed by an
5605 * OP_CONST containing the corresponding literal '(?{...})
5608 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5609 if (o->op_type == OP_CONST) {
5610 sv_catsv(pat, cSVOPo_sv);
5612 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5616 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5617 assert(i+1 < pRExC_state->num_code_blocks);
5618 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5619 pRExC_state->code_blocks[i].block = o;
5620 pRExC_state->code_blocks[i].src_regex = NULL;
5626 assert(expr->op_type == OP_CONST);
5627 pat = cSVOPx_sv(expr);
5631 exp = SvPV_nomg(pat, plen);
5633 if (!eng->op_comp) {
5634 if ((SvUTF8(pat) && IN_BYTES)
5635 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5637 /* make a temporary copy; either to convert to bytes,
5638 * or to avoid repeating get-magic / overloaded stringify */
5639 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5640 (IN_BYTES ? 0 : SvUTF8(pat)));
5642 Safefree(pRExC_state->code_blocks);
5643 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5646 /* ignore the utf8ness if the pattern is 0 length */
5647 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5648 RExC_uni_semantics = 0;
5649 RExC_contains_locale = 0;
5650 pRExC_state->runtime_code_qr = NULL;
5652 /****************** LONG JUMP TARGET HERE***********************/
5653 /* Longjmp back to here if have to switch in midstream to utf8 */
5654 if (! RExC_orig_utf8) {
5655 JMPENV_PUSH(jump_ret);
5656 used_setjump = TRUE;
5659 if (jump_ret == 0) { /* First time through */
5663 SV *dsv= sv_newmortal();
5664 RE_PV_QUOTED_DECL(s, RExC_utf8,
5665 dsv, exp, plen, 60);
5666 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5667 PL_colors[4],PL_colors[5],s);
5670 else { /* longjumped back */
5673 STRLEN s = 0, d = 0;
5676 /* If the cause for the longjmp was other than changing to utf8, pop
5677 * our own setjmp, and longjmp to the correct handler */
5678 if (jump_ret != UTF8_LONGJMP) {
5680 JMPENV_JUMP(jump_ret);
5685 /* It's possible to write a regexp in ascii that represents Unicode
5686 codepoints outside of the byte range, such as via \x{100}. If we
5687 detect such a sequence we have to convert the entire pattern to utf8
5688 and then recompile, as our sizing calculation will have been based
5689 on 1 byte == 1 character, but we will need to use utf8 to encode
5690 at least some part of the pattern, and therefore must convert the whole
5693 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5694 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5696 /* upgrade pattern to UTF8, and if there are code blocks,
5697 * recalculate the indices.
5698 * This is essentially an unrolled Perl_bytes_to_utf8() */
5700 src = (U8*)SvPV_nomg(pat, plen);
5701 Newx(dst, plen * 2 + 1, U8);
5704 const UV uv = NATIVE_TO_ASCII(src[s]);
5705 if (UNI_IS_INVARIANT(uv))
5706 dst[d] = (U8)UTF_TO_NATIVE(uv);
5708 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5709 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5711 if (n < pRExC_state->num_code_blocks) {
5712 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5713 pRExC_state->code_blocks[n].start = d;
5714 assert(dst[d] == '(');
5717 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5718 pRExC_state->code_blocks[n].end = d;
5719 assert(dst[d] == ')');
5732 RExC_orig_utf8 = RExC_utf8 = 1;
5735 /* return old regex if pattern hasn't changed */
5739 && !!RX_UTF8(old_re) == !!RExC_utf8
5740 && RX_PRECOMP(old_re)
5741 && RX_PRELEN(old_re) == plen
5742 && memEQ(RX_PRECOMP(old_re), exp, plen))
5744 /* with runtime code, always recompile */
5745 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5747 if (!runtime_code) {
5751 Safefree(pRExC_state->code_blocks);
5755 else if ((pm_flags & PMf_USE_RE_EVAL)
5756 /* this second condition covers the non-regex literal case,
5757 * i.e. $foo =~ '(?{})'. */
5758 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5759 && (PL_hints & HINT_RE_EVAL))
5761 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5764 #ifdef TRIE_STUDY_OPT
5768 rx_flags = orig_rx_flags;
5770 if (initial_charset == REGEX_LOCALE_CHARSET) {
5771 RExC_contains_locale = 1;
5773 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5775 /* Set to use unicode semantics if the pattern is in utf8 and has the
5776 * 'depends' charset specified, as it means unicode when utf8 */
5777 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5781 RExC_flags = rx_flags;
5782 RExC_pm_flags = pm_flags;
5785 if (TAINTING_get && TAINT_get)
5786 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5788 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5789 /* whoops, we have a non-utf8 pattern, whilst run-time code
5790 * got compiled as utf8. Try again with a utf8 pattern */
5791 JMPENV_JUMP(UTF8_LONGJMP);
5794 assert(!pRExC_state->runtime_code_qr);
5799 RExC_in_lookbehind = 0;
5800 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5802 RExC_override_recoding = 0;
5803 RExC_in_multi_char_class = 0;
5805 /* First pass: determine size, legality. */
5813 RExC_emit = &PL_regdummy;
5814 RExC_whilem_seen = 0;
5815 RExC_open_parens = NULL;
5816 RExC_close_parens = NULL;
5818 RExC_paren_names = NULL;
5820 RExC_paren_name_list = NULL;
5822 RExC_recurse = NULL;
5823 RExC_recurse_count = 0;
5824 pRExC_state->code_index = 0;
5826 #if 0 /* REGC() is (currently) a NOP at the first pass.
5827 * Clever compilers notice this and complain. --jhi */
5828 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5831 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5833 RExC_lastparse=NULL;
5835 /* reg may croak on us, not giving us a chance to free
5836 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5837 need it to survive as long as the regexp (qr/(?{})/).
5838 We must check that code_blocksv is not already set, because we may
5839 have longjmped back. */
5840 if (pRExC_state->code_blocks && !code_blocksv) {
5841 code_blocksv = newSV_type(SVt_PV);
5842 SAVEFREESV(code_blocksv);
5843 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5844 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5846 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5847 RExC_precomp = NULL;
5851 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5853 /* Here, finished first pass. Get rid of any added setjmp */
5859 PerlIO_printf(Perl_debug_log,
5860 "Required size %"IVdf" nodes\n"
5861 "Starting second pass (creation)\n",
5864 RExC_lastparse=NULL;
5867 /* The first pass could have found things that force Unicode semantics */
5868 if ((RExC_utf8 || RExC_uni_semantics)
5869 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5871 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5874 /* Small enough for pointer-storage convention?
5875 If extralen==0, this means that we will not need long jumps. */
5876 if (RExC_size >= 0x10000L && RExC_extralen)
5877 RExC_size += RExC_extralen;
5880 if (RExC_whilem_seen > 15)
5881 RExC_whilem_seen = 15;
5883 /* Allocate space and zero-initialize. Note, the two step process
5884 of zeroing when in debug mode, thus anything assigned has to
5885 happen after that */
5886 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5888 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5889 char, regexp_internal);
5890 if ( r == NULL || ri == NULL )
5891 FAIL("Regexp out of space");
5893 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5894 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5896 /* bulk initialize base fields with 0. */
5897 Zero(ri, sizeof(regexp_internal), char);
5900 /* non-zero initialization begins here */
5903 r->extflags = rx_flags;
5904 if (pm_flags & PMf_IS_QR) {
5905 ri->code_blocks = pRExC_state->code_blocks;
5906 ri->num_code_blocks = pRExC_state->num_code_blocks;
5911 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5912 if (pRExC_state->code_blocks[n].src_regex)
5913 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5914 SAVEFREEPV(pRExC_state->code_blocks);
5918 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5919 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5921 /* The caret is output if there are any defaults: if not all the STD
5922 * flags are set, or if no character set specifier is needed */
5924 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5926 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5927 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5928 >> RXf_PMf_STD_PMMOD_SHIFT);
5929 const char *fptr = STD_PAT_MODS; /*"msix"*/
5931 /* Allocate for the worst case, which is all the std flags are turned
5932 * on. If more precision is desired, we could do a population count of
5933 * the flags set. This could be done with a small lookup table, or by
5934 * shifting, masking and adding, or even, when available, assembly
5935 * language for a machine-language population count.
5936 * We never output a minus, as all those are defaults, so are
5937 * covered by the caret */
5938 const STRLEN wraplen = plen + has_p + has_runon
5939 + has_default /* If needs a caret */
5941 /* If needs a character set specifier */
5942 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5943 + (sizeof(STD_PAT_MODS) - 1)
5944 + (sizeof("(?:)") - 1);
5946 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5947 r->xpv_len_u.xpvlenu_pv = p;
5949 SvFLAGS(rx) |= SVf_UTF8;
5952 /* If a default, cover it using the caret */
5954 *p++= DEFAULT_PAT_MOD;
5958 const char* const name = get_regex_charset_name(r->extflags, &len);
5959 Copy(name, p, len, char);
5963 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5966 while((ch = *fptr++)) {
5974 Copy(RExC_precomp, p, plen, char);
5975 assert ((RX_WRAPPED(rx) - p) < 16);
5976 r->pre_prefix = p - RX_WRAPPED(rx);
5982 SvCUR_set(rx, p - RX_WRAPPED(rx));
5986 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5988 if (RExC_seen & REG_SEEN_RECURSE) {
5989 Newxz(RExC_open_parens, RExC_npar,regnode *);
5990 SAVEFREEPV(RExC_open_parens);
5991 Newxz(RExC_close_parens,RExC_npar,regnode *);
5992 SAVEFREEPV(RExC_close_parens);
5995 /* Useful during FAIL. */
5996 #ifdef RE_TRACK_PATTERN_OFFSETS
5997 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5998 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5999 "%s %"UVuf" bytes for offset annotations.\n",
6000 ri->u.offsets ? "Got" : "Couldn't get",
6001 (UV)((2*RExC_size+1) * sizeof(U32))));
6003 SetProgLen(ri,RExC_size);
6007 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6009 /* Second pass: emit code. */
6010 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6011 RExC_pm_flags = pm_flags;
6016 RExC_emit_start = ri->program;
6017 RExC_emit = ri->program;
6018 RExC_emit_bound = ri->program + RExC_size + 1;
6019 pRExC_state->code_index = 0;
6021 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6022 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6026 /* XXXX To minimize changes to RE engine we always allocate
6027 3-units-long substrs field. */
6028 Newx(r->substrs, 1, struct reg_substr_data);
6029 if (RExC_recurse_count) {
6030 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6031 SAVEFREEPV(RExC_recurse);
6035 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6036 Zero(r->substrs, 1, struct reg_substr_data);
6038 #ifdef TRIE_STUDY_OPT
6040 StructCopy(&zero_scan_data, &data, scan_data_t);
6041 copyRExC_state = RExC_state;
6044 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6046 RExC_state = copyRExC_state;
6047 if (seen & REG_TOP_LEVEL_BRANCHES)
6048 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6050 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6051 StructCopy(&zero_scan_data, &data, scan_data_t);
6054 StructCopy(&zero_scan_data, &data, scan_data_t);
6057 /* Dig out information for optimizations. */
6058 r->extflags = RExC_flags; /* was pm_op */
6059 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6062 SvUTF8_on(rx); /* Unicode in it? */
6063 ri->regstclass = NULL;
6064 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6065 r->intflags |= PREGf_NAUGHTY;
6066 scan = ri->program + 1; /* First BRANCH. */
6068 /* testing for BRANCH here tells us whether there is "must appear"
6069 data in the pattern. If there is then we can use it for optimisations */
6070 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6072 STRLEN longest_float_length, longest_fixed_length;
6073 struct regnode_charclass_class ch_class; /* pointed to by data */
6075 I32 last_close = 0; /* pointed to by data */
6076 regnode *first= scan;
6077 regnode *first_next= regnext(first);
6079 * Skip introductions and multiplicators >= 1
6080 * so that we can extract the 'meat' of the pattern that must
6081 * match in the large if() sequence following.
6082 * NOTE that EXACT is NOT covered here, as it is normally
6083 * picked up by the optimiser separately.
6085 * This is unfortunate as the optimiser isnt handling lookahead
6086 * properly currently.
6089 while ((OP(first) == OPEN && (sawopen = 1)) ||
6090 /* An OR of *one* alternative - should not happen now. */
6091 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6092 /* for now we can't handle lookbehind IFMATCH*/
6093 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6094 (OP(first) == PLUS) ||
6095 (OP(first) == MINMOD) ||
6096 /* An {n,m} with n>0 */
6097 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6098 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6101 * the only op that could be a regnode is PLUS, all the rest
6102 * will be regnode_1 or regnode_2.
6105 if (OP(first) == PLUS)
6108 first += regarglen[OP(first)];
6110 first = NEXTOPER(first);
6111 first_next= regnext(first);
6114 /* Starting-point info. */
6116 DEBUG_PEEP("first:",first,0);
6117 /* Ignore EXACT as we deal with it later. */
6118 if (PL_regkind[OP(first)] == EXACT) {
6119 if (OP(first) == EXACT)
6120 NOOP; /* Empty, get anchored substr later. */
6122 ri->regstclass = first;
6125 else if (PL_regkind[OP(first)] == TRIE &&
6126 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6129 /* this can happen only on restudy */
6130 if ( OP(first) == TRIE ) {
6131 struct regnode_1 *trieop = (struct regnode_1 *)
6132 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6133 StructCopy(first,trieop,struct regnode_1);
6134 trie_op=(regnode *)trieop;
6136 struct regnode_charclass *trieop = (struct regnode_charclass *)
6137 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6138 StructCopy(first,trieop,struct regnode_charclass);
6139 trie_op=(regnode *)trieop;
6142 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6143 ri->regstclass = trie_op;
6146 else if (REGNODE_SIMPLE(OP(first)))
6147 ri->regstclass = first;
6148 else if (PL_regkind[OP(first)] == BOUND ||
6149 PL_regkind[OP(first)] == NBOUND)
6150 ri->regstclass = first;
6151 else if (PL_regkind[OP(first)] == BOL) {
6152 r->extflags |= (OP(first) == MBOL
6154 : (OP(first) == SBOL
6157 first = NEXTOPER(first);
6160 else if (OP(first) == GPOS) {
6161 r->extflags |= RXf_ANCH_GPOS;
6162 first = NEXTOPER(first);
6165 else if ((!sawopen || !RExC_sawback) &&
6166 (OP(first) == STAR &&
6167 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6168 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6170 /* turn .* into ^.* with an implied $*=1 */
6172 (OP(NEXTOPER(first)) == REG_ANY)
6175 r->extflags |= type;
6176 r->intflags |= PREGf_IMPLICIT;
6177 first = NEXTOPER(first);
6180 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6181 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6182 /* x+ must match at the 1st pos of run of x's */
6183 r->intflags |= PREGf_SKIP;
6185 /* Scan is after the zeroth branch, first is atomic matcher. */
6186 #ifdef TRIE_STUDY_OPT
6189 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6190 (IV)(first - scan + 1))
6194 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6195 (IV)(first - scan + 1))
6201 * If there's something expensive in the r.e., find the
6202 * longest literal string that must appear and make it the
6203 * regmust. Resolve ties in favor of later strings, since
6204 * the regstart check works with the beginning of the r.e.
6205 * and avoiding duplication strengthens checking. Not a
6206 * strong reason, but sufficient in the absence of others.
6207 * [Now we resolve ties in favor of the earlier string if
6208 * it happens that c_offset_min has been invalidated, since the
6209 * earlier string may buy us something the later one won't.]
6212 data.longest_fixed = newSVpvs("");
6213 data.longest_float = newSVpvs("");
6214 data.last_found = newSVpvs("");
6215 data.longest = &(data.longest_fixed);
6216 ENTER_with_name("study_chunk");
6217 SAVEFREESV(data.longest_fixed);
6218 SAVEFREESV(data.longest_float);
6219 SAVEFREESV(data.last_found);
6221 if (!ri->regstclass) {
6222 cl_init(pRExC_state, &ch_class);
6223 data.start_class = &ch_class;
6224 stclass_flag = SCF_DO_STCLASS_AND;
6225 } else /* XXXX Check for BOUND? */
6227 data.last_closep = &last_close;
6229 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6230 &data, -1, NULL, NULL,
6231 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6234 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6237 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6238 && data.last_start_min == 0 && data.last_end > 0
6239 && !RExC_seen_zerolen
6240 && !(RExC_seen & REG_SEEN_VERBARG)
6241 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6242 r->extflags |= RXf_CHECK_ALL;
6243 scan_commit(pRExC_state, &data,&minlen,0);
6245 longest_float_length = CHR_SVLEN(data.longest_float);
6247 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6248 && data.offset_fixed == data.offset_float_min
6249 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6250 && S_setup_longest (aTHX_ pRExC_state,
6254 &(r->float_end_shift),
6255 data.lookbehind_float,
6256 data.offset_float_min,
6258 longest_float_length,
6259 data.flags & SF_FL_BEFORE_EOL,
6260 data.flags & SF_FL_BEFORE_MEOL))
6262 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6263 r->float_max_offset = data.offset_float_max;
6264 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6265 r->float_max_offset -= data.lookbehind_float;
6266 SvREFCNT_inc_simple_void_NN(data.longest_float);
6269 r->float_substr = r->float_utf8 = NULL;
6270 longest_float_length = 0;
6273 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6275 if (S_setup_longest (aTHX_ pRExC_state,
6277 &(r->anchored_utf8),
6278 &(r->anchored_substr),
6279 &(r->anchored_end_shift),
6280 data.lookbehind_fixed,
6283 longest_fixed_length,
6284 data.flags & SF_FIX_BEFORE_EOL,
6285 data.flags & SF_FIX_BEFORE_MEOL))
6287 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6288 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6291 r->anchored_substr = r->anchored_utf8 = NULL;
6292 longest_fixed_length = 0;
6294 LEAVE_with_name("study_chunk");
6297 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6298 ri->regstclass = NULL;
6300 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6302 && !(data.start_class->flags & ANYOF_EOS)
6303 && !cl_is_anything(data.start_class))
6305 const U32 n = add_data(pRExC_state, 1, "f");
6306 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6308 Newx(RExC_rxi->data->data[n], 1,
6309 struct regnode_charclass_class);
6310 StructCopy(data.start_class,
6311 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6312 struct regnode_charclass_class);
6313 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6314 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6315 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6316 regprop(r, sv, (regnode*)data.start_class);
6317 PerlIO_printf(Perl_debug_log,
6318 "synthetic stclass \"%s\".\n",
6319 SvPVX_const(sv));});
6322 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6323 if (longest_fixed_length > longest_float_length) {
6324 r->check_end_shift = r->anchored_end_shift;
6325 r->check_substr = r->anchored_substr;
6326 r->check_utf8 = r->anchored_utf8;
6327 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6328 if (r->extflags & RXf_ANCH_SINGLE)
6329 r->extflags |= RXf_NOSCAN;
6332 r->check_end_shift = r->float_end_shift;
6333 r->check_substr = r->float_substr;
6334 r->check_utf8 = r->float_utf8;
6335 r->check_offset_min = r->float_min_offset;
6336 r->check_offset_max = r->float_max_offset;
6338 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6339 This should be changed ASAP! */
6340 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6341 r->extflags |= RXf_USE_INTUIT;
6342 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6343 r->extflags |= RXf_INTUIT_TAIL;
6345 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6346 if ( (STRLEN)minlen < longest_float_length )
6347 minlen= longest_float_length;
6348 if ( (STRLEN)minlen < longest_fixed_length )
6349 minlen= longest_fixed_length;
6353 /* Several toplevels. Best we can is to set minlen. */
6355 struct regnode_charclass_class ch_class;
6358 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6360 scan = ri->program + 1;
6361 cl_init(pRExC_state, &ch_class);
6362 data.start_class = &ch_class;
6363 data.last_closep = &last_close;
6366 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6367 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6369 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6371 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6372 = r->float_substr = r->float_utf8 = NULL;
6374 if (!(data.start_class->flags & ANYOF_EOS)
6375 && !cl_is_anything(data.start_class))
6377 const U32 n = add_data(pRExC_state, 1, "f");
6378 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6380 Newx(RExC_rxi->data->data[n], 1,
6381 struct regnode_charclass_class);
6382 StructCopy(data.start_class,
6383 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6384 struct regnode_charclass_class);
6385 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6386 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6387 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6388 regprop(r, sv, (regnode*)data.start_class);
6389 PerlIO_printf(Perl_debug_log,
6390 "synthetic stclass \"%s\".\n",
6391 SvPVX_const(sv));});
6395 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6396 the "real" pattern. */
6398 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6399 (IV)minlen, (IV)r->minlen);
6401 r->minlenret = minlen;
6402 if (r->minlen < minlen)
6405 if (RExC_seen & REG_SEEN_GPOS)
6406 r->extflags |= RXf_GPOS_SEEN;
6407 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6408 r->extflags |= RXf_LOOKBEHIND_SEEN;
6409 if (pRExC_state->num_code_blocks)
6410 r->extflags |= RXf_EVAL_SEEN;
6411 if (RExC_seen & REG_SEEN_CANY)
6412 r->extflags |= RXf_CANY_SEEN;
6413 if (RExC_seen & REG_SEEN_VERBARG)
6415 r->intflags |= PREGf_VERBARG_SEEN;
6416 r->extflags |= RXf_MODIFIES_VARS;
6418 if (RExC_seen & REG_SEEN_CUTGROUP)
6419 r->intflags |= PREGf_CUTGROUP_SEEN;
6420 if (pm_flags & PMf_USE_RE_EVAL)
6421 r->intflags |= PREGf_USE_RE_EVAL;
6422 if (RExC_paren_names)
6423 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6425 RXp_PAREN_NAMES(r) = NULL;
6427 #ifdef STUPID_PATTERN_CHECKS
6428 if (RX_PRELEN(rx) == 0)
6429 r->extflags |= RXf_NULL;
6430 if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6431 r->extflags |= RXf_WHITE;
6432 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6433 r->extflags |= RXf_START_ONLY;
6436 regnode *first = ri->program + 1;
6439 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6440 r->extflags |= RXf_NULL;
6441 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6442 r->extflags |= RXf_START_ONLY;
6443 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6444 && OP(regnext(first)) == END)
6445 r->extflags |= RXf_WHITE;
6449 if (RExC_paren_names) {
6450 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6451 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6454 ri->name_list_idx = 0;
6456 if (RExC_recurse_count) {
6457 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6458 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6459 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6462 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6463 /* assume we don't need to swap parens around before we match */
6466 PerlIO_printf(Perl_debug_log,"Final program:\n");
6469 #ifdef RE_TRACK_PATTERN_OFFSETS
6470 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6471 const U32 len = ri->u.offsets[0];
6473 GET_RE_DEBUG_FLAGS_DECL;
6474 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6475 for (i = 1; i <= len; i++) {
6476 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6477 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6478 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6480 PerlIO_printf(Perl_debug_log, "\n");
6488 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6491 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6493 PERL_UNUSED_ARG(value);
6495 if (flags & RXapif_FETCH) {
6496 return reg_named_buff_fetch(rx, key, flags);
6497 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6498 Perl_croak_no_modify();
6500 } else if (flags & RXapif_EXISTS) {
6501 return reg_named_buff_exists(rx, key, flags)
6504 } else if (flags & RXapif_REGNAMES) {
6505 return reg_named_buff_all(rx, flags);
6506 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6507 return reg_named_buff_scalar(rx, flags);
6509 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6515 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6518 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6519 PERL_UNUSED_ARG(lastkey);
6521 if (flags & RXapif_FIRSTKEY)
6522 return reg_named_buff_firstkey(rx, flags);
6523 else if (flags & RXapif_NEXTKEY)
6524 return reg_named_buff_nextkey(rx, flags);
6526 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6532 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6535 AV *retarray = NULL;
6537 struct regexp *const rx = ReANY(r);
6539 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6541 if (flags & RXapif_ALL)
6544 if (rx && RXp_PAREN_NAMES(rx)) {
6545 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6548 SV* sv_dat=HeVAL(he_str);
6549 I32 *nums=(I32*)SvPVX(sv_dat);
6550 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6551 if ((I32)(rx->nparens) >= nums[i]
6552 && rx->offs[nums[i]].start != -1
6553 && rx->offs[nums[i]].end != -1)
6556 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6561 ret = newSVsv(&PL_sv_undef);
6564 av_push(retarray, ret);
6567 return newRV_noinc(MUTABLE_SV(retarray));
6574 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6577 struct regexp *const rx = ReANY(r);
6579 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6581 if (rx && RXp_PAREN_NAMES(rx)) {
6582 if (flags & RXapif_ALL) {
6583 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6585 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6599 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6601 struct regexp *const rx = ReANY(r);
6603 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6605 if ( rx && RXp_PAREN_NAMES(rx) ) {
6606 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6608 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6615 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6617 struct regexp *const rx = ReANY(r);
6618 GET_RE_DEBUG_FLAGS_DECL;
6620 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6622 if (rx && RXp_PAREN_NAMES(rx)) {
6623 HV *hv = RXp_PAREN_NAMES(rx);
6625 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6628 SV* sv_dat = HeVAL(temphe);
6629 I32 *nums = (I32*)SvPVX(sv_dat);
6630 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6631 if ((I32)(rx->lastparen) >= nums[i] &&
6632 rx->offs[nums[i]].start != -1 &&
6633 rx->offs[nums[i]].end != -1)
6639 if (parno || flags & RXapif_ALL) {
6640 return newSVhek(HeKEY_hek(temphe));
6648 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6653 struct regexp *const rx = ReANY(r);
6655 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6657 if (rx && RXp_PAREN_NAMES(rx)) {
6658 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6659 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6660 } else if (flags & RXapif_ONE) {
6661 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6662 av = MUTABLE_AV(SvRV(ret));
6663 length = av_len(av);
6665 return newSViv(length + 1);
6667 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6671 return &PL_sv_undef;
6675 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6677 struct regexp *const rx = ReANY(r);
6680 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6682 if (rx && RXp_PAREN_NAMES(rx)) {
6683 HV *hv= RXp_PAREN_NAMES(rx);
6685 (void)hv_iterinit(hv);
6686 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6689 SV* sv_dat = HeVAL(temphe);
6690 I32 *nums = (I32*)SvPVX(sv_dat);
6691 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6692 if ((I32)(rx->lastparen) >= nums[i] &&
6693 rx->offs[nums[i]].start != -1 &&
6694 rx->offs[nums[i]].end != -1)
6700 if (parno || flags & RXapif_ALL) {
6701 av_push(av, newSVhek(HeKEY_hek(temphe)));
6706 return newRV_noinc(MUTABLE_SV(av));
6710 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6713 struct regexp *const rx = ReANY(r);
6719 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6721 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6722 || n == RX_BUFF_IDX_CARET_FULLMATCH
6723 || n == RX_BUFF_IDX_CARET_POSTMATCH
6725 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6732 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6733 /* no need to distinguish between them any more */
6734 n = RX_BUFF_IDX_FULLMATCH;
6736 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6737 && rx->offs[0].start != -1)
6739 /* $`, ${^PREMATCH} */
6740 i = rx->offs[0].start;
6744 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6745 && rx->offs[0].end != -1)
6747 /* $', ${^POSTMATCH} */
6748 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6749 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6752 if ( 0 <= n && n <= (I32)rx->nparens &&
6753 (s1 = rx->offs[n].start) != -1 &&
6754 (t1 = rx->offs[n].end) != -1)
6756 /* $&, ${^MATCH}, $1 ... */
6758 s = rx->subbeg + s1 - rx->suboffset;
6763 assert(s >= rx->subbeg);
6764 assert(rx->sublen >= (s - rx->subbeg) + i );
6766 #if NO_TAINT_SUPPORT
6767 sv_setpvn(sv, s, i);
6769 const int oldtainted = TAINT_get;
6771 sv_setpvn(sv, s, i);
6772 TAINT_set(oldtainted);
6774 if ( (rx->extflags & RXf_CANY_SEEN)
6775 ? (RXp_MATCH_UTF8(rx)
6776 && (!i || is_utf8_string((U8*)s, i)))
6777 : (RXp_MATCH_UTF8(rx)) )
6784 if (RXp_MATCH_TAINTED(rx)) {
6785 if (SvTYPE(sv) >= SVt_PVMG) {
6786 MAGIC* const mg = SvMAGIC(sv);
6789 SvMAGIC_set(sv, mg->mg_moremagic);
6791 if ((mgt = SvMAGIC(sv))) {
6792 mg->mg_moremagic = mgt;
6793 SvMAGIC_set(sv, mg);
6804 sv_setsv(sv,&PL_sv_undef);
6810 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6811 SV const * const value)
6813 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6815 PERL_UNUSED_ARG(rx);
6816 PERL_UNUSED_ARG(paren);
6817 PERL_UNUSED_ARG(value);
6820 Perl_croak_no_modify();
6824 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6827 struct regexp *const rx = ReANY(r);
6831 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6833 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6835 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6836 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6840 case RX_BUFF_IDX_PREMATCH: /* $` */
6841 if (rx->offs[0].start != -1) {
6842 i = rx->offs[0].start;
6851 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6852 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6854 case RX_BUFF_IDX_POSTMATCH: /* $' */
6855 if (rx->offs[0].end != -1) {
6856 i = rx->sublen - rx->offs[0].end;
6858 s1 = rx->offs[0].end;
6865 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6866 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6870 /* $& / ${^MATCH}, $1, $2, ... */
6872 if (paren <= (I32)rx->nparens &&
6873 (s1 = rx->offs[paren].start) != -1 &&
6874 (t1 = rx->offs[paren].end) != -1)
6880 if (ckWARN(WARN_UNINITIALIZED))
6881 report_uninit((const SV *)sv);
6886 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6887 const char * const s = rx->subbeg - rx->suboffset + s1;
6892 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6899 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6901 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6902 PERL_UNUSED_ARG(rx);
6906 return newSVpvs("Regexp");
6909 /* Scans the name of a named buffer from the pattern.
6910 * If flags is REG_RSN_RETURN_NULL returns null.
6911 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6912 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6913 * to the parsed name as looked up in the RExC_paren_names hash.
6914 * If there is an error throws a vFAIL().. type exception.
6917 #define REG_RSN_RETURN_NULL 0
6918 #define REG_RSN_RETURN_NAME 1
6919 #define REG_RSN_RETURN_DATA 2
6922 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6924 char *name_start = RExC_parse;
6926 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6928 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6929 /* skip IDFIRST by using do...while */
6932 RExC_parse += UTF8SKIP(RExC_parse);
6933 } while (isALNUM_utf8((U8*)RExC_parse));
6937 } while (isALNUM(*RExC_parse));
6939 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6940 vFAIL("Group name must start with a non-digit word character");
6944 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6945 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6946 if ( flags == REG_RSN_RETURN_NAME)
6948 else if (flags==REG_RSN_RETURN_DATA) {
6951 if ( ! sv_name ) /* should not happen*/
6952 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6953 if (RExC_paren_names)
6954 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6956 sv_dat = HeVAL(he_str);
6958 vFAIL("Reference to nonexistent named group");
6962 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6963 (unsigned long) flags);
6965 assert(0); /* NOT REACHED */
6970 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6971 int rem=(int)(RExC_end - RExC_parse); \
6980 if (RExC_lastparse!=RExC_parse) \
6981 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6984 iscut ? "..." : "<" \
6987 PerlIO_printf(Perl_debug_log,"%16s",""); \
6990 num = RExC_size + 1; \
6992 num=REG_NODE_NUM(RExC_emit); \
6993 if (RExC_lastnum!=num) \
6994 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6996 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6997 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6998 (int)((depth*2)), "", \
7002 RExC_lastparse=RExC_parse; \
7007 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7008 DEBUG_PARSE_MSG((funcname)); \
7009 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7011 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7012 DEBUG_PARSE_MSG((funcname)); \
7013 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7016 /* This section of code defines the inversion list object and its methods. The
7017 * interfaces are highly subject to change, so as much as possible is static to
7018 * this file. An inversion list is here implemented as a malloc'd C UV array
7019 * with some added info that is placed as UVs at the beginning in a header
7020 * portion. An inversion list for Unicode is an array of code points, sorted
7021 * by ordinal number. The zeroth element is the first code point in the list.
7022 * The 1th element is the first element beyond that not in the list. In other
7023 * words, the first range is
7024 * invlist[0]..(invlist[1]-1)
7025 * The other ranges follow. Thus every element whose index is divisible by two
7026 * marks the beginning of a range that is in the list, and every element not
7027 * divisible by two marks the beginning of a range not in the list. A single
7028 * element inversion list that contains the single code point N generally
7029 * consists of two elements
7032 * (The exception is when N is the highest representable value on the
7033 * machine, in which case the list containing just it would be a single
7034 * element, itself. By extension, if the last range in the list extends to
7035 * infinity, then the first element of that range will be in the inversion list
7036 * at a position that is divisible by two, and is the final element in the
7038 * Taking the complement (inverting) an inversion list is quite simple, if the
7039 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7040 * This implementation reserves an element at the beginning of each inversion
7041 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
7042 * actual beginning of the list is either that element if 0, or the next one if
7045 * More about inversion lists can be found in "Unicode Demystified"
7046 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7047 * More will be coming when functionality is added later.
7049 * The inversion list data structure is currently implemented as an SV pointing
7050 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7051 * array of UV whose memory management is automatically handled by the existing
7052 * facilities for SV's.
7054 * Some of the methods should always be private to the implementation, and some
7055 * should eventually be made public */
7057 /* The header definitions are in F<inline_invlist.c> */
7059 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7060 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7062 #define INVLIST_INITIAL_LEN 10
7064 PERL_STATIC_INLINE UV*
7065 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7067 /* Returns a pointer to the first element in the inversion list's array.
7068 * This is called upon initialization of an inversion list. Where the
7069 * array begins depends on whether the list has the code point U+0000
7070 * in it or not. The other parameter tells it whether the code that
7071 * follows this call is about to put a 0 in the inversion list or not.
7072 * The first element is either the element with 0, if 0, or the next one,
7075 UV* zero = get_invlist_zero_addr(invlist);
7077 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7080 assert(! *_get_invlist_len_addr(invlist));
7082 /* 1^1 = 0; 1^0 = 1 */
7083 *zero = 1 ^ will_have_0;
7084 return zero + *zero;
7087 PERL_STATIC_INLINE UV*
7088 S_invlist_array(pTHX_ SV* const invlist)
7090 /* Returns the pointer to the inversion list's array. Every time the
7091 * length changes, this needs to be called in case malloc or realloc moved
7094 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7096 /* Must not be empty. If these fail, you probably didn't check for <len>
7097 * being non-zero before trying to get the array */
7098 assert(*_get_invlist_len_addr(invlist));
7099 assert(*get_invlist_zero_addr(invlist) == 0
7100 || *get_invlist_zero_addr(invlist) == 1);
7102 /* The array begins either at the element reserved for zero if the
7103 * list contains 0 (that element will be set to 0), or otherwise the next
7104 * element (in which case the reserved element will be set to 1). */
7105 return (UV *) (get_invlist_zero_addr(invlist)
7106 + *get_invlist_zero_addr(invlist));
7109 PERL_STATIC_INLINE void
7110 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7112 /* Sets the current number of elements stored in the inversion list */
7114 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7116 *_get_invlist_len_addr(invlist) = len;
7118 assert(len <= SvLEN(invlist));
7120 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7121 /* If the list contains U+0000, that element is part of the header,
7122 * and should not be counted as part of the array. It will contain
7123 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7125 * SvCUR_set(invlist,
7126 * TO_INTERNAL_SIZE(len
7127 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7128 * But, this is only valid if len is not 0. The consequences of not doing
7129 * this is that the memory allocation code may think that 1 more UV is
7130 * being used than actually is, and so might do an unnecessary grow. That
7131 * seems worth not bothering to make this the precise amount.
7133 * Note that when inverting, SvCUR shouldn't change */
7136 PERL_STATIC_INLINE IV*
7137 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7139 /* Return the address of the UV that is reserved to hold the cached index
7142 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7144 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7147 PERL_STATIC_INLINE IV
7148 S_invlist_previous_index(pTHX_ SV* const invlist)
7150 /* Returns cached index of previous search */
7152 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7154 return *get_invlist_previous_index_addr(invlist);
7157 PERL_STATIC_INLINE void
7158 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7160 /* Caches <index> for later retrieval */
7162 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7164 assert(index == 0 || index < (int) _invlist_len(invlist));
7166 *get_invlist_previous_index_addr(invlist) = index;
7169 PERL_STATIC_INLINE UV
7170 S_invlist_max(pTHX_ SV* const invlist)
7172 /* Returns the maximum number of elements storable in the inversion list's
7173 * array, without having to realloc() */
7175 PERL_ARGS_ASSERT_INVLIST_MAX;
7177 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7180 PERL_STATIC_INLINE UV*
7181 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7183 /* Return the address of the UV that is reserved to hold 0 if the inversion
7184 * list contains 0. This has to be the last element of the heading, as the
7185 * list proper starts with either it if 0, or the next element if not.
7186 * (But we force it to contain either 0 or 1) */
7188 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7190 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7193 #ifndef PERL_IN_XSUB_RE
7195 Perl__new_invlist(pTHX_ IV initial_size)
7198 /* Return a pointer to a newly constructed inversion list, with enough
7199 * space to store 'initial_size' elements. If that number is negative, a
7200 * system default is used instead */
7204 if (initial_size < 0) {
7205 initial_size = INVLIST_INITIAL_LEN;
7208 /* Allocate the initial space */
7209 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7210 invlist_set_len(new_list, 0);
7212 /* Force iterinit() to be used to get iteration to work */
7213 *get_invlist_iter_addr(new_list) = UV_MAX;
7215 /* This should force a segfault if a method doesn't initialize this
7217 *get_invlist_zero_addr(new_list) = UV_MAX;
7219 *get_invlist_previous_index_addr(new_list) = 0;
7220 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7221 #if HEADER_LENGTH != 5
7222 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7230 S__new_invlist_C_array(pTHX_ UV* list)
7232 /* Return a pointer to a newly constructed inversion list, initialized to
7233 * point to <list>, which has to be in the exact correct inversion list
7234 * form, including internal fields. Thus this is a dangerous routine that
7235 * should not be used in the wrong hands */
7237 SV* invlist = newSV_type(SVt_PV);
7239 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7241 SvPV_set(invlist, (char *) list);
7242 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7243 shouldn't touch it */
7244 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7246 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7247 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7254 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7256 /* Grow the maximum size of an inversion list */
7258 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7260 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7263 PERL_STATIC_INLINE void
7264 S_invlist_trim(pTHX_ SV* const invlist)
7266 PERL_ARGS_ASSERT_INVLIST_TRIM;
7268 /* Change the length of the inversion list to how many entries it currently
7271 SvPV_shrink_to_cur((SV *) invlist);
7274 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7277 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7279 /* Subject to change or removal. Append the range from 'start' to 'end' at
7280 * the end of the inversion list. The range must be above any existing
7284 UV max = invlist_max(invlist);
7285 UV len = _invlist_len(invlist);
7287 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7289 if (len == 0) { /* Empty lists must be initialized */
7290 array = _invlist_array_init(invlist, start == 0);
7293 /* Here, the existing list is non-empty. The current max entry in the
7294 * list is generally the first value not in the set, except when the
7295 * set extends to the end of permissible values, in which case it is
7296 * the first entry in that final set, and so this call is an attempt to
7297 * append out-of-order */
7299 UV final_element = len - 1;
7300 array = invlist_array(invlist);
7301 if (array[final_element] > start
7302 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7304 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7305 array[final_element], start,
7306 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7309 /* Here, it is a legal append. If the new range begins with the first
7310 * value not in the set, it is extending the set, so the new first
7311 * value not in the set is one greater than the newly extended range.
7313 if (array[final_element] == start) {
7314 if (end != UV_MAX) {
7315 array[final_element] = end + 1;
7318 /* But if the end is the maximum representable on the machine,
7319 * just let the range that this would extend to have no end */
7320 invlist_set_len(invlist, len - 1);
7326 /* Here the new range doesn't extend any existing set. Add it */
7328 len += 2; /* Includes an element each for the start and end of range */
7330 /* If overflows the existing space, extend, which may cause the array to be
7333 invlist_extend(invlist, len);
7334 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7335 failure in invlist_array() */
7336 array = invlist_array(invlist);
7339 invlist_set_len(invlist, len);
7342 /* The next item on the list starts the range, the one after that is
7343 * one past the new range. */
7344 array[len - 2] = start;
7345 if (end != UV_MAX) {
7346 array[len - 1] = end + 1;
7349 /* But if the end is the maximum representable on the machine, just let
7350 * the range have no end */
7351 invlist_set_len(invlist, len - 1);
7355 #ifndef PERL_IN_XSUB_RE
7358 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7360 /* Searches the inversion list for the entry that contains the input code
7361 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7362 * return value is the index into the list's array of the range that
7367 IV high = _invlist_len(invlist);
7368 const IV highest_element = high - 1;
7371 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7373 /* If list is empty, return failure. */
7378 /* If the code point is before the first element, return failure. (We
7379 * can't combine this with the test above, because we can't get the array
7380 * unless we know the list is non-empty) */
7381 array = invlist_array(invlist);
7383 mid = invlist_previous_index(invlist);
7384 assert(mid >=0 && mid <= highest_element);
7386 /* <mid> contains the cache of the result of the previous call to this
7387 * function (0 the first time). See if this call is for the same result,
7388 * or if it is for mid-1. This is under the theory that calls to this
7389 * function will often be for related code points that are near each other.
7390 * And benchmarks show that caching gives better results. We also test
7391 * here if the code point is within the bounds of the list. These tests
7392 * replace others that would have had to be made anyway to make sure that
7393 * the array bounds were not exceeded, and these give us extra information
7394 * at the same time */
7395 if (cp >= array[mid]) {
7396 if (cp >= array[highest_element]) {
7397 return highest_element;
7400 /* Here, array[mid] <= cp < array[highest_element]. This means that
7401 * the final element is not the answer, so can exclude it; it also
7402 * means that <mid> is not the final element, so can refer to 'mid + 1'
7404 if (cp < array[mid + 1]) {
7410 else { /* cp < aray[mid] */
7411 if (cp < array[0]) { /* Fail if outside the array */
7415 if (cp >= array[mid - 1]) {
7420 /* Binary search. What we are looking for is <i> such that
7421 * array[i] <= cp < array[i+1]
7422 * The loop below converges on the i+1. Note that there may not be an
7423 * (i+1)th element in the array, and things work nonetheless */
7424 while (low < high) {
7425 mid = (low + high) / 2;
7426 assert(mid <= highest_element);
7427 if (array[mid] <= cp) { /* cp >= array[mid] */
7430 /* We could do this extra test to exit the loop early.
7431 if (cp < array[low]) {
7436 else { /* cp < array[mid] */
7443 invlist_set_previous_index(invlist, high);
7448 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7450 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7451 * but is used when the swash has an inversion list. This makes this much
7452 * faster, as it uses a binary search instead of a linear one. This is
7453 * intimately tied to that function, and perhaps should be in utf8.c,
7454 * except it is intimately tied to inversion lists as well. It assumes
7455 * that <swatch> is all 0's on input */
7458 const IV len = _invlist_len(invlist);
7462 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7464 if (len == 0) { /* Empty inversion list */
7468 array = invlist_array(invlist);
7470 /* Find which element it is */
7471 i = _invlist_search(invlist, start);
7473 /* We populate from <start> to <end> */
7474 while (current < end) {
7477 /* The inversion list gives the results for every possible code point
7478 * after the first one in the list. Only those ranges whose index is
7479 * even are ones that the inversion list matches. For the odd ones,
7480 * and if the initial code point is not in the list, we have to skip
7481 * forward to the next element */
7482 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7484 if (i >= len) { /* Finished if beyond the end of the array */
7488 if (current >= end) { /* Finished if beyond the end of what we
7490 if (LIKELY(end < UV_MAX)) {
7494 /* We get here when the upper bound is the maximum
7495 * representable on the machine, and we are looking for just
7496 * that code point. Have to special case it */
7498 goto join_end_of_list;
7501 assert(current >= start);
7503 /* The current range ends one below the next one, except don't go past
7506 upper = (i < len && array[i] < end) ? array[i] : end;
7508 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7509 * for each code point in it */
7510 for (; current < upper; current++) {
7511 const STRLEN offset = (STRLEN)(current - start);
7512 swatch[offset >> 3] |= 1 << (offset & 7);
7517 /* Quit if at the end of the list */
7520 /* But first, have to deal with the highest possible code point on
7521 * the platform. The previous code assumes that <end> is one
7522 * beyond where we want to populate, but that is impossible at the
7523 * platform's infinity, so have to handle it specially */
7524 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7526 const STRLEN offset = (STRLEN)(end - start);
7527 swatch[offset >> 3] |= 1 << (offset & 7);
7532 /* Advance to the next range, which will be for code points not in the
7541 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7543 /* Take the union of two inversion lists and point <output> to it. *output
7544 * should be defined upon input, and if it points to one of the two lists,
7545 * the reference count to that list will be decremented. The first list,
7546 * <a>, may be NULL, in which case a copy of the second list is returned.
7547 * If <complement_b> is TRUE, the union is taken of the complement
7548 * (inversion) of <b> instead of b itself.
7550 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7551 * Richard Gillam, published by Addison-Wesley, and explained at some
7552 * length there. The preface says to incorporate its examples into your
7553 * code at your own risk.
7555 * The algorithm is like a merge sort.
7557 * XXX A potential performance improvement is to keep track as we go along
7558 * if only one of the inputs contributes to the result, meaning the other
7559 * is a subset of that one. In that case, we can skip the final copy and
7560 * return the larger of the input lists, but then outside code might need
7561 * to keep track of whether to free the input list or not */
7563 UV* array_a; /* a's array */
7565 UV len_a; /* length of a's array */
7568 SV* u; /* the resulting union */
7572 UV i_a = 0; /* current index into a's array */
7576 /* running count, as explained in the algorithm source book; items are
7577 * stopped accumulating and are output when the count changes to/from 0.
7578 * The count is incremented when we start a range that's in the set, and
7579 * decremented when we start a range that's not in the set. So its range
7580 * is 0 to 2. Only when the count is zero is something not in the set.
7584 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7587 /* If either one is empty, the union is the other one */
7588 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7595 *output = invlist_clone(b);
7597 _invlist_invert(*output);
7599 } /* else *output already = b; */
7602 else if ((len_b = _invlist_len(b)) == 0) {
7607 /* The complement of an empty list is a list that has everything in it,
7608 * so the union with <a> includes everything too */
7613 *output = _new_invlist(1);
7614 _append_range_to_invlist(*output, 0, UV_MAX);
7616 else if (*output != a) {
7617 *output = invlist_clone(a);
7619 /* else *output already = a; */
7623 /* Here both lists exist and are non-empty */
7624 array_a = invlist_array(a);
7625 array_b = invlist_array(b);
7627 /* If are to take the union of 'a' with the complement of b, set it
7628 * up so are looking at b's complement. */
7631 /* To complement, we invert: if the first element is 0, remove it. To
7632 * do this, we just pretend the array starts one later, and clear the
7633 * flag as we don't have to do anything else later */
7634 if (array_b[0] == 0) {
7637 complement_b = FALSE;
7641 /* But if the first element is not zero, we unshift a 0 before the
7642 * array. The data structure reserves a space for that 0 (which
7643 * should be a '1' right now), so physical shifting is unneeded,
7644 * but temporarily change that element to 0. Before exiting the
7645 * routine, we must restore the element to '1' */
7652 /* Size the union for the worst case: that the sets are completely
7654 u = _new_invlist(len_a + len_b);
7656 /* Will contain U+0000 if either component does */
7657 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7658 || (len_b > 0 && array_b[0] == 0));
7660 /* Go through each list item by item, stopping when exhausted one of
7662 while (i_a < len_a && i_b < len_b) {
7663 UV cp; /* The element to potentially add to the union's array */
7664 bool cp_in_set; /* is it in the the input list's set or not */
7666 /* We need to take one or the other of the two inputs for the union.
7667 * Since we are merging two sorted lists, we take the smaller of the
7668 * next items. In case of a tie, we take the one that is in its set
7669 * first. If we took one not in the set first, it would decrement the
7670 * count, possibly to 0 which would cause it to be output as ending the
7671 * range, and the next time through we would take the same number, and
7672 * output it again as beginning the next range. By doing it the
7673 * opposite way, there is no possibility that the count will be
7674 * momentarily decremented to 0, and thus the two adjoining ranges will
7675 * be seamlessly merged. (In a tie and both are in the set or both not
7676 * in the set, it doesn't matter which we take first.) */
7677 if (array_a[i_a] < array_b[i_b]
7678 || (array_a[i_a] == array_b[i_b]
7679 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7681 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7685 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7689 /* Here, have chosen which of the two inputs to look at. Only output
7690 * if the running count changes to/from 0, which marks the
7691 * beginning/end of a range in that's in the set */
7694 array_u[i_u++] = cp;
7701 array_u[i_u++] = cp;
7706 /* Here, we are finished going through at least one of the lists, which
7707 * means there is something remaining in at most one. We check if the list
7708 * that hasn't been exhausted is positioned such that we are in the middle
7709 * of a range in its set or not. (i_a and i_b point to the element beyond
7710 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7711 * is potentially more to output.
7712 * There are four cases:
7713 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7714 * in the union is entirely from the non-exhausted set.
7715 * 2) Both were in their sets, count is 2. Nothing further should
7716 * be output, as everything that remains will be in the exhausted
7717 * list's set, hence in the union; decrementing to 1 but not 0 insures
7719 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7720 * Nothing further should be output because the union includes
7721 * everything from the exhausted set. Not decrementing ensures that.
7722 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7723 * decrementing to 0 insures that we look at the remainder of the
7724 * non-exhausted set */
7725 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7726 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7731 /* The final length is what we've output so far, plus what else is about to
7732 * be output. (If 'count' is non-zero, then the input list we exhausted
7733 * has everything remaining up to the machine's limit in its set, and hence
7734 * in the union, so there will be no further output. */
7737 /* At most one of the subexpressions will be non-zero */
7738 len_u += (len_a - i_a) + (len_b - i_b);
7741 /* Set result to final length, which can change the pointer to array_u, so
7743 if (len_u != _invlist_len(u)) {
7744 invlist_set_len(u, len_u);
7746 array_u = invlist_array(u);
7749 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7750 * the other) ended with everything above it not in its set. That means
7751 * that the remaining part of the union is precisely the same as the
7752 * non-exhausted list, so can just copy it unchanged. (If both list were
7753 * exhausted at the same time, then the operations below will be both 0.)
7756 IV copy_count; /* At most one will have a non-zero copy count */
7757 if ((copy_count = len_a - i_a) > 0) {
7758 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7760 else if ((copy_count = len_b - i_b) > 0) {
7761 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7765 /* We may be removing a reference to one of the inputs */
7766 if (a == *output || b == *output) {
7767 SvREFCNT_dec(*output);
7770 /* If we've changed b, restore it */
7780 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7782 /* Take the intersection of two inversion lists and point <i> to it. *i
7783 * should be defined upon input, and if it points to one of the two lists,
7784 * the reference count to that list will be decremented.
7785 * If <complement_b> is TRUE, the result will be the intersection of <a>
7786 * and the complement (or inversion) of <b> instead of <b> directly.
7788 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7789 * Richard Gillam, published by Addison-Wesley, and explained at some
7790 * length there. The preface says to incorporate its examples into your
7791 * code at your own risk. In fact, it had bugs
7793 * The algorithm is like a merge sort, and is essentially the same as the
7797 UV* array_a; /* a's array */
7799 UV len_a; /* length of a's array */
7802 SV* r; /* the resulting intersection */
7806 UV i_a = 0; /* current index into a's array */
7810 /* running count, as explained in the algorithm source book; items are
7811 * stopped accumulating and are output when the count changes to/from 2.
7812 * The count is incremented when we start a range that's in the set, and
7813 * decremented when we start a range that's not in the set. So its range
7814 * is 0 to 2. Only when the count is 2 is something in the intersection.
7818 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7821 /* Special case if either one is empty */
7822 len_a = _invlist_len(a);
7823 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7825 if (len_a != 0 && complement_b) {
7827 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7828 * be empty. Here, also we are using 'b's complement, which hence
7829 * must be every possible code point. Thus the intersection is
7832 *i = invlist_clone(a);
7838 /* else *i is already 'a' */
7842 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7843 * intersection must be empty */
7850 *i = _new_invlist(0);
7854 /* Here both lists exist and are non-empty */
7855 array_a = invlist_array(a);
7856 array_b = invlist_array(b);
7858 /* If are to take the intersection of 'a' with the complement of b, set it
7859 * up so are looking at b's complement. */
7862 /* To complement, we invert: if the first element is 0, remove it. To
7863 * do this, we just pretend the array starts one later, and clear the
7864 * flag as we don't have to do anything else later */
7865 if (array_b[0] == 0) {
7868 complement_b = FALSE;
7872 /* But if the first element is not zero, we unshift a 0 before the
7873 * array. The data structure reserves a space for that 0 (which
7874 * should be a '1' right now), so physical shifting is unneeded,
7875 * but temporarily change that element to 0. Before exiting the
7876 * routine, we must restore the element to '1' */
7883 /* Size the intersection for the worst case: that the intersection ends up
7884 * fragmenting everything to be completely disjoint */
7885 r= _new_invlist(len_a + len_b);
7887 /* Will contain U+0000 iff both components do */
7888 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7889 && len_b > 0 && array_b[0] == 0);
7891 /* Go through each list item by item, stopping when exhausted one of
7893 while (i_a < len_a && i_b < len_b) {
7894 UV cp; /* The element to potentially add to the intersection's
7896 bool cp_in_set; /* Is it in the input list's set or not */
7898 /* We need to take one or the other of the two inputs for the
7899 * intersection. Since we are merging two sorted lists, we take the
7900 * smaller of the next items. In case of a tie, we take the one that
7901 * is not in its set first (a difference from the union algorithm). If
7902 * we took one in the set first, it would increment the count, possibly
7903 * to 2 which would cause it to be output as starting a range in the
7904 * intersection, and the next time through we would take that same
7905 * number, and output it again as ending the set. By doing it the
7906 * opposite of this, there is no possibility that the count will be
7907 * momentarily incremented to 2. (In a tie and both are in the set or
7908 * both not in the set, it doesn't matter which we take first.) */
7909 if (array_a[i_a] < array_b[i_b]
7910 || (array_a[i_a] == array_b[i_b]
7911 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7913 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7917 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7921 /* Here, have chosen which of the two inputs to look at. Only output
7922 * if the running count changes to/from 2, which marks the
7923 * beginning/end of a range that's in the intersection */
7927 array_r[i_r++] = cp;
7932 array_r[i_r++] = cp;
7938 /* Here, we are finished going through at least one of the lists, which
7939 * means there is something remaining in at most one. We check if the list
7940 * that has been exhausted is positioned such that we are in the middle
7941 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7942 * the ones we care about.) There are four cases:
7943 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7944 * nothing left in the intersection.
7945 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7946 * above 2. What should be output is exactly that which is in the
7947 * non-exhausted set, as everything it has is also in the intersection
7948 * set, and everything it doesn't have can't be in the intersection
7949 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7950 * gets incremented to 2. Like the previous case, the intersection is
7951 * everything that remains in the non-exhausted set.
7952 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7953 * remains 1. And the intersection has nothing more. */
7954 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7955 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7960 /* The final length is what we've output so far plus what else is in the
7961 * intersection. At most one of the subexpressions below will be non-zero */
7964 len_r += (len_a - i_a) + (len_b - i_b);
7967 /* Set result to final length, which can change the pointer to array_r, so
7969 if (len_r != _invlist_len(r)) {
7970 invlist_set_len(r, len_r);
7972 array_r = invlist_array(r);
7975 /* Finish outputting any remaining */
7976 if (count >= 2) { /* At most one will have a non-zero copy count */
7978 if ((copy_count = len_a - i_a) > 0) {
7979 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7981 else if ((copy_count = len_b - i_b) > 0) {
7982 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7986 /* We may be removing a reference to one of the inputs */
7987 if (a == *i || b == *i) {
7991 /* If we've changed b, restore it */
8001 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8003 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8004 * set. A pointer to the inversion list is returned. This may actually be
8005 * a new list, in which case the passed in one has been destroyed. The
8006 * passed in inversion list can be NULL, in which case a new one is created
8007 * with just the one range in it */
8012 if (invlist == NULL) {
8013 invlist = _new_invlist(2);
8017 len = _invlist_len(invlist);
8020 /* If comes after the final entry, can just append it to the end */
8022 || start >= invlist_array(invlist)
8023 [_invlist_len(invlist) - 1])
8025 _append_range_to_invlist(invlist, start, end);
8029 /* Here, can't just append things, create and return a new inversion list
8030 * which is the union of this range and the existing inversion list */
8031 range_invlist = _new_invlist(2);
8032 _append_range_to_invlist(range_invlist, start, end);
8034 _invlist_union(invlist, range_invlist, &invlist);
8036 /* The temporary can be freed */
8037 SvREFCNT_dec(range_invlist);
8044 PERL_STATIC_INLINE SV*
8045 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8046 return _add_range_to_invlist(invlist, cp, cp);
8049 #ifndef PERL_IN_XSUB_RE
8051 Perl__invlist_invert(pTHX_ SV* const invlist)
8053 /* Complement the input inversion list. This adds a 0 if the list didn't
8054 * have a zero; removes it otherwise. As described above, the data
8055 * structure is set up so that this is very efficient */
8057 UV* len_pos = _get_invlist_len_addr(invlist);
8059 PERL_ARGS_ASSERT__INVLIST_INVERT;
8061 /* The inverse of matching nothing is matching everything */
8062 if (*len_pos == 0) {
8063 _append_range_to_invlist(invlist, 0, UV_MAX);
8067 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
8068 * zero element was a 0, so it is being removed, so the length decrements
8069 * by 1; and vice-versa. SvCUR is unaffected */
8070 if (*get_invlist_zero_addr(invlist) ^= 1) {
8079 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8081 /* Complement the input inversion list (which must be a Unicode property,
8082 * all of which don't match above the Unicode maximum code point.) And
8083 * Perl has chosen to not have the inversion match above that either. This
8084 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8090 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8092 _invlist_invert(invlist);
8094 len = _invlist_len(invlist);
8096 if (len != 0) { /* If empty do nothing */
8097 array = invlist_array(invlist);
8098 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8099 /* Add 0x110000. First, grow if necessary */
8101 if (invlist_max(invlist) < len) {
8102 invlist_extend(invlist, len);
8103 array = invlist_array(invlist);
8105 invlist_set_len(invlist, len);
8106 array[len - 1] = PERL_UNICODE_MAX + 1;
8108 else { /* Remove the 0x110000 */
8109 invlist_set_len(invlist, len - 1);
8117 PERL_STATIC_INLINE SV*
8118 S_invlist_clone(pTHX_ SV* const invlist)
8121 /* Return a new inversion list that is a copy of the input one, which is
8124 /* Need to allocate extra space to accommodate Perl's addition of a
8125 * trailing NUL to SvPV's, since it thinks they are always strings */
8126 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8127 STRLEN length = SvCUR(invlist);
8129 PERL_ARGS_ASSERT_INVLIST_CLONE;
8131 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8132 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8137 PERL_STATIC_INLINE UV*
8138 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8140 /* Return the address of the UV that contains the current iteration
8143 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8145 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8148 PERL_STATIC_INLINE UV*
8149 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8151 /* Return the address of the UV that contains the version id. */
8153 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8155 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8158 PERL_STATIC_INLINE void
8159 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8161 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8163 *get_invlist_iter_addr(invlist) = 0;
8167 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8169 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8170 * This call sets in <*start> and <*end>, the next range in <invlist>.
8171 * Returns <TRUE> if successful and the next call will return the next
8172 * range; <FALSE> if was already at the end of the list. If the latter,
8173 * <*start> and <*end> are unchanged, and the next call to this function
8174 * will start over at the beginning of the list */
8176 UV* pos = get_invlist_iter_addr(invlist);
8177 UV len = _invlist_len(invlist);
8180 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8183 *pos = UV_MAX; /* Force iternit() to be required next time */
8187 array = invlist_array(invlist);
8189 *start = array[(*pos)++];
8195 *end = array[(*pos)++] - 1;
8201 PERL_STATIC_INLINE UV
8202 S_invlist_highest(pTHX_ SV* const invlist)
8204 /* Returns the highest code point that matches an inversion list. This API
8205 * has an ambiguity, as it returns 0 under either the highest is actually
8206 * 0, or if the list is empty. If this distinction matters to you, check
8207 * for emptiness before calling this function */
8209 UV len = _invlist_len(invlist);
8212 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8218 array = invlist_array(invlist);
8220 /* The last element in the array in the inversion list always starts a
8221 * range that goes to infinity. That range may be for code points that are
8222 * matched in the inversion list, or it may be for ones that aren't
8223 * matched. In the latter case, the highest code point in the set is one
8224 * less than the beginning of this range; otherwise it is the final element
8225 * of this range: infinity */
8226 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8228 : array[len - 1] - 1;
8231 #ifndef PERL_IN_XSUB_RE
8233 Perl__invlist_contents(pTHX_ SV* const invlist)
8235 /* Get the contents of an inversion list into a string SV so that they can
8236 * be printed out. It uses the format traditionally done for debug tracing
8240 SV* output = newSVpvs("\n");
8242 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8244 invlist_iterinit(invlist);
8245 while (invlist_iternext(invlist, &start, &end)) {
8246 if (end == UV_MAX) {
8247 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8249 else if (end != start) {
8250 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8254 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8262 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8264 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8266 /* Dumps out the ranges in an inversion list. The string 'header'
8267 * if present is output on a line before the first range */
8271 PERL_ARGS_ASSERT__INVLIST_DUMP;
8273 if (header && strlen(header)) {
8274 PerlIO_printf(Perl_debug_log, "%s\n", header);
8276 invlist_iterinit(invlist);
8277 while (invlist_iternext(invlist, &start, &end)) {
8278 if (end == UV_MAX) {
8279 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8281 else if (end != start) {
8282 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8286 PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8294 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8296 /* Return a boolean as to if the two passed in inversion lists are
8297 * identical. The final argument, if TRUE, says to take the complement of
8298 * the second inversion list before doing the comparison */
8300 UV* array_a = invlist_array(a);
8301 UV* array_b = invlist_array(b);
8302 UV len_a = _invlist_len(a);
8303 UV len_b = _invlist_len(b);
8305 UV i = 0; /* current index into the arrays */
8306 bool retval = TRUE; /* Assume are identical until proven otherwise */
8308 PERL_ARGS_ASSERT__INVLISTEQ;
8310 /* If are to compare 'a' with the complement of b, set it
8311 * up so are looking at b's complement. */
8314 /* The complement of nothing is everything, so <a> would have to have
8315 * just one element, starting at zero (ending at infinity) */
8317 return (len_a == 1 && array_a[0] == 0);
8319 else if (array_b[0] == 0) {
8321 /* Otherwise, to complement, we invert. Here, the first element is
8322 * 0, just remove it. To do this, we just pretend the array starts
8323 * one later, and clear the flag as we don't have to do anything
8328 complement_b = FALSE;
8332 /* But if the first element is not zero, we unshift a 0 before the
8333 * array. The data structure reserves a space for that 0 (which
8334 * should be a '1' right now), so physical shifting is unneeded,
8335 * but temporarily change that element to 0. Before exiting the
8336 * routine, we must restore the element to '1' */
8343 /* Make sure that the lengths are the same, as well as the final element
8344 * before looping through the remainder. (Thus we test the length, final,
8345 * and first elements right off the bat) */
8346 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8349 else for (i = 0; i < len_a - 1; i++) {
8350 if (array_a[i] != array_b[i]) {
8363 #undef HEADER_LENGTH
8364 #undef INVLIST_INITIAL_LENGTH
8365 #undef TO_INTERNAL_SIZE
8366 #undef FROM_INTERNAL_SIZE
8367 #undef INVLIST_LEN_OFFSET
8368 #undef INVLIST_ZERO_OFFSET
8369 #undef INVLIST_ITER_OFFSET
8370 #undef INVLIST_VERSION_ID
8372 /* End of inversion list object */
8375 - reg - regular expression, i.e. main body or parenthesized thing
8377 * Caller must absorb opening parenthesis.
8379 * Combining parenthesis handling with the base level of regular expression
8380 * is a trifle forced, but the need to tie the tails of the branches to what
8381 * follows makes it hard to avoid.
8383 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8385 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8387 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8391 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8392 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8395 regnode *ret; /* Will be the head of the group. */
8398 regnode *ender = NULL;
8401 U32 oregflags = RExC_flags;
8402 bool have_branch = 0;
8404 I32 freeze_paren = 0;
8405 I32 after_freeze = 0;
8407 /* for (?g), (?gc), and (?o) warnings; warning
8408 about (?c) will warn about (?g) -- japhy */
8410 #define WASTED_O 0x01
8411 #define WASTED_G 0x02
8412 #define WASTED_C 0x04
8413 #define WASTED_GC (0x02|0x04)
8414 I32 wastedflags = 0x00;
8416 char * parse_start = RExC_parse; /* MJD */
8417 char * const oregcomp_parse = RExC_parse;
8419 GET_RE_DEBUG_FLAGS_DECL;
8421 PERL_ARGS_ASSERT_REG;
8422 DEBUG_PARSE("reg ");
8424 *flagp = 0; /* Tentatively. */
8427 /* Make an OPEN node, if parenthesized. */
8429 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8430 char *start_verb = RExC_parse;
8431 STRLEN verb_len = 0;
8432 char *start_arg = NULL;
8433 unsigned char op = 0;
8435 int internal_argval = 0; /* internal_argval is only useful if !argok */
8436 while ( *RExC_parse && *RExC_parse != ')' ) {
8437 if ( *RExC_parse == ':' ) {
8438 start_arg = RExC_parse + 1;
8444 verb_len = RExC_parse - start_verb;
8447 while ( *RExC_parse && *RExC_parse != ')' )
8449 if ( *RExC_parse != ')' )
8450 vFAIL("Unterminated verb pattern argument");
8451 if ( RExC_parse == start_arg )
8454 if ( *RExC_parse != ')' )
8455 vFAIL("Unterminated verb pattern");
8458 switch ( *start_verb ) {
8459 case 'A': /* (*ACCEPT) */
8460 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8462 internal_argval = RExC_nestroot;
8465 case 'C': /* (*COMMIT) */
8466 if ( memEQs(start_verb,verb_len,"COMMIT") )
8469 case 'F': /* (*FAIL) */
8470 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8475 case ':': /* (*:NAME) */
8476 case 'M': /* (*MARK:NAME) */
8477 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8482 case 'P': /* (*PRUNE) */
8483 if ( memEQs(start_verb,verb_len,"PRUNE") )
8486 case 'S': /* (*SKIP) */
8487 if ( memEQs(start_verb,verb_len,"SKIP") )
8490 case 'T': /* (*THEN) */
8491 /* [19:06] <TimToady> :: is then */
8492 if ( memEQs(start_verb,verb_len,"THEN") ) {
8494 RExC_seen |= REG_SEEN_CUTGROUP;
8500 vFAIL3("Unknown verb pattern '%.*s'",
8501 verb_len, start_verb);
8504 if ( start_arg && internal_argval ) {
8505 vFAIL3("Verb pattern '%.*s' may not have an argument",
8506 verb_len, start_verb);
8507 } else if ( argok < 0 && !start_arg ) {
8508 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8509 verb_len, start_verb);
8511 ret = reganode(pRExC_state, op, internal_argval);
8512 if ( ! internal_argval && ! SIZE_ONLY ) {
8514 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8515 ARG(ret) = add_data( pRExC_state, 1, "S" );
8516 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8523 if (!internal_argval)
8524 RExC_seen |= REG_SEEN_VERBARG;
8525 } else if ( start_arg ) {
8526 vFAIL3("Verb pattern '%.*s' may not have an argument",
8527 verb_len, start_verb);
8529 ret = reg_node(pRExC_state, op);
8531 nextchar(pRExC_state);
8534 if (*RExC_parse == '?') { /* (?...) */
8535 bool is_logical = 0;
8536 const char * const seqstart = RExC_parse;
8537 bool has_use_defaults = FALSE;
8540 paren = *RExC_parse++;
8541 ret = NULL; /* For look-ahead/behind. */
8544 case 'P': /* (?P...) variants for those used to PCRE/Python */
8545 paren = *RExC_parse++;
8546 if ( paren == '<') /* (?P<...>) named capture */
8548 else if (paren == '>') { /* (?P>name) named recursion */
8549 goto named_recursion;
8551 else if (paren == '=') { /* (?P=...) named backref */
8552 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8553 you change this make sure you change that */
8554 char* name_start = RExC_parse;
8556 SV *sv_dat = reg_scan_name(pRExC_state,
8557 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8558 if (RExC_parse == name_start || *RExC_parse != ')')
8559 vFAIL2("Sequence %.3s... not terminated",parse_start);
8562 num = add_data( pRExC_state, 1, "S" );
8563 RExC_rxi->data->data[num]=(void*)sv_dat;
8564 SvREFCNT_inc_simple_void(sv_dat);
8567 ret = reganode(pRExC_state,
8570 : (ASCII_FOLD_RESTRICTED)
8572 : (AT_LEAST_UNI_SEMANTICS)
8580 Set_Node_Offset(ret, parse_start+1);
8581 Set_Node_Cur_Length(ret); /* MJD */
8583 nextchar(pRExC_state);
8587 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8589 case '<': /* (?<...) */
8590 if (*RExC_parse == '!')
8592 else if (*RExC_parse != '=')
8598 case '\'': /* (?'...') */
8599 name_start= RExC_parse;
8600 svname = reg_scan_name(pRExC_state,
8601 SIZE_ONLY ? /* reverse test from the others */
8602 REG_RSN_RETURN_NAME :
8603 REG_RSN_RETURN_NULL);
8604 if (RExC_parse == name_start) {
8606 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8609 if (*RExC_parse != paren)
8610 vFAIL2("Sequence (?%c... not terminated",
8611 paren=='>' ? '<' : paren);
8615 if (!svname) /* shouldn't happen */
8617 "panic: reg_scan_name returned NULL");
8618 if (!RExC_paren_names) {
8619 RExC_paren_names= newHV();
8620 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8622 RExC_paren_name_list= newAV();
8623 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8626 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8628 sv_dat = HeVAL(he_str);
8630 /* croak baby croak */
8632 "panic: paren_name hash element allocation failed");
8633 } else if ( SvPOK(sv_dat) ) {
8634 /* (?|...) can mean we have dupes so scan to check
8635 its already been stored. Maybe a flag indicating
8636 we are inside such a construct would be useful,
8637 but the arrays are likely to be quite small, so
8638 for now we punt -- dmq */
8639 IV count = SvIV(sv_dat);
8640 I32 *pv = (I32*)SvPVX(sv_dat);
8642 for ( i = 0 ; i < count ; i++ ) {
8643 if ( pv[i] == RExC_npar ) {
8649 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8650 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8651 pv[count] = RExC_npar;
8652 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8655 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8656 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8658 SvIV_set(sv_dat, 1);
8661 /* Yes this does cause a memory leak in debugging Perls */
8662 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8663 SvREFCNT_dec(svname);
8666 /*sv_dump(sv_dat);*/
8668 nextchar(pRExC_state);
8670 goto capturing_parens;
8672 RExC_seen |= REG_SEEN_LOOKBEHIND;
8673 RExC_in_lookbehind++;
8675 case '=': /* (?=...) */
8676 RExC_seen_zerolen++;
8678 case '!': /* (?!...) */
8679 RExC_seen_zerolen++;
8680 if (*RExC_parse == ')') {
8681 ret=reg_node(pRExC_state, OPFAIL);
8682 nextchar(pRExC_state);
8686 case '|': /* (?|...) */
8687 /* branch reset, behave like a (?:...) except that
8688 buffers in alternations share the same numbers */
8690 after_freeze = freeze_paren = RExC_npar;
8692 case ':': /* (?:...) */
8693 case '>': /* (?>...) */
8695 case '$': /* (?$...) */
8696 case '@': /* (?@...) */
8697 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8699 case '#': /* (?#...) */
8700 while (*RExC_parse && *RExC_parse != ')')
8702 if (*RExC_parse != ')')
8703 FAIL("Sequence (?#... not terminated");
8704 nextchar(pRExC_state);
8707 case '0' : /* (?0) */
8708 case 'R' : /* (?R) */
8709 if (*RExC_parse != ')')
8710 FAIL("Sequence (?R) not terminated");
8711 ret = reg_node(pRExC_state, GOSTART);
8712 *flagp |= POSTPONED;
8713 nextchar(pRExC_state);
8716 { /* named and numeric backreferences */
8718 case '&': /* (?&NAME) */
8719 parse_start = RExC_parse - 1;
8722 SV *sv_dat = reg_scan_name(pRExC_state,
8723 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8724 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8726 goto gen_recurse_regop;
8727 assert(0); /* NOT REACHED */
8729 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8731 vFAIL("Illegal pattern");
8733 goto parse_recursion;
8735 case '-': /* (?-1) */
8736 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8737 RExC_parse--; /* rewind to let it be handled later */
8741 case '1': case '2': case '3': case '4': /* (?1) */
8742 case '5': case '6': case '7': case '8': case '9':
8745 num = atoi(RExC_parse);
8746 parse_start = RExC_parse - 1; /* MJD */
8747 if (*RExC_parse == '-')
8749 while (isDIGIT(*RExC_parse))
8751 if (*RExC_parse!=')')
8752 vFAIL("Expecting close bracket");
8755 if ( paren == '-' ) {
8757 Diagram of capture buffer numbering.
8758 Top line is the normal capture buffer numbers
8759 Bottom line is the negative indexing as from
8763 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8767 num = RExC_npar + num;
8770 vFAIL("Reference to nonexistent group");
8772 } else if ( paren == '+' ) {
8773 num = RExC_npar + num - 1;
8776 ret = reganode(pRExC_state, GOSUB, num);
8778 if (num > (I32)RExC_rx->nparens) {
8780 vFAIL("Reference to nonexistent group");
8782 ARG2L_SET( ret, RExC_recurse_count++);
8784 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8785 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8789 RExC_seen |= REG_SEEN_RECURSE;
8790 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8791 Set_Node_Offset(ret, parse_start); /* MJD */
8793 *flagp |= POSTPONED;
8794 nextchar(pRExC_state);
8796 } /* named and numeric backreferences */
8797 assert(0); /* NOT REACHED */
8799 case '?': /* (??...) */
8801 if (*RExC_parse != '{') {
8803 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8806 *flagp |= POSTPONED;
8807 paren = *RExC_parse++;
8809 case '{': /* (?{...}) */
8812 struct reg_code_block *cb;
8814 RExC_seen_zerolen++;
8816 if ( !pRExC_state->num_code_blocks
8817 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8818 || pRExC_state->code_blocks[pRExC_state->code_index].start
8819 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8822 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8823 FAIL("panic: Sequence (?{...}): no code block found\n");
8824 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8826 /* this is a pre-compiled code block (?{...}) */
8827 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8828 RExC_parse = RExC_start + cb->end;
8831 if (cb->src_regex) {
8832 n = add_data(pRExC_state, 2, "rl");
8833 RExC_rxi->data->data[n] =
8834 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8835 RExC_rxi->data->data[n+1] = (void*)o;
8838 n = add_data(pRExC_state, 1,
8839 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8840 RExC_rxi->data->data[n] = (void*)o;
8843 pRExC_state->code_index++;
8844 nextchar(pRExC_state);
8848 ret = reg_node(pRExC_state, LOGICAL);
8849 eval = reganode(pRExC_state, EVAL, n);
8852 /* for later propagation into (??{}) return value */
8853 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8855 REGTAIL(pRExC_state, ret, eval);
8856 /* deal with the length of this later - MJD */
8859 ret = reganode(pRExC_state, EVAL, n);
8860 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8861 Set_Node_Offset(ret, parse_start);
8864 case '(': /* (?(?{...})...) and (?(?=...)...) */
8867 if (RExC_parse[0] == '?') { /* (?(?...)) */
8868 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8869 || RExC_parse[1] == '<'
8870 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8873 ret = reg_node(pRExC_state, LOGICAL);
8876 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8880 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8881 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8883 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8884 char *name_start= RExC_parse++;
8886 SV *sv_dat=reg_scan_name(pRExC_state,
8887 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8888 if (RExC_parse == name_start || *RExC_parse != ch)
8889 vFAIL2("Sequence (?(%c... not terminated",
8890 (ch == '>' ? '<' : ch));
8893 num = add_data( pRExC_state, 1, "S" );
8894 RExC_rxi->data->data[num]=(void*)sv_dat;
8895 SvREFCNT_inc_simple_void(sv_dat);
8897 ret = reganode(pRExC_state,NGROUPP,num);
8898 goto insert_if_check_paren;
8900 else if (RExC_parse[0] == 'D' &&
8901 RExC_parse[1] == 'E' &&
8902 RExC_parse[2] == 'F' &&
8903 RExC_parse[3] == 'I' &&
8904 RExC_parse[4] == 'N' &&
8905 RExC_parse[5] == 'E')
8907 ret = reganode(pRExC_state,DEFINEP,0);
8910 goto insert_if_check_paren;
8912 else if (RExC_parse[0] == 'R') {
8915 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8916 parno = atoi(RExC_parse++);
8917 while (isDIGIT(*RExC_parse))
8919 } else if (RExC_parse[0] == '&') {
8922 sv_dat = reg_scan_name(pRExC_state,
8923 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8924 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8926 ret = reganode(pRExC_state,INSUBP,parno);
8927 goto insert_if_check_paren;
8929 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8932 parno = atoi(RExC_parse++);
8934 while (isDIGIT(*RExC_parse))
8936 ret = reganode(pRExC_state, GROUPP, parno);
8938 insert_if_check_paren:
8939 if ((c = *nextchar(pRExC_state)) != ')')
8940 vFAIL("Switch condition not recognized");
8942 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8943 br = regbranch(pRExC_state, &flags, 1,depth+1);
8945 br = reganode(pRExC_state, LONGJMP, 0);
8947 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8948 c = *nextchar(pRExC_state);
8953 vFAIL("(?(DEFINE)....) does not allow branches");
8954 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8955 regbranch(pRExC_state, &flags, 1,depth+1);
8956 REGTAIL(pRExC_state, ret, lastbr);
8959 c = *nextchar(pRExC_state);
8964 vFAIL("Switch (?(condition)... contains too many branches");
8965 ender = reg_node(pRExC_state, TAIL);
8966 REGTAIL(pRExC_state, br, ender);
8968 REGTAIL(pRExC_state, lastbr, ender);
8969 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8972 REGTAIL(pRExC_state, ret, ender);
8973 RExC_size++; /* XXX WHY do we need this?!!
8974 For large programs it seems to be required
8975 but I can't figure out why. -- dmq*/
8979 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8983 RExC_parse--; /* for vFAIL to print correctly */
8984 vFAIL("Sequence (? incomplete");
8986 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8988 has_use_defaults = TRUE;
8989 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8990 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8991 ? REGEX_UNICODE_CHARSET
8992 : REGEX_DEPENDS_CHARSET);
8996 parse_flags: /* (?i) */
8998 U32 posflags = 0, negflags = 0;
8999 U32 *flagsp = &posflags;
9000 char has_charset_modifier = '\0';
9001 regex_charset cs = get_regex_charset(RExC_flags);
9002 if (cs == REGEX_DEPENDS_CHARSET
9003 && (RExC_utf8 || RExC_uni_semantics))
9005 cs = REGEX_UNICODE_CHARSET;
9008 while (*RExC_parse) {
9009 /* && strchr("iogcmsx", *RExC_parse) */
9010 /* (?g), (?gc) and (?o) are useless here
9011 and must be globally applied -- japhy */
9012 switch (*RExC_parse) {
9013 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9014 case LOCALE_PAT_MOD:
9015 if (has_charset_modifier) {
9016 goto excess_modifier;
9018 else if (flagsp == &negflags) {
9021 cs = REGEX_LOCALE_CHARSET;
9022 has_charset_modifier = LOCALE_PAT_MOD;
9023 RExC_contains_locale = 1;
9025 case UNICODE_PAT_MOD:
9026 if (has_charset_modifier) {
9027 goto excess_modifier;
9029 else if (flagsp == &negflags) {
9032 cs = REGEX_UNICODE_CHARSET;
9033 has_charset_modifier = UNICODE_PAT_MOD;
9035 case ASCII_RESTRICT_PAT_MOD:
9036 if (flagsp == &negflags) {
9039 if (has_charset_modifier) {
9040 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9041 goto excess_modifier;
9043 /* Doubled modifier implies more restricted */
9044 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9047 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9049 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9051 case DEPENDS_PAT_MOD:
9052 if (has_use_defaults) {
9053 goto fail_modifiers;
9055 else if (flagsp == &negflags) {
9058 else if (has_charset_modifier) {
9059 goto excess_modifier;
9062 /* The dual charset means unicode semantics if the
9063 * pattern (or target, not known until runtime) are
9064 * utf8, or something in the pattern indicates unicode
9066 cs = (RExC_utf8 || RExC_uni_semantics)
9067 ? REGEX_UNICODE_CHARSET
9068 : REGEX_DEPENDS_CHARSET;
9069 has_charset_modifier = DEPENDS_PAT_MOD;
9073 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9074 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9076 else if (has_charset_modifier == *(RExC_parse - 1)) {
9077 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9080 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9085 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9087 case ONCE_PAT_MOD: /* 'o' */
9088 case GLOBAL_PAT_MOD: /* 'g' */
9089 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9090 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9091 if (! (wastedflags & wflagbit) ) {
9092 wastedflags |= wflagbit;
9095 "Useless (%s%c) - %suse /%c modifier",
9096 flagsp == &negflags ? "?-" : "?",
9098 flagsp == &negflags ? "don't " : "",
9105 case CONTINUE_PAT_MOD: /* 'c' */
9106 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9107 if (! (wastedflags & WASTED_C) ) {
9108 wastedflags |= WASTED_GC;
9111 "Useless (%sc) - %suse /gc modifier",
9112 flagsp == &negflags ? "?-" : "?",
9113 flagsp == &negflags ? "don't " : ""
9118 case KEEPCOPY_PAT_MOD: /* 'p' */
9119 if (flagsp == &negflags) {
9121 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9123 *flagsp |= RXf_PMf_KEEPCOPY;
9127 /* A flag is a default iff it is following a minus, so
9128 * if there is a minus, it means will be trying to
9129 * re-specify a default which is an error */
9130 if (has_use_defaults || flagsp == &negflags) {
9133 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9137 wastedflags = 0; /* reset so (?g-c) warns twice */
9143 RExC_flags |= posflags;
9144 RExC_flags &= ~negflags;
9145 set_regex_charset(&RExC_flags, cs);
9147 oregflags |= posflags;
9148 oregflags &= ~negflags;
9149 set_regex_charset(&oregflags, cs);
9151 nextchar(pRExC_state);
9162 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9167 }} /* one for the default block, one for the switch */
9174 ret = reganode(pRExC_state, OPEN, parno);
9177 RExC_nestroot = parno;
9178 if (RExC_seen & REG_SEEN_RECURSE
9179 && !RExC_open_parens[parno-1])
9181 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9182 "Setting open paren #%"IVdf" to %d\n",
9183 (IV)parno, REG_NODE_NUM(ret)));
9184 RExC_open_parens[parno-1]= ret;
9187 Set_Node_Length(ret, 1); /* MJD */
9188 Set_Node_Offset(ret, RExC_parse); /* MJD */
9196 /* Pick up the branches, linking them together. */
9197 parse_start = RExC_parse; /* MJD */
9198 br = regbranch(pRExC_state, &flags, 1,depth+1);
9200 /* branch_len = (paren != 0); */
9204 if (*RExC_parse == '|') {
9205 if (!SIZE_ONLY && RExC_extralen) {
9206 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9209 reginsert(pRExC_state, BRANCH, br, depth+1);
9210 Set_Node_Length(br, paren != 0);
9211 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9215 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9217 else if (paren == ':') {
9218 *flagp |= flags&SIMPLE;
9220 if (is_open) { /* Starts with OPEN. */
9221 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9223 else if (paren != '?') /* Not Conditional */
9225 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9227 while (*RExC_parse == '|') {
9228 if (!SIZE_ONLY && RExC_extralen) {
9229 ender = reganode(pRExC_state, LONGJMP,0);
9230 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9233 RExC_extralen += 2; /* Account for LONGJMP. */
9234 nextchar(pRExC_state);
9236 if (RExC_npar > after_freeze)
9237 after_freeze = RExC_npar;
9238 RExC_npar = freeze_paren;
9240 br = regbranch(pRExC_state, &flags, 0, depth+1);
9244 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9246 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9249 if (have_branch || paren != ':') {
9250 /* Make a closing node, and hook it on the end. */
9253 ender = reg_node(pRExC_state, TAIL);
9256 ender = reganode(pRExC_state, CLOSE, parno);
9257 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9258 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9259 "Setting close paren #%"IVdf" to %d\n",
9260 (IV)parno, REG_NODE_NUM(ender)));
9261 RExC_close_parens[parno-1]= ender;
9262 if (RExC_nestroot == parno)
9265 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9266 Set_Node_Length(ender,1); /* MJD */
9272 *flagp &= ~HASWIDTH;
9275 ender = reg_node(pRExC_state, SUCCEED);
9278 ender = reg_node(pRExC_state, END);
9280 assert(!RExC_opend); /* there can only be one! */
9285 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9286 SV * const mysv_val1=sv_newmortal();
9287 SV * const mysv_val2=sv_newmortal();
9288 DEBUG_PARSE_MSG("lsbr");
9289 regprop(RExC_rx, mysv_val1, lastbr);
9290 regprop(RExC_rx, mysv_val2, ender);
9291 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9292 SvPV_nolen_const(mysv_val1),
9293 (IV)REG_NODE_NUM(lastbr),
9294 SvPV_nolen_const(mysv_val2),
9295 (IV)REG_NODE_NUM(ender),
9296 (IV)(ender - lastbr)
9299 REGTAIL(pRExC_state, lastbr, ender);
9301 if (have_branch && !SIZE_ONLY) {
9304 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9306 /* Hook the tails of the branches to the closing node. */
9307 for (br = ret; br; br = regnext(br)) {
9308 const U8 op = PL_regkind[OP(br)];
9310 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9311 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9314 else if (op == BRANCHJ) {
9315 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9316 /* for now we always disable this optimisation * /
9317 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9323 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9324 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9325 SV * const mysv_val1=sv_newmortal();
9326 SV * const mysv_val2=sv_newmortal();
9327 DEBUG_PARSE_MSG("NADA");
9328 regprop(RExC_rx, mysv_val1, ret);
9329 regprop(RExC_rx, mysv_val2, ender);
9330 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9331 SvPV_nolen_const(mysv_val1),
9332 (IV)REG_NODE_NUM(ret),
9333 SvPV_nolen_const(mysv_val2),
9334 (IV)REG_NODE_NUM(ender),
9339 if (OP(ender) == TAIL) {
9344 for ( opt= br + 1; opt < ender ; opt++ )
9346 NEXT_OFF(br)= ender - br;
9354 static const char parens[] = "=!<,>";
9356 if (paren && (p = strchr(parens, paren))) {
9357 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9358 int flag = (p - parens) > 1;
9361 node = SUSPEND, flag = 0;
9362 reginsert(pRExC_state, node,ret, depth+1);
9363 Set_Node_Cur_Length(ret);
9364 Set_Node_Offset(ret, parse_start + 1);
9366 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9370 /* Check for proper termination. */
9372 RExC_flags = oregflags;
9373 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9374 RExC_parse = oregcomp_parse;
9375 vFAIL("Unmatched (");
9378 else if (!paren && RExC_parse < RExC_end) {
9379 if (*RExC_parse == ')') {
9381 vFAIL("Unmatched )");
9384 FAIL("Junk on end of regexp"); /* "Can't happen". */
9385 assert(0); /* NOTREACHED */
9388 if (RExC_in_lookbehind) {
9389 RExC_in_lookbehind--;
9391 if (after_freeze > RExC_npar)
9392 RExC_npar = after_freeze;
9397 - regbranch - one alternative of an | operator
9399 * Implements the concatenation operator.
9402 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9406 regnode *chain = NULL;
9408 I32 flags = 0, c = 0;
9409 GET_RE_DEBUG_FLAGS_DECL;
9411 PERL_ARGS_ASSERT_REGBRANCH;
9413 DEBUG_PARSE("brnc");
9418 if (!SIZE_ONLY && RExC_extralen)
9419 ret = reganode(pRExC_state, BRANCHJ,0);
9421 ret = reg_node(pRExC_state, BRANCH);
9422 Set_Node_Length(ret, 1);
9426 if (!first && SIZE_ONLY)
9427 RExC_extralen += 1; /* BRANCHJ */
9429 *flagp = WORST; /* Tentatively. */
9432 nextchar(pRExC_state);
9433 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9435 latest = regpiece(pRExC_state, &flags,depth+1);
9436 if (latest == NULL) {
9437 if (flags & TRYAGAIN)
9441 else if (ret == NULL)
9443 *flagp |= flags&(HASWIDTH|POSTPONED);
9444 if (chain == NULL) /* First piece. */
9445 *flagp |= flags&SPSTART;
9448 REGTAIL(pRExC_state, chain, latest);
9453 if (chain == NULL) { /* Loop ran zero times. */
9454 chain = reg_node(pRExC_state, NOTHING);
9459 *flagp |= flags&SIMPLE;
9466 - regpiece - something followed by possible [*+?]
9468 * Note that the branching code sequences used for ? and the general cases
9469 * of * and + are somewhat optimized: they use the same NOTHING node as
9470 * both the endmarker for their branch list and the body of the last branch.
9471 * It might seem that this node could be dispensed with entirely, but the
9472 * endmarker role is not redundant.
9475 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9482 const char * const origparse = RExC_parse;
9484 I32 max = REG_INFTY;
9485 #ifdef RE_TRACK_PATTERN_OFFSETS
9488 const char *maxpos = NULL;
9490 /* Save the original in case we change the emitted regop to a FAIL. */
9491 regnode * const orig_emit = RExC_emit;
9493 GET_RE_DEBUG_FLAGS_DECL;
9495 PERL_ARGS_ASSERT_REGPIECE;
9497 DEBUG_PARSE("piec");
9499 ret = regatom(pRExC_state, &flags,depth+1);
9501 if (flags & TRYAGAIN)
9508 if (op == '{' && regcurly(RExC_parse)) {
9510 #ifdef RE_TRACK_PATTERN_OFFSETS
9511 parse_start = RExC_parse; /* MJD */
9513 next = RExC_parse + 1;
9514 while (isDIGIT(*next) || *next == ',') {
9523 if (*next == '}') { /* got one */
9527 min = atoi(RExC_parse);
9531 maxpos = RExC_parse;
9533 if (!max && *maxpos != '0')
9534 max = REG_INFTY; /* meaning "infinity" */
9535 else if (max >= REG_INFTY)
9536 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9538 nextchar(pRExC_state);
9539 if (max < min) { /* If can't match, warn and optimize to fail
9542 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9544 /* We can't back off the size because we have to reserve
9545 * enough space for all the things we are about to throw
9546 * away, but we can shrink it by the ammount we are about
9548 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9551 RExC_emit = orig_emit;
9553 ret = reg_node(pRExC_state, OPFAIL);
9558 if ((flags&SIMPLE)) {
9559 RExC_naughty += 2 + RExC_naughty / 2;
9560 reginsert(pRExC_state, CURLY, ret, depth+1);
9561 Set_Node_Offset(ret, parse_start+1); /* MJD */
9562 Set_Node_Cur_Length(ret);
9565 regnode * const w = reg_node(pRExC_state, WHILEM);
9568 REGTAIL(pRExC_state, ret, w);
9569 if (!SIZE_ONLY && RExC_extralen) {
9570 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9571 reginsert(pRExC_state, NOTHING,ret, depth+1);
9572 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9574 reginsert(pRExC_state, CURLYX,ret, depth+1);
9576 Set_Node_Offset(ret, parse_start+1);
9577 Set_Node_Length(ret,
9578 op == '{' ? (RExC_parse - parse_start) : 1);
9580 if (!SIZE_ONLY && RExC_extralen)
9581 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9582 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9584 RExC_whilem_seen++, RExC_extralen += 3;
9585 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9594 ARG1_SET(ret, (U16)min);
9595 ARG2_SET(ret, (U16)max);
9607 #if 0 /* Now runtime fix should be reliable. */
9609 /* if this is reinstated, don't forget to put this back into perldiag:
9611 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9613 (F) The part of the regexp subject to either the * or + quantifier
9614 could match an empty string. The {#} shows in the regular
9615 expression about where the problem was discovered.
9619 if (!(flags&HASWIDTH) && op != '?')
9620 vFAIL("Regexp *+ operand could be empty");
9623 #ifdef RE_TRACK_PATTERN_OFFSETS
9624 parse_start = RExC_parse;
9626 nextchar(pRExC_state);
9628 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9630 if (op == '*' && (flags&SIMPLE)) {
9631 reginsert(pRExC_state, STAR, ret, depth+1);
9635 else if (op == '*') {
9639 else if (op == '+' && (flags&SIMPLE)) {
9640 reginsert(pRExC_state, PLUS, ret, depth+1);
9644 else if (op == '+') {
9648 else if (op == '?') {
9653 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9654 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9655 ckWARN3reg(RExC_parse,
9656 "%.*s matches null string many times",
9657 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9659 (void)ReREFCNT_inc(RExC_rx_sv);
9662 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9663 nextchar(pRExC_state);
9664 reginsert(pRExC_state, MINMOD, ret, depth+1);
9665 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9667 #ifndef REG_ALLOW_MINMOD_SUSPEND
9670 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9672 nextchar(pRExC_state);
9673 ender = reg_node(pRExC_state, SUCCEED);
9674 REGTAIL(pRExC_state, ret, ender);
9675 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9677 ender = reg_node(pRExC_state, TAIL);
9678 REGTAIL(pRExC_state, ret, ender);
9682 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9684 vFAIL("Nested quantifiers");
9691 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9694 /* This is expected to be called by a parser routine that has recognized '\N'
9695 and needs to handle the rest. RExC_parse is expected to point at the first
9696 char following the N at the time of the call. On successful return,
9697 RExC_parse has been updated to point to just after the sequence identified
9698 by this routine, and <*flagp> has been updated.
9700 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9703 \N may begin either a named sequence, or if outside a character class, mean
9704 to match a non-newline. For non single-quoted regexes, the tokenizer has
9705 attempted to decide which, and in the case of a named sequence, converted it
9706 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9707 where c1... are the characters in the sequence. For single-quoted regexes,
9708 the tokenizer passes the \N sequence through unchanged; this code will not
9709 attempt to determine this nor expand those, instead raising a syntax error.
9710 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9711 or there is no '}', it signals that this \N occurrence means to match a
9714 Only the \N{U+...} form should occur in a character class, for the same
9715 reason that '.' inside a character class means to just match a period: it
9716 just doesn't make sense.
9718 The function raises an error (via vFAIL), and doesn't return for various
9719 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9720 success; it returns FALSE otherwise.
9722 If <valuep> is non-null, it means the caller can accept an input sequence
9723 consisting of a just a single code point; <*valuep> is set to that value
9724 if the input is such.
9726 If <node_p> is non-null it signifies that the caller can accept any other
9727 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9729 1) \N means not-a-NL: points to a newly created REG_ANY node;
9730 2) \N{}: points to a new NOTHING node;
9731 3) otherwise: points to a new EXACT node containing the resolved
9733 Note that FALSE is returned for single code point sequences if <valuep> is
9737 char * endbrace; /* '}' following the name */
9739 char *endchar; /* Points to '.' or '}' ending cur char in the input
9741 bool has_multiple_chars; /* true if the input stream contains a sequence of
9742 more than one character */
9744 GET_RE_DEBUG_FLAGS_DECL;
9746 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9750 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9752 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9753 * modifier. The other meaning does not */
9754 p = (RExC_flags & RXf_PMf_EXTENDED)
9755 ? regwhite( pRExC_state, RExC_parse )
9758 /* Disambiguate between \N meaning a named character versus \N meaning
9759 * [^\n]. The former is assumed when it can't be the latter. */
9760 if (*p != '{' || regcurly(p)) {
9763 /* no bare \N in a charclass */
9764 if (in_char_class) {
9765 vFAIL("\\N in a character class must be a named character: \\N{...}");
9769 nextchar(pRExC_state);
9770 *node_p = reg_node(pRExC_state, REG_ANY);
9771 *flagp |= HASWIDTH|SIMPLE;
9774 Set_Node_Length(*node_p, 1); /* MJD */
9778 /* Here, we have decided it should be a named character or sequence */
9780 /* The test above made sure that the next real character is a '{', but
9781 * under the /x modifier, it could be separated by space (or a comment and
9782 * \n) and this is not allowed (for consistency with \x{...} and the
9783 * tokenizer handling of \N{NAME}). */
9784 if (*RExC_parse != '{') {
9785 vFAIL("Missing braces on \\N{}");
9788 RExC_parse++; /* Skip past the '{' */
9790 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9791 || ! (endbrace == RExC_parse /* nothing between the {} */
9792 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9793 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9795 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9796 vFAIL("\\N{NAME} must be resolved by the lexer");
9799 if (endbrace == RExC_parse) { /* empty: \N{} */
9802 *node_p = reg_node(pRExC_state,NOTHING);
9804 else if (in_char_class) {
9805 if (SIZE_ONLY && in_char_class) {
9806 ckWARNreg(RExC_parse,
9807 "Ignoring zero length \\N{} in character class"
9815 nextchar(pRExC_state);
9819 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9820 RExC_parse += 2; /* Skip past the 'U+' */
9822 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9824 /* Code points are separated by dots. If none, there is only one code
9825 * point, and is terminated by the brace */
9826 has_multiple_chars = (endchar < endbrace);
9828 if (valuep && (! has_multiple_chars || in_char_class)) {
9829 /* We only pay attention to the first char of
9830 multichar strings being returned in char classes. I kinda wonder
9831 if this makes sense as it does change the behaviour
9832 from earlier versions, OTOH that behaviour was broken
9833 as well. XXX Solution is to recharacterize as
9834 [rest-of-class]|multi1|multi2... */
9836 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9837 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9838 | PERL_SCAN_DISALLOW_PREFIX
9839 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9841 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9843 /* The tokenizer should have guaranteed validity, but it's possible to
9844 * bypass it by using single quoting, so check */
9845 if (length_of_hex == 0
9846 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9848 RExC_parse += length_of_hex; /* Includes all the valid */
9849 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9850 ? UTF8SKIP(RExC_parse)
9852 /* Guard against malformed utf8 */
9853 if (RExC_parse >= endchar) {
9854 RExC_parse = endchar;
9856 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9859 if (in_char_class && has_multiple_chars) {
9860 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9863 RExC_parse = endbrace + 1;
9865 else if (! node_p || ! has_multiple_chars) {
9867 /* Here, the input is legal, but not according to the caller's
9868 * options. We fail without advancing the parse, so that the
9869 * caller can try again */
9875 /* What is done here is to convert this to a sub-pattern of the form
9876 * (?:\x{char1}\x{char2}...)
9877 * and then call reg recursively. That way, it retains its atomicness,
9878 * while not having to worry about special handling that some code
9879 * points may have. toke.c has converted the original Unicode values
9880 * to native, so that we can just pass on the hex values unchanged. We
9881 * do have to set a flag to keep recoding from happening in the
9884 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9886 char *orig_end = RExC_end;
9889 while (RExC_parse < endbrace) {
9891 /* Convert to notation the rest of the code understands */
9892 sv_catpv(substitute_parse, "\\x{");
9893 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9894 sv_catpv(substitute_parse, "}");
9896 /* Point to the beginning of the next character in the sequence. */
9897 RExC_parse = endchar + 1;
9898 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9900 sv_catpv(substitute_parse, ")");
9902 RExC_parse = SvPV(substitute_parse, len);
9904 /* Don't allow empty number */
9906 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9908 RExC_end = RExC_parse + len;
9910 /* The values are Unicode, and therefore not subject to recoding */
9911 RExC_override_recoding = 1;
9913 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9914 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9916 RExC_parse = endbrace;
9917 RExC_end = orig_end;
9918 RExC_override_recoding = 0;
9920 nextchar(pRExC_state);
9930 * It returns the code point in utf8 for the value in *encp.
9931 * value: a code value in the source encoding
9932 * encp: a pointer to an Encode object
9934 * If the result from Encode is not a single character,
9935 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9938 S_reg_recode(pTHX_ const char value, SV **encp)
9941 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9942 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9943 const STRLEN newlen = SvCUR(sv);
9944 UV uv = UNICODE_REPLACEMENT;
9946 PERL_ARGS_ASSERT_REG_RECODE;
9950 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9953 if (!newlen || numlen != newlen) {
9954 uv = UNICODE_REPLACEMENT;
9960 PERL_STATIC_INLINE U8
9961 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9965 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9971 op = get_regex_charset(RExC_flags);
9972 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9973 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9974 been, so there is no hole */
9980 PERL_STATIC_INLINE void
9981 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9983 /* This knows the details about sizing an EXACTish node, setting flags for
9984 * it (by setting <*flagp>, and potentially populating it with a single
9987 * If <len> (the length in bytes) is non-zero, this function assumes that
9988 * the node has already been populated, and just does the sizing. In this
9989 * case <code_point> should be the final code point that has already been
9990 * placed into the node. This value will be ignored except that under some
9991 * circumstances <*flagp> is set based on it.
9993 * If <len> is zero, the function assumes that the node is to contain only
9994 * the single character given by <code_point> and calculates what <len>
9995 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9996 * additionally will populate the node's STRING with <code_point>, if <len>
9997 * is 0. In both cases <*flagp> is appropriately set
9999 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
10000 * folded (the latter only when the rules indicate it can match 'ss') */
10002 bool len_passed_in = cBOOL(len != 0);
10003 U8 character[UTF8_MAXBYTES_CASE+1];
10005 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10007 if (! len_passed_in) {
10010 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
10013 uvchr_to_utf8( character, code_point);
10014 len = UTF8SKIP(character);
10018 || code_point != LATIN_SMALL_LETTER_SHARP_S
10019 || ASCII_FOLD_RESTRICTED
10020 || ! AT_LEAST_UNI_SEMANTICS)
10022 *character = (U8) code_point;
10027 *(character + 1) = 's';
10033 RExC_size += STR_SZ(len);
10036 RExC_emit += STR_SZ(len);
10037 STR_LEN(node) = len;
10038 if (! len_passed_in) {
10039 Copy((char *) character, STRING(node), len, char);
10043 *flagp |= HASWIDTH;
10045 /* A single character node is SIMPLE, except for the special-cased SHARP S
10047 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10048 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10049 || ! FOLD || ! DEPENDS_SEMANTICS))
10056 - regatom - the lowest level
10058 Try to identify anything special at the start of the pattern. If there
10059 is, then handle it as required. This may involve generating a single regop,
10060 such as for an assertion; or it may involve recursing, such as to
10061 handle a () structure.
10063 If the string doesn't start with something special then we gobble up
10064 as much literal text as we can.
10066 Once we have been able to handle whatever type of thing started the
10067 sequence, we return.
10069 Note: we have to be careful with escapes, as they can be both literal
10070 and special, and in the case of \10 and friends, context determines which.
10072 A summary of the code structure is:
10074 switch (first_byte) {
10075 cases for each special:
10076 handle this special;
10079 switch (2nd byte) {
10080 cases for each unambiguous special:
10081 handle this special;
10083 cases for each ambigous special/literal:
10085 if (special) handle here
10087 default: // unambiguously literal:
10090 default: // is a literal char
10093 create EXACTish node for literal;
10094 while (more input and node isn't full) {
10095 switch (input_byte) {
10096 cases for each special;
10097 make sure parse pointer is set so that the next call to
10098 regatom will see this special first
10099 goto loopdone; // EXACTish node terminated by prev. char
10101 append char to EXACTISH node;
10103 get next input byte;
10107 return the generated node;
10109 Specifically there are two separate switches for handling
10110 escape sequences, with the one for handling literal escapes requiring
10111 a dummy entry for all of the special escapes that are actually handled
10116 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10119 regnode *ret = NULL;
10121 char *parse_start = RExC_parse;
10123 GET_RE_DEBUG_FLAGS_DECL;
10124 DEBUG_PARSE("atom");
10125 *flagp = WORST; /* Tentatively. */
10127 PERL_ARGS_ASSERT_REGATOM;
10130 switch ((U8)*RExC_parse) {
10132 RExC_seen_zerolen++;
10133 nextchar(pRExC_state);
10134 if (RExC_flags & RXf_PMf_MULTILINE)
10135 ret = reg_node(pRExC_state, MBOL);
10136 else if (RExC_flags & RXf_PMf_SINGLELINE)
10137 ret = reg_node(pRExC_state, SBOL);
10139 ret = reg_node(pRExC_state, BOL);
10140 Set_Node_Length(ret, 1); /* MJD */
10143 nextchar(pRExC_state);
10145 RExC_seen_zerolen++;
10146 if (RExC_flags & RXf_PMf_MULTILINE)
10147 ret = reg_node(pRExC_state, MEOL);
10148 else if (RExC_flags & RXf_PMf_SINGLELINE)
10149 ret = reg_node(pRExC_state, SEOL);
10151 ret = reg_node(pRExC_state, EOL);
10152 Set_Node_Length(ret, 1); /* MJD */
10155 nextchar(pRExC_state);
10156 if (RExC_flags & RXf_PMf_SINGLELINE)
10157 ret = reg_node(pRExC_state, SANY);
10159 ret = reg_node(pRExC_state, REG_ANY);
10160 *flagp |= HASWIDTH|SIMPLE;
10162 Set_Node_Length(ret, 1); /* MJD */
10166 char * const oregcomp_parse = ++RExC_parse;
10167 ret = regclass(pRExC_state, flagp,depth+1);
10168 if (*RExC_parse != ']') {
10169 RExC_parse = oregcomp_parse;
10170 vFAIL("Unmatched [");
10172 nextchar(pRExC_state);
10173 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10177 nextchar(pRExC_state);
10178 ret = reg(pRExC_state, 1, &flags,depth+1);
10180 if (flags & TRYAGAIN) {
10181 if (RExC_parse == RExC_end) {
10182 /* Make parent create an empty node if needed. */
10183 *flagp |= TRYAGAIN;
10190 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10194 if (flags & TRYAGAIN) {
10195 *flagp |= TRYAGAIN;
10198 vFAIL("Internal urp");
10199 /* Supposed to be caught earlier. */
10205 vFAIL("Quantifier follows nothing");
10210 This switch handles escape sequences that resolve to some kind
10211 of special regop and not to literal text. Escape sequnces that
10212 resolve to literal text are handled below in the switch marked
10215 Every entry in this switch *must* have a corresponding entry
10216 in the literal escape switch. However, the opposite is not
10217 required, as the default for this switch is to jump to the
10218 literal text handling code.
10220 switch ((U8)*++RExC_parse) {
10221 /* Special Escapes */
10223 RExC_seen_zerolen++;
10224 ret = reg_node(pRExC_state, SBOL);
10226 goto finish_meta_pat;
10228 ret = reg_node(pRExC_state, GPOS);
10229 RExC_seen |= REG_SEEN_GPOS;
10231 goto finish_meta_pat;
10233 RExC_seen_zerolen++;
10234 ret = reg_node(pRExC_state, KEEPS);
10236 /* XXX:dmq : disabling in-place substitution seems to
10237 * be necessary here to avoid cases of memory corruption, as
10238 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10240 RExC_seen |= REG_SEEN_LOOKBEHIND;
10241 goto finish_meta_pat;
10243 ret = reg_node(pRExC_state, SEOL);
10245 RExC_seen_zerolen++; /* Do not optimize RE away */
10246 goto finish_meta_pat;
10248 ret = reg_node(pRExC_state, EOS);
10250 RExC_seen_zerolen++; /* Do not optimize RE away */
10251 goto finish_meta_pat;
10253 ret = reg_node(pRExC_state, CANY);
10254 RExC_seen |= REG_SEEN_CANY;
10255 *flagp |= HASWIDTH|SIMPLE;
10256 goto finish_meta_pat;
10258 ret = reg_node(pRExC_state, CLUMP);
10259 *flagp |= HASWIDTH;
10260 goto finish_meta_pat;
10262 op = ALNUM + get_regex_charset(RExC_flags);
10263 if (op > ALNUMA) { /* /aa is same as /a */
10266 ret = reg_node(pRExC_state, op);
10267 *flagp |= HASWIDTH|SIMPLE;
10268 goto finish_meta_pat;
10270 op = NALNUM + get_regex_charset(RExC_flags);
10271 if (op > NALNUMA) { /* /aa is same as /a */
10274 ret = reg_node(pRExC_state, op);
10275 *flagp |= HASWIDTH|SIMPLE;
10276 goto finish_meta_pat;
10278 RExC_seen_zerolen++;
10279 RExC_seen |= REG_SEEN_LOOKBEHIND;
10280 op = BOUND + get_regex_charset(RExC_flags);
10281 if (op > BOUNDA) { /* /aa is same as /a */
10284 ret = reg_node(pRExC_state, op);
10285 FLAGS(ret) = get_regex_charset(RExC_flags);
10287 goto finish_meta_pat;
10289 RExC_seen_zerolen++;
10290 RExC_seen |= REG_SEEN_LOOKBEHIND;
10291 op = NBOUND + get_regex_charset(RExC_flags);
10292 if (op > NBOUNDA) { /* /aa is same as /a */
10295 ret = reg_node(pRExC_state, op);
10296 FLAGS(ret) = get_regex_charset(RExC_flags);
10298 goto finish_meta_pat;
10300 op = SPACE + get_regex_charset(RExC_flags);
10301 if (op > SPACEA) { /* /aa is same as /a */
10304 ret = reg_node(pRExC_state, op);
10305 *flagp |= HASWIDTH|SIMPLE;
10306 goto finish_meta_pat;
10308 op = NSPACE + get_regex_charset(RExC_flags);
10309 if (op > NSPACEA) { /* /aa is same as /a */
10312 ret = reg_node(pRExC_state, op);
10313 *flagp |= HASWIDTH|SIMPLE;
10314 goto finish_meta_pat;
10322 U8 offset = get_regex_charset(RExC_flags);
10323 if (offset == REGEX_UNICODE_CHARSET) {
10324 offset = REGEX_DEPENDS_CHARSET;
10326 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10327 offset = REGEX_ASCII_RESTRICTED_CHARSET;
10331 ret = reg_node(pRExC_state, op);
10332 *flagp |= HASWIDTH|SIMPLE;
10333 goto finish_meta_pat;
10335 ret = reg_node(pRExC_state, LNBREAK);
10336 *flagp |= HASWIDTH|SIMPLE;
10337 goto finish_meta_pat;
10339 ret = reg_node(pRExC_state, HORIZWS);
10340 *flagp |= HASWIDTH|SIMPLE;
10341 goto finish_meta_pat;
10343 ret = reg_node(pRExC_state, NHORIZWS);
10344 *flagp |= HASWIDTH|SIMPLE;
10345 goto finish_meta_pat;
10347 ret = reg_node(pRExC_state, VERTWS);
10348 *flagp |= HASWIDTH|SIMPLE;
10349 goto finish_meta_pat;
10351 ret = reg_node(pRExC_state, NVERTWS);
10352 *flagp |= HASWIDTH|SIMPLE;
10354 nextchar(pRExC_state);
10355 Set_Node_Length(ret, 2); /* MJD */
10360 char* const oldregxend = RExC_end;
10362 char* parse_start = RExC_parse - 2;
10365 if (RExC_parse[1] == '{') {
10366 /* a lovely hack--pretend we saw [\pX] instead */
10367 RExC_end = strchr(RExC_parse, '}');
10369 const U8 c = (U8)*RExC_parse;
10371 RExC_end = oldregxend;
10372 vFAIL2("Missing right brace on \\%c{}", c);
10377 RExC_end = RExC_parse + 2;
10378 if (RExC_end > oldregxend)
10379 RExC_end = oldregxend;
10383 ret = regclass(pRExC_state, flagp,depth+1);
10385 RExC_end = oldregxend;
10388 Set_Node_Offset(ret, parse_start + 2);
10389 Set_Node_Cur_Length(ret);
10390 nextchar(pRExC_state);
10394 /* Handle \N and \N{NAME} with multiple code points here and not
10395 * below because it can be multicharacter. join_exact() will join
10396 * them up later on. Also this makes sure that things like
10397 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10398 * The options to the grok function call causes it to fail if the
10399 * sequence is just a single code point. We then go treat it as
10400 * just another character in the current EXACT node, and hence it
10401 * gets uniform treatment with all the other characters. The
10402 * special treatment for quantifiers is not needed for such single
10403 * character sequences */
10405 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10410 case 'k': /* Handle \k<NAME> and \k'NAME' */
10413 char ch= RExC_parse[1];
10414 if (ch != '<' && ch != '\'' && ch != '{') {
10416 vFAIL2("Sequence %.2s... not terminated",parse_start);
10418 /* this pretty much dupes the code for (?P=...) in reg(), if
10419 you change this make sure you change that */
10420 char* name_start = (RExC_parse += 2);
10422 SV *sv_dat = reg_scan_name(pRExC_state,
10423 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10424 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10425 if (RExC_parse == name_start || *RExC_parse != ch)
10426 vFAIL2("Sequence %.3s... not terminated",parse_start);
10429 num = add_data( pRExC_state, 1, "S" );
10430 RExC_rxi->data->data[num]=(void*)sv_dat;
10431 SvREFCNT_inc_simple_void(sv_dat);
10435 ret = reganode(pRExC_state,
10438 : (ASCII_FOLD_RESTRICTED)
10440 : (AT_LEAST_UNI_SEMANTICS)
10446 *flagp |= HASWIDTH;
10448 /* override incorrect value set in reganode MJD */
10449 Set_Node_Offset(ret, parse_start+1);
10450 Set_Node_Cur_Length(ret); /* MJD */
10451 nextchar(pRExC_state);
10457 case '1': case '2': case '3': case '4':
10458 case '5': case '6': case '7': case '8': case '9':
10461 bool isg = *RExC_parse == 'g';
10466 if (*RExC_parse == '{') {
10470 if (*RExC_parse == '-') {
10474 if (hasbrace && !isDIGIT(*RExC_parse)) {
10475 if (isrel) RExC_parse--;
10477 goto parse_named_seq;
10479 num = atoi(RExC_parse);
10480 if (isg && num == 0)
10481 vFAIL("Reference to invalid group 0");
10483 num = RExC_npar - num;
10485 vFAIL("Reference to nonexistent or unclosed group");
10487 if (!isg && num > 9 && num >= RExC_npar)
10488 /* Probably a character specified in octal, e.g. \35 */
10491 char * const parse_start = RExC_parse - 1; /* MJD */
10492 while (isDIGIT(*RExC_parse))
10494 if (parse_start == RExC_parse - 1)
10495 vFAIL("Unterminated \\g... pattern");
10497 if (*RExC_parse != '}')
10498 vFAIL("Unterminated \\g{...} pattern");
10502 if (num > (I32)RExC_rx->nparens)
10503 vFAIL("Reference to nonexistent group");
10506 ret = reganode(pRExC_state,
10509 : (ASCII_FOLD_RESTRICTED)
10511 : (AT_LEAST_UNI_SEMANTICS)
10517 *flagp |= HASWIDTH;
10519 /* override incorrect value set in reganode MJD */
10520 Set_Node_Offset(ret, parse_start+1);
10521 Set_Node_Cur_Length(ret); /* MJD */
10523 nextchar(pRExC_state);
10528 if (RExC_parse >= RExC_end)
10529 FAIL("Trailing \\");
10532 /* Do not generate "unrecognized" warnings here, we fall
10533 back into the quick-grab loop below */
10540 if (RExC_flags & RXf_PMf_EXTENDED) {
10541 if ( reg_skipcomment( pRExC_state ) )
10548 parse_start = RExC_parse - 1;
10557 #define MAX_NODE_STRING_SIZE 127
10558 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10560 U8 upper_parse = MAX_NODE_STRING_SIZE;
10563 bool next_is_quantifier;
10564 char * oldp = NULL;
10566 /* If a folding node contains only code points that don't
10567 * participate in folds, it can be changed into an EXACT node,
10568 * which allows the optimizer more things to look for */
10572 node_type = compute_EXACTish(pRExC_state);
10573 ret = reg_node(pRExC_state, node_type);
10575 /* In pass1, folded, we use a temporary buffer instead of the
10576 * actual node, as the node doesn't exist yet */
10577 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10583 /* We do the EXACTFish to EXACT node only if folding, and not if in
10584 * locale, as whether a character folds or not isn't known until
10586 maybe_exact = FOLD && ! LOC;
10588 /* XXX The node can hold up to 255 bytes, yet this only goes to
10589 * 127. I (khw) do not know why. Keeping it somewhat less than
10590 * 255 allows us to not have to worry about overflow due to
10591 * converting to utf8 and fold expansion, but that value is
10592 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10593 * split up by this limit into a single one using the real max of
10594 * 255. Even at 127, this breaks under rare circumstances. If
10595 * folding, we do not want to split a node at a character that is a
10596 * non-final in a multi-char fold, as an input string could just
10597 * happen to want to match across the node boundary. The join
10598 * would solve that problem if the join actually happens. But a
10599 * series of more than two nodes in a row each of 127 would cause
10600 * the first join to succeed to get to 254, but then there wouldn't
10601 * be room for the next one, which could at be one of those split
10602 * multi-char folds. I don't know of any fool-proof solution. One
10603 * could back off to end with only a code point that isn't such a
10604 * non-final, but it is possible for there not to be any in the
10606 for (p = RExC_parse - 1;
10607 len < upper_parse && p < RExC_end;
10612 if (RExC_flags & RXf_PMf_EXTENDED)
10613 p = regwhite( pRExC_state, p );
10624 /* Literal Escapes Switch
10626 This switch is meant to handle escape sequences that
10627 resolve to a literal character.
10629 Every escape sequence that represents something
10630 else, like an assertion or a char class, is handled
10631 in the switch marked 'Special Escapes' above in this
10632 routine, but also has an entry here as anything that
10633 isn't explicitly mentioned here will be treated as
10634 an unescaped equivalent literal.
10637 switch ((U8)*++p) {
10638 /* These are all the special escapes. */
10639 case 'A': /* Start assertion */
10640 case 'b': case 'B': /* Word-boundary assertion*/
10641 case 'C': /* Single char !DANGEROUS! */
10642 case 'd': case 'D': /* digit class */
10643 case 'g': case 'G': /* generic-backref, pos assertion */
10644 case 'h': case 'H': /* HORIZWS */
10645 case 'k': case 'K': /* named backref, keep marker */
10646 case 'p': case 'P': /* Unicode property */
10647 case 'R': /* LNBREAK */
10648 case 's': case 'S': /* space class */
10649 case 'v': case 'V': /* VERTWS */
10650 case 'w': case 'W': /* word class */
10651 case 'X': /* eXtended Unicode "combining character sequence" */
10652 case 'z': case 'Z': /* End of line/string assertion */
10656 /* Anything after here is an escape that resolves to a
10657 literal. (Except digits, which may or may not)
10663 case 'N': /* Handle a single-code point named character. */
10664 /* The options cause it to fail if a multiple code
10665 * point sequence. Handle those in the switch() above
10667 RExC_parse = p + 1;
10668 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10669 flagp, depth, FALSE))
10671 RExC_parse = p = oldp;
10675 if (ender > 0xff) {
10692 ender = ASCII_TO_NATIVE('\033');
10696 ender = ASCII_TO_NATIVE('\007');
10701 STRLEN brace_len = len;
10703 const char* error_msg;
10705 bool valid = grok_bslash_o(p,
10712 RExC_parse = p; /* going to die anyway; point
10713 to exact spot of failure */
10720 if (PL_encoding && ender < 0x100) {
10721 goto recode_encoding;
10723 if (ender > 0xff) {
10730 STRLEN brace_len = len;
10732 const char* error_msg;
10734 bool valid = grok_bslash_x(p,
10741 RExC_parse = p; /* going to die anyway; point
10742 to exact spot of failure */
10748 if (PL_encoding && ender < 0x100) {
10749 goto recode_encoding;
10751 if (ender > 0xff) {
10758 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10760 case '0': case '1': case '2': case '3':case '4':
10761 case '5': case '6': case '7':
10763 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10765 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10767 ender = grok_oct(p, &numlen, &flags, NULL);
10768 if (ender > 0xff) {
10777 if (PL_encoding && ender < 0x100)
10778 goto recode_encoding;
10781 if (! RExC_override_recoding) {
10782 SV* enc = PL_encoding;
10783 ender = reg_recode((const char)(U8)ender, &enc);
10784 if (!enc && SIZE_ONLY)
10785 ckWARNreg(p, "Invalid escape in the specified encoding");
10791 FAIL("Trailing \\");
10794 if (!SIZE_ONLY&& isALNUMC(*p)) {
10795 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10797 goto normal_default;
10801 /* Currently we don't warn when the lbrace is at the start
10802 * of a construct. This catches it in the middle of a
10803 * literal string, or when its the first thing after
10804 * something like "\b" */
10806 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10808 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10813 if (UTF8_IS_START(*p) && UTF) {
10815 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10816 &numlen, UTF8_ALLOW_DEFAULT);
10822 } /* End of switch on the literal */
10824 /* Here, have looked at the literal character and <ender>
10825 * contains its ordinal, <p> points to the character after it
10828 if ( RExC_flags & RXf_PMf_EXTENDED)
10829 p = regwhite( pRExC_state, p );
10831 /* If the next thing is a quantifier, it applies to this
10832 * character only, which means that this character has to be in
10833 * its own node and can't just be appended to the string in an
10834 * existing node, so if there are already other characters in
10835 * the node, close the node with just them, and set up to do
10836 * this character again next time through, when it will be the
10837 * only thing in its new node */
10838 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10846 /* See comments for join_exact() as to why we fold
10847 * this non-UTF at compile time */
10848 || (node_type == EXACTFU
10849 && ender == LATIN_SMALL_LETTER_SHARP_S))
10853 /* Prime the casefolded buffer. Locale rules, which
10854 * apply only to code points < 256, aren't known until
10855 * execution, so for them, just output the original
10856 * character using utf8. If we start to fold non-UTF
10857 * patterns, be sure to update join_exact() */
10858 if (LOC && ender < 256) {
10859 if (UNI_IS_INVARIANT(ender)) {
10863 *s = UTF8_TWO_BYTE_HI(ender);
10864 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10869 UV folded = _to_uni_fold_flags(
10874 | ((LOC) ? FOLD_FLAGS_LOCALE
10875 : (ASCII_FOLD_RESTRICTED)
10876 ? FOLD_FLAGS_NOMIX_ASCII
10880 /* If this node only contains non-folding code
10881 * points so far, see if this new one is also
10884 if (folded != ender) {
10885 maybe_exact = FALSE;
10888 /* Here the fold is the original; we have
10889 * to check further to see if anything
10891 if (! PL_utf8_foldable) {
10892 SV* swash = swash_init("utf8",
10894 &PL_sv_undef, 1, 0);
10896 _get_swash_invlist(swash);
10897 SvREFCNT_dec(swash);
10899 if (_invlist_contains_cp(PL_utf8_foldable,
10902 maybe_exact = FALSE;
10910 /* The loop increments <len> each time, as all but this
10911 * path (and the one just below for UTF) through it add
10912 * a single byte to the EXACTish node. But this one
10913 * has changed len to be the correct final value, so
10914 * subtract one to cancel out the increment that
10916 len += foldlen - 1;
10920 maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10924 const STRLEN unilen = reguni(pRExC_state, ender, s);
10930 /* See comment just above for - 1 */
10934 REGC((char)ender, s++);
10937 if (next_is_quantifier) {
10939 /* Here, the next input is a quantifier, and to get here,
10940 * the current character is the only one in the node.
10941 * Also, here <len> doesn't include the final byte for this
10947 } /* End of loop through literal characters */
10949 /* Here we have either exhausted the input or ran out of room in
10950 * the node. (If we encountered a character that can't be in the
10951 * node, transfer is made directly to <loopdone>, and so we
10952 * wouldn't have fallen off the end of the loop.) In the latter
10953 * case, we artificially have to split the node into two, because
10954 * we just don't have enough space to hold everything. This
10955 * creates a problem if the final character participates in a
10956 * multi-character fold in the non-final position, as a match that
10957 * should have occurred won't, due to the way nodes are matched,
10958 * and our artificial boundary. So back off until we find a non-
10959 * problematic character -- one that isn't at the beginning or
10960 * middle of such a fold. (Either it doesn't participate in any
10961 * folds, or appears only in the final position of all the folds it
10962 * does participate in.) A better solution with far fewer false
10963 * positives, and that would fill the nodes more completely, would
10964 * be to actually have available all the multi-character folds to
10965 * test against, and to back-off only far enough to be sure that
10966 * this node isn't ending with a partial one. <upper_parse> is set
10967 * further below (if we need to reparse the node) to include just
10968 * up through that final non-problematic character that this code
10969 * identifies, so when it is set to less than the full node, we can
10970 * skip the rest of this */
10971 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10973 const STRLEN full_len = len;
10975 assert(len >= MAX_NODE_STRING_SIZE);
10977 /* Here, <s> points to the final byte of the final character.
10978 * Look backwards through the string until find a non-
10979 * problematic character */
10983 /* These two have no multi-char folds to non-UTF characters
10985 if (ASCII_FOLD_RESTRICTED || LOC) {
10989 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10993 if (! PL_NonL1NonFinalFold) {
10994 PL_NonL1NonFinalFold = _new_invlist_C_array(
10995 NonL1_Perl_Non_Final_Folds_invlist);
10998 /* Point to the first byte of the final character */
10999 s = (char *) utf8_hop((U8 *) s, -1);
11001 while (s >= s0) { /* Search backwards until find
11002 non-problematic char */
11003 if (UTF8_IS_INVARIANT(*s)) {
11005 /* There are no ascii characters that participate
11006 * in multi-char folds under /aa. In EBCDIC, the
11007 * non-ascii invariants are all control characters,
11008 * so don't ever participate in any folds. */
11009 if (ASCII_FOLD_RESTRICTED
11010 || ! IS_NON_FINAL_FOLD(*s))
11015 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11017 /* No Latin1 characters participate in multi-char
11018 * folds under /l */
11020 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11026 else if (! _invlist_contains_cp(
11027 PL_NonL1NonFinalFold,
11028 valid_utf8_to_uvchr((U8 *) s, NULL)))
11033 /* Here, the current character is problematic in that
11034 * it does occur in the non-final position of some
11035 * fold, so try the character before it, but have to
11036 * special case the very first byte in the string, so
11037 * we don't read outside the string */
11038 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11039 } /* End of loop backwards through the string */
11041 /* If there were only problematic characters in the string,
11042 * <s> will point to before s0, in which case the length
11043 * should be 0, otherwise include the length of the
11044 * non-problematic character just found */
11045 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11048 /* Here, have found the final character, if any, that is
11049 * non-problematic as far as ending the node without splitting
11050 * it across a potential multi-char fold. <len> contains the
11051 * number of bytes in the node up-to and including that
11052 * character, or is 0 if there is no such character, meaning
11053 * the whole node contains only problematic characters. In
11054 * this case, give up and just take the node as-is. We can't
11060 /* Here, the node does contain some characters that aren't
11061 * problematic. If one such is the final character in the
11062 * node, we are done */
11063 if (len == full_len) {
11066 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11068 /* If the final character is problematic, but the
11069 * penultimate is not, back-off that last character to
11070 * later start a new node with it */
11075 /* Here, the final non-problematic character is earlier
11076 * in the input than the penultimate character. What we do
11077 * is reparse from the beginning, going up only as far as
11078 * this final ok one, thus guaranteeing that the node ends
11079 * in an acceptable character. The reason we reparse is
11080 * that we know how far in the character is, but we don't
11081 * know how to correlate its position with the input parse.
11082 * An alternate implementation would be to build that
11083 * correlation as we go along during the original parse,
11084 * but that would entail extra work for every node, whereas
11085 * this code gets executed only when the string is too
11086 * large for the node, and the final two characters are
11087 * problematic, an infrequent occurrence. Yet another
11088 * possible strategy would be to save the tail of the
11089 * string, and the next time regatom is called, initialize
11090 * with that. The problem with this is that unless you
11091 * back off one more character, you won't be guaranteed
11092 * regatom will get called again, unless regbranch,
11093 * regpiece ... are also changed. If you do back off that
11094 * extra character, so that there is input guaranteed to
11095 * force calling regatom, you can't handle the case where
11096 * just the first character in the node is acceptable. I
11097 * (khw) decided to try this method which doesn't have that
11098 * pitfall; if performance issues are found, we can do a
11099 * combination of the current approach plus that one */
11105 } /* End of verifying node ends with an appropriate char */
11107 loopdone: /* Jumped to when encounters something that shouldn't be in
11110 /* If 'maybe_exact' is still set here, means there are no
11111 * code points in the node that participate in folds */
11112 if (FOLD && maybe_exact) {
11116 /* I (khw) don't know if you can get here with zero length, but the
11117 * old code handled this situation by creating a zero-length EXACT
11118 * node. Might as well be NOTHING instead */
11123 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11126 RExC_parse = p - 1;
11127 Set_Node_Cur_Length(ret); /* MJD */
11128 nextchar(pRExC_state);
11130 /* len is STRLEN which is unsigned, need to copy to signed */
11133 vFAIL("Internal disaster");
11136 } /* End of label 'defchar:' */
11138 } /* End of giant switch on input character */
11144 S_regwhite( RExC_state_t *pRExC_state, char *p )
11146 const char *e = RExC_end;
11148 PERL_ARGS_ASSERT_REGWHITE;
11153 else if (*p == '#') {
11156 if (*p++ == '\n') {
11162 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11170 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11171 Character classes ([:foo:]) can also be negated ([:^foo:]).
11172 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11173 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11174 but trigger failures because they are currently unimplemented. */
11176 #define POSIXCC_DONE(c) ((c) == ':')
11177 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11178 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11180 PERL_STATIC_INLINE I32
11181 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
11184 I32 namedclass = OOB_NAMEDCLASS;
11186 PERL_ARGS_ASSERT_REGPPOSIXCC;
11188 if (value == '[' && RExC_parse + 1 < RExC_end &&
11189 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11190 POSIXCC(UCHARAT(RExC_parse))) {
11191 const char c = UCHARAT(RExC_parse);
11192 char* const s = RExC_parse++;
11194 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11196 if (RExC_parse == RExC_end)
11197 /* Grandfather lone [:, [=, [. */
11200 const char* const t = RExC_parse++; /* skip over the c */
11203 if (UCHARAT(RExC_parse) == ']') {
11204 const char *posixcc = s + 1;
11205 RExC_parse++; /* skip over the ending ] */
11208 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11209 const I32 skip = t - posixcc;
11211 /* Initially switch on the length of the name. */
11214 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11215 namedclass = ANYOF_WORDCHAR;
11218 /* Names all of length 5. */
11219 /* alnum alpha ascii blank cntrl digit graph lower
11220 print punct space upper */
11221 /* Offset 4 gives the best switch position. */
11222 switch (posixcc[4]) {
11224 if (memEQ(posixcc, "alph", 4)) /* alpha */
11225 namedclass = ANYOF_ALPHA;
11228 if (memEQ(posixcc, "spac", 4)) /* space */
11229 namedclass = ANYOF_PSXSPC;
11232 if (memEQ(posixcc, "grap", 4)) /* graph */
11233 namedclass = ANYOF_GRAPH;
11236 if (memEQ(posixcc, "asci", 4)) /* ascii */
11237 namedclass = ANYOF_ASCII;
11240 if (memEQ(posixcc, "blan", 4)) /* blank */
11241 namedclass = ANYOF_BLANK;
11244 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11245 namedclass = ANYOF_CNTRL;
11248 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11249 namedclass = ANYOF_ALNUMC;
11252 if (memEQ(posixcc, "lowe", 4)) /* lower */
11253 namedclass = ANYOF_LOWER;
11254 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11255 namedclass = ANYOF_UPPER;
11258 if (memEQ(posixcc, "digi", 4)) /* digit */
11259 namedclass = ANYOF_DIGIT;
11260 else if (memEQ(posixcc, "prin", 4)) /* print */
11261 namedclass = ANYOF_PRINT;
11262 else if (memEQ(posixcc, "punc", 4)) /* punct */
11263 namedclass = ANYOF_PUNCT;
11268 if (memEQ(posixcc, "xdigit", 6))
11269 namedclass = ANYOF_XDIGIT;
11273 if (namedclass == OOB_NAMEDCLASS)
11274 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11277 /* The #defines are structured so each complement is +1 to
11278 * the normal one */
11282 assert (posixcc[skip] == ':');
11283 assert (posixcc[skip+1] == ']');
11284 } else if (!SIZE_ONLY) {
11285 /* [[=foo=]] and [[.foo.]] are still future. */
11287 /* adjust RExC_parse so the warning shows after
11288 the class closes */
11289 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11291 SvREFCNT_dec(free_me);
11292 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11295 /* Maternal grandfather:
11296 * "[:" ending in ":" but not in ":]" */
11305 /* Generate the code to add a full posix character <class> to the bracketed
11306 * character class given by <node>. (<node> is needed only under locale rules)
11307 * destlist is the inversion list for non-locale rules that this class is
11309 * sourcelist is the ASCII-range inversion list to add under /a rules
11310 * Xsourcelist is the full Unicode range list to use otherwise. */
11311 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11313 SV* scratch_list = NULL; \
11315 /* Set this class in the node for runtime matching */ \
11316 ANYOF_CLASS_SET(node, class); \
11318 /* For above Latin1 code points, we use the full Unicode range */ \
11319 _invlist_intersection(PL_AboveLatin1, \
11322 /* And set the output to it, adding instead if there already is an \
11323 * output. Checking if <destlist> is NULL first saves an extra \
11324 * clone. Its reference count will be decremented at the next \
11325 * union, etc, or if this is the only instance, at the end of the \
11327 if (! destlist) { \
11328 destlist = scratch_list; \
11331 _invlist_union(destlist, scratch_list, &destlist); \
11332 SvREFCNT_dec(scratch_list); \
11336 /* For non-locale, just add it to any existing list */ \
11337 _invlist_union(destlist, \
11338 (AT_LEAST_ASCII_RESTRICTED) \
11344 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11346 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11348 SV* scratch_list = NULL; \
11349 ANYOF_CLASS_SET(node, class); \
11350 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
11351 if (! destlist) { \
11352 destlist = scratch_list; \
11355 _invlist_union(destlist, scratch_list, &destlist); \
11356 SvREFCNT_dec(scratch_list); \
11360 _invlist_union_complement_2nd(destlist, \
11361 (AT_LEAST_ASCII_RESTRICTED) \
11365 /* Under /d, everything in the upper half of the Latin1 range \
11366 * matches this complement */ \
11367 if (DEPENDS_SEMANTICS) { \
11368 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11372 /* Generate the code to add a posix character <class> to the bracketed
11373 * character class given by <node>. (<node> is needed only under locale rules)
11374 * destlist is the inversion list for non-locale rules that this class is
11376 * sourcelist is the ASCII-range inversion list to add under /a rules
11377 * l1_sourcelist is the Latin1 range list to use otherwise.
11378 * Xpropertyname is the name to add to <run_time_list> of the property to
11379 * specify the code points above Latin1 that will have to be
11380 * determined at run-time
11381 * run_time_list is a SV* that contains text names of properties that are to
11382 * be computed at run time. This concatenates <Xpropertyname>
11383 * to it, appropriately
11384 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11386 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11387 l1_sourcelist, Xpropertyname, run_time_list) \
11388 /* First, resolve whether to use the ASCII-only list or the L1 \
11390 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
11391 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11392 Xpropertyname, run_time_list)
11394 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11395 Xpropertyname, run_time_list) \
11396 /* If not /a matching, there are going to be code points we will have \
11397 * to defer to runtime to look-up */ \
11398 if (! AT_LEAST_ASCII_RESTRICTED) { \
11399 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11402 ANYOF_CLASS_SET(node, class); \
11405 _invlist_union(destlist, sourcelist, &destlist); \
11408 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
11409 * this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
11411 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11412 l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11413 if (AT_LEAST_ASCII_RESTRICTED) { \
11414 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
11417 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11418 matches_above_unicode = TRUE; \
11420 ANYOF_CLASS_SET(node, namedclass); \
11423 SV* scratch_list = NULL; \
11424 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
11425 if (! destlist) { \
11426 destlist = scratch_list; \
11429 _invlist_union(destlist, scratch_list, &destlist); \
11430 SvREFCNT_dec(scratch_list); \
11432 if (DEPENDS_SEMANTICS) { \
11433 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11438 /* The names of properties whose definitions are not known at compile time are
11439 * stored in this SV, after a constant heading. So if the length has been
11440 * changed since initialization, then there is a run-time definition. */
11441 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11443 /* This converts the named class defined in regcomp.h to its equivalent class
11444 * number defined in handy.h. */
11445 #define namedclass_to_classnum(class) ((class) / 2)
11448 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11450 /* parse a bracketed class specification. Most of these will produce an ANYOF node;
11451 * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11452 * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with
11453 * multi-character folds: it will be rewritten following the paradigm of
11454 * this example, where the <multi-fold>s are characters which fold to
11455 * multiple character sequences:
11456 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11457 * gets effectively rewritten as:
11458 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11459 * reg() gets called (recursively) on the rewritten version, and this
11460 * function will return what it constructs. (Actually the <multi-fold>s
11461 * aren't physically removed from the [abcdefghi], it's just that they are
11462 * ignored in the recursion by means of a flag:
11463 * <RExC_in_multi_char_class>.)
11465 * ANYOF nodes contain a bit map for the first 256 characters, with the
11466 * corresponding bit set if that character is in the list. For characters
11467 * above 255, a range list or swash is used. There are extra bits for \w,
11468 * etc. in locale ANYOFs, as what these match is not determinable at
11473 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11475 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11478 IV namedclass = OOB_NAMEDCLASS;
11479 char *rangebegin = NULL;
11480 bool need_class = 0;
11482 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11483 than just initialized. */
11484 SV* properties = NULL; /* Code points that match \p{} \P{} */
11485 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11486 extended beyond the Latin1 range */
11487 UV element_count = 0; /* Number of distinct elements in the class.
11488 Optimizations may be possible if this is tiny */
11489 AV * multi_char_matches = NULL; /* Code points that fold to more than one
11490 character; used under /i */
11493 /* Unicode properties are stored in a swash; this holds the current one
11494 * being parsed. If this swash is the only above-latin1 component of the
11495 * character class, an optimization is to pass it directly on to the
11496 * execution engine. Otherwise, it is set to NULL to indicate that there
11497 * are other things in the class that have to be dealt with at execution
11499 SV* swash = NULL; /* Code points that match \p{} \P{} */
11501 /* Set if a component of this character class is user-defined; just passed
11502 * on to the engine */
11503 bool has_user_defined_property = FALSE;
11505 /* inversion list of code points this node matches only when the target
11506 * string is in UTF-8. (Because is under /d) */
11507 SV* depends_list = NULL;
11509 /* inversion list of code points this node matches. For much of the
11510 * function, it includes only those that match regardless of the utf8ness
11511 * of the target string */
11512 SV* cp_list = NULL;
11515 /* In a range, counts how many 0-2 of the ends of it came from literals,
11516 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11517 UV literal_endpoint = 0;
11519 bool invert = FALSE; /* Is this class to be complemented */
11521 /* Is there any thing like \W or [:^digit:] that matches above the legal
11522 * Unicode range? */
11523 bool runtime_posix_matches_above_Unicode = FALSE;
11525 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11526 case we need to change the emitted regop to an EXACT. */
11527 const char * orig_parse = RExC_parse;
11528 const I32 orig_size = RExC_size;
11529 GET_RE_DEBUG_FLAGS_DECL;
11531 PERL_ARGS_ASSERT_REGCLASS;
11533 PERL_UNUSED_ARG(depth);
11536 DEBUG_PARSE("clas");
11538 /* Assume we are going to generate an ANYOF node. */
11539 ret = reganode(pRExC_state, ANYOF, 0);
11542 ANYOF_FLAGS(ret) = 0;
11545 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11552 RExC_size += ANYOF_SKIP;
11553 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11556 RExC_emit += ANYOF_SKIP;
11558 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11560 listsv = newSVpvs("# comment\n");
11561 initial_listsv_len = SvCUR(listsv);
11564 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11566 if (!SIZE_ONLY && POSIXCC(nextvalue))
11568 const char *s = RExC_parse;
11569 const char c = *s++;
11571 while (isALNUM(*s))
11573 if (*s && c == *s && s[1] == ']') {
11574 SAVEFREESV(RExC_rx_sv);
11575 SAVEFREESV(listsv);
11577 "POSIX syntax [%c %c] belongs inside character classes",
11579 (void)ReREFCNT_inc(RExC_rx_sv);
11580 SvREFCNT_inc_simple_void_NN(listsv);
11584 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11585 if (UCHARAT(RExC_parse) == ']')
11586 goto charclassloop;
11589 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11593 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11594 save_value = value;
11595 save_prevvalue = prevvalue;
11598 rangebegin = RExC_parse;
11602 value = utf8n_to_uvchr((U8*)RExC_parse,
11603 RExC_end - RExC_parse,
11604 &numlen, UTF8_ALLOW_DEFAULT);
11605 RExC_parse += numlen;
11608 value = UCHARAT(RExC_parse++);
11610 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11611 if (value == '[' && POSIXCC(nextvalue))
11612 namedclass = regpposixcc(pRExC_state, value, listsv);
11613 else if (value == '\\') {
11615 value = utf8n_to_uvchr((U8*)RExC_parse,
11616 RExC_end - RExC_parse,
11617 &numlen, UTF8_ALLOW_DEFAULT);
11618 RExC_parse += numlen;
11621 value = UCHARAT(RExC_parse++);
11622 /* Some compilers cannot handle switching on 64-bit integer
11623 * values, therefore value cannot be an UV. Yes, this will
11624 * be a problem later if we want switch on Unicode.
11625 * A similar issue a little bit later when switching on
11626 * namedclass. --jhi */
11627 switch ((I32)value) {
11628 case 'w': namedclass = ANYOF_WORDCHAR; break;
11629 case 'W': namedclass = ANYOF_NWORDCHAR; break;
11630 case 's': namedclass = ANYOF_SPACE; break;
11631 case 'S': namedclass = ANYOF_NSPACE; break;
11632 case 'd': namedclass = ANYOF_DIGIT; break;
11633 case 'D': namedclass = ANYOF_NDIGIT; break;
11634 case 'v': namedclass = ANYOF_VERTWS; break;
11635 case 'V': namedclass = ANYOF_NVERTWS; break;
11636 case 'h': namedclass = ANYOF_HORIZWS; break;
11637 case 'H': namedclass = ANYOF_NHORIZWS; break;
11638 case 'N': /* Handle \N{NAME} in class */
11640 /* We only pay attention to the first char of
11641 multichar strings being returned. I kinda wonder
11642 if this makes sense as it does change the behaviour
11643 from earlier versions, OTOH that behaviour was broken
11645 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11646 TRUE /* => charclass */))
11657 /* This routine will handle any undefined properties */
11658 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11660 if (RExC_parse >= RExC_end)
11661 vFAIL2("Empty \\%c{}", (U8)value);
11662 if (*RExC_parse == '{') {
11663 const U8 c = (U8)value;
11664 e = strchr(RExC_parse++, '}');
11666 vFAIL2("Missing right brace on \\%c{}", c);
11667 while (isSPACE(UCHARAT(RExC_parse)))
11669 if (e == RExC_parse)
11670 vFAIL2("Empty \\%c{}", c);
11671 n = e - RExC_parse;
11672 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11683 if (UCHARAT(RExC_parse) == '^') {
11686 value = value == 'p' ? 'P' : 'p'; /* toggle */
11687 while (isSPACE(UCHARAT(RExC_parse))) {
11692 /* Try to get the definition of the property into
11693 * <invlist>. If /i is in effect, the effective property
11694 * will have its name be <__NAME_i>. The design is
11695 * discussed in commit
11696 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11697 Newx(name, n + sizeof("_i__\n"), char);
11699 sprintf(name, "%s%.*s%s\n",
11700 (FOLD) ? "__" : "",
11706 /* Look up the property name, and get its swash and
11707 * inversion list, if the property is found */
11709 SvREFCNT_dec(swash);
11711 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11714 NULL, /* No inversion list */
11717 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11719 SvREFCNT_dec(swash);
11723 /* Here didn't find it. It could be a user-defined
11724 * property that will be available at run-time. Add it
11725 * to the list to look up then */
11726 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11727 (value == 'p' ? '+' : '!'),
11729 has_user_defined_property = TRUE;
11731 /* We don't know yet, so have to assume that the
11732 * property could match something in the Latin1 range,
11733 * hence something that isn't utf8. Note that this
11734 * would cause things in <depends_list> to match
11735 * inappropriately, except that any \p{}, including
11736 * this one forces Unicode semantics, which means there
11737 * is <no depends_list> */
11738 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11742 /* Here, did get the swash and its inversion list. If
11743 * the swash is from a user-defined property, then this
11744 * whole character class should be regarded as such */
11745 has_user_defined_property =
11747 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11749 /* Invert if asking for the complement */
11750 if (value == 'P') {
11751 _invlist_union_complement_2nd(properties,
11755 /* The swash can't be used as-is, because we've
11756 * inverted things; delay removing it to here after
11757 * have copied its invlist above */
11758 SvREFCNT_dec(swash);
11762 _invlist_union(properties, invlist, &properties);
11767 RExC_parse = e + 1;
11768 namedclass = ANYOF_UNIPROP; /* no official name, but it's named */
11770 /* \p means they want Unicode semantics */
11771 RExC_uni_semantics = 1;
11774 case 'n': value = '\n'; break;
11775 case 'r': value = '\r'; break;
11776 case 't': value = '\t'; break;
11777 case 'f': value = '\f'; break;
11778 case 'b': value = '\b'; break;
11779 case 'e': value = ASCII_TO_NATIVE('\033');break;
11780 case 'a': value = ASCII_TO_NATIVE('\007');break;
11782 RExC_parse--; /* function expects to be pointed at the 'o' */
11784 const char* error_msg;
11785 bool valid = grok_bslash_o(RExC_parse,
11790 RExC_parse += numlen;
11795 if (PL_encoding && value < 0x100) {
11796 goto recode_encoding;
11800 RExC_parse--; /* function expects to be pointed at the 'x' */
11802 const char* error_msg;
11803 bool valid = grok_bslash_x(RExC_parse,
11808 RExC_parse += numlen;
11813 if (PL_encoding && value < 0x100)
11814 goto recode_encoding;
11817 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11819 case '0': case '1': case '2': case '3': case '4':
11820 case '5': case '6': case '7':
11822 /* Take 1-3 octal digits */
11823 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11825 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11826 RExC_parse += numlen;
11827 if (PL_encoding && value < 0x100)
11828 goto recode_encoding;
11832 if (! RExC_override_recoding) {
11833 SV* enc = PL_encoding;
11834 value = reg_recode((const char)(U8)value, &enc);
11835 if (!enc && SIZE_ONLY)
11836 ckWARNreg(RExC_parse,
11837 "Invalid escape in the specified encoding");
11841 /* Allow \_ to not give an error */
11842 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11843 SAVEFREESV(RExC_rx_sv);
11844 SAVEFREESV(listsv);
11845 ckWARN2reg(RExC_parse,
11846 "Unrecognized escape \\%c in character class passed through",
11848 (void)ReREFCNT_inc(RExC_rx_sv);
11849 SvREFCNT_inc_simple_void_NN(listsv);
11853 } /* end of \blah */
11856 literal_endpoint++;
11859 /* What matches in a locale is not known until runtime. This
11860 * includes what the Posix classes (like \w, [:space:]) match.
11861 * Room must be reserved (one time per class) to store such
11862 * classes, either if Perl is compiled so that locale nodes always
11863 * should have this space, or if there is such class info to be
11864 * stored. The space will contain a bit for each named class that
11865 * is to be matched against. This isn't needed for \p{} and
11866 * pseudo-classes, as they are not affected by locale, and hence
11867 * are dealt with separately */
11870 && (ANYOF_LOCALE == ANYOF_CLASS
11871 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11875 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11878 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11879 ANYOF_CLASS_ZERO(ret);
11881 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11884 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11886 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11887 * literal, as is the character that began the false range, i.e.
11888 * the 'a' in the examples */
11892 RExC_parse >= rangebegin ?
11893 RExC_parse - rangebegin : 0;
11894 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11895 SAVEFREESV(listsv);
11896 ckWARN4reg(RExC_parse,
11897 "False [] range \"%*.*s\"",
11899 (void)ReREFCNT_inc(RExC_rx_sv);
11900 SvREFCNT_inc_simple_void_NN(listsv);
11901 cp_list = add_cp_to_invlist(cp_list, '-');
11902 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11905 range = 0; /* this was not a true range */
11906 element_count += 2; /* So counts for three values */
11910 switch ((I32)namedclass) {
11912 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11913 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11914 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11916 case ANYOF_NALNUMC:
11917 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11918 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11919 runtime_posix_matches_above_Unicode);
11922 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11923 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11926 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11927 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11928 runtime_posix_matches_above_Unicode);
11933 ANYOF_CLASS_SET(ret, namedclass);
11936 #endif /* Not isascii(); just use the hard-coded definition for it */
11937 _invlist_union(posixes, PL_ASCII, &posixes);
11942 ANYOF_CLASS_SET(ret, namedclass);
11946 _invlist_union_complement_2nd(posixes,
11947 PL_ASCII, &posixes);
11948 if (DEPENDS_SEMANTICS) {
11949 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11956 if (hasISBLANK || ! LOC) {
11957 DO_POSIX(ret, namedclass, posixes,
11958 PL_PosixBlank, PL_XPosixBlank);
11960 else { /* There is no isblank() and we are in locale: We
11961 use the ASCII range and the above-Latin1 range
11963 SV* scratch_list = NULL;
11965 /* Include all above-Latin1 blanks */
11966 _invlist_intersection(PL_AboveLatin1,
11969 /* Add it to the running total of posix classes */
11971 posixes = scratch_list;
11974 _invlist_union(posixes, scratch_list, &posixes);
11975 SvREFCNT_dec(scratch_list);
11977 /* Add the ASCII-range blanks to the running total. */
11978 _invlist_union(posixes, PL_PosixBlank, &posixes);
11982 if (hasISBLANK || ! LOC) {
11983 DO_N_POSIX(ret, namedclass, posixes,
11984 PL_PosixBlank, PL_XPosixBlank);
11986 else { /* There is no isblank() and we are in locale */
11987 SV* scratch_list = NULL;
11989 /* Include all above-Latin1 non-blanks */
11990 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11993 /* Add them to the running total of posix classes */
11994 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11997 posixes = scratch_list;
12000 _invlist_union(posixes, scratch_list, &posixes);
12001 SvREFCNT_dec(scratch_list);
12004 /* Get the list of all non-ASCII-blanks in Latin 1, and
12005 * add them to the running total */
12006 _invlist_subtract(PL_Latin1, PL_PosixBlank,
12008 _invlist_union(posixes, scratch_list, &posixes);
12009 SvREFCNT_dec(scratch_list);
12013 DO_POSIX(ret, namedclass, posixes,
12014 PL_PosixCntrl, PL_XPosixCntrl);
12017 DO_N_POSIX(ret, namedclass, posixes,
12018 PL_PosixCntrl, PL_XPosixCntrl);
12021 /* There are no digits in the Latin1 range outside of
12022 * ASCII, so call the macro that doesn't have to resolve
12024 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
12025 PL_PosixDigit, "XPosixDigit", listsv);
12028 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12029 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
12030 runtime_posix_matches_above_Unicode);
12033 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12034 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
12037 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12038 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
12039 runtime_posix_matches_above_Unicode);
12041 case ANYOF_HORIZWS:
12042 /* For these, we use the cp_list, as /d doesn't make a
12043 * difference in what these match. There would be problems
12044 * if these characters had folds other than themselves, as
12045 * cp_list is subject to folding. It turns out that \h
12046 * is just a synonym for XPosixBlank */
12047 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
12049 case ANYOF_NHORIZWS:
12050 _invlist_union_complement_2nd(cp_list,
12051 PL_XPosixBlank, &cp_list);
12055 { /* These require special handling, as they differ under
12056 folding, matching Cased there (which in the ASCII range
12057 is the same as Alpha */
12063 if (FOLD && ! LOC) {
12064 ascii_source = PL_PosixAlpha;
12065 l1_source = PL_L1Cased;
12069 ascii_source = PL_PosixLower;
12070 l1_source = PL_L1PosixLower;
12071 Xname = "XPosixLower";
12073 if (namedclass == ANYOF_LOWER) {
12074 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12075 ascii_source, l1_source, Xname, listsv);
12078 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12079 posixes, ascii_source, l1_source, Xname, listsv,
12080 runtime_posix_matches_above_Unicode);
12085 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12086 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
12089 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12090 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
12091 runtime_posix_matches_above_Unicode);
12094 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12095 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
12098 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12099 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
12100 runtime_posix_matches_above_Unicode);
12103 DO_POSIX(ret, namedclass, posixes,
12104 PL_PosixSpace, PL_XPosixSpace);
12106 case ANYOF_NPSXSPC:
12107 DO_N_POSIX(ret, namedclass, posixes,
12108 PL_PosixSpace, PL_XPosixSpace);
12111 DO_POSIX(ret, namedclass, posixes,
12112 PL_PerlSpace, PL_XPerlSpace);
12115 DO_N_POSIX(ret, namedclass, posixes,
12116 PL_PerlSpace, PL_XPerlSpace);
12118 case ANYOF_UPPER: /* Same as LOWER, above */
12125 if (FOLD && ! LOC) {
12126 ascii_source = PL_PosixAlpha;
12127 l1_source = PL_L1Cased;
12131 ascii_source = PL_PosixUpper;
12132 l1_source = PL_L1PosixUpper;
12133 Xname = "XPosixUpper";
12135 if (namedclass == ANYOF_UPPER) {
12136 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12137 ascii_source, l1_source, Xname, listsv);
12140 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12141 posixes, ascii_source, l1_source, Xname, listsv,
12142 runtime_posix_matches_above_Unicode);
12146 case ANYOF_WORDCHAR:
12147 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12148 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
12150 case ANYOF_NWORDCHAR:
12151 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12152 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
12153 runtime_posix_matches_above_Unicode);
12156 /* For these, we use the cp_list, as /d doesn't make a
12157 * difference in what these match. There would be problems
12158 * if these characters had folds other than themselves, as
12159 * cp_list is subject to folding */
12160 _invlist_union(cp_list, PL_VertSpace, &cp_list);
12162 case ANYOF_NVERTWS:
12163 _invlist_union_complement_2nd(cp_list,
12164 PL_VertSpace, &cp_list);
12167 DO_POSIX(ret, namedclass, posixes,
12168 PL_PosixXDigit, PL_XPosixXDigit);
12170 case ANYOF_NXDIGIT:
12171 DO_N_POSIX(ret, namedclass, posixes,
12172 PL_PosixXDigit, PL_XPosixXDigit);
12174 case ANYOF_UNIPROP: /* this is to handle \p and \P */
12177 vFAIL("Invalid [::] class");
12181 continue; /* Go get next character */
12183 } /* end of namedclass \blah */
12186 if (prevvalue > value) /* b-a */ {
12187 const int w = RExC_parse - rangebegin;
12188 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12189 range = 0; /* not a valid range */
12193 prevvalue = value; /* save the beginning of the potential range */
12194 if (RExC_parse+1 < RExC_end
12195 && *RExC_parse == '-'
12196 && RExC_parse[1] != ']')
12200 /* a bad range like \w-, [:word:]- ? */
12201 if (namedclass > OOB_NAMEDCLASS) {
12202 if (ckWARN(WARN_REGEXP)) {
12204 RExC_parse >= rangebegin ?
12205 RExC_parse - rangebegin : 0;
12207 "False [] range \"%*.*s\"",
12211 cp_list = add_cp_to_invlist(cp_list, '-');
12215 range = 1; /* yeah, it's a range! */
12216 continue; /* but do it the next time */
12220 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12223 /* non-Latin1 code point implies unicode semantics. Must be set in
12224 * pass1 so is there for the whole of pass 2 */
12226 RExC_uni_semantics = 1;
12229 /* Ready to process either the single value, or the completed range.
12230 * For single-valued non-inverted ranges, we consider the possibility
12231 * of multi-char folds. (We made a conscious decision to not do this
12232 * for the other cases because it can often lead to non-intuitive
12233 * results. For example, you have the peculiar case that:
12234 * "s s" =~ /^[^\xDF]+$/i => Y
12235 * "ss" =~ /^[^\xDF]+$/i => N
12237 * See [perl #89750] */
12238 if (FOLD && ! invert && value == prevvalue) {
12239 if (value == LATIN_SMALL_LETTER_SHARP_S
12240 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12243 /* Here <value> is indeed a multi-char fold. Get what it is */
12245 U8 foldbuf[UTF8_MAXBYTES_CASE];
12248 UV folded = _to_uni_fold_flags(
12253 | ((LOC) ? FOLD_FLAGS_LOCALE
12254 : (ASCII_FOLD_RESTRICTED)
12255 ? FOLD_FLAGS_NOMIX_ASCII
12259 /* Here, <folded> should be the first character of the
12260 * multi-char fold of <value>, with <foldbuf> containing the
12261 * whole thing. But, if this fold is not allowed (because of
12262 * the flags), <fold> will be the same as <value>, and should
12263 * be processed like any other character, so skip the special
12265 if (folded != value) {
12267 /* Skip if we are recursed, currently parsing the class
12268 * again. Otherwise add this character to the list of
12269 * multi-char folds. */
12270 if (! RExC_in_multi_char_class) {
12271 AV** this_array_ptr;
12273 STRLEN cp_count = utf8_length(foldbuf,
12274 foldbuf + foldlen);
12275 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12277 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12280 if (! multi_char_matches) {
12281 multi_char_matches = newAV();
12284 /* <multi_char_matches> is actually an array of arrays.
12285 * There will be one or two top-level elements: [2],
12286 * and/or [3]. The [2] element is an array, each
12287 * element thereof is a character which folds to two
12288 * characters; likewise for [3]. (Unicode guarantees a
12289 * maximum of 3 characters in any fold.) When we
12290 * rewrite the character class below, we will do so
12291 * such that the longest folds are written first, so
12292 * that it prefers the longest matching strings first.
12293 * This is done even if it turns out that any
12294 * quantifier is non-greedy, out of programmer
12295 * laziness. Tom Christiansen has agreed that this is
12296 * ok. This makes the test for the ligature 'ffi' come
12297 * before the test for 'ff' */
12298 if (av_exists(multi_char_matches, cp_count)) {
12299 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12301 this_array = *this_array_ptr;
12304 this_array = newAV();
12305 av_store(multi_char_matches, cp_count,
12308 av_push(this_array, multi_fold);
12311 /* This element should not be processed further in this
12314 value = save_value;
12315 prevvalue = save_prevvalue;
12321 /* Deal with this element of the class */
12324 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12326 UV* this_range = _new_invlist(1);
12327 _append_range_to_invlist(this_range, prevvalue, value);
12329 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12330 * If this range was specified using something like 'i-j', we want
12331 * to include only the 'i' and the 'j', and not anything in
12332 * between, so exclude non-ASCII, non-alphabetics from it.
12333 * However, if the range was specified with something like
12334 * [\x89-\x91] or [\x89-j], all code points within it should be
12335 * included. literal_endpoint==2 means both ends of the range used
12336 * a literal character, not \x{foo} */
12337 if (literal_endpoint == 2
12338 && (prevvalue >= 'a' && value <= 'z')
12339 || (prevvalue >= 'A' && value <= 'Z'))
12341 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12342 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12344 _invlist_union(cp_list, this_range, &cp_list);
12345 literal_endpoint = 0;
12349 range = 0; /* this range (if it was one) is done now */
12350 } /* End of loop through all the text within the brackets */
12352 /* If anything in the class expands to more than one character, we have to
12353 * deal with them by building up a substitute parse string, and recursively
12354 * calling reg() on it, instead of proceeding */
12355 if (multi_char_matches) {
12356 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12359 char *save_end = RExC_end;
12360 char *save_parse = RExC_parse;
12361 bool first_time = TRUE; /* First multi-char occurrence doesn't get
12366 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
12367 because too confusing */
12369 sv_catpv(substitute_parse, "(?:");
12373 /* Look at the longest folds first */
12374 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12376 if (av_exists(multi_char_matches, cp_count)) {
12377 AV** this_array_ptr;
12380 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12382 while ((this_sequence = av_pop(*this_array_ptr)) !=
12385 if (! first_time) {
12386 sv_catpv(substitute_parse, "|");
12388 first_time = FALSE;
12390 sv_catpv(substitute_parse, SvPVX(this_sequence));
12395 /* If the character class contains anything else besides these
12396 * multi-character folds, have to include it in recursive parsing */
12397 if (element_count) {
12398 sv_catpv(substitute_parse, "|[");
12399 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12400 sv_catpv(substitute_parse, "]");
12403 sv_catpv(substitute_parse, ")");
12406 /* This is a way to get the parse to skip forward a whole named
12407 * sequence instead of matching the 2nd character when it fails the
12409 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12413 RExC_parse = SvPV(substitute_parse, len);
12414 RExC_end = RExC_parse + len;
12415 RExC_in_multi_char_class = 1;
12416 RExC_emit = (regnode *)orig_emit;
12418 ret = reg(pRExC_state, 1, ®_flags, depth+1);
12420 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12422 RExC_parse = save_parse;
12423 RExC_end = save_end;
12424 RExC_in_multi_char_class = 0;
12425 SvREFCNT_dec(multi_char_matches);
12426 SvREFCNT_dec(listsv);
12430 /* If the character class contains only a single element, it may be
12431 * optimizable into another node type which is smaller and runs faster.
12432 * Check if this is the case for this class */
12433 if (element_count == 1) {
12437 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12438 [:digit:] or \p{foo} */
12440 /* Certain named classes have equivalents that can appear outside a
12441 * character class, e.g. \w, \H. We use these instead of a
12442 * character class. */
12443 switch ((I32)namedclass) {
12446 /* The first group is for node types that depend on the charset
12447 * modifier to the regex. We first calculate the base node
12448 * type, and if it should be inverted */
12450 case ANYOF_NWORDCHAR:
12453 case ANYOF_WORDCHAR:
12455 goto join_charset_classes;
12462 goto join_charset_classes;
12470 join_charset_classes:
12472 /* Now that we have the base node type, we take advantage
12473 * of the enum ordering of the charset modifiers to get the
12474 * exact node type, For example the base SPACE also has
12475 * SPACEL, SPACEU, and SPACEA */
12477 offset = get_regex_charset(RExC_flags);
12479 /* /aa is the same as /a for these */
12480 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12481 offset = REGEX_ASCII_RESTRICTED_CHARSET;
12483 else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12484 offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12489 /* The number of varieties of each of these is the same,
12490 * hence, so is the delta between the normal and
12491 * complemented nodes */
12493 op += NALNUM - ALNUM;
12495 *flagp |= HASWIDTH|SIMPLE;
12498 /* The second group doesn't depend of the charset modifiers.
12499 * We just have normal and complemented */
12500 case ANYOF_NHORIZWS:
12503 case ANYOF_HORIZWS:
12505 op = (invert) ? NHORIZWS : HORIZWS;
12506 *flagp |= HASWIDTH|SIMPLE;
12509 case ANYOF_NVERTWS:
12513 op = (invert) ? NVERTWS : VERTWS;
12514 *flagp |= HASWIDTH|SIMPLE;
12517 case ANYOF_UNIPROP:
12524 if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12529 /* A generic posix class. All the /a ones can be handled
12530 * by the POSIXA opcode. And all are closed under folding
12531 * in the ASCII range, so FOLD doesn't matter */
12532 if (AT_LEAST_ASCII_RESTRICTED
12533 || (! LOC && namedclass == ANYOF_ASCII))
12535 /* The odd numbered ones are the complements of the
12536 * next-lower even number one */
12537 if (namedclass % 2 == 1) {
12541 arg = namedclass_to_classnum(namedclass);
12542 op = (invert) ? NPOSIXA : POSIXA;
12547 else if (value == prevvalue) {
12549 /* Here, the class consists of just a single code point */
12552 if (! LOC && value == '\n') {
12553 op = REG_ANY; /* Optimize [^\n] */
12554 *flagp |= HASWIDTH|SIMPLE;
12558 else if (value < 256 || UTF) {
12560 /* Optimize a single value into an EXACTish node, but not if it
12561 * would require converting the pattern to UTF-8. */
12562 op = compute_EXACTish(pRExC_state);
12564 } /* Otherwise is a range */
12565 else if (! LOC) { /* locale could vary these */
12566 if (prevvalue == '0') {
12567 if (value == '9') {
12568 op = (invert) ? NDIGITA : DIGITA;
12569 *flagp |= HASWIDTH|SIMPLE;
12574 /* Here, we have changed <op> away from its initial value iff we found
12575 * an optimization */
12578 /* Throw away this ANYOF regnode, and emit the calculated one,
12579 * which should correspond to the beginning, not current, state of
12581 const char * cur_parse = RExC_parse;
12582 RExC_parse = (char *)orig_parse;
12586 /* To get locale nodes to not use the full ANYOF size would
12587 * require moving the code above that writes the portions
12588 * of it that aren't in other nodes to after this point.
12589 * e.g. ANYOF_CLASS_SET */
12590 RExC_size = orig_size;
12594 RExC_emit = (regnode *)orig_emit;
12597 ret = reg_node(pRExC_state, op);
12599 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
12603 *flagp |= HASWIDTH|SIMPLE;
12605 else if (PL_regkind[op] == EXACT) {
12606 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12609 RExC_parse = (char *) cur_parse;
12611 SvREFCNT_dec(posixes);
12612 SvREFCNT_dec(listsv);
12613 SvREFCNT_dec(cp_list);
12620 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12622 /* If folding, we calculate all characters that could fold to or from the
12623 * ones already on the list */
12624 if (FOLD && cp_list) {
12625 UV start, end; /* End points of code point ranges */
12627 SV* fold_intersection = NULL;
12629 /* If the highest code point is within Latin1, we can use the
12630 * compiled-in Alphas list, and not have to go out to disk. This
12631 * yields two false positives, the masculine and feminine ordinal
12632 * indicators, which are weeded out below using the
12633 * IS_IN_SOME_FOLD_L1() macro */
12634 if (invlist_highest(cp_list) < 256) {
12635 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12639 /* Here, there are non-Latin1 code points, so we will have to go
12640 * fetch the list of all the characters that participate in folds
12642 if (! PL_utf8_foldable) {
12643 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12644 &PL_sv_undef, 1, 0);
12645 PL_utf8_foldable = _get_swash_invlist(swash);
12646 SvREFCNT_dec(swash);
12649 /* This is a hash that for a particular fold gives all characters
12650 * that are involved in it */
12651 if (! PL_utf8_foldclosures) {
12653 /* If we were unable to find any folds, then we likely won't be
12654 * able to find the closures. So just create an empty list.
12655 * Folding will effectively be restricted to the non-Unicode
12656 * rules hard-coded into Perl. (This case happens legitimately
12657 * during compilation of Perl itself before the Unicode tables
12658 * are generated) */
12659 if (_invlist_len(PL_utf8_foldable) == 0) {
12660 PL_utf8_foldclosures = newHV();
12663 /* If the folds haven't been read in, call a fold function
12665 if (! PL_utf8_tofold) {
12666 U8 dummy[UTF8_MAXBYTES+1];
12668 /* This string is just a short named one above \xff */
12669 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12670 assert(PL_utf8_tofold); /* Verify that worked */
12672 PL_utf8_foldclosures =
12673 _swash_inversion_hash(PL_utf8_tofold);
12677 /* Only the characters in this class that participate in folds need
12678 * be checked. Get the intersection of this class and all the
12679 * possible characters that are foldable. This can quickly narrow
12680 * down a large class */
12681 _invlist_intersection(PL_utf8_foldable, cp_list,
12682 &fold_intersection);
12685 /* Now look at the foldable characters in this class individually */
12686 invlist_iterinit(fold_intersection);
12687 while (invlist_iternext(fold_intersection, &start, &end)) {
12690 /* Locale folding for Latin1 characters is deferred until runtime */
12691 if (LOC && start < 256) {
12695 /* Look at every character in the range */
12696 for (j = start; j <= end; j++) {
12698 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12704 /* We have the latin1 folding rules hard-coded here so that
12705 * an innocent-looking character class, like /[ks]/i won't
12706 * have to go out to disk to find the possible matches.
12707 * XXX It would be better to generate these via regen, in
12708 * case a new version of the Unicode standard adds new
12709 * mappings, though that is not really likely, and may be
12710 * caught by the default: case of the switch below. */
12712 if (IS_IN_SOME_FOLD_L1(j)) {
12714 /* ASCII is always matched; non-ASCII is matched only
12715 * under Unicode rules */
12716 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12718 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12722 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12726 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12727 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12729 /* Certain Latin1 characters have matches outside
12730 * Latin1. To get here, <j> is one of those
12731 * characters. None of these matches is valid for
12732 * ASCII characters under /aa, which is why the 'if'
12733 * just above excludes those. These matches only
12734 * happen when the target string is utf8. The code
12735 * below adds the single fold closures for <j> to the
12736 * inversion list. */
12741 add_cp_to_invlist(cp_list, KELVIN_SIGN);
12745 cp_list = add_cp_to_invlist(cp_list,
12746 LATIN_SMALL_LETTER_LONG_S);
12749 cp_list = add_cp_to_invlist(cp_list,
12750 GREEK_CAPITAL_LETTER_MU);
12751 cp_list = add_cp_to_invlist(cp_list,
12752 GREEK_SMALL_LETTER_MU);
12754 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12755 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12757 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12759 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12760 cp_list = add_cp_to_invlist(cp_list,
12761 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12763 case LATIN_SMALL_LETTER_SHARP_S:
12764 cp_list = add_cp_to_invlist(cp_list,
12765 LATIN_CAPITAL_LETTER_SHARP_S);
12767 case 'F': case 'f':
12768 case 'I': case 'i':
12769 case 'L': case 'l':
12770 case 'T': case 't':
12771 case 'A': case 'a':
12772 case 'H': case 'h':
12773 case 'J': case 'j':
12774 case 'N': case 'n':
12775 case 'W': case 'w':
12776 case 'Y': case 'y':
12777 /* These all are targets of multi-character
12778 * folds from code points that require UTF8 to
12779 * express, so they can't match unless the
12780 * target string is in UTF-8, so no action here
12781 * is necessary, as regexec.c properly handles
12782 * the general case for UTF-8 matching and
12783 * multi-char folds */
12786 /* Use deprecated warning to increase the
12787 * chances of this being output */
12788 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12795 /* Here is an above Latin1 character. We don't have the rules
12796 * hard-coded for it. First, get its fold. This is the simple
12797 * fold, as the multi-character folds have been handled earlier
12798 * and separated out */
12799 _to_uni_fold_flags(j, foldbuf, &foldlen,
12801 ? FOLD_FLAGS_LOCALE
12802 : (ASCII_FOLD_RESTRICTED)
12803 ? FOLD_FLAGS_NOMIX_ASCII
12806 /* Single character fold of above Latin1. Add everything in
12807 * its fold closure to the list that this node should match.
12808 * The fold closures data structure is a hash with the keys
12809 * being the UTF-8 of every character that is folded to, like
12810 * 'k', and the values each an array of all code points that
12811 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
12812 * Multi-character folds are not included */
12813 if ((listp = hv_fetch(PL_utf8_foldclosures,
12814 (char *) foldbuf, foldlen, FALSE)))
12816 AV* list = (AV*) *listp;
12818 for (k = 0; k <= av_len(list); k++) {
12819 SV** c_p = av_fetch(list, k, FALSE);
12822 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12826 /* /aa doesn't allow folds between ASCII and non-; /l
12827 * doesn't allow them between above and below 256 */
12828 if ((ASCII_FOLD_RESTRICTED
12829 && (isASCII(c) != isASCII(j)))
12830 || (LOC && ((c < 256) != (j < 256))))
12835 /* Folds involving non-ascii Latin1 characters
12836 * under /d are added to a separate list */
12837 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12839 cp_list = add_cp_to_invlist(cp_list, c);
12842 depends_list = add_cp_to_invlist(depends_list, c);
12848 SvREFCNT_dec(fold_intersection);
12851 /* And combine the result (if any) with any inversion list from posix
12852 * classes. The lists are kept separate up to now because we don't want to
12853 * fold the classes (folding of those is automatically handled by the swash
12854 * fetching code) */
12856 if (! DEPENDS_SEMANTICS) {
12858 _invlist_union(cp_list, posixes, &cp_list);
12859 SvREFCNT_dec(posixes);
12866 /* Under /d, we put into a separate list the Latin1 things that
12867 * match only when the target string is utf8 */
12868 SV* nonascii_but_latin1_properties = NULL;
12869 _invlist_intersection(posixes, PL_Latin1,
12870 &nonascii_but_latin1_properties);
12871 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12872 &nonascii_but_latin1_properties);
12873 _invlist_subtract(posixes, nonascii_but_latin1_properties,
12876 _invlist_union(cp_list, posixes, &cp_list);
12877 SvREFCNT_dec(posixes);
12883 if (depends_list) {
12884 _invlist_union(depends_list, nonascii_but_latin1_properties,
12886 SvREFCNT_dec(nonascii_but_latin1_properties);
12889 depends_list = nonascii_but_latin1_properties;
12894 /* And combine the result (if any) with any inversion list from properties.
12895 * The lists are kept separate up to now so that we can distinguish the two
12896 * in regards to matching above-Unicode. A run-time warning is generated
12897 * if a Unicode property is matched against a non-Unicode code point. But,
12898 * we allow user-defined properties to match anything, without any warning,
12899 * and we also suppress the warning if there is a portion of the character
12900 * class that isn't a Unicode property, and which matches above Unicode, \W
12901 * or [\x{110000}] for example.
12902 * (Note that in this case, unlike the Posix one above, there is no
12903 * <depends_list>, because having a Unicode property forces Unicode
12906 bool warn_super = ! has_user_defined_property;
12909 /* If it matters to the final outcome, see if a non-property
12910 * component of the class matches above Unicode. If so, the
12911 * warning gets suppressed. This is true even if just a single
12912 * such code point is specified, as though not strictly correct if
12913 * another such code point is matched against, the fact that they
12914 * are using above-Unicode code points indicates they should know
12915 * the issues involved */
12917 bool non_prop_matches_above_Unicode =
12918 runtime_posix_matches_above_Unicode
12919 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12921 non_prop_matches_above_Unicode =
12922 ! non_prop_matches_above_Unicode;
12924 warn_super = ! non_prop_matches_above_Unicode;
12927 _invlist_union(properties, cp_list, &cp_list);
12928 SvREFCNT_dec(properties);
12931 cp_list = properties;
12935 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12939 /* Here, we have calculated what code points should be in the character
12942 * Now we can see about various optimizations. Fold calculation (which we
12943 * did above) needs to take place before inversion. Otherwise /[^k]/i
12944 * would invert to include K, which under /i would match k, which it
12945 * shouldn't. Therefore we can't invert folded locale now, as it won't be
12946 * folded until runtime */
12948 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12949 * at compile time. Besides not inverting folded locale now, we can't
12950 * invert if there are things such as \w, which aren't known until runtime
12953 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12955 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12957 _invlist_invert(cp_list);
12959 /* Any swash can't be used as-is, because we've inverted things */
12961 SvREFCNT_dec(swash);
12965 /* Clear the invert flag since have just done it here */
12969 /* If we didn't do folding, it's because some information isn't available
12970 * until runtime; set the run-time fold flag for these. (We don't have to
12971 * worry about properties folding, as that is taken care of by the swash
12975 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12978 /* Some character classes are equivalent to other nodes. Such nodes take
12979 * up less room and generally fewer operations to execute than ANYOF nodes.
12980 * Above, we checked for and optimized into some such equivalents for
12981 * certain common classes that are easy to test. Getting to this point in
12982 * the code means that the class didn't get optimized there. Since this
12983 * code is only executed in Pass 2, it is too late to save space--it has
12984 * been allocated in Pass 1, and currently isn't given back. But turning
12985 * things into an EXACTish node can allow the optimizer to join it to any
12986 * adjacent such nodes. And if the class is equivalent to things like /./,
12987 * expensive run-time swashes can be avoided. Now that we have more
12988 * complete information, we can find things necessarily missed by the
12989 * earlier code. I (khw) am not sure how much to look for here. It would
12990 * be easy, but perhaps too slow, to check any candidates against all the
12991 * node types they could possibly match using _invlistEQ(). */
12996 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12997 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13000 U8 op = END; /* The optimzation node-type */
13001 const char * cur_parse= RExC_parse;
13003 invlist_iterinit(cp_list);
13004 if (! invlist_iternext(cp_list, &start, &end)) {
13006 /* Here, the list is empty. This happens, for example, when a
13007 * Unicode property is the only thing in the character class, and
13008 * it doesn't match anything. (perluniprops.pod notes such
13011 *flagp |= HASWIDTH|SIMPLE;
13013 else if (start == end) { /* The range is a single code point */
13014 if (! invlist_iternext(cp_list, &start, &end)
13016 /* Don't do this optimization if it would require changing
13017 * the pattern to UTF-8 */
13018 && (start < 256 || UTF))
13020 /* Here, the list contains a single code point. Can optimize
13021 * into an EXACT node */
13030 /* A locale node under folding with one code point can be
13031 * an EXACTFL, as its fold won't be calculated until
13037 /* Here, we are generally folding, but there is only one
13038 * code point to match. If we have to, we use an EXACT
13039 * node, but it would be better for joining with adjacent
13040 * nodes in the optimization pass if we used the same
13041 * EXACTFish node that any such are likely to be. We can
13042 * do this iff the code point doesn't participate in any
13043 * folds. For example, an EXACTF of a colon is the same as
13044 * an EXACT one, since nothing folds to or from a colon. */
13046 if (IS_IN_SOME_FOLD_L1(value)) {
13051 if (! PL_utf8_foldable) {
13052 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13053 &PL_sv_undef, 1, 0);
13054 PL_utf8_foldable = _get_swash_invlist(swash);
13055 SvREFCNT_dec(swash);
13057 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13062 /* If we haven't found the node type, above, it means we
13063 * can use the prevailing one */
13065 op = compute_EXACTish(pRExC_state);
13070 else if (start == 0) {
13071 if (end == UV_MAX) {
13073 *flagp |= HASWIDTH|SIMPLE;
13076 else if (end == '\n' - 1
13077 && invlist_iternext(cp_list, &start, &end)
13078 && start == '\n' + 1 && end == UV_MAX)
13081 *flagp |= HASWIDTH|SIMPLE;
13087 RExC_parse = (char *)orig_parse;
13088 RExC_emit = (regnode *)orig_emit;
13090 ret = reg_node(pRExC_state, op);
13092 RExC_parse = (char *)cur_parse;
13094 if (PL_regkind[op] == EXACT) {
13095 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13098 SvREFCNT_dec(cp_list);
13099 SvREFCNT_dec(listsv);
13104 /* Here, <cp_list> contains all the code points we can determine at
13105 * compile time that match under all conditions. Go through it, and
13106 * for things that belong in the bitmap, put them there, and delete from
13107 * <cp_list>. While we are at it, see if everything above 255 is in the
13108 * list, and if so, set a flag to speed up execution */
13109 ANYOF_BITMAP_ZERO(ret);
13112 /* This gets set if we actually need to modify things */
13113 bool change_invlist = FALSE;
13117 /* Start looking through <cp_list> */
13118 invlist_iterinit(cp_list);
13119 while (invlist_iternext(cp_list, &start, &end)) {
13123 if (end == UV_MAX && start <= 256) {
13124 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13127 /* Quit if are above what we should change */
13132 change_invlist = TRUE;
13134 /* Set all the bits in the range, up to the max that we are doing */
13135 high = (end < 255) ? end : 255;
13136 for (i = start; i <= (int) high; i++) {
13137 if (! ANYOF_BITMAP_TEST(ret, i)) {
13138 ANYOF_BITMAP_SET(ret, i);
13145 /* Done with loop; remove any code points that are in the bitmap from
13147 if (change_invlist) {
13148 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13151 /* If have completely emptied it, remove it completely */
13152 if (_invlist_len(cp_list) == 0) {
13153 SvREFCNT_dec(cp_list);
13159 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13162 /* Here, the bitmap has been populated with all the Latin1 code points that
13163 * always match. Can now add to the overall list those that match only
13164 * when the target string is UTF-8 (<depends_list>). */
13165 if (depends_list) {
13167 _invlist_union(cp_list, depends_list, &cp_list);
13168 SvREFCNT_dec(depends_list);
13171 cp_list = depends_list;
13175 /* If there is a swash and more than one element, we can't use the swash in
13176 * the optimization below. */
13177 if (swash && element_count > 1) {
13178 SvREFCNT_dec(swash);
13183 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13185 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13186 SvREFCNT_dec(listsv);
13189 /* av[0] stores the character class description in its textual form:
13190 * used later (regexec.c:Perl_regclass_swash()) to initialize the
13191 * appropriate swash, and is also useful for dumping the regnode.
13192 * av[1] if NULL, is a placeholder to later contain the swash computed
13193 * from av[0]. But if no further computation need be done, the
13194 * swash is stored there now.
13195 * av[2] stores the cp_list inversion list for use in addition or
13196 * instead of av[0]; used only if av[1] is NULL
13197 * av[3] is set if any component of the class is from a user-defined
13198 * property; used only if av[1] is NULL */
13199 AV * const av = newAV();
13202 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13204 : (SvREFCNT_dec(listsv), &PL_sv_undef));
13206 av_store(av, 1, swash);
13207 SvREFCNT_dec(cp_list);
13210 av_store(av, 1, NULL);
13212 av_store(av, 2, cp_list);
13213 av_store(av, 3, newSVuv(has_user_defined_property));
13217 rv = newRV_noinc(MUTABLE_SV(av));
13218 n = add_data(pRExC_state, 1, "s");
13219 RExC_rxi->data->data[n] = (void*)rv;
13223 *flagp |= HASWIDTH|SIMPLE;
13226 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13229 /* reg_skipcomment()
13231 Absorbs an /x style # comments from the input stream.
13232 Returns true if there is more text remaining in the stream.
13233 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13234 terminates the pattern without including a newline.
13236 Note its the callers responsibility to ensure that we are
13237 actually in /x mode
13242 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13246 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13248 while (RExC_parse < RExC_end)
13249 if (*RExC_parse++ == '\n') {
13254 /* we ran off the end of the pattern without ending
13255 the comment, so we have to add an \n when wrapping */
13256 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13264 Advances the parse position, and optionally absorbs
13265 "whitespace" from the inputstream.
13267 Without /x "whitespace" means (?#...) style comments only,
13268 with /x this means (?#...) and # comments and whitespace proper.
13270 Returns the RExC_parse point from BEFORE the scan occurs.
13272 This is the /x friendly way of saying RExC_parse++.
13276 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13278 char* const retval = RExC_parse++;
13280 PERL_ARGS_ASSERT_NEXTCHAR;
13283 if (RExC_end - RExC_parse >= 3
13284 && *RExC_parse == '('
13285 && RExC_parse[1] == '?'
13286 && RExC_parse[2] == '#')
13288 while (*RExC_parse != ')') {
13289 if (RExC_parse == RExC_end)
13290 FAIL("Sequence (?#... not terminated");
13296 if (RExC_flags & RXf_PMf_EXTENDED) {
13297 if (isSPACE(*RExC_parse)) {
13301 else if (*RExC_parse == '#') {
13302 if ( reg_skipcomment( pRExC_state ) )
13311 - reg_node - emit a node
13313 STATIC regnode * /* Location. */
13314 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13318 regnode * const ret = RExC_emit;
13319 GET_RE_DEBUG_FLAGS_DECL;
13321 PERL_ARGS_ASSERT_REG_NODE;
13324 SIZE_ALIGN(RExC_size);
13328 if (RExC_emit >= RExC_emit_bound)
13329 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13330 op, RExC_emit, RExC_emit_bound);
13332 NODE_ALIGN_FILL(ret);
13334 FILL_ADVANCE_NODE(ptr, op);
13335 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
13336 #ifdef RE_TRACK_PATTERN_OFFSETS
13337 if (RExC_offsets) { /* MJD */
13338 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13339 "reg_node", __LINE__,
13341 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13342 ? "Overwriting end of array!\n" : "OK",
13343 (UV)(RExC_emit - RExC_emit_start),
13344 (UV)(RExC_parse - RExC_start),
13345 (UV)RExC_offsets[0]));
13346 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13354 - reganode - emit a node with an argument
13356 STATIC regnode * /* Location. */
13357 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13361 regnode * const ret = RExC_emit;
13362 GET_RE_DEBUG_FLAGS_DECL;
13364 PERL_ARGS_ASSERT_REGANODE;
13367 SIZE_ALIGN(RExC_size);
13372 assert(2==regarglen[op]+1);
13374 Anything larger than this has to allocate the extra amount.
13375 If we changed this to be:
13377 RExC_size += (1 + regarglen[op]);
13379 then it wouldn't matter. Its not clear what side effect
13380 might come from that so its not done so far.
13385 if (RExC_emit >= RExC_emit_bound)
13386 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13387 op, RExC_emit, RExC_emit_bound);
13389 NODE_ALIGN_FILL(ret);
13391 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13392 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
13393 #ifdef RE_TRACK_PATTERN_OFFSETS
13394 if (RExC_offsets) { /* MJD */
13395 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13399 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13400 "Overwriting end of array!\n" : "OK",
13401 (UV)(RExC_emit - RExC_emit_start),
13402 (UV)(RExC_parse - RExC_start),
13403 (UV)RExC_offsets[0]));
13404 Set_Cur_Node_Offset;
13412 - reguni - emit (if appropriate) a Unicode character
13415 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13419 PERL_ARGS_ASSERT_REGUNI;
13421 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13425 - reginsert - insert an operator in front of already-emitted operand
13427 * Means relocating the operand.
13430 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13436 const int offset = regarglen[(U8)op];
13437 const int size = NODE_STEP_REGNODE + offset;
13438 GET_RE_DEBUG_FLAGS_DECL;
13440 PERL_ARGS_ASSERT_REGINSERT;
13441 PERL_UNUSED_ARG(depth);
13442 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13443 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13452 if (RExC_open_parens) {
13454 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13455 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13456 if ( RExC_open_parens[paren] >= opnd ) {
13457 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13458 RExC_open_parens[paren] += size;
13460 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13462 if ( RExC_close_parens[paren] >= opnd ) {
13463 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13464 RExC_close_parens[paren] += size;
13466 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13471 while (src > opnd) {
13472 StructCopy(--src, --dst, regnode);
13473 #ifdef RE_TRACK_PATTERN_OFFSETS
13474 if (RExC_offsets) { /* MJD 20010112 */
13475 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13479 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13480 ? "Overwriting end of array!\n" : "OK",
13481 (UV)(src - RExC_emit_start),
13482 (UV)(dst - RExC_emit_start),
13483 (UV)RExC_offsets[0]));
13484 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13485 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13491 place = opnd; /* Op node, where operand used to be. */
13492 #ifdef RE_TRACK_PATTERN_OFFSETS
13493 if (RExC_offsets) { /* MJD */
13494 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13498 (UV)(place - RExC_emit_start) > RExC_offsets[0]
13499 ? "Overwriting end of array!\n" : "OK",
13500 (UV)(place - RExC_emit_start),
13501 (UV)(RExC_parse - RExC_start),
13502 (UV)RExC_offsets[0]));
13503 Set_Node_Offset(place, RExC_parse);
13504 Set_Node_Length(place, 1);
13507 src = NEXTOPER(place);
13508 FILL_ADVANCE_NODE(place, op);
13509 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
13510 Zero(src, offset, regnode);
13514 - regtail - set the next-pointer at the end of a node chain of p to val.
13515 - SEE ALSO: regtail_study
13517 /* TODO: All three parms should be const */
13519 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13523 GET_RE_DEBUG_FLAGS_DECL;
13525 PERL_ARGS_ASSERT_REGTAIL;
13527 PERL_UNUSED_ARG(depth);
13533 /* Find last node. */
13536 regnode * const temp = regnext(scan);
13538 SV * const mysv=sv_newmortal();
13539 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13540 regprop(RExC_rx, mysv, scan);
13541 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13542 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13543 (temp == NULL ? "->" : ""),
13544 (temp == NULL ? PL_reg_name[OP(val)] : "")
13552 if (reg_off_by_arg[OP(scan)]) {
13553 ARG_SET(scan, val - scan);
13556 NEXT_OFF(scan) = val - scan;
13562 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13563 - Look for optimizable sequences at the same time.
13564 - currently only looks for EXACT chains.
13566 This is experimental code. The idea is to use this routine to perform
13567 in place optimizations on branches and groups as they are constructed,
13568 with the long term intention of removing optimization from study_chunk so
13569 that it is purely analytical.
13571 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13572 to control which is which.
13575 /* TODO: All four parms should be const */
13578 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13583 #ifdef EXPERIMENTAL_INPLACESCAN
13586 GET_RE_DEBUG_FLAGS_DECL;
13588 PERL_ARGS_ASSERT_REGTAIL_STUDY;
13594 /* Find last node. */
13598 regnode * const temp = regnext(scan);
13599 #ifdef EXPERIMENTAL_INPLACESCAN
13600 if (PL_regkind[OP(scan)] == EXACT) {
13601 bool has_exactf_sharp_s; /* Unexamined in this routine */
13602 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13607 switch (OP(scan)) {
13613 case EXACTFU_TRICKYFOLD:
13615 if( exact == PSEUDO )
13617 else if ( exact != OP(scan) )
13626 SV * const mysv=sv_newmortal();
13627 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13628 regprop(RExC_rx, mysv, scan);
13629 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13630 SvPV_nolen_const(mysv),
13631 REG_NODE_NUM(scan),
13632 PL_reg_name[exact]);
13639 SV * const mysv_val=sv_newmortal();
13640 DEBUG_PARSE_MSG("");
13641 regprop(RExC_rx, mysv_val, val);
13642 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13643 SvPV_nolen_const(mysv_val),
13644 (IV)REG_NODE_NUM(val),
13648 if (reg_off_by_arg[OP(scan)]) {
13649 ARG_SET(scan, val - scan);
13652 NEXT_OFF(scan) = val - scan;
13660 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13664 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13670 for (bit=0; bit<32; bit++) {
13671 if (flags & (1<<bit)) {
13672 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
13675 if (!set++ && lead)
13676 PerlIO_printf(Perl_debug_log, "%s",lead);
13677 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13680 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13681 if (!set++ && lead) {
13682 PerlIO_printf(Perl_debug_log, "%s",lead);
13685 case REGEX_UNICODE_CHARSET:
13686 PerlIO_printf(Perl_debug_log, "UNICODE");
13688 case REGEX_LOCALE_CHARSET:
13689 PerlIO_printf(Perl_debug_log, "LOCALE");
13691 case REGEX_ASCII_RESTRICTED_CHARSET:
13692 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13694 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13695 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13698 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13704 PerlIO_printf(Perl_debug_log, "\n");
13706 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13712 Perl_regdump(pTHX_ const regexp *r)
13716 SV * const sv = sv_newmortal();
13717 SV *dsv= sv_newmortal();
13718 RXi_GET_DECL(r,ri);
13719 GET_RE_DEBUG_FLAGS_DECL;
13721 PERL_ARGS_ASSERT_REGDUMP;
13723 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13725 /* Header fields of interest. */
13726 if (r->anchored_substr) {
13727 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13728 RE_SV_DUMPLEN(r->anchored_substr), 30);
13729 PerlIO_printf(Perl_debug_log,
13730 "anchored %s%s at %"IVdf" ",
13731 s, RE_SV_TAIL(r->anchored_substr),
13732 (IV)r->anchored_offset);
13733 } else if (r->anchored_utf8) {
13734 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13735 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13736 PerlIO_printf(Perl_debug_log,
13737 "anchored utf8 %s%s at %"IVdf" ",
13738 s, RE_SV_TAIL(r->anchored_utf8),
13739 (IV)r->anchored_offset);
13741 if (r->float_substr) {
13742 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13743 RE_SV_DUMPLEN(r->float_substr), 30);
13744 PerlIO_printf(Perl_debug_log,
13745 "floating %s%s at %"IVdf"..%"UVuf" ",
13746 s, RE_SV_TAIL(r->float_substr),
13747 (IV)r->float_min_offset, (UV)r->float_max_offset);
13748 } else if (r->float_utf8) {
13749 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13750 RE_SV_DUMPLEN(r->float_utf8), 30);
13751 PerlIO_printf(Perl_debug_log,
13752 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13753 s, RE_SV_TAIL(r->float_utf8),
13754 (IV)r->float_min_offset, (UV)r->float_max_offset);
13756 if (r->check_substr || r->check_utf8)
13757 PerlIO_printf(Perl_debug_log,
13759 (r->check_substr == r->float_substr
13760 && r->check_utf8 == r->float_utf8
13761 ? "(checking floating" : "(checking anchored"));
13762 if (r->extflags & RXf_NOSCAN)
13763 PerlIO_printf(Perl_debug_log, " noscan");
13764 if (r->extflags & RXf_CHECK_ALL)
13765 PerlIO_printf(Perl_debug_log, " isall");
13766 if (r->check_substr || r->check_utf8)
13767 PerlIO_printf(Perl_debug_log, ") ");
13769 if (ri->regstclass) {
13770 regprop(r, sv, ri->regstclass);
13771 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13773 if (r->extflags & RXf_ANCH) {
13774 PerlIO_printf(Perl_debug_log, "anchored");
13775 if (r->extflags & RXf_ANCH_BOL)
13776 PerlIO_printf(Perl_debug_log, "(BOL)");
13777 if (r->extflags & RXf_ANCH_MBOL)
13778 PerlIO_printf(Perl_debug_log, "(MBOL)");
13779 if (r->extflags & RXf_ANCH_SBOL)
13780 PerlIO_printf(Perl_debug_log, "(SBOL)");
13781 if (r->extflags & RXf_ANCH_GPOS)
13782 PerlIO_printf(Perl_debug_log, "(GPOS)");
13783 PerlIO_putc(Perl_debug_log, ' ');
13785 if (r->extflags & RXf_GPOS_SEEN)
13786 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13787 if (r->intflags & PREGf_SKIP)
13788 PerlIO_printf(Perl_debug_log, "plus ");
13789 if (r->intflags & PREGf_IMPLICIT)
13790 PerlIO_printf(Perl_debug_log, "implicit ");
13791 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13792 if (r->extflags & RXf_EVAL_SEEN)
13793 PerlIO_printf(Perl_debug_log, "with eval ");
13794 PerlIO_printf(Perl_debug_log, "\n");
13795 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13797 PERL_ARGS_ASSERT_REGDUMP;
13798 PERL_UNUSED_CONTEXT;
13799 PERL_UNUSED_ARG(r);
13800 #endif /* DEBUGGING */
13804 - regprop - printable representation of opcode
13806 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13809 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13810 if (flags & ANYOF_INVERT) \
13811 /*make sure the invert info is in each */ \
13812 sv_catpvs(sv, "^"); \
13818 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13824 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13825 static const char * const anyofs[] = {
13826 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
13827 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 || _CC_ALNUMC != 7 \
13828 || _CC_GRAPH != 8 || _CC_SPACE != 9 || _CC_BLANK != 10 \
13829 || _CC_XDIGIT != 11 || _CC_PSXSPC != 12 || _CC_CNTRL != 13 \
13830 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
13831 #error Need to adjust order of anyofs[]
13866 RXi_GET_DECL(prog,progi);
13867 GET_RE_DEBUG_FLAGS_DECL;
13869 PERL_ARGS_ASSERT_REGPROP;
13873 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13874 /* It would be nice to FAIL() here, but this may be called from
13875 regexec.c, and it would be hard to supply pRExC_state. */
13876 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13877 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13879 k = PL_regkind[OP(o)];
13882 sv_catpvs(sv, " ");
13883 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13884 * is a crude hack but it may be the best for now since
13885 * we have no flag "this EXACTish node was UTF-8"
13887 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13888 PERL_PV_ESCAPE_UNI_DETECT |
13889 PERL_PV_ESCAPE_NONASCII |
13890 PERL_PV_PRETTY_ELLIPSES |
13891 PERL_PV_PRETTY_LTGT |
13892 PERL_PV_PRETTY_NOCLEAR
13894 } else if (k == TRIE) {
13895 /* print the details of the trie in dumpuntil instead, as
13896 * progi->data isn't available here */
13897 const char op = OP(o);
13898 const U32 n = ARG(o);
13899 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13900 (reg_ac_data *)progi->data->data[n] :
13902 const reg_trie_data * const trie
13903 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13905 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13906 DEBUG_TRIE_COMPILE_r(
13907 Perl_sv_catpvf(aTHX_ sv,
13908 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13909 (UV)trie->startstate,
13910 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13911 (UV)trie->wordcount,
13914 (UV)TRIE_CHARCOUNT(trie),
13915 (UV)trie->uniquecharcount
13918 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13920 int rangestart = -1;
13921 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13922 sv_catpvs(sv, "[");
13923 for (i = 0; i <= 256; i++) {
13924 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13925 if (rangestart == -1)
13927 } else if (rangestart != -1) {
13928 if (i <= rangestart + 3)
13929 for (; rangestart < i; rangestart++)
13930 put_byte(sv, rangestart);
13932 put_byte(sv, rangestart);
13933 sv_catpvs(sv, "-");
13934 put_byte(sv, i - 1);
13939 sv_catpvs(sv, "]");
13942 } else if (k == CURLY) {
13943 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13944 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13945 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13947 else if (k == WHILEM && o->flags) /* Ordinal/of */
13948 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13949 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13950 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13951 if ( RXp_PAREN_NAMES(prog) ) {
13952 if ( k != REF || (OP(o) < NREF)) {
13953 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13954 SV **name= av_fetch(list, ARG(o), 0 );
13956 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13959 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13960 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13961 I32 *nums=(I32*)SvPVX(sv_dat);
13962 SV **name= av_fetch(list, nums[0], 0 );
13965 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13966 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13967 (n ? "," : ""), (IV)nums[n]);
13969 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13973 } else if (k == GOSUB)
13974 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13975 else if (k == VERB) {
13977 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13978 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13979 } else if (k == LOGICAL)
13980 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13981 else if (k == ANYOF) {
13982 int i, rangestart = -1;
13983 const U8 flags = ANYOF_FLAGS(o);
13987 if (flags & ANYOF_LOCALE)
13988 sv_catpvs(sv, "{loc}");
13989 if (flags & ANYOF_LOC_FOLD)
13990 sv_catpvs(sv, "{i}");
13991 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13992 if (flags & ANYOF_INVERT)
13993 sv_catpvs(sv, "^");
13995 /* output what the standard cp 0-255 bitmap matches */
13996 for (i = 0; i <= 256; i++) {
13997 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13998 if (rangestart == -1)
14000 } else if (rangestart != -1) {
14001 if (i <= rangestart + 3)
14002 for (; rangestart < i; rangestart++)
14003 put_byte(sv, rangestart);
14005 put_byte(sv, rangestart);
14006 sv_catpvs(sv, "-");
14007 put_byte(sv, i - 1);
14014 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14015 /* output any special charclass tests (used entirely under use locale) */
14016 if (ANYOF_CLASS_TEST_ANY_SET(o))
14017 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14018 if (ANYOF_CLASS_TEST(o,i)) {
14019 sv_catpv(sv, anyofs[i]);
14023 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14025 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14026 sv_catpvs(sv, "{non-utf8-latin1-all}");
14029 /* output information about the unicode matching */
14030 if (flags & ANYOF_UNICODE_ALL)
14031 sv_catpvs(sv, "{unicode_all}");
14032 else if (ANYOF_NONBITMAP(o))
14033 sv_catpvs(sv, "{unicode}");
14034 if (flags & ANYOF_NONBITMAP_NON_UTF8)
14035 sv_catpvs(sv, "{outside bitmap}");
14037 if (ANYOF_NONBITMAP(o)) {
14038 SV *lv; /* Set if there is something outside the bit map */
14039 SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14040 bool byte_output = FALSE; /* If something in the bitmap has been
14043 if (lv && lv != &PL_sv_undef) {
14045 U8 s[UTF8_MAXBYTES_CASE+1];
14047 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14048 uvchr_to_utf8(s, i);
14051 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
14055 && swash_fetch(sw, s, TRUE))
14057 if (rangestart == -1)
14059 } else if (rangestart != -1) {
14060 byte_output = TRUE;
14061 if (i <= rangestart + 3)
14062 for (; rangestart < i; rangestart++) {
14063 put_byte(sv, rangestart);
14066 put_byte(sv, rangestart);
14067 sv_catpvs(sv, "-");
14076 char *s = savesvpv(lv);
14077 char * const origs = s;
14079 while (*s && *s != '\n')
14083 const char * const t = ++s;
14086 sv_catpvs(sv, " ");
14092 /* Truncate very long output */
14093 if (s - origs > 256) {
14094 Perl_sv_catpvf(aTHX_ sv,
14096 (int) (s - origs - 1),
14102 else if (*s == '\t') {
14121 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14123 else if (k == POSIXD || k == NPOSIXD) {
14124 U8 index = FLAGS(o) * 2;
14125 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14126 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14129 sv_catpv(sv, anyofs[index]);
14132 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14133 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14135 PERL_UNUSED_CONTEXT;
14136 PERL_UNUSED_ARG(sv);
14137 PERL_UNUSED_ARG(o);
14138 PERL_UNUSED_ARG(prog);
14139 #endif /* DEBUGGING */
14143 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14144 { /* Assume that RE_INTUIT is set */
14146 struct regexp *const prog = ReANY(r);
14147 GET_RE_DEBUG_FLAGS_DECL;
14149 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14150 PERL_UNUSED_CONTEXT;
14154 const char * const s = SvPV_nolen_const(prog->check_substr
14155 ? prog->check_substr : prog->check_utf8);
14157 if (!PL_colorset) reginitcolors();
14158 PerlIO_printf(Perl_debug_log,
14159 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14161 prog->check_substr ? "" : "utf8 ",
14162 PL_colors[5],PL_colors[0],
14165 (strlen(s) > 60 ? "..." : ""));
14168 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14174 handles refcounting and freeing the perl core regexp structure. When
14175 it is necessary to actually free the structure the first thing it
14176 does is call the 'free' method of the regexp_engine associated to
14177 the regexp, allowing the handling of the void *pprivate; member
14178 first. (This routine is not overridable by extensions, which is why
14179 the extensions free is called first.)
14181 See regdupe and regdupe_internal if you change anything here.
14183 #ifndef PERL_IN_XSUB_RE
14185 Perl_pregfree(pTHX_ REGEXP *r)
14191 Perl_pregfree2(pTHX_ REGEXP *rx)
14194 struct regexp *const r = ReANY(rx);
14195 GET_RE_DEBUG_FLAGS_DECL;
14197 PERL_ARGS_ASSERT_PREGFREE2;
14199 if (r->mother_re) {
14200 ReREFCNT_dec(r->mother_re);
14202 CALLREGFREE_PVT(rx); /* free the private data */
14203 SvREFCNT_dec(RXp_PAREN_NAMES(r));
14204 Safefree(r->xpv_len_u.xpvlenu_pv);
14207 SvREFCNT_dec(r->anchored_substr);
14208 SvREFCNT_dec(r->anchored_utf8);
14209 SvREFCNT_dec(r->float_substr);
14210 SvREFCNT_dec(r->float_utf8);
14211 Safefree(r->substrs);
14213 RX_MATCH_COPY_FREE(rx);
14214 #ifdef PERL_ANY_COW
14215 SvREFCNT_dec(r->saved_copy);
14218 SvREFCNT_dec(r->qr_anoncv);
14219 rx->sv_u.svu_rx = 0;
14224 This is a hacky workaround to the structural issue of match results
14225 being stored in the regexp structure which is in turn stored in
14226 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14227 could be PL_curpm in multiple contexts, and could require multiple
14228 result sets being associated with the pattern simultaneously, such
14229 as when doing a recursive match with (??{$qr})
14231 The solution is to make a lightweight copy of the regexp structure
14232 when a qr// is returned from the code executed by (??{$qr}) this
14233 lightweight copy doesn't actually own any of its data except for
14234 the starp/end and the actual regexp structure itself.
14240 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14242 struct regexp *ret;
14243 struct regexp *const r = ReANY(rx);
14244 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14246 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14249 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14251 SvOK_off((SV *)ret_x);
14253 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14254 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
14255 made both spots point to the same regexp body.) */
14256 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14257 assert(!SvPVX(ret_x));
14258 ret_x->sv_u.svu_rx = temp->sv_any;
14259 temp->sv_any = NULL;
14260 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14261 SvREFCNT_dec(temp);
14262 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14263 ing below will not set it. */
14264 SvCUR_set(ret_x, SvCUR(rx));
14267 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14268 sv_force_normal(sv) is called. */
14270 ret = ReANY(ret_x);
14272 SvFLAGS(ret_x) |= SvUTF8(rx);
14273 /* We share the same string buffer as the original regexp, on which we
14274 hold a reference count, incremented when mother_re is set below.
14275 The string pointer is copied here, being part of the regexp struct.
14277 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14278 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14280 const I32 npar = r->nparens+1;
14281 Newx(ret->offs, npar, regexp_paren_pair);
14282 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14285 Newx(ret->substrs, 1, struct reg_substr_data);
14286 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14288 SvREFCNT_inc_void(ret->anchored_substr);
14289 SvREFCNT_inc_void(ret->anchored_utf8);
14290 SvREFCNT_inc_void(ret->float_substr);
14291 SvREFCNT_inc_void(ret->float_utf8);
14293 /* check_substr and check_utf8, if non-NULL, point to either their
14294 anchored or float namesakes, and don't hold a second reference. */
14296 RX_MATCH_COPIED_off(ret_x);
14297 #ifdef PERL_ANY_COW
14298 ret->saved_copy = NULL;
14300 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14301 SvREFCNT_inc_void(ret->qr_anoncv);
14307 /* regfree_internal()
14309 Free the private data in a regexp. This is overloadable by
14310 extensions. Perl takes care of the regexp structure in pregfree(),
14311 this covers the *pprivate pointer which technically perl doesn't
14312 know about, however of course we have to handle the
14313 regexp_internal structure when no extension is in use.
14315 Note this is called before freeing anything in the regexp
14320 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14323 struct regexp *const r = ReANY(rx);
14324 RXi_GET_DECL(r,ri);
14325 GET_RE_DEBUG_FLAGS_DECL;
14327 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14333 SV *dsv= sv_newmortal();
14334 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14335 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14336 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14337 PL_colors[4],PL_colors[5],s);
14340 #ifdef RE_TRACK_PATTERN_OFFSETS
14342 Safefree(ri->u.offsets); /* 20010421 MJD */
14344 if (ri->code_blocks) {
14346 for (n = 0; n < ri->num_code_blocks; n++)
14347 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14348 Safefree(ri->code_blocks);
14352 int n = ri->data->count;
14355 /* If you add a ->what type here, update the comment in regcomp.h */
14356 switch (ri->data->what[n]) {
14362 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14365 Safefree(ri->data->data[n]);
14371 { /* Aho Corasick add-on structure for a trie node.
14372 Used in stclass optimization only */
14374 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14376 refcount = --aho->refcount;
14379 PerlMemShared_free(aho->states);
14380 PerlMemShared_free(aho->fail);
14381 /* do this last!!!! */
14382 PerlMemShared_free(ri->data->data[n]);
14383 PerlMemShared_free(ri->regstclass);
14389 /* trie structure. */
14391 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14393 refcount = --trie->refcount;
14396 PerlMemShared_free(trie->charmap);
14397 PerlMemShared_free(trie->states);
14398 PerlMemShared_free(trie->trans);
14400 PerlMemShared_free(trie->bitmap);
14402 PerlMemShared_free(trie->jump);
14403 PerlMemShared_free(trie->wordinfo);
14404 /* do this last!!!! */
14405 PerlMemShared_free(ri->data->data[n]);
14410 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14413 Safefree(ri->data->what);
14414 Safefree(ri->data);
14420 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14421 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14422 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14425 re_dup - duplicate a regexp.
14427 This routine is expected to clone a given regexp structure. It is only
14428 compiled under USE_ITHREADS.
14430 After all of the core data stored in struct regexp is duplicated
14431 the regexp_engine.dupe method is used to copy any private data
14432 stored in the *pprivate pointer. This allows extensions to handle
14433 any duplication it needs to do.
14435 See pregfree() and regfree_internal() if you change anything here.
14437 #if defined(USE_ITHREADS)
14438 #ifndef PERL_IN_XSUB_RE
14440 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14444 const struct regexp *r = ReANY(sstr);
14445 struct regexp *ret = ReANY(dstr);
14447 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14449 npar = r->nparens+1;
14450 Newx(ret->offs, npar, regexp_paren_pair);
14451 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14453 /* no need to copy these */
14454 Newx(ret->swap, npar, regexp_paren_pair);
14457 if (ret->substrs) {
14458 /* Do it this way to avoid reading from *r after the StructCopy().
14459 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14460 cache, it doesn't matter. */
14461 const bool anchored = r->check_substr
14462 ? r->check_substr == r->anchored_substr
14463 : r->check_utf8 == r->anchored_utf8;
14464 Newx(ret->substrs, 1, struct reg_substr_data);
14465 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14467 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14468 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14469 ret->float_substr = sv_dup_inc(ret->float_substr, param);
14470 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14472 /* check_substr and check_utf8, if non-NULL, point to either their
14473 anchored or float namesakes, and don't hold a second reference. */
14475 if (ret->check_substr) {
14477 assert(r->check_utf8 == r->anchored_utf8);
14478 ret->check_substr = ret->anchored_substr;
14479 ret->check_utf8 = ret->anchored_utf8;
14481 assert(r->check_substr == r->float_substr);
14482 assert(r->check_utf8 == r->float_utf8);
14483 ret->check_substr = ret->float_substr;
14484 ret->check_utf8 = ret->float_utf8;
14486 } else if (ret->check_utf8) {
14488 ret->check_utf8 = ret->anchored_utf8;
14490 ret->check_utf8 = ret->float_utf8;
14495 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14496 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14499 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14501 if (RX_MATCH_COPIED(dstr))
14502 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
14504 ret->subbeg = NULL;
14505 #ifdef PERL_ANY_COW
14506 ret->saved_copy = NULL;
14509 /* Whether mother_re be set or no, we need to copy the string. We
14510 cannot refrain from copying it when the storage points directly to
14511 our mother regexp, because that's
14512 1: a buffer in a different thread
14513 2: something we no longer hold a reference on
14514 so we need to copy it locally. */
14515 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
14516 ret->mother_re = NULL;
14519 #endif /* PERL_IN_XSUB_RE */
14524 This is the internal complement to regdupe() which is used to copy
14525 the structure pointed to by the *pprivate pointer in the regexp.
14526 This is the core version of the extension overridable cloning hook.
14527 The regexp structure being duplicated will be copied by perl prior
14528 to this and will be provided as the regexp *r argument, however
14529 with the /old/ structures pprivate pointer value. Thus this routine
14530 may override any copying normally done by perl.
14532 It returns a pointer to the new regexp_internal structure.
14536 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14539 struct regexp *const r = ReANY(rx);
14540 regexp_internal *reti;
14542 RXi_GET_DECL(r,ri);
14544 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14548 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14549 Copy(ri->program, reti->program, len+1, regnode);
14551 reti->num_code_blocks = ri->num_code_blocks;
14552 if (ri->code_blocks) {
14554 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14555 struct reg_code_block);
14556 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14557 struct reg_code_block);
14558 for (n = 0; n < ri->num_code_blocks; n++)
14559 reti->code_blocks[n].src_regex = (REGEXP*)
14560 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14563 reti->code_blocks = NULL;
14565 reti->regstclass = NULL;
14568 struct reg_data *d;
14569 const int count = ri->data->count;
14572 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14573 char, struct reg_data);
14574 Newx(d->what, count, U8);
14577 for (i = 0; i < count; i++) {
14578 d->what[i] = ri->data->what[i];
14579 switch (d->what[i]) {
14580 /* see also regcomp.h and regfree_internal() */
14581 case 'a': /* actually an AV, but the dup function is identical. */
14585 case 'u': /* actually an HV, but the dup function is identical. */
14586 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14589 /* This is cheating. */
14590 Newx(d->data[i], 1, struct regnode_charclass_class);
14591 StructCopy(ri->data->data[i], d->data[i],
14592 struct regnode_charclass_class);
14593 reti->regstclass = (regnode*)d->data[i];
14596 /* Trie stclasses are readonly and can thus be shared
14597 * without duplication. We free the stclass in pregfree
14598 * when the corresponding reg_ac_data struct is freed.
14600 reti->regstclass= ri->regstclass;
14604 ((reg_trie_data*)ri->data->data[i])->refcount++;
14609 d->data[i] = ri->data->data[i];
14612 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14621 reti->name_list_idx = ri->name_list_idx;
14623 #ifdef RE_TRACK_PATTERN_OFFSETS
14624 if (ri->u.offsets) {
14625 Newx(reti->u.offsets, 2*len+1, U32);
14626 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14629 SetProgLen(reti,len);
14632 return (void*)reti;
14635 #endif /* USE_ITHREADS */
14637 #ifndef PERL_IN_XSUB_RE
14640 - regnext - dig the "next" pointer out of a node
14643 Perl_regnext(pTHX_ regnode *p)
14651 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
14652 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14655 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14664 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14667 STRLEN l1 = strlen(pat1);
14668 STRLEN l2 = strlen(pat2);
14671 const char *message;
14673 PERL_ARGS_ASSERT_RE_CROAK2;
14679 Copy(pat1, buf, l1 , char);
14680 Copy(pat2, buf + l1, l2 , char);
14681 buf[l1 + l2] = '\n';
14682 buf[l1 + l2 + 1] = '\0';
14684 /* ANSI variant takes additional second argument */
14685 va_start(args, pat2);
14689 msv = vmess(buf, &args);
14691 message = SvPV_const(msv,l1);
14694 Copy(message, buf, l1 , char);
14695 buf[l1-1] = '\0'; /* Overwrite \n */
14696 Perl_croak(aTHX_ "%s", buf);
14699 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
14701 #ifndef PERL_IN_XSUB_RE
14703 Perl_save_re_context(pTHX)
14707 struct re_save_state *state;
14709 SAVEVPTR(PL_curcop);
14710 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14712 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14713 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14714 SSPUSHUV(SAVEt_RE_STATE);
14716 Copy(&PL_reg_state, state, 1, struct re_save_state);
14718 PL_reg_oldsaved = NULL;
14719 PL_reg_oldsavedlen = 0;
14720 PL_reg_oldsavedoffset = 0;
14721 PL_reg_oldsavedcoffset = 0;
14722 PL_reg_maxiter = 0;
14723 PL_reg_leftiter = 0;
14724 PL_reg_poscache = NULL;
14725 PL_reg_poscache_size = 0;
14726 #ifdef PERL_ANY_COW
14730 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14732 const REGEXP * const rx = PM_GETRE(PL_curpm);
14735 for (i = 1; i <= RX_NPARENS(rx); i++) {
14736 char digits[TYPE_CHARS(long)];
14737 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14738 GV *const *const gvp
14739 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14742 GV * const gv = *gvp;
14743 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14755 S_put_byte(pTHX_ SV *sv, int c)
14757 PERL_ARGS_ASSERT_PUT_BYTE;
14759 /* Our definition of isPRINT() ignores locales, so only bytes that are
14760 not part of UTF-8 are considered printable. I assume that the same
14761 holds for UTF-EBCDIC.
14762 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14763 which Wikipedia says:
14765 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14766 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14767 identical, to the ASCII delete (DEL) or rubout control character.
14768 ) So the old condition can be simplified to !isPRINT(c) */
14771 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14774 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14778 const char string = c;
14779 if (c == '-' || c == ']' || c == '\\' || c == '^')
14780 sv_catpvs(sv, "\\");
14781 sv_catpvn(sv, &string, 1);
14786 #define CLEAR_OPTSTART \
14787 if (optstart) STMT_START { \
14788 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14792 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14794 STATIC const regnode *
14795 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14796 const regnode *last, const regnode *plast,
14797 SV* sv, I32 indent, U32 depth)
14800 U8 op = PSEUDO; /* Arbitrary non-END op. */
14801 const regnode *next;
14802 const regnode *optstart= NULL;
14804 RXi_GET_DECL(r,ri);
14805 GET_RE_DEBUG_FLAGS_DECL;
14807 PERL_ARGS_ASSERT_DUMPUNTIL;
14809 #ifdef DEBUG_DUMPUNTIL
14810 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14811 last ? last-start : 0,plast ? plast-start : 0);
14814 if (plast && plast < last)
14817 while (PL_regkind[op] != END && (!last || node < last)) {
14818 /* While that wasn't END last time... */
14821 if (op == CLOSE || op == WHILEM)
14823 next = regnext((regnode *)node);
14826 if (OP(node) == OPTIMIZED) {
14827 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14834 regprop(r, sv, node);
14835 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14836 (int)(2*indent + 1), "", SvPVX_const(sv));
14838 if (OP(node) != OPTIMIZED) {
14839 if (next == NULL) /* Next ptr. */
14840 PerlIO_printf(Perl_debug_log, " (0)");
14841 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14842 PerlIO_printf(Perl_debug_log, " (FAIL)");
14844 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14845 (void)PerlIO_putc(Perl_debug_log, '\n');
14849 if (PL_regkind[(U8)op] == BRANCHJ) {
14852 const regnode *nnode = (OP(next) == LONGJMP
14853 ? regnext((regnode *)next)
14855 if (last && nnode > last)
14857 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14860 else if (PL_regkind[(U8)op] == BRANCH) {
14862 DUMPUNTIL(NEXTOPER(node), next);
14864 else if ( PL_regkind[(U8)op] == TRIE ) {
14865 const regnode *this_trie = node;
14866 const char op = OP(node);
14867 const U32 n = ARG(node);
14868 const reg_ac_data * const ac = op>=AHOCORASICK ?
14869 (reg_ac_data *)ri->data->data[n] :
14871 const reg_trie_data * const trie =
14872 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14874 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14876 const regnode *nextbranch= NULL;
14879 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14880 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14882 PerlIO_printf(Perl_debug_log, "%*s%s ",
14883 (int)(2*(indent+3)), "",
14884 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14885 PL_colors[0], PL_colors[1],
14886 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14887 PERL_PV_PRETTY_ELLIPSES |
14888 PERL_PV_PRETTY_LTGT
14893 U16 dist= trie->jump[word_idx+1];
14894 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14895 (UV)((dist ? this_trie + dist : next) - start));
14898 nextbranch= this_trie + trie->jump[0];
14899 DUMPUNTIL(this_trie + dist, nextbranch);
14901 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14902 nextbranch= regnext((regnode *)nextbranch);
14904 PerlIO_printf(Perl_debug_log, "\n");
14907 if (last && next > last)
14912 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14913 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14914 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14916 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14918 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14920 else if ( op == PLUS || op == STAR) {
14921 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14923 else if (PL_regkind[(U8)op] == ANYOF) {
14924 /* arglen 1 + class block */
14925 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14926 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14927 node = NEXTOPER(node);
14929 else if (PL_regkind[(U8)op] == EXACT) {
14930 /* Literal string, where present. */
14931 node += NODE_SZ_STR(node) - 1;
14932 node = NEXTOPER(node);
14935 node = NEXTOPER(node);
14936 node += regarglen[(U8)op];
14938 if (op == CURLYX || op == OPEN)
14942 #ifdef DEBUG_DUMPUNTIL
14943 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14948 #endif /* DEBUGGING */
14952 * c-indentation-style: bsd
14953 * c-basic-offset: 4
14954 * indent-tabs-mode: nil
14957 * ex: set ts=8 sts=4 sw=4 et: