5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
89 #ifndef PERL_IN_XSUB_RE
90 # include "charclass_invlists.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
100 # if defined(BUGGY_MSC6)
101 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
102 # pragma optimize("a",off)
103 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
104 # pragma optimize("w",on )
105 # endif /* BUGGY_MSC6 */
109 #define STATIC static
113 typedef struct RExC_state_t {
114 U32 flags; /* RXf_* are we folding, multilining? */
115 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
116 char *precomp; /* uncompiled string. */
117 REGEXP *rx_sv; /* The SV that is the regexp. */
118 regexp *rx; /* perl core regexp structure */
119 regexp_internal *rxi; /* internal data for regexp object pprivate field */
120 char *start; /* Start of input for compile */
121 char *end; /* End of input for compile */
122 char *parse; /* Input-scan pointer. */
123 I32 whilem_seen; /* number of WHILEM in this expr */
124 regnode *emit_start; /* Start of emitted-code area */
125 regnode *emit_bound; /* First regnode outside of the allocated space */
126 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
130 I32 size; /* Code size. */
131 I32 npar; /* Capture buffer count, (OPEN). */
132 I32 cpar; /* Capture buffer count, (CLOSE). */
133 I32 nestroot; /* root parens we are in - used by accept */
136 regnode **open_parens; /* pointers to open parens */
137 regnode **close_parens; /* pointers to close parens */
138 regnode *opend; /* END node in program */
139 I32 utf8; /* whether the pattern is utf8 or not */
140 I32 orig_utf8; /* whether the pattern was originally in utf8 */
141 /* XXX use this for future optimisation of case
142 * where pattern must be upgraded to utf8. */
143 I32 uni_semantics; /* If a d charset modifier should use unicode
144 rules, even if the pattern is not in
146 HV *paren_names; /* Paren names */
148 regnode **recurse; /* Recurse regops */
149 I32 recurse_count; /* Number of recurse regops */
152 I32 override_recoding;
153 struct reg_code_block *code_blocks; /* positions of literal (?{})
155 int num_code_blocks; /* size of code_blocks[] */
156 int code_index; /* next code_blocks[] slot */
158 char *starttry; /* -Dr: where regtry was called. */
159 #define RExC_starttry (pRExC_state->starttry)
161 SV *runtime_code_qr; /* qr with the runtime code blocks */
163 const char *lastparse;
165 AV *paren_name_list; /* idx -> name */
166 #define RExC_lastparse (pRExC_state->lastparse)
167 #define RExC_lastnum (pRExC_state->lastnum)
168 #define RExC_paren_name_list (pRExC_state->paren_name_list)
172 #define RExC_flags (pRExC_state->flags)
173 #define RExC_pm_flags (pRExC_state->pm_flags)
174 #define RExC_precomp (pRExC_state->precomp)
175 #define RExC_rx_sv (pRExC_state->rx_sv)
176 #define RExC_rx (pRExC_state->rx)
177 #define RExC_rxi (pRExC_state->rxi)
178 #define RExC_start (pRExC_state->start)
179 #define RExC_end (pRExC_state->end)
180 #define RExC_parse (pRExC_state->parse)
181 #define RExC_whilem_seen (pRExC_state->whilem_seen)
182 #ifdef RE_TRACK_PATTERN_OFFSETS
183 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
185 #define RExC_emit (pRExC_state->emit)
186 #define RExC_emit_start (pRExC_state->emit_start)
187 #define RExC_emit_bound (pRExC_state->emit_bound)
188 #define RExC_naughty (pRExC_state->naughty)
189 #define RExC_sawback (pRExC_state->sawback)
190 #define RExC_seen (pRExC_state->seen)
191 #define RExC_size (pRExC_state->size)
192 #define RExC_npar (pRExC_state->npar)
193 #define RExC_nestroot (pRExC_state->nestroot)
194 #define RExC_extralen (pRExC_state->extralen)
195 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
196 #define RExC_utf8 (pRExC_state->utf8)
197 #define RExC_uni_semantics (pRExC_state->uni_semantics)
198 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
199 #define RExC_open_parens (pRExC_state->open_parens)
200 #define RExC_close_parens (pRExC_state->close_parens)
201 #define RExC_opend (pRExC_state->opend)
202 #define RExC_paren_names (pRExC_state->paren_names)
203 #define RExC_recurse (pRExC_state->recurse)
204 #define RExC_recurse_count (pRExC_state->recurse_count)
205 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
206 #define RExC_contains_locale (pRExC_state->contains_locale)
207 #define RExC_override_recoding (pRExC_state->override_recoding)
210 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
211 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
212 ((*s) == '{' && regcurly(s)))
215 #undef SPSTART /* dratted cpp namespace... */
218 * Flags to be passed up and down.
220 #define WORST 0 /* Worst case. */
221 #define HASWIDTH 0x01 /* Known to match non-null strings. */
223 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
224 * character, and if utf8, must be invariant. Note that this is not the same
225 * thing as REGNODE_SIMPLE */
227 #define SPSTART 0x04 /* Starts with * or +. */
228 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
229 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
231 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
233 /* whether trie related optimizations are enabled */
234 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
235 #define TRIE_STUDY_OPT
236 #define FULL_TRIE_STUDY
242 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
243 #define PBITVAL(paren) (1 << ((paren) & 7))
244 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
245 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
246 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
248 /* If not already in utf8, do a longjmp back to the beginning */
249 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
250 #define REQUIRE_UTF8 STMT_START { \
251 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
254 /* About scan_data_t.
256 During optimisation we recurse through the regexp program performing
257 various inplace (keyhole style) optimisations. In addition study_chunk
258 and scan_commit populate this data structure with information about
259 what strings MUST appear in the pattern. We look for the longest
260 string that must appear at a fixed location, and we look for the
261 longest string that may appear at a floating location. So for instance
266 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
267 strings (because they follow a .* construct). study_chunk will identify
268 both FOO and BAR as being the longest fixed and floating strings respectively.
270 The strings can be composites, for instance
274 will result in a composite fixed substring 'foo'.
276 For each string some basic information is maintained:
278 - offset or min_offset
279 This is the position the string must appear at, or not before.
280 It also implicitly (when combined with minlenp) tells us how many
281 characters must match before the string we are searching for.
282 Likewise when combined with minlenp and the length of the string it
283 tells us how many characters must appear after the string we have
287 Only used for floating strings. This is the rightmost point that
288 the string can appear at. If set to I32 max it indicates that the
289 string can occur infinitely far to the right.
292 A pointer to the minimum length of the pattern that the string
293 was found inside. This is important as in the case of positive
294 lookahead or positive lookbehind we can have multiple patterns
299 The minimum length of the pattern overall is 3, the minimum length
300 of the lookahead part is 3, but the minimum length of the part that
301 will actually match is 1. So 'FOO's minimum length is 3, but the
302 minimum length for the F is 1. This is important as the minimum length
303 is used to determine offsets in front of and behind the string being
304 looked for. Since strings can be composites this is the length of the
305 pattern at the time it was committed with a scan_commit. Note that
306 the length is calculated by study_chunk, so that the minimum lengths
307 are not known until the full pattern has been compiled, thus the
308 pointer to the value.
312 In the case of lookbehind the string being searched for can be
313 offset past the start point of the final matching string.
314 If this value was just blithely removed from the min_offset it would
315 invalidate some of the calculations for how many chars must match
316 before or after (as they are derived from min_offset and minlen and
317 the length of the string being searched for).
318 When the final pattern is compiled and the data is moved from the
319 scan_data_t structure into the regexp structure the information
320 about lookbehind is factored in, with the information that would
321 have been lost precalculated in the end_shift field for the
324 The fields pos_min and pos_delta are used to store the minimum offset
325 and the delta to the maximum offset at the current point in the pattern.
329 typedef struct scan_data_t {
330 /*I32 len_min; unused */
331 /*I32 len_delta; unused */
335 I32 last_end; /* min value, <0 unless valid. */
338 SV **longest; /* Either &l_fixed, or &l_float. */
339 SV *longest_fixed; /* longest fixed string found in pattern */
340 I32 offset_fixed; /* offset where it starts */
341 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
342 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
343 SV *longest_float; /* longest floating string found in pattern */
344 I32 offset_float_min; /* earliest point in string it can appear */
345 I32 offset_float_max; /* latest point in string it can appear */
346 I32 *minlen_float; /* pointer to the minlen relevant to the string */
347 I32 lookbehind_float; /* is the position of the string modified by LB */
351 struct regnode_charclass_class *start_class;
355 * Forward declarations for pregcomp()'s friends.
358 static const scan_data_t zero_scan_data =
359 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
361 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
362 #define SF_BEFORE_SEOL 0x0001
363 #define SF_BEFORE_MEOL 0x0002
364 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
365 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
368 # define SF_FIX_SHIFT_EOL (0+2)
369 # define SF_FL_SHIFT_EOL (0+4)
371 # define SF_FIX_SHIFT_EOL (+2)
372 # define SF_FL_SHIFT_EOL (+4)
375 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
376 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
378 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
379 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
380 #define SF_IS_INF 0x0040
381 #define SF_HAS_PAR 0x0080
382 #define SF_IN_PAR 0x0100
383 #define SF_HAS_EVAL 0x0200
384 #define SCF_DO_SUBSTR 0x0400
385 #define SCF_DO_STCLASS_AND 0x0800
386 #define SCF_DO_STCLASS_OR 0x1000
387 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
388 #define SCF_WHILEM_VISITED_POS 0x2000
390 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
391 #define SCF_SEEN_ACCEPT 0x8000
393 #define UTF cBOOL(RExC_utf8)
395 /* The enums for all these are ordered so things work out correctly */
396 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
397 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
398 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
399 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
400 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
401 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
402 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
404 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
406 #define OOB_UNICODE 12345678
407 #define OOB_NAMEDCLASS -1
409 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
410 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
413 /* length of regex to show in messages that don't mark a position within */
414 #define RegexLengthToShowInErrorMessages 127
417 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
418 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
419 * op/pragma/warn/regcomp.
421 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
422 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
424 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
427 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
428 * arg. Show regex, up to a maximum length. If it's too long, chop and add
431 #define _FAIL(code) STMT_START { \
432 const char *ellipses = ""; \
433 IV len = RExC_end - RExC_precomp; \
436 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
437 if (len > RegexLengthToShowInErrorMessages) { \
438 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
439 len = RegexLengthToShowInErrorMessages - 10; \
445 #define FAIL(msg) _FAIL( \
446 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
447 msg, (int)len, RExC_precomp, ellipses))
449 #define FAIL2(msg,arg) _FAIL( \
450 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
451 arg, (int)len, RExC_precomp, ellipses))
454 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
456 #define Simple_vFAIL(m) STMT_START { \
457 const IV offset = RExC_parse - RExC_precomp; \
458 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
459 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
465 #define vFAIL(m) STMT_START { \
467 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
472 * Like Simple_vFAIL(), but accepts two arguments.
474 #define Simple_vFAIL2(m,a1) STMT_START { \
475 const IV offset = RExC_parse - RExC_precomp; \
476 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
477 (int)offset, RExC_precomp, RExC_precomp + offset); \
481 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
483 #define vFAIL2(m,a1) STMT_START { \
485 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
486 Simple_vFAIL2(m, a1); \
491 * Like Simple_vFAIL(), but accepts three arguments.
493 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
494 const IV offset = RExC_parse - RExC_precomp; \
495 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
496 (int)offset, RExC_precomp, RExC_precomp + offset); \
500 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
502 #define vFAIL3(m,a1,a2) STMT_START { \
504 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
505 Simple_vFAIL3(m, a1, a2); \
509 * Like Simple_vFAIL(), but accepts four arguments.
511 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
512 const IV offset = RExC_parse - RExC_precomp; \
513 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
514 (int)offset, RExC_precomp, RExC_precomp + offset); \
517 #define ckWARNreg(loc,m) STMT_START { \
518 const IV offset = loc - RExC_precomp; \
519 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
520 (int)offset, RExC_precomp, RExC_precomp + offset); \
523 #define ckWARNregdep(loc,m) STMT_START { \
524 const IV offset = loc - RExC_precomp; \
525 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
527 (int)offset, RExC_precomp, RExC_precomp + offset); \
530 #define ckWARN2regdep(loc,m, a1) STMT_START { \
531 const IV offset = loc - RExC_precomp; \
532 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
534 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
537 #define ckWARN2reg(loc, m, a1) STMT_START { \
538 const IV offset = loc - RExC_precomp; \
539 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
540 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
543 #define vWARN3(loc, m, a1, a2) STMT_START { \
544 const IV offset = loc - RExC_precomp; \
545 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
546 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
549 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
550 const IV offset = loc - RExC_precomp; \
551 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
552 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
555 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
556 const IV offset = loc - RExC_precomp; \
557 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
558 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
561 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
562 const IV offset = loc - RExC_precomp; \
563 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
564 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
567 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
568 const IV offset = loc - RExC_precomp; \
569 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
570 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
574 /* Allow for side effects in s */
575 #define REGC(c,s) STMT_START { \
576 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
579 /* Macros for recording node offsets. 20001227 mjd@plover.com
580 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
581 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
582 * Element 0 holds the number n.
583 * Position is 1 indexed.
585 #ifndef RE_TRACK_PATTERN_OFFSETS
586 #define Set_Node_Offset_To_R(node,byte)
587 #define Set_Node_Offset(node,byte)
588 #define Set_Cur_Node_Offset
589 #define Set_Node_Length_To_R(node,len)
590 #define Set_Node_Length(node,len)
591 #define Set_Node_Cur_Length(node)
592 #define Node_Offset(n)
593 #define Node_Length(n)
594 #define Set_Node_Offset_Length(node,offset,len)
595 #define ProgLen(ri) ri->u.proglen
596 #define SetProgLen(ri,x) ri->u.proglen = x
598 #define ProgLen(ri) ri->u.offsets[0]
599 #define SetProgLen(ri,x) ri->u.offsets[0] = x
600 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
602 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
603 __LINE__, (int)(node), (int)(byte))); \
605 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
607 RExC_offsets[2*(node)-1] = (byte); \
612 #define Set_Node_Offset(node,byte) \
613 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
614 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
616 #define Set_Node_Length_To_R(node,len) STMT_START { \
618 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
619 __LINE__, (int)(node), (int)(len))); \
621 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
623 RExC_offsets[2*(node)] = (len); \
628 #define Set_Node_Length(node,len) \
629 Set_Node_Length_To_R((node)-RExC_emit_start, len)
630 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
631 #define Set_Node_Cur_Length(node) \
632 Set_Node_Length(node, RExC_parse - parse_start)
634 /* Get offsets and lengths */
635 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
636 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
638 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
639 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
640 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
644 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
645 #define EXPERIMENTAL_INPLACESCAN
646 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
648 #define DEBUG_STUDYDATA(str,data,depth) \
649 DEBUG_OPTIMISE_MORE_r(if(data){ \
650 PerlIO_printf(Perl_debug_log, \
651 "%*s" str "Pos:%"IVdf"/%"IVdf \
652 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
653 (int)(depth)*2, "", \
654 (IV)((data)->pos_min), \
655 (IV)((data)->pos_delta), \
656 (UV)((data)->flags), \
657 (IV)((data)->whilem_c), \
658 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
659 is_inf ? "INF " : "" \
661 if ((data)->last_found) \
662 PerlIO_printf(Perl_debug_log, \
663 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
664 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
665 SvPVX_const((data)->last_found), \
666 (IV)((data)->last_end), \
667 (IV)((data)->last_start_min), \
668 (IV)((data)->last_start_max), \
669 ((data)->longest && \
670 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
671 SvPVX_const((data)->longest_fixed), \
672 (IV)((data)->offset_fixed), \
673 ((data)->longest && \
674 (data)->longest==&((data)->longest_float)) ? "*" : "", \
675 SvPVX_const((data)->longest_float), \
676 (IV)((data)->offset_float_min), \
677 (IV)((data)->offset_float_max) \
679 PerlIO_printf(Perl_debug_log,"\n"); \
682 static void clear_re(pTHX_ void *r);
684 /* Mark that we cannot extend a found fixed substring at this point.
685 Update the longest found anchored substring and the longest found
686 floating substrings if needed. */
689 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
691 const STRLEN l = CHR_SVLEN(data->last_found);
692 const STRLEN old_l = CHR_SVLEN(*data->longest);
693 GET_RE_DEBUG_FLAGS_DECL;
695 PERL_ARGS_ASSERT_SCAN_COMMIT;
697 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
698 SvSetMagicSV(*data->longest, data->last_found);
699 if (*data->longest == data->longest_fixed) {
700 data->offset_fixed = l ? data->last_start_min : data->pos_min;
701 if (data->flags & SF_BEFORE_EOL)
703 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
705 data->flags &= ~SF_FIX_BEFORE_EOL;
706 data->minlen_fixed=minlenp;
707 data->lookbehind_fixed=0;
709 else { /* *data->longest == data->longest_float */
710 data->offset_float_min = l ? data->last_start_min : data->pos_min;
711 data->offset_float_max = (l
712 ? data->last_start_max
713 : data->pos_min + data->pos_delta);
714 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
715 data->offset_float_max = I32_MAX;
716 if (data->flags & SF_BEFORE_EOL)
718 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
720 data->flags &= ~SF_FL_BEFORE_EOL;
721 data->minlen_float=minlenp;
722 data->lookbehind_float=0;
725 SvCUR_set(data->last_found, 0);
727 SV * const sv = data->last_found;
728 if (SvUTF8(sv) && SvMAGICAL(sv)) {
729 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
735 data->flags &= ~SF_BEFORE_EOL;
736 DEBUG_STUDYDATA("commit: ",data,0);
739 /* Can match anything (initialization) */
741 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
743 PERL_ARGS_ASSERT_CL_ANYTHING;
745 ANYOF_BITMAP_SETALL(cl);
746 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
747 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
749 /* If any portion of the regex is to operate under locale rules,
750 * initialization includes it. The reason this isn't done for all regexes
751 * is that the optimizer was written under the assumption that locale was
752 * all-or-nothing. Given the complexity and lack of documentation in the
753 * optimizer, and that there are inadequate test cases for locale, so many
754 * parts of it may not work properly, it is safest to avoid locale unless
756 if (RExC_contains_locale) {
757 ANYOF_CLASS_SETALL(cl); /* /l uses class */
758 cl->flags |= ANYOF_LOCALE;
761 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
765 /* Can match anything (initialization) */
767 S_cl_is_anything(const struct regnode_charclass_class *cl)
771 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
773 for (value = 0; value <= ANYOF_MAX; value += 2)
774 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
776 if (!(cl->flags & ANYOF_UNICODE_ALL))
778 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
783 /* Can match anything (initialization) */
785 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
787 PERL_ARGS_ASSERT_CL_INIT;
789 Zero(cl, 1, struct regnode_charclass_class);
791 cl_anything(pRExC_state, cl);
792 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
795 /* These two functions currently do the exact same thing */
796 #define cl_init_zero S_cl_init
798 /* 'AND' a given class with another one. Can create false positives. 'cl'
799 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
800 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
802 S_cl_and(struct regnode_charclass_class *cl,
803 const struct regnode_charclass_class *and_with)
805 PERL_ARGS_ASSERT_CL_AND;
807 assert(and_with->type == ANYOF);
809 /* I (khw) am not sure all these restrictions are necessary XXX */
810 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
811 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
812 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
813 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
814 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
817 if (and_with->flags & ANYOF_INVERT)
818 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
819 cl->bitmap[i] &= ~and_with->bitmap[i];
821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 cl->bitmap[i] &= and_with->bitmap[i];
823 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
825 if (and_with->flags & ANYOF_INVERT) {
827 /* Here, the and'ed node is inverted. Get the AND of the flags that
828 * aren't affected by the inversion. Those that are affected are
829 * handled individually below */
830 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
831 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
832 cl->flags |= affected_flags;
834 /* We currently don't know how to deal with things that aren't in the
835 * bitmap, but we know that the intersection is no greater than what
836 * is already in cl, so let there be false positives that get sorted
837 * out after the synthetic start class succeeds, and the node is
838 * matched for real. */
840 /* The inversion of these two flags indicate that the resulting
841 * intersection doesn't have them */
842 if (and_with->flags & ANYOF_UNICODE_ALL) {
843 cl->flags &= ~ANYOF_UNICODE_ALL;
845 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
846 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
849 else { /* and'd node is not inverted */
850 U8 outside_bitmap_but_not_utf8; /* Temp variable */
852 if (! ANYOF_NONBITMAP(and_with)) {
854 /* Here 'and_with' doesn't match anything outside the bitmap
855 * (except possibly ANYOF_UNICODE_ALL), which means the
856 * intersection can't either, except for ANYOF_UNICODE_ALL, in
857 * which case we don't know what the intersection is, but it's no
858 * greater than what cl already has, so can just leave it alone,
859 * with possible false positives */
860 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
861 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
862 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
865 else if (! ANYOF_NONBITMAP(cl)) {
867 /* Here, 'and_with' does match something outside the bitmap, and cl
868 * doesn't have a list of things to match outside the bitmap. If
869 * cl can match all code points above 255, the intersection will
870 * be those above-255 code points that 'and_with' matches. If cl
871 * can't match all Unicode code points, it means that it can't
872 * match anything outside the bitmap (since the 'if' that got us
873 * into this block tested for that), so we leave the bitmap empty.
875 if (cl->flags & ANYOF_UNICODE_ALL) {
876 ARG_SET(cl, ARG(and_with));
878 /* and_with's ARG may match things that don't require UTF8.
879 * And now cl's will too, in spite of this being an 'and'. See
880 * the comments below about the kludge */
881 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
885 /* Here, both 'and_with' and cl match something outside the
886 * bitmap. Currently we do not do the intersection, so just match
887 * whatever cl had at the beginning. */
891 /* Take the intersection of the two sets of flags. However, the
892 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
893 * kludge around the fact that this flag is not treated like the others
894 * which are initialized in cl_anything(). The way the optimizer works
895 * is that the synthetic start class (SSC) is initialized to match
896 * anything, and then the first time a real node is encountered, its
897 * values are AND'd with the SSC's with the result being the values of
898 * the real node. However, there are paths through the optimizer where
899 * the AND never gets called, so those initialized bits are set
900 * inappropriately, which is not usually a big deal, as they just cause
901 * false positives in the SSC, which will just mean a probably
902 * imperceptible slow down in execution. However this bit has a
903 * higher false positive consequence in that it can cause utf8.pm,
904 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
905 * bigger slowdown and also causes significant extra memory to be used.
906 * In order to prevent this, the code now takes a different tack. The
907 * bit isn't set unless some part of the regular expression needs it,
908 * but once set it won't get cleared. This means that these extra
909 * modules won't get loaded unless there was some path through the
910 * pattern that would have required them anyway, and so any false
911 * positives that occur by not ANDing them out when they could be
912 * aren't as severe as they would be if we treated this bit like all
914 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
915 & ANYOF_NONBITMAP_NON_UTF8;
916 cl->flags &= and_with->flags;
917 cl->flags |= outside_bitmap_but_not_utf8;
921 /* 'OR' a given class with another one. Can create false positives. 'cl'
922 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
923 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
925 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
927 PERL_ARGS_ASSERT_CL_OR;
929 if (or_with->flags & ANYOF_INVERT) {
931 /* Here, the or'd node is to be inverted. This means we take the
932 * complement of everything not in the bitmap, but currently we don't
933 * know what that is, so give up and match anything */
934 if (ANYOF_NONBITMAP(or_with)) {
935 cl_anything(pRExC_state, cl);
938 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
939 * <= (B1 | !B2) | (CL1 | !CL2)
940 * which is wasteful if CL2 is small, but we ignore CL2:
941 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
942 * XXXX Can we handle case-fold? Unclear:
943 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
944 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
946 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
947 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
948 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
951 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
952 cl->bitmap[i] |= ~or_with->bitmap[i];
953 } /* XXXX: logic is complicated otherwise */
955 cl_anything(pRExC_state, cl);
958 /* And, we can just take the union of the flags that aren't affected
959 * by the inversion */
960 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
962 /* For the remaining flags:
963 ANYOF_UNICODE_ALL and inverted means to not match anything above
964 255, which means that the union with cl should just be
965 what cl has in it, so can ignore this flag
966 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
967 is 127-255 to match them, but then invert that, so the
968 union with cl should just be what cl has in it, so can
971 } else { /* 'or_with' is not inverted */
972 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
973 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
974 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
975 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
978 /* OR char bitmap and class bitmap separately */
979 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
980 cl->bitmap[i] |= or_with->bitmap[i];
981 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
982 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
983 cl->classflags[i] |= or_with->classflags[i];
984 cl->flags |= ANYOF_CLASS;
987 else { /* XXXX: logic is complicated, leave it along for a moment. */
988 cl_anything(pRExC_state, cl);
991 if (ANYOF_NONBITMAP(or_with)) {
993 /* Use the added node's outside-the-bit-map match if there isn't a
994 * conflict. If there is a conflict (both nodes match something
995 * outside the bitmap, but what they match outside is not the same
996 * pointer, and hence not easily compared until XXX we extend
997 * inversion lists this far), give up and allow the start class to
998 * match everything outside the bitmap. If that stuff is all above
999 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1000 if (! ANYOF_NONBITMAP(cl)) {
1001 ARG_SET(cl, ARG(or_with));
1003 else if (ARG(cl) != ARG(or_with)) {
1005 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1006 cl_anything(pRExC_state, cl);
1009 cl->flags |= ANYOF_UNICODE_ALL;
1014 /* Take the union */
1015 cl->flags |= or_with->flags;
1019 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1020 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1021 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1022 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1027 dump_trie(trie,widecharmap,revcharmap)
1028 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1029 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1031 These routines dump out a trie in a somewhat readable format.
1032 The _interim_ variants are used for debugging the interim
1033 tables that are used to generate the final compressed
1034 representation which is what dump_trie expects.
1036 Part of the reason for their existence is to provide a form
1037 of documentation as to how the different representations function.
1042 Dumps the final compressed table form of the trie to Perl_debug_log.
1043 Used for debugging make_trie().
1047 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1048 AV *revcharmap, U32 depth)
1051 SV *sv=sv_newmortal();
1052 int colwidth= widecharmap ? 6 : 4;
1054 GET_RE_DEBUG_FLAGS_DECL;
1056 PERL_ARGS_ASSERT_DUMP_TRIE;
1058 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1059 (int)depth * 2 + 2,"",
1060 "Match","Base","Ofs" );
1062 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1063 SV ** const tmp = av_fetch( revcharmap, state, 0);
1065 PerlIO_printf( Perl_debug_log, "%*s",
1067 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1068 PL_colors[0], PL_colors[1],
1069 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1070 PERL_PV_ESCAPE_FIRSTCHAR
1075 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1076 (int)depth * 2 + 2,"");
1078 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1079 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1080 PerlIO_printf( Perl_debug_log, "\n");
1082 for( state = 1 ; state < trie->statecount ; state++ ) {
1083 const U32 base = trie->states[ state ].trans.base;
1085 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1087 if ( trie->states[ state ].wordnum ) {
1088 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1090 PerlIO_printf( Perl_debug_log, "%6s", "" );
1093 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1098 while( ( base + ofs < trie->uniquecharcount ) ||
1099 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1100 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1103 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1105 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1106 if ( ( base + ofs >= trie->uniquecharcount ) &&
1107 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1108 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1110 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1112 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1114 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1118 PerlIO_printf( Perl_debug_log, "]");
1121 PerlIO_printf( Perl_debug_log, "\n" );
1123 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1124 for (word=1; word <= trie->wordcount; word++) {
1125 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1126 (int)word, (int)(trie->wordinfo[word].prev),
1127 (int)(trie->wordinfo[word].len));
1129 PerlIO_printf(Perl_debug_log, "\n" );
1132 Dumps a fully constructed but uncompressed trie in list form.
1133 List tries normally only are used for construction when the number of
1134 possible chars (trie->uniquecharcount) is very high.
1135 Used for debugging make_trie().
1138 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1139 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1143 SV *sv=sv_newmortal();
1144 int colwidth= widecharmap ? 6 : 4;
1145 GET_RE_DEBUG_FLAGS_DECL;
1147 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1149 /* print out the table precompression. */
1150 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1151 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1152 "------:-----+-----------------\n" );
1154 for( state=1 ; state < next_alloc ; state ++ ) {
1157 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1158 (int)depth * 2 + 2,"", (UV)state );
1159 if ( ! trie->states[ state ].wordnum ) {
1160 PerlIO_printf( Perl_debug_log, "%5s| ","");
1162 PerlIO_printf( Perl_debug_log, "W%4x| ",
1163 trie->states[ state ].wordnum
1166 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1167 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1169 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1171 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1172 PL_colors[0], PL_colors[1],
1173 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1174 PERL_PV_ESCAPE_FIRSTCHAR
1176 TRIE_LIST_ITEM(state,charid).forid,
1177 (UV)TRIE_LIST_ITEM(state,charid).newstate
1180 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1181 (int)((depth * 2) + 14), "");
1184 PerlIO_printf( Perl_debug_log, "\n");
1189 Dumps a fully constructed but uncompressed trie in table form.
1190 This is the normal DFA style state transition table, with a few
1191 twists to facilitate compression later.
1192 Used for debugging make_trie().
1195 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1196 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1201 SV *sv=sv_newmortal();
1202 int colwidth= widecharmap ? 6 : 4;
1203 GET_RE_DEBUG_FLAGS_DECL;
1205 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1208 print out the table precompression so that we can do a visual check
1209 that they are identical.
1212 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1214 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1215 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1217 PerlIO_printf( Perl_debug_log, "%*s",
1219 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1220 PL_colors[0], PL_colors[1],
1221 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1222 PERL_PV_ESCAPE_FIRSTCHAR
1228 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1230 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1231 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1234 PerlIO_printf( Perl_debug_log, "\n" );
1236 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1238 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1239 (int)depth * 2 + 2,"",
1240 (UV)TRIE_NODENUM( state ) );
1242 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1243 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1245 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1247 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1249 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1250 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1252 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1253 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1261 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1262 startbranch: the first branch in the whole branch sequence
1263 first : start branch of sequence of branch-exact nodes.
1264 May be the same as startbranch
1265 last : Thing following the last branch.
1266 May be the same as tail.
1267 tail : item following the branch sequence
1268 count : words in the sequence
1269 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1270 depth : indent depth
1272 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1274 A trie is an N'ary tree where the branches are determined by digital
1275 decomposition of the key. IE, at the root node you look up the 1st character and
1276 follow that branch repeat until you find the end of the branches. Nodes can be
1277 marked as "accepting" meaning they represent a complete word. Eg:
1281 would convert into the following structure. Numbers represent states, letters
1282 following numbers represent valid transitions on the letter from that state, if
1283 the number is in square brackets it represents an accepting state, otherwise it
1284 will be in parenthesis.
1286 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1290 (1) +-i->(6)-+-s->[7]
1292 +-s->(3)-+-h->(4)-+-e->[5]
1294 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1296 This shows that when matching against the string 'hers' we will begin at state 1
1297 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1298 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1299 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1300 single traverse. We store a mapping from accepting to state to which word was
1301 matched, and then when we have multiple possibilities we try to complete the
1302 rest of the regex in the order in which they occured in the alternation.
1304 The only prior NFA like behaviour that would be changed by the TRIE support is
1305 the silent ignoring of duplicate alternations which are of the form:
1307 / (DUPE|DUPE) X? (?{ ... }) Y /x
1309 Thus EVAL blocks following a trie may be called a different number of times with
1310 and without the optimisation. With the optimisations dupes will be silently
1311 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1312 the following demonstrates:
1314 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1316 which prints out 'word' three times, but
1318 'words'=~/(word|word|word)(?{ print $1 })S/
1320 which doesnt print it out at all. This is due to other optimisations kicking in.
1322 Example of what happens on a structural level:
1324 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1326 1: CURLYM[1] {1,32767}(18)
1337 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1338 and should turn into:
1340 1: CURLYM[1] {1,32767}(18)
1342 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1350 Cases where tail != last would be like /(?foo|bar)baz/:
1360 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1361 and would end up looking like:
1364 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1371 d = uvuni_to_utf8_flags(d, uv, 0);
1373 is the recommended Unicode-aware way of saying
1378 #define TRIE_STORE_REVCHAR(val) \
1381 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1382 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1383 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1384 SvCUR_set(zlopp, kapow - flrbbbbb); \
1387 av_push(revcharmap, zlopp); \
1389 char ooooff = (char)val; \
1390 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1394 #define TRIE_READ_CHAR STMT_START { \
1397 /* if it is UTF then it is either already folded, or does not need folding */ \
1398 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1400 else if (folder == PL_fold_latin1) { \
1401 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1402 if ( foldlen > 0 ) { \
1403 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1409 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1410 skiplen = UNISKIP(uvc); \
1411 foldlen -= skiplen; \
1412 scan = foldbuf + skiplen; \
1415 /* raw data, will be folded later if needed */ \
1423 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1424 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1425 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1426 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1428 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1429 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1430 TRIE_LIST_CUR( state )++; \
1433 #define TRIE_LIST_NEW(state) STMT_START { \
1434 Newxz( trie->states[ state ].trans.list, \
1435 4, reg_trie_trans_le ); \
1436 TRIE_LIST_CUR( state ) = 1; \
1437 TRIE_LIST_LEN( state ) = 4; \
1440 #define TRIE_HANDLE_WORD(state) STMT_START { \
1441 U16 dupe= trie->states[ state ].wordnum; \
1442 regnode * const noper_next = regnext( noper ); \
1445 /* store the word for dumping */ \
1447 if (OP(noper) != NOTHING) \
1448 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1450 tmp = newSVpvn_utf8( "", 0, UTF ); \
1451 av_push( trie_words, tmp ); \
1455 trie->wordinfo[curword].prev = 0; \
1456 trie->wordinfo[curword].len = wordlen; \
1457 trie->wordinfo[curword].accept = state; \
1459 if ( noper_next < tail ) { \
1461 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1462 trie->jump[curword] = (U16)(noper_next - convert); \
1464 jumper = noper_next; \
1466 nextbranch= regnext(cur); \
1470 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1471 /* chain, so that when the bits of chain are later */\
1472 /* linked together, the dups appear in the chain */\
1473 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1474 trie->wordinfo[dupe].prev = curword; \
1476 /* we haven't inserted this word yet. */ \
1477 trie->states[ state ].wordnum = curword; \
1482 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1483 ( ( base + charid >= ucharcount \
1484 && base + charid < ubound \
1485 && state == trie->trans[ base - ucharcount + charid ].check \
1486 && trie->trans[ base - ucharcount + charid ].next ) \
1487 ? trie->trans[ base - ucharcount + charid ].next \
1488 : ( state==1 ? special : 0 ) \
1492 #define MADE_JUMP_TRIE 2
1493 #define MADE_EXACT_TRIE 4
1496 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1499 /* first pass, loop through and scan words */
1500 reg_trie_data *trie;
1501 HV *widecharmap = NULL;
1502 AV *revcharmap = newAV();
1504 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1509 regnode *jumper = NULL;
1510 regnode *nextbranch = NULL;
1511 regnode *convert = NULL;
1512 U32 *prev_states; /* temp array mapping each state to previous one */
1513 /* we just use folder as a flag in utf8 */
1514 const U8 * folder = NULL;
1517 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1518 AV *trie_words = NULL;
1519 /* along with revcharmap, this only used during construction but both are
1520 * useful during debugging so we store them in the struct when debugging.
1523 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1524 STRLEN trie_charcount=0;
1526 SV *re_trie_maxbuff;
1527 GET_RE_DEBUG_FLAGS_DECL;
1529 PERL_ARGS_ASSERT_MAKE_TRIE;
1531 PERL_UNUSED_ARG(depth);
1538 case EXACTFU_TRICKYFOLD:
1539 case EXACTFU: folder = PL_fold_latin1; break;
1540 case EXACTF: folder = PL_fold; break;
1541 case EXACTFL: folder = PL_fold_locale; break;
1542 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1545 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1547 trie->startstate = 1;
1548 trie->wordcount = word_count;
1549 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1550 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1552 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1553 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1554 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1557 trie_words = newAV();
1560 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1561 if (!SvIOK(re_trie_maxbuff)) {
1562 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1564 DEBUG_TRIE_COMPILE_r({
1565 PerlIO_printf( Perl_debug_log,
1566 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1567 (int)depth * 2 + 2, "",
1568 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1569 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1573 /* Find the node we are going to overwrite */
1574 if ( first == startbranch && OP( last ) != BRANCH ) {
1575 /* whole branch chain */
1578 /* branch sub-chain */
1579 convert = NEXTOPER( first );
1582 /* -- First loop and Setup --
1584 We first traverse the branches and scan each word to determine if it
1585 contains widechars, and how many unique chars there are, this is
1586 important as we have to build a table with at least as many columns as we
1589 We use an array of integers to represent the character codes 0..255
1590 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1591 native representation of the character value as the key and IV's for the
1594 *TODO* If we keep track of how many times each character is used we can
1595 remap the columns so that the table compression later on is more
1596 efficient in terms of memory by ensuring the most common value is in the
1597 middle and the least common are on the outside. IMO this would be better
1598 than a most to least common mapping as theres a decent chance the most
1599 common letter will share a node with the least common, meaning the node
1600 will not be compressible. With a middle is most common approach the worst
1601 case is when we have the least common nodes twice.
1605 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1606 regnode *noper = NEXTOPER( cur );
1607 const U8 *uc = (U8*)STRING( noper );
1608 const U8 *e = uc + STR_LEN( noper );
1610 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1612 const U8 *scan = (U8*)NULL;
1613 U32 wordlen = 0; /* required init */
1615 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1617 if (OP(noper) == NOTHING) {
1618 regnode *noper_next= regnext(noper);
1619 if (noper_next != tail && OP(noper_next) == flags) {
1621 uc= (U8*)STRING(noper);
1622 e= uc + STR_LEN(noper);
1623 trie->minlen= STR_LEN(noper);
1630 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1631 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1632 regardless of encoding */
1633 if (OP( noper ) == EXACTFU_SS) {
1634 /* false positives are ok, so just set this */
1635 TRIE_BITMAP_SET(trie,0xDF);
1638 for ( ; uc < e ; uc += len ) {
1639 TRIE_CHARCOUNT(trie)++;
1644 U8 folded= folder[ (U8) uvc ];
1645 if ( !trie->charmap[ folded ] ) {
1646 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1647 TRIE_STORE_REVCHAR( folded );
1650 if ( !trie->charmap[ uvc ] ) {
1651 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1652 TRIE_STORE_REVCHAR( uvc );
1655 /* store the codepoint in the bitmap, and its folded
1657 TRIE_BITMAP_SET(trie, uvc);
1659 /* store the folded codepoint */
1660 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1663 /* store first byte of utf8 representation of
1664 variant codepoints */
1665 if (! UNI_IS_INVARIANT(uvc)) {
1666 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1669 set_bit = 0; /* We've done our bit :-) */
1674 widecharmap = newHV();
1676 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1679 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1681 if ( !SvTRUE( *svpp ) ) {
1682 sv_setiv( *svpp, ++trie->uniquecharcount );
1683 TRIE_STORE_REVCHAR(uvc);
1687 if( cur == first ) {
1688 trie->minlen = chars;
1689 trie->maxlen = chars;
1690 } else if (chars < trie->minlen) {
1691 trie->minlen = chars;
1692 } else if (chars > trie->maxlen) {
1693 trie->maxlen = chars;
1695 if (OP( noper ) == EXACTFU_SS) {
1696 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1697 if (trie->minlen > 1)
1700 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1701 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1702 * - We assume that any such sequence might match a 2 byte string */
1703 if (trie->minlen > 2 )
1707 } /* end first pass */
1708 DEBUG_TRIE_COMPILE_r(
1709 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1710 (int)depth * 2 + 2,"",
1711 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1712 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1713 (int)trie->minlen, (int)trie->maxlen )
1717 We now know what we are dealing with in terms of unique chars and
1718 string sizes so we can calculate how much memory a naive
1719 representation using a flat table will take. If it's over a reasonable
1720 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1721 conservative but potentially much slower representation using an array
1724 At the end we convert both representations into the same compressed
1725 form that will be used in regexec.c for matching with. The latter
1726 is a form that cannot be used to construct with but has memory
1727 properties similar to the list form and access properties similar
1728 to the table form making it both suitable for fast searches and
1729 small enough that its feasable to store for the duration of a program.
1731 See the comment in the code where the compressed table is produced
1732 inplace from the flat tabe representation for an explanation of how
1733 the compression works.
1738 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1741 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1743 Second Pass -- Array Of Lists Representation
1745 Each state will be represented by a list of charid:state records
1746 (reg_trie_trans_le) the first such element holds the CUR and LEN
1747 points of the allocated array. (See defines above).
1749 We build the initial structure using the lists, and then convert
1750 it into the compressed table form which allows faster lookups
1751 (but cant be modified once converted).
1754 STRLEN transcount = 1;
1756 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1757 "%*sCompiling trie using list compiler\n",
1758 (int)depth * 2 + 2, ""));
1760 trie->states = (reg_trie_state *)
1761 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1762 sizeof(reg_trie_state) );
1766 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1768 regnode *noper = NEXTOPER( cur );
1769 U8 *uc = (U8*)STRING( noper );
1770 const U8 *e = uc + STR_LEN( noper );
1771 U32 state = 1; /* required init */
1772 U16 charid = 0; /* sanity init */
1773 U8 *scan = (U8*)NULL; /* sanity init */
1774 STRLEN foldlen = 0; /* required init */
1775 U32 wordlen = 0; /* required init */
1776 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1779 if (OP(noper) == NOTHING) {
1780 regnode *noper_next= regnext(noper);
1781 if (noper_next != tail && OP(noper_next) == flags) {
1783 uc= (U8*)STRING(noper);
1784 e= uc + STR_LEN(noper);
1788 if (OP(noper) != NOTHING) {
1789 for ( ; uc < e ; uc += len ) {
1794 charid = trie->charmap[ uvc ];
1796 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1800 charid=(U16)SvIV( *svpp );
1803 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1810 if ( !trie->states[ state ].trans.list ) {
1811 TRIE_LIST_NEW( state );
1813 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1814 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1815 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1820 newstate = next_alloc++;
1821 prev_states[newstate] = state;
1822 TRIE_LIST_PUSH( state, charid, newstate );
1827 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1831 TRIE_HANDLE_WORD(state);
1833 } /* end second pass */
1835 /* next alloc is the NEXT state to be allocated */
1836 trie->statecount = next_alloc;
1837 trie->states = (reg_trie_state *)
1838 PerlMemShared_realloc( trie->states,
1840 * sizeof(reg_trie_state) );
1842 /* and now dump it out before we compress it */
1843 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1844 revcharmap, next_alloc,
1848 trie->trans = (reg_trie_trans *)
1849 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1856 for( state=1 ; state < next_alloc ; state ++ ) {
1860 DEBUG_TRIE_COMPILE_MORE_r(
1861 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1865 if (trie->states[state].trans.list) {
1866 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1870 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1871 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1872 if ( forid < minid ) {
1874 } else if ( forid > maxid ) {
1878 if ( transcount < tp + maxid - minid + 1) {
1880 trie->trans = (reg_trie_trans *)
1881 PerlMemShared_realloc( trie->trans,
1883 * sizeof(reg_trie_trans) );
1884 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1886 base = trie->uniquecharcount + tp - minid;
1887 if ( maxid == minid ) {
1889 for ( ; zp < tp ; zp++ ) {
1890 if ( ! trie->trans[ zp ].next ) {
1891 base = trie->uniquecharcount + zp - minid;
1892 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1893 trie->trans[ zp ].check = state;
1899 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1900 trie->trans[ tp ].check = state;
1905 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1906 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1907 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1908 trie->trans[ tid ].check = state;
1910 tp += ( maxid - minid + 1 );
1912 Safefree(trie->states[ state ].trans.list);
1915 DEBUG_TRIE_COMPILE_MORE_r(
1916 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1919 trie->states[ state ].trans.base=base;
1921 trie->lasttrans = tp + 1;
1925 Second Pass -- Flat Table Representation.
1927 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1928 We know that we will need Charcount+1 trans at most to store the data
1929 (one row per char at worst case) So we preallocate both structures
1930 assuming worst case.
1932 We then construct the trie using only the .next slots of the entry
1935 We use the .check field of the first entry of the node temporarily to
1936 make compression both faster and easier by keeping track of how many non
1937 zero fields are in the node.
1939 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1942 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1943 number representing the first entry of the node, and state as a
1944 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1945 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1946 are 2 entrys per node. eg:
1954 The table is internally in the right hand, idx form. However as we also
1955 have to deal with the states array which is indexed by nodenum we have to
1956 use TRIE_NODENUM() to convert.
1959 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1960 "%*sCompiling trie using table compiler\n",
1961 (int)depth * 2 + 2, ""));
1963 trie->trans = (reg_trie_trans *)
1964 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1965 * trie->uniquecharcount + 1,
1966 sizeof(reg_trie_trans) );
1967 trie->states = (reg_trie_state *)
1968 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1969 sizeof(reg_trie_state) );
1970 next_alloc = trie->uniquecharcount + 1;
1973 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1975 regnode *noper = NEXTOPER( cur );
1976 const U8 *uc = (U8*)STRING( noper );
1977 const U8 *e = uc + STR_LEN( noper );
1979 U32 state = 1; /* required init */
1981 U16 charid = 0; /* sanity init */
1982 U32 accept_state = 0; /* sanity init */
1983 U8 *scan = (U8*)NULL; /* sanity init */
1985 STRLEN foldlen = 0; /* required init */
1986 U32 wordlen = 0; /* required init */
1988 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1990 if (OP(noper) == NOTHING) {
1991 regnode *noper_next= regnext(noper);
1992 if (noper_next != tail && OP(noper_next) == flags) {
1994 uc= (U8*)STRING(noper);
1995 e= uc + STR_LEN(noper);
1999 if ( OP(noper) != NOTHING ) {
2000 for ( ; uc < e ; uc += len ) {
2005 charid = trie->charmap[ uvc ];
2007 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2008 charid = svpp ? (U16)SvIV(*svpp) : 0;
2012 if ( !trie->trans[ state + charid ].next ) {
2013 trie->trans[ state + charid ].next = next_alloc;
2014 trie->trans[ state ].check++;
2015 prev_states[TRIE_NODENUM(next_alloc)]
2016 = TRIE_NODENUM(state);
2017 next_alloc += trie->uniquecharcount;
2019 state = trie->trans[ state + charid ].next;
2021 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2023 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2026 accept_state = TRIE_NODENUM( state );
2027 TRIE_HANDLE_WORD(accept_state);
2029 } /* end second pass */
2031 /* and now dump it out before we compress it */
2032 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2034 next_alloc, depth+1));
2038 * Inplace compress the table.*
2040 For sparse data sets the table constructed by the trie algorithm will
2041 be mostly 0/FAIL transitions or to put it another way mostly empty.
2042 (Note that leaf nodes will not contain any transitions.)
2044 This algorithm compresses the tables by eliminating most such
2045 transitions, at the cost of a modest bit of extra work during lookup:
2047 - Each states[] entry contains a .base field which indicates the
2048 index in the state[] array wheres its transition data is stored.
2050 - If .base is 0 there are no valid transitions from that node.
2052 - If .base is nonzero then charid is added to it to find an entry in
2055 -If trans[states[state].base+charid].check!=state then the
2056 transition is taken to be a 0/Fail transition. Thus if there are fail
2057 transitions at the front of the node then the .base offset will point
2058 somewhere inside the previous nodes data (or maybe even into a node
2059 even earlier), but the .check field determines if the transition is
2063 The following process inplace converts the table to the compressed
2064 table: We first do not compress the root node 1,and mark all its
2065 .check pointers as 1 and set its .base pointer as 1 as well. This
2066 allows us to do a DFA construction from the compressed table later,
2067 and ensures that any .base pointers we calculate later are greater
2070 - We set 'pos' to indicate the first entry of the second node.
2072 - We then iterate over the columns of the node, finding the first and
2073 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2074 and set the .check pointers accordingly, and advance pos
2075 appropriately and repreat for the next node. Note that when we copy
2076 the next pointers we have to convert them from the original
2077 NODEIDX form to NODENUM form as the former is not valid post
2080 - If a node has no transitions used we mark its base as 0 and do not
2081 advance the pos pointer.
2083 - If a node only has one transition we use a second pointer into the
2084 structure to fill in allocated fail transitions from other states.
2085 This pointer is independent of the main pointer and scans forward
2086 looking for null transitions that are allocated to a state. When it
2087 finds one it writes the single transition into the "hole". If the
2088 pointer doesnt find one the single transition is appended as normal.
2090 - Once compressed we can Renew/realloc the structures to release the
2093 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2094 specifically Fig 3.47 and the associated pseudocode.
2098 const U32 laststate = TRIE_NODENUM( next_alloc );
2101 trie->statecount = laststate;
2103 for ( state = 1 ; state < laststate ; state++ ) {
2105 const U32 stateidx = TRIE_NODEIDX( state );
2106 const U32 o_used = trie->trans[ stateidx ].check;
2107 U32 used = trie->trans[ stateidx ].check;
2108 trie->trans[ stateidx ].check = 0;
2110 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2111 if ( flag || trie->trans[ stateidx + charid ].next ) {
2112 if ( trie->trans[ stateidx + charid ].next ) {
2114 for ( ; zp < pos ; zp++ ) {
2115 if ( ! trie->trans[ zp ].next ) {
2119 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2120 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2121 trie->trans[ zp ].check = state;
2122 if ( ++zp > pos ) pos = zp;
2129 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2131 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2132 trie->trans[ pos ].check = state;
2137 trie->lasttrans = pos + 1;
2138 trie->states = (reg_trie_state *)
2139 PerlMemShared_realloc( trie->states, laststate
2140 * sizeof(reg_trie_state) );
2141 DEBUG_TRIE_COMPILE_MORE_r(
2142 PerlIO_printf( Perl_debug_log,
2143 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2144 (int)depth * 2 + 2,"",
2145 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2148 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2151 } /* end table compress */
2153 DEBUG_TRIE_COMPILE_MORE_r(
2154 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2155 (int)depth * 2 + 2, "",
2156 (UV)trie->statecount,
2157 (UV)trie->lasttrans)
2159 /* resize the trans array to remove unused space */
2160 trie->trans = (reg_trie_trans *)
2161 PerlMemShared_realloc( trie->trans, trie->lasttrans
2162 * sizeof(reg_trie_trans) );
2164 { /* Modify the program and insert the new TRIE node */
2165 U8 nodetype =(U8)(flags & 0xFF);
2169 regnode *optimize = NULL;
2170 #ifdef RE_TRACK_PATTERN_OFFSETS
2173 U32 mjd_nodelen = 0;
2174 #endif /* RE_TRACK_PATTERN_OFFSETS */
2175 #endif /* DEBUGGING */
2177 This means we convert either the first branch or the first Exact,
2178 depending on whether the thing following (in 'last') is a branch
2179 or not and whther first is the startbranch (ie is it a sub part of
2180 the alternation or is it the whole thing.)
2181 Assuming its a sub part we convert the EXACT otherwise we convert
2182 the whole branch sequence, including the first.
2184 /* Find the node we are going to overwrite */
2185 if ( first != startbranch || OP( last ) == BRANCH ) {
2186 /* branch sub-chain */
2187 NEXT_OFF( first ) = (U16)(last - first);
2188 #ifdef RE_TRACK_PATTERN_OFFSETS
2190 mjd_offset= Node_Offset((convert));
2191 mjd_nodelen= Node_Length((convert));
2194 /* whole branch chain */
2196 #ifdef RE_TRACK_PATTERN_OFFSETS
2199 const regnode *nop = NEXTOPER( convert );
2200 mjd_offset= Node_Offset((nop));
2201 mjd_nodelen= Node_Length((nop));
2205 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2206 (int)depth * 2 + 2, "",
2207 (UV)mjd_offset, (UV)mjd_nodelen)
2210 /* But first we check to see if there is a common prefix we can
2211 split out as an EXACT and put in front of the TRIE node. */
2212 trie->startstate= 1;
2213 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2215 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2219 const U32 base = trie->states[ state ].trans.base;
2221 if ( trie->states[state].wordnum )
2224 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2225 if ( ( base + ofs >= trie->uniquecharcount ) &&
2226 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2227 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2229 if ( ++count > 1 ) {
2230 SV **tmp = av_fetch( revcharmap, ofs, 0);
2231 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2232 if ( state == 1 ) break;
2234 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2236 PerlIO_printf(Perl_debug_log,
2237 "%*sNew Start State=%"UVuf" Class: [",
2238 (int)depth * 2 + 2, "",
2241 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2242 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2244 TRIE_BITMAP_SET(trie,*ch);
2246 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2248 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2252 TRIE_BITMAP_SET(trie,*ch);
2254 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2255 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2261 SV **tmp = av_fetch( revcharmap, idx, 0);
2263 char *ch = SvPV( *tmp, len );
2265 SV *sv=sv_newmortal();
2266 PerlIO_printf( Perl_debug_log,
2267 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2268 (int)depth * 2 + 2, "",
2270 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2271 PL_colors[0], PL_colors[1],
2272 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2273 PERL_PV_ESCAPE_FIRSTCHAR
2278 OP( convert ) = nodetype;
2279 str=STRING(convert);
2282 STR_LEN(convert) += len;
2288 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2293 trie->prefixlen = (state-1);
2295 regnode *n = convert+NODE_SZ_STR(convert);
2296 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2297 trie->startstate = state;
2298 trie->minlen -= (state - 1);
2299 trie->maxlen -= (state - 1);
2301 /* At least the UNICOS C compiler choked on this
2302 * being argument to DEBUG_r(), so let's just have
2305 #ifdef PERL_EXT_RE_BUILD
2311 regnode *fix = convert;
2312 U32 word = trie->wordcount;
2314 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2315 while( ++fix < n ) {
2316 Set_Node_Offset_Length(fix, 0, 0);
2319 SV ** const tmp = av_fetch( trie_words, word, 0 );
2321 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2322 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2324 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2332 NEXT_OFF(convert) = (U16)(tail - convert);
2333 DEBUG_r(optimize= n);
2339 if ( trie->maxlen ) {
2340 NEXT_OFF( convert ) = (U16)(tail - convert);
2341 ARG_SET( convert, data_slot );
2342 /* Store the offset to the first unabsorbed branch in
2343 jump[0], which is otherwise unused by the jump logic.
2344 We use this when dumping a trie and during optimisation. */
2346 trie->jump[0] = (U16)(nextbranch - convert);
2348 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2349 * and there is a bitmap
2350 * and the first "jump target" node we found leaves enough room
2351 * then convert the TRIE node into a TRIEC node, with the bitmap
2352 * embedded inline in the opcode - this is hypothetically faster.
2354 if ( !trie->states[trie->startstate].wordnum
2356 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2358 OP( convert ) = TRIEC;
2359 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2360 PerlMemShared_free(trie->bitmap);
2363 OP( convert ) = TRIE;
2365 /* store the type in the flags */
2366 convert->flags = nodetype;
2370 + regarglen[ OP( convert ) ];
2372 /* XXX We really should free up the resource in trie now,
2373 as we won't use them - (which resources?) dmq */
2375 /* needed for dumping*/
2376 DEBUG_r(if (optimize) {
2377 regnode *opt = convert;
2379 while ( ++opt < optimize) {
2380 Set_Node_Offset_Length(opt,0,0);
2383 Try to clean up some of the debris left after the
2386 while( optimize < jumper ) {
2387 mjd_nodelen += Node_Length((optimize));
2388 OP( optimize ) = OPTIMIZED;
2389 Set_Node_Offset_Length(optimize,0,0);
2392 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2394 } /* end node insert */
2395 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2397 /* Finish populating the prev field of the wordinfo array. Walk back
2398 * from each accept state until we find another accept state, and if
2399 * so, point the first word's .prev field at the second word. If the
2400 * second already has a .prev field set, stop now. This will be the
2401 * case either if we've already processed that word's accept state,
2402 * or that state had multiple words, and the overspill words were
2403 * already linked up earlier.
2410 for (word=1; word <= trie->wordcount; word++) {
2412 if (trie->wordinfo[word].prev)
2414 state = trie->wordinfo[word].accept;
2416 state = prev_states[state];
2419 prev = trie->states[state].wordnum;
2423 trie->wordinfo[word].prev = prev;
2425 Safefree(prev_states);
2429 /* and now dump out the compressed format */
2430 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2432 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2434 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2435 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2437 SvREFCNT_dec(revcharmap);
2441 : trie->startstate>1
2447 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2449 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2451 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2452 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2455 We find the fail state for each state in the trie, this state is the longest proper
2456 suffix of the current state's 'word' that is also a proper prefix of another word in our
2457 trie. State 1 represents the word '' and is thus the default fail state. This allows
2458 the DFA not to have to restart after its tried and failed a word at a given point, it
2459 simply continues as though it had been matching the other word in the first place.
2461 'abcdgu'=~/abcdefg|cdgu/
2462 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2463 fail, which would bring us to the state representing 'd' in the second word where we would
2464 try 'g' and succeed, proceeding to match 'cdgu'.
2466 /* add a fail transition */
2467 const U32 trie_offset = ARG(source);
2468 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2470 const U32 ucharcount = trie->uniquecharcount;
2471 const U32 numstates = trie->statecount;
2472 const U32 ubound = trie->lasttrans + ucharcount;
2476 U32 base = trie->states[ 1 ].trans.base;
2479 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2480 GET_RE_DEBUG_FLAGS_DECL;
2482 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2484 PERL_UNUSED_ARG(depth);
2488 ARG_SET( stclass, data_slot );
2489 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2490 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2491 aho->trie=trie_offset;
2492 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2493 Copy( trie->states, aho->states, numstates, reg_trie_state );
2494 Newxz( q, numstates, U32);
2495 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2498 /* initialize fail[0..1] to be 1 so that we always have
2499 a valid final fail state */
2500 fail[ 0 ] = fail[ 1 ] = 1;
2502 for ( charid = 0; charid < ucharcount ; charid++ ) {
2503 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2505 q[ q_write ] = newstate;
2506 /* set to point at the root */
2507 fail[ q[ q_write++ ] ]=1;
2510 while ( q_read < q_write) {
2511 const U32 cur = q[ q_read++ % numstates ];
2512 base = trie->states[ cur ].trans.base;
2514 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2515 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2517 U32 fail_state = cur;
2520 fail_state = fail[ fail_state ];
2521 fail_base = aho->states[ fail_state ].trans.base;
2522 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2524 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2525 fail[ ch_state ] = fail_state;
2526 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2528 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2530 q[ q_write++ % numstates] = ch_state;
2534 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2535 when we fail in state 1, this allows us to use the
2536 charclass scan to find a valid start char. This is based on the principle
2537 that theres a good chance the string being searched contains lots of stuff
2538 that cant be a start char.
2540 fail[ 0 ] = fail[ 1 ] = 0;
2541 DEBUG_TRIE_COMPILE_r({
2542 PerlIO_printf(Perl_debug_log,
2543 "%*sStclass Failtable (%"UVuf" states): 0",
2544 (int)(depth * 2), "", (UV)numstates
2546 for( q_read=1; q_read<numstates; q_read++ ) {
2547 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2549 PerlIO_printf(Perl_debug_log, "\n");
2552 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2557 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2558 * These need to be revisited when a newer toolchain becomes available.
2560 #if defined(__sparc64__) && defined(__GNUC__)
2561 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2562 # undef SPARC64_GCC_WORKAROUND
2563 # define SPARC64_GCC_WORKAROUND 1
2567 #define DEBUG_PEEP(str,scan,depth) \
2568 DEBUG_OPTIMISE_r({if (scan){ \
2569 SV * const mysv=sv_newmortal(); \
2570 regnode *Next = regnext(scan); \
2571 regprop(RExC_rx, mysv, scan); \
2572 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2573 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2574 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2578 /* The below joins as many adjacent EXACTish nodes as possible into a single
2579 * one, and looks for problematic sequences of characters whose folds vs.
2580 * non-folds have sufficiently different lengths, that the optimizer would be
2581 * fooled into rejecting legitimate matches of them, and the trie construction
2582 * code can't cope with them. The joining is only done if:
2583 * 1) there is room in the current conglomerated node to entirely contain the
2585 * 2) they are the exact same node type
2587 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2588 * these get optimized out
2590 * If there are problematic code sequences, *min_subtract is set to the delta
2591 * that the minimum size of the node can be less than its actual size. And,
2592 * the node type of the result is changed to reflect that it contains these
2595 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2596 * and contains LATIN SMALL LETTER SHARP S
2598 * This is as good a place as any to discuss the design of handling these
2599 * problematic sequences. It's been wrong in Perl for a very long time. There
2600 * are three code points in Unicode whose folded lengths differ so much from
2601 * the un-folded lengths that it causes problems for the optimizer and trie
2602 * construction. Why only these are problematic, and not others where lengths
2603 * also differ is something I (khw) do not understand. New versions of Unicode
2604 * might add more such code points. Hopefully the logic in fold_grind.t that
2605 * figures out what to test (in part by verifying that each size-combination
2606 * gets tested) will catch any that do come along, so they can be added to the
2607 * special handling below. The chances of new ones are actually rather small,
2608 * as most, if not all, of the world's scripts that have casefolding have
2609 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2610 * made to allow compatibility with pre-existing standards, and almost all of
2611 * those have already been dealt with. These would otherwise be the most
2612 * likely candidates for generating further tricky sequences. In other words,
2613 * Unicode by itself is unlikely to add new ones unless it is for compatibility
2614 * with pre-existing standards, and there aren't many of those left.
2616 * The previous designs for dealing with these involved assigning a special
2617 * node for them. This approach doesn't work, as evidenced by this example:
2618 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2619 * Both these fold to "sss", but if the pattern is parsed to create a node of
2620 * that would match just the \xDF, it won't be able to handle the case where a
2621 * successful match would have to cross the node's boundary. The new approach
2622 * that hopefully generally solves the problem generates an EXACTFU_SS node
2625 * There are a number of components to the approach (a lot of work for just
2626 * three code points!):
2627 * 1) This routine examines each EXACTFish node that could contain the
2628 * problematic sequences. It returns in *min_subtract how much to
2629 * subtract from the the actual length of the string to get a real minimum
2630 * for one that could match it. This number is usually 0 except for the
2631 * problematic sequences. This delta is used by the caller to adjust the
2632 * min length of the match, and the delta between min and max, so that the
2633 * optimizer doesn't reject these possibilities based on size constraints.
2634 * 2) These sequences are not currently correctly handled by the trie code
2635 * either, so it changes the joined node type to ops that are not handled
2636 * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2637 * 3) This is sufficient for the two Greek sequences (described below), but
2638 * the one involving the Sharp s (\xDF) needs more. The node type
2639 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2640 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2641 * case where there is a possible fold length change. That means that a
2642 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2643 * itself with length changes, and so can be processed faster. regexec.c
2644 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2645 * is pre-folded by regcomp.c. This saves effort in regex matching.
2646 * However, probably mostly for historical reasons, the pre-folding isn't
2647 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2648 * nodes, as what they fold to isn't known until runtime.) The fold
2649 * possibilities for the non-UTF8 patterns are quite simple, except for
2650 * the sharp s. All the ones that don't involve a UTF-8 target string
2651 * are members of a fold-pair, and arrays are set up for all of them
2652 * that quickly find the other member of the pair. It might actually
2653 * be faster to pre-fold these, but it isn't currently done, except for
2654 * the sharp s. Code elsewhere in this file makes sure that it gets
2655 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2656 * issues described in the next item.
2657 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2658 * 'ss' or not is not knowable at compile time. It will match iff the
2659 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2660 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2661 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2662 * described in item 3). An assumption that the optimizer part of
2663 * regexec.c (probably unwittingly) makes is that a character in the
2664 * pattern corresponds to at most a single character in the target string.
2665 * (And I do mean character, and not byte here, unlike other parts of the
2666 * documentation that have never been updated to account for multibyte
2667 * Unicode.) This assumption is wrong only in this case, as all other
2668 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2669 * virtue of having this file pre-fold UTF-8 patterns. I'm
2670 * reluctant to try to change this assumption, so instead the code punts.
2671 * This routine examines EXACTF nodes for the sharp s, and returns a
2672 * boolean indicating whether or not the node is an EXACTF node that
2673 * contains a sharp s. When it is true, the caller sets a flag that later
2674 * causes the optimizer in this file to not set values for the floating
2675 * and fixed string lengths, and thus avoids the optimizer code in
2676 * regexec.c that makes the invalid assumption. Thus, there is no
2677 * optimization based on string lengths for EXACTF nodes that contain the
2678 * sharp s. This only happens for /id rules (which means the pattern
2682 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2683 if (PL_regkind[OP(scan)] == EXACT) \
2684 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2687 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2688 /* Merge several consecutive EXACTish nodes into one. */
2689 regnode *n = regnext(scan);
2691 regnode *next = scan + NODE_SZ_STR(scan);
2695 regnode *stop = scan;
2696 GET_RE_DEBUG_FLAGS_DECL;
2698 PERL_UNUSED_ARG(depth);
2701 PERL_ARGS_ASSERT_JOIN_EXACT;
2702 #ifndef EXPERIMENTAL_INPLACESCAN
2703 PERL_UNUSED_ARG(flags);
2704 PERL_UNUSED_ARG(val);
2706 DEBUG_PEEP("join",scan,depth);
2708 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2709 * EXACT ones that are mergeable to the current one. */
2711 && (PL_regkind[OP(n)] == NOTHING
2712 || (stringok && OP(n) == OP(scan)))
2714 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2717 if (OP(n) == TAIL || n > next)
2719 if (PL_regkind[OP(n)] == NOTHING) {
2720 DEBUG_PEEP("skip:",n,depth);
2721 NEXT_OFF(scan) += NEXT_OFF(n);
2722 next = n + NODE_STEP_REGNODE;
2729 else if (stringok) {
2730 const unsigned int oldl = STR_LEN(scan);
2731 regnode * const nnext = regnext(n);
2733 if (oldl + STR_LEN(n) > U8_MAX)
2736 DEBUG_PEEP("merg",n,depth);
2739 NEXT_OFF(scan) += NEXT_OFF(n);
2740 STR_LEN(scan) += STR_LEN(n);
2741 next = n + NODE_SZ_STR(n);
2742 /* Now we can overwrite *n : */
2743 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2751 #ifdef EXPERIMENTAL_INPLACESCAN
2752 if (flags && !NEXT_OFF(n)) {
2753 DEBUG_PEEP("atch", val, depth);
2754 if (reg_off_by_arg[OP(n)]) {
2755 ARG_SET(n, val - n);
2758 NEXT_OFF(n) = val - n;
2766 *has_exactf_sharp_s = FALSE;
2768 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2769 * can now analyze for sequences of problematic code points. (Prior to
2770 * this final joining, sequences could have been split over boundaries, and
2771 * hence missed). The sequences only happen in folding, hence for any
2772 * non-EXACT EXACTish node */
2773 if (OP(scan) != EXACT) {
2775 U8 * s0 = (U8*) STRING(scan);
2776 U8 * const s_end = s0 + STR_LEN(scan);
2778 /* The below is perhaps overboard, but this allows us to save a test
2779 * each time through the loop at the expense of a mask. This is
2780 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2781 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2782 * This uses an exclusive 'or' to find that bit and then inverts it to
2783 * form a mask, with just a single 0, in the bit position where 'S' and
2785 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2786 const U8 s_masked = 's' & S_or_s_mask;
2788 /* One pass is made over the node's string looking for all the
2789 * possibilities. to avoid some tests in the loop, there are two main
2790 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2794 /* There are two problematic Greek code points in Unicode
2797 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2798 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2804 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2805 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2807 * This means that in case-insensitive matching (or "loose
2808 * matching", as Unicode calls it), an EXACTF of length six (the
2809 * UTF-8 encoded byte length of the above casefolded versions) can
2810 * match a target string of length two (the byte length of UTF-8
2811 * encoded U+0390 or U+03B0). This would rather mess up the
2812 * minimum length computation. (there are other code points that
2813 * also fold to these two sequences, but the delta is smaller)
2815 * If these sequences are found, the minimum length is decreased by
2816 * four (six minus two).
2818 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2819 * LETTER SHARP S. We decrease the min length by 1 for each
2820 * occurrence of 'ss' found */
2822 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2823 # define U390_first_byte 0xb4
2824 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2825 # define U3B0_first_byte 0xb5
2826 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2828 # define U390_first_byte 0xce
2829 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2830 # define U3B0_first_byte 0xcf
2831 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2833 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2834 yields a net of 0 */
2835 /* Examine the string for one of the problematic sequences */
2837 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2838 * sequence we are looking for is 2 */
2842 /* Look for the first byte in each problematic sequence */
2844 /* We don't have to worry about other things that fold to
2845 * 's' (such as the long s, U+017F), as all above-latin1
2846 * code points have been pre-folded */
2850 /* Current character is an 's' or 'S'. If next one is
2851 * as well, we have the dreaded sequence */
2852 if (((*(s+1) & S_or_s_mask) == s_masked)
2853 /* These two node types don't have special handling
2855 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2858 OP(scan) = EXACTFU_SS;
2859 s++; /* No need to look at this character again */
2863 case U390_first_byte:
2864 if (s_end - s >= len
2866 /* The 1's are because are skipping comparing the
2868 && memEQ(s + 1, U390_tail, len - 1))
2870 goto greek_sequence;
2874 case U3B0_first_byte:
2875 if (! (s_end - s >= len
2876 && memEQ(s + 1, U3B0_tail, len - 1)))
2883 /* This can't currently be handled by trie's, so change
2884 * the node type to indicate this. If EXACTFA and
2885 * EXACTFL were ever to be handled by trie's, this
2886 * would have to be changed. If this node has already
2887 * been changed to EXACTFU_SS in this loop, leave it as
2888 * is. (I (khw) think it doesn't matter in regexec.c
2889 * for UTF patterns, but no need to change it */
2890 if (OP(scan) == EXACTFU) {
2891 OP(scan) = EXACTFU_TRICKYFOLD;
2893 s += 6; /* We already know what this sequence is. Skip
2899 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2901 /* Here, the pattern is not UTF-8. We need to look only for the
2902 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2903 * in the final position. Otherwise we can stop looking 1 byte
2904 * earlier because have to find both the first and second 's' */
2905 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2907 for (s = s0; s < upper; s++) {
2912 && ((*(s+1) & S_or_s_mask) == s_masked))
2916 /* EXACTF nodes need to know that the minimum
2917 * length changed so that a sharp s in the string
2918 * can match this ss in the pattern, but they
2919 * remain EXACTF nodes, as they are not trie'able,
2920 * so don't have to invent a new node type to
2921 * exclude them from the trie code */
2922 if (OP(scan) != EXACTF) {
2923 OP(scan) = EXACTFU_SS;
2928 case LATIN_SMALL_LETTER_SHARP_S:
2929 if (OP(scan) == EXACTF) {
2930 *has_exactf_sharp_s = TRUE;
2939 /* Allow dumping but overwriting the collection of skipped
2940 * ops and/or strings with fake optimized ops */
2941 n = scan + NODE_SZ_STR(scan);
2949 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2953 /* REx optimizer. Converts nodes into quicker variants "in place".
2954 Finds fixed substrings. */
2956 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2957 to the position after last scanned or to NULL. */
2959 #define INIT_AND_WITHP \
2960 assert(!and_withp); \
2961 Newx(and_withp,1,struct regnode_charclass_class); \
2962 SAVEFREEPV(and_withp)
2964 /* this is a chain of data about sub patterns we are processing that
2965 need to be handled separately/specially in study_chunk. Its so
2966 we can simulate recursion without losing state. */
2968 typedef struct scan_frame {
2969 regnode *last; /* last node to process in this frame */
2970 regnode *next; /* next node to process when last is reached */
2971 struct scan_frame *prev; /*previous frame*/
2972 I32 stop; /* what stopparen do we use */
2976 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2978 #define CASE_SYNST_FNC(nAmE) \
2980 if (flags & SCF_DO_STCLASS_AND) { \
2981 for (value = 0; value < 256; value++) \
2982 if (!is_ ## nAmE ## _cp(value)) \
2983 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2986 for (value = 0; value < 256; value++) \
2987 if (is_ ## nAmE ## _cp(value)) \
2988 ANYOF_BITMAP_SET(data->start_class, value); \
2992 if (flags & SCF_DO_STCLASS_AND) { \
2993 for (value = 0; value < 256; value++) \
2994 if (is_ ## nAmE ## _cp(value)) \
2995 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2998 for (value = 0; value < 256; value++) \
2999 if (!is_ ## nAmE ## _cp(value)) \
3000 ANYOF_BITMAP_SET(data->start_class, value); \
3007 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3008 I32 *minlenp, I32 *deltap,
3013 struct regnode_charclass_class *and_withp,
3014 U32 flags, U32 depth)
3015 /* scanp: Start here (read-write). */
3016 /* deltap: Write maxlen-minlen here. */
3017 /* last: Stop before this one. */
3018 /* data: string data about the pattern */
3019 /* stopparen: treat close N as END */
3020 /* recursed: which subroutines have we recursed into */
3021 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3024 I32 min = 0, pars = 0, code;
3025 regnode *scan = *scanp, *next;
3027 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3028 int is_inf_internal = 0; /* The studied chunk is infinite */
3029 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3030 scan_data_t data_fake;
3031 SV *re_trie_maxbuff = NULL;
3032 regnode *first_non_open = scan;
3033 I32 stopmin = I32_MAX;
3034 scan_frame *frame = NULL;
3035 GET_RE_DEBUG_FLAGS_DECL;
3037 PERL_ARGS_ASSERT_STUDY_CHUNK;
3040 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3044 while (first_non_open && OP(first_non_open) == OPEN)
3045 first_non_open=regnext(first_non_open);
3050 while ( scan && OP(scan) != END && scan < last ){
3051 UV min_subtract = 0; /* How much to subtract from the minimum node
3052 length to get a real minimum (because the
3053 folded version may be shorter) */
3054 bool has_exactf_sharp_s = FALSE;
3055 /* Peephole optimizer: */
3056 DEBUG_STUDYDATA("Peep:", data,depth);
3057 DEBUG_PEEP("Peep",scan,depth);
3059 /* Its not clear to khw or hv why this is done here, and not in the
3060 * clauses that deal with EXACT nodes. khw's guess is that it's
3061 * because of a previous design */
3062 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3064 /* Follow the next-chain of the current node and optimize
3065 away all the NOTHINGs from it. */
3066 if (OP(scan) != CURLYX) {
3067 const int max = (reg_off_by_arg[OP(scan)]
3069 /* I32 may be smaller than U16 on CRAYs! */
3070 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3071 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3075 /* Skip NOTHING and LONGJMP. */
3076 while ((n = regnext(n))
3077 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3078 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3079 && off + noff < max)
3081 if (reg_off_by_arg[OP(scan)])
3084 NEXT_OFF(scan) = off;
3089 /* The principal pseudo-switch. Cannot be a switch, since we
3090 look into several different things. */
3091 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3092 || OP(scan) == IFTHEN) {
3093 next = regnext(scan);
3095 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3097 if (OP(next) == code || code == IFTHEN) {
3098 /* NOTE - There is similar code to this block below for handling
3099 TRIE nodes on a re-study. If you change stuff here check there
3101 I32 max1 = 0, min1 = I32_MAX, num = 0;
3102 struct regnode_charclass_class accum;
3103 regnode * const startbranch=scan;
3105 if (flags & SCF_DO_SUBSTR)
3106 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3107 if (flags & SCF_DO_STCLASS)
3108 cl_init_zero(pRExC_state, &accum);
3110 while (OP(scan) == code) {
3111 I32 deltanext, minnext, f = 0, fake;
3112 struct regnode_charclass_class this_class;
3115 data_fake.flags = 0;
3117 data_fake.whilem_c = data->whilem_c;
3118 data_fake.last_closep = data->last_closep;
3121 data_fake.last_closep = &fake;
3123 data_fake.pos_delta = delta;
3124 next = regnext(scan);
3125 scan = NEXTOPER(scan);
3127 scan = NEXTOPER(scan);
3128 if (flags & SCF_DO_STCLASS) {
3129 cl_init(pRExC_state, &this_class);
3130 data_fake.start_class = &this_class;
3131 f = SCF_DO_STCLASS_AND;
3133 if (flags & SCF_WHILEM_VISITED_POS)
3134 f |= SCF_WHILEM_VISITED_POS;
3136 /* we suppose the run is continuous, last=next...*/
3137 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3139 stopparen, recursed, NULL, f,depth+1);
3142 if (max1 < minnext + deltanext)
3143 max1 = minnext + deltanext;
3144 if (deltanext == I32_MAX)
3145 is_inf = is_inf_internal = 1;
3147 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3149 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3150 if ( stopmin > minnext)
3151 stopmin = min + min1;
3152 flags &= ~SCF_DO_SUBSTR;
3154 data->flags |= SCF_SEEN_ACCEPT;
3157 if (data_fake.flags & SF_HAS_EVAL)
3158 data->flags |= SF_HAS_EVAL;
3159 data->whilem_c = data_fake.whilem_c;
3161 if (flags & SCF_DO_STCLASS)
3162 cl_or(pRExC_state, &accum, &this_class);
3164 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3166 if (flags & SCF_DO_SUBSTR) {
3167 data->pos_min += min1;
3168 data->pos_delta += max1 - min1;
3169 if (max1 != min1 || is_inf)
3170 data->longest = &(data->longest_float);
3173 delta += max1 - min1;
3174 if (flags & SCF_DO_STCLASS_OR) {
3175 cl_or(pRExC_state, data->start_class, &accum);
3177 cl_and(data->start_class, and_withp);
3178 flags &= ~SCF_DO_STCLASS;
3181 else if (flags & SCF_DO_STCLASS_AND) {
3183 cl_and(data->start_class, &accum);
3184 flags &= ~SCF_DO_STCLASS;
3187 /* Switch to OR mode: cache the old value of
3188 * data->start_class */
3190 StructCopy(data->start_class, and_withp,
3191 struct regnode_charclass_class);
3192 flags &= ~SCF_DO_STCLASS_AND;
3193 StructCopy(&accum, data->start_class,
3194 struct regnode_charclass_class);
3195 flags |= SCF_DO_STCLASS_OR;
3196 data->start_class->flags |= ANYOF_EOS;
3200 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3203 Assuming this was/is a branch we are dealing with: 'scan' now
3204 points at the item that follows the branch sequence, whatever
3205 it is. We now start at the beginning of the sequence and look
3212 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3214 If we can find such a subsequence we need to turn the first
3215 element into a trie and then add the subsequent branch exact
3216 strings to the trie.
3220 1. patterns where the whole set of branches can be converted.
3222 2. patterns where only a subset can be converted.
3224 In case 1 we can replace the whole set with a single regop
3225 for the trie. In case 2 we need to keep the start and end
3228 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3229 becomes BRANCH TRIE; BRANCH X;
3231 There is an additional case, that being where there is a
3232 common prefix, which gets split out into an EXACT like node
3233 preceding the TRIE node.
3235 If x(1..n)==tail then we can do a simple trie, if not we make
3236 a "jump" trie, such that when we match the appropriate word
3237 we "jump" to the appropriate tail node. Essentially we turn
3238 a nested if into a case structure of sorts.
3243 if (!re_trie_maxbuff) {
3244 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3245 if (!SvIOK(re_trie_maxbuff))
3246 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3248 if ( SvIV(re_trie_maxbuff)>=0 ) {
3250 regnode *first = (regnode *)NULL;
3251 regnode *last = (regnode *)NULL;
3252 regnode *tail = scan;
3257 SV * const mysv = sv_newmortal(); /* for dumping */
3259 /* var tail is used because there may be a TAIL
3260 regop in the way. Ie, the exacts will point to the
3261 thing following the TAIL, but the last branch will
3262 point at the TAIL. So we advance tail. If we
3263 have nested (?:) we may have to move through several
3267 while ( OP( tail ) == TAIL ) {
3268 /* this is the TAIL generated by (?:) */
3269 tail = regnext( tail );
3273 DEBUG_TRIE_COMPILE_r({
3274 regprop(RExC_rx, mysv, tail );
3275 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3276 (int)depth * 2 + 2, "",
3277 "Looking for TRIE'able sequences. Tail node is: ",
3278 SvPV_nolen_const( mysv )
3284 Step through the branches
3285 cur represents each branch,
3286 noper is the first thing to be matched as part of that branch
3287 noper_next is the regnext() of that node.
3289 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3290 via a "jump trie" but we also support building with NOJUMPTRIE,
3291 which restricts the trie logic to structures like /FOO|BAR/.
3293 If noper is a trieable nodetype then the branch is a possible optimization
3294 target. If we are building under NOJUMPTRIE then we require that noper_next
3295 is the same as scan (our current position in the regex program).
3297 Once we have two or more consecutive such branches we can create a
3298 trie of the EXACT's contents and stitch it in place into the program.
3300 If the sequence represents all of the branches in the alternation we
3301 replace the entire thing with a single TRIE node.
3303 Otherwise when it is a subsequence we need to stitch it in place and
3304 replace only the relevant branches. This means the first branch has
3305 to remain as it is used by the alternation logic, and its next pointer,
3306 and needs to be repointed at the item on the branch chain following
3307 the last branch we have optimized away.
3309 This could be either a BRANCH, in which case the subsequence is internal,
3310 or it could be the item following the branch sequence in which case the
3311 subsequence is at the end (which does not necessarily mean the first node
3312 is the start of the alternation).
3314 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3317 ----------------+-----------
3321 EXACTFU_SS | EXACTFU
3322 EXACTFU_TRICKYFOLD | EXACTFU
3327 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3328 ( EXACT == (X) ) ? EXACT : \
3329 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3332 /* dont use tail as the end marker for this traverse */
3333 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3334 regnode * const noper = NEXTOPER( cur );
3335 U8 noper_type = OP( noper );
3336 U8 noper_trietype = TRIE_TYPE( noper_type );
3337 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3338 regnode * const noper_next = regnext( noper );
3339 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3340 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3343 DEBUG_TRIE_COMPILE_r({
3344 regprop(RExC_rx, mysv, cur);
3345 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3346 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3348 regprop(RExC_rx, mysv, noper);
3349 PerlIO_printf( Perl_debug_log, " -> %s",
3350 SvPV_nolen_const(mysv));
3353 regprop(RExC_rx, mysv, noper_next );
3354 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3355 SvPV_nolen_const(mysv));
3357 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3358 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3359 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3363 /* Is noper a trieable nodetype that can be merged with the
3364 * current trie (if there is one)? */
3368 ( noper_trietype == NOTHING)
3369 || ( trietype == NOTHING )
3370 || ( trietype == noper_trietype )
3373 && noper_next == tail
3377 /* Handle mergable triable node
3378 * Either we are the first node in a new trieable sequence,
3379 * in which case we do some bookkeeping, otherwise we update
3380 * the end pointer. */
3383 if ( noper_trietype == NOTHING ) {
3384 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3385 regnode * const noper_next = regnext( noper );
3386 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3387 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3390 if ( noper_next_trietype ) {
3391 trietype = noper_next_trietype;
3392 } else if (noper_next_type) {
3393 /* a NOTHING regop is 1 regop wide. We need at least two
3394 * for a trie so we can't merge this in */
3398 trietype = noper_trietype;
3401 if ( trietype == NOTHING )
3402 trietype = noper_trietype;
3407 } /* end handle mergable triable node */
3409 /* handle unmergable node -
3410 * noper may either be a triable node which can not be tried
3411 * together with the current trie, or a non triable node */
3413 /* If last is set and trietype is not NOTHING then we have found
3414 * at least two triable branch sequences in a row of a similar
3415 * trietype so we can turn them into a trie. If/when we
3416 * allow NOTHING to start a trie sequence this condition will be
3417 * required, and it isn't expensive so we leave it in for now. */
3418 if ( trietype != NOTHING )
3419 make_trie( pRExC_state,
3420 startbranch, first, cur, tail, count,
3421 trietype, depth+1 );
3422 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3426 && noper_next == tail
3429 /* noper is triable, so we can start a new trie sequence */
3432 trietype = noper_trietype;
3434 /* if we already saw a first but the current node is not triable then we have
3435 * to reset the first information. */
3440 } /* end handle unmergable node */
3441 } /* loop over branches */
3442 DEBUG_TRIE_COMPILE_r({
3443 regprop(RExC_rx, mysv, cur);
3444 PerlIO_printf( Perl_debug_log,
3445 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3446 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3450 if ( trietype != NOTHING ) {
3451 /* the last branch of the sequence was part of a trie,
3452 * so we have to construct it here outside of the loop
3454 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3455 #ifdef TRIE_STUDY_OPT
3456 if ( ((made == MADE_EXACT_TRIE &&
3457 startbranch == first)
3458 || ( first_non_open == first )) &&
3460 flags |= SCF_TRIE_RESTUDY;
3461 if ( startbranch == first
3464 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3469 /* at this point we know whatever we have is a NOTHING sequence/branch
3470 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3472 if ( startbranch == first ) {
3474 /* the entire thing is a NOTHING sequence, something like this:
3475 * (?:|) So we can turn it into a plain NOTHING op. */
3476 DEBUG_TRIE_COMPILE_r({
3477 regprop(RExC_rx, mysv, cur);
3478 PerlIO_printf( Perl_debug_log,
3479 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3480 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3483 OP(startbranch)= NOTHING;
3484 NEXT_OFF(startbranch)= tail - startbranch;
3485 for ( opt= startbranch + 1; opt < tail ; opt++ )
3489 } /* end if ( last) */
3490 } /* TRIE_MAXBUF is non zero */
3495 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3496 scan = NEXTOPER(NEXTOPER(scan));
3497 } else /* single branch is optimized. */
3498 scan = NEXTOPER(scan);
3500 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3501 scan_frame *newframe = NULL;
3506 if (OP(scan) != SUSPEND) {
3507 /* set the pointer */
3508 if (OP(scan) == GOSUB) {
3510 RExC_recurse[ARG2L(scan)] = scan;
3511 start = RExC_open_parens[paren-1];
3512 end = RExC_close_parens[paren-1];
3515 start = RExC_rxi->program + 1;
3519 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3520 SAVEFREEPV(recursed);
3522 if (!PAREN_TEST(recursed,paren+1)) {
3523 PAREN_SET(recursed,paren+1);
3524 Newx(newframe,1,scan_frame);
3526 if (flags & SCF_DO_SUBSTR) {
3527 SCAN_COMMIT(pRExC_state,data,minlenp);
3528 data->longest = &(data->longest_float);
3530 is_inf = is_inf_internal = 1;
3531 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3532 cl_anything(pRExC_state, data->start_class);
3533 flags &= ~SCF_DO_STCLASS;
3536 Newx(newframe,1,scan_frame);
3539 end = regnext(scan);
3544 SAVEFREEPV(newframe);
3545 newframe->next = regnext(scan);
3546 newframe->last = last;
3547 newframe->stop = stopparen;
3548 newframe->prev = frame;
3558 else if (OP(scan) == EXACT) {
3559 I32 l = STR_LEN(scan);
3562 const U8 * const s = (U8*)STRING(scan);
3563 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3564 l = utf8_length(s, s + l);
3566 uc = *((U8*)STRING(scan));
3569 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3570 /* The code below prefers earlier match for fixed
3571 offset, later match for variable offset. */
3572 if (data->last_end == -1) { /* Update the start info. */
3573 data->last_start_min = data->pos_min;
3574 data->last_start_max = is_inf
3575 ? I32_MAX : data->pos_min + data->pos_delta;
3577 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3579 SvUTF8_on(data->last_found);
3581 SV * const sv = data->last_found;
3582 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3583 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3584 if (mg && mg->mg_len >= 0)
3585 mg->mg_len += utf8_length((U8*)STRING(scan),
3586 (U8*)STRING(scan)+STR_LEN(scan));
3588 data->last_end = data->pos_min + l;
3589 data->pos_min += l; /* As in the first entry. */
3590 data->flags &= ~SF_BEFORE_EOL;
3592 if (flags & SCF_DO_STCLASS_AND) {
3593 /* Check whether it is compatible with what we know already! */
3597 /* If compatible, we or it in below. It is compatible if is
3598 * in the bitmp and either 1) its bit or its fold is set, or 2)
3599 * it's for a locale. Even if there isn't unicode semantics
3600 * here, at runtime there may be because of matching against a
3601 * utf8 string, so accept a possible false positive for
3602 * latin1-range folds */
3604 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3605 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3606 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3607 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3612 ANYOF_CLASS_ZERO(data->start_class);
3613 ANYOF_BITMAP_ZERO(data->start_class);
3615 ANYOF_BITMAP_SET(data->start_class, uc);
3616 else if (uc >= 0x100) {
3619 /* Some Unicode code points fold to the Latin1 range; as
3620 * XXX temporary code, instead of figuring out if this is
3621 * one, just assume it is and set all the start class bits
3622 * that could be some such above 255 code point's fold
3623 * which will generate fals positives. As the code
3624 * elsewhere that does compute the fold settles down, it
3625 * can be extracted out and re-used here */
3626 for (i = 0; i < 256; i++){
3627 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3628 ANYOF_BITMAP_SET(data->start_class, i);
3632 data->start_class->flags &= ~ANYOF_EOS;
3634 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3636 else if (flags & SCF_DO_STCLASS_OR) {
3637 /* false positive possible if the class is case-folded */
3639 ANYOF_BITMAP_SET(data->start_class, uc);
3641 data->start_class->flags |= ANYOF_UNICODE_ALL;
3642 data->start_class->flags &= ~ANYOF_EOS;
3643 cl_and(data->start_class, and_withp);
3645 flags &= ~SCF_DO_STCLASS;
3647 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3648 I32 l = STR_LEN(scan);
3649 UV uc = *((U8*)STRING(scan));
3651 /* Search for fixed substrings supports EXACT only. */
3652 if (flags & SCF_DO_SUBSTR) {
3654 SCAN_COMMIT(pRExC_state, data, minlenp);
3657 const U8 * const s = (U8 *)STRING(scan);
3658 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3659 l = utf8_length(s, s + l);
3661 else if (has_exactf_sharp_s) {
3662 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3664 min += l - min_subtract;
3668 delta += min_subtract;
3669 if (flags & SCF_DO_SUBSTR) {
3670 data->pos_min += l - min_subtract;
3671 if (data->pos_min < 0) {
3674 data->pos_delta += min_subtract;
3676 data->longest = &(data->longest_float);
3679 if (flags & SCF_DO_STCLASS_AND) {
3680 /* Check whether it is compatible with what we know already! */
3683 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3684 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3685 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3689 ANYOF_CLASS_ZERO(data->start_class);
3690 ANYOF_BITMAP_ZERO(data->start_class);
3692 ANYOF_BITMAP_SET(data->start_class, uc);
3693 data->start_class->flags &= ~ANYOF_EOS;
3694 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3695 if (OP(scan) == EXACTFL) {
3696 /* XXX This set is probably no longer necessary, and
3697 * probably wrong as LOCALE now is on in the initial
3699 data->start_class->flags |= ANYOF_LOCALE;
3703 /* Also set the other member of the fold pair. In case
3704 * that unicode semantics is called for at runtime, use
3705 * the full latin1 fold. (Can't do this for locale,
3706 * because not known until runtime) */
3707 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3709 /* All other (EXACTFL handled above) folds except under
3710 * /iaa that include s, S, and sharp_s also may include
3712 if (OP(scan) != EXACTFA) {
3713 if (uc == 's' || uc == 'S') {
3714 ANYOF_BITMAP_SET(data->start_class,
3715 LATIN_SMALL_LETTER_SHARP_S);
3717 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3718 ANYOF_BITMAP_SET(data->start_class, 's');
3719 ANYOF_BITMAP_SET(data->start_class, 'S');
3724 else if (uc >= 0x100) {
3726 for (i = 0; i < 256; i++){
3727 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3728 ANYOF_BITMAP_SET(data->start_class, i);
3733 else if (flags & SCF_DO_STCLASS_OR) {
3734 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3735 /* false positive possible if the class is case-folded.
3736 Assume that the locale settings are the same... */
3738 ANYOF_BITMAP_SET(data->start_class, uc);
3739 if (OP(scan) != EXACTFL) {
3741 /* And set the other member of the fold pair, but
3742 * can't do that in locale because not known until
3744 ANYOF_BITMAP_SET(data->start_class,
3745 PL_fold_latin1[uc]);
3747 /* All folds except under /iaa that include s, S,
3748 * and sharp_s also may include the others */
3749 if (OP(scan) != EXACTFA) {
3750 if (uc == 's' || uc == 'S') {
3751 ANYOF_BITMAP_SET(data->start_class,
3752 LATIN_SMALL_LETTER_SHARP_S);
3754 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3755 ANYOF_BITMAP_SET(data->start_class, 's');
3756 ANYOF_BITMAP_SET(data->start_class, 'S');
3761 data->start_class->flags &= ~ANYOF_EOS;
3763 cl_and(data->start_class, and_withp);
3765 flags &= ~SCF_DO_STCLASS;
3767 else if (REGNODE_VARIES(OP(scan))) {
3768 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3769 I32 f = flags, pos_before = 0;
3770 regnode * const oscan = scan;
3771 struct regnode_charclass_class this_class;
3772 struct regnode_charclass_class *oclass = NULL;
3773 I32 next_is_eval = 0;
3775 switch (PL_regkind[OP(scan)]) {
3776 case WHILEM: /* End of (?:...)* . */
3777 scan = NEXTOPER(scan);
3780 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3781 next = NEXTOPER(scan);
3782 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3784 maxcount = REG_INFTY;
3785 next = regnext(scan);
3786 scan = NEXTOPER(scan);
3790 if (flags & SCF_DO_SUBSTR)
3795 if (flags & SCF_DO_STCLASS) {
3797 maxcount = REG_INFTY;
3798 next = regnext(scan);
3799 scan = NEXTOPER(scan);
3802 is_inf = is_inf_internal = 1;
3803 scan = regnext(scan);
3804 if (flags & SCF_DO_SUBSTR) {
3805 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3806 data->longest = &(data->longest_float);
3808 goto optimize_curly_tail;
3810 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3811 && (scan->flags == stopparen))
3816 mincount = ARG1(scan);
3817 maxcount = ARG2(scan);
3819 next = regnext(scan);
3820 if (OP(scan) == CURLYX) {
3821 I32 lp = (data ? *(data->last_closep) : 0);
3822 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3824 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3825 next_is_eval = (OP(scan) == EVAL);
3827 if (flags & SCF_DO_SUBSTR) {
3828 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3829 pos_before = data->pos_min;
3833 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3835 data->flags |= SF_IS_INF;
3837 if (flags & SCF_DO_STCLASS) {
3838 cl_init(pRExC_state, &this_class);
3839 oclass = data->start_class;
3840 data->start_class = &this_class;
3841 f |= SCF_DO_STCLASS_AND;
3842 f &= ~SCF_DO_STCLASS_OR;
3844 /* Exclude from super-linear cache processing any {n,m}
3845 regops for which the combination of input pos and regex
3846 pos is not enough information to determine if a match
3849 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3850 regex pos at the \s*, the prospects for a match depend not
3851 only on the input position but also on how many (bar\s*)
3852 repeats into the {4,8} we are. */
3853 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3854 f &= ~SCF_WHILEM_VISITED_POS;
3856 /* This will finish on WHILEM, setting scan, or on NULL: */
3857 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3858 last, data, stopparen, recursed, NULL,
3860 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3862 if (flags & SCF_DO_STCLASS)
3863 data->start_class = oclass;
3864 if (mincount == 0 || minnext == 0) {
3865 if (flags & SCF_DO_STCLASS_OR) {
3866 cl_or(pRExC_state, data->start_class, &this_class);
3868 else if (flags & SCF_DO_STCLASS_AND) {
3869 /* Switch to OR mode: cache the old value of
3870 * data->start_class */
3872 StructCopy(data->start_class, and_withp,
3873 struct regnode_charclass_class);
3874 flags &= ~SCF_DO_STCLASS_AND;
3875 StructCopy(&this_class, data->start_class,
3876 struct regnode_charclass_class);
3877 flags |= SCF_DO_STCLASS_OR;
3878 data->start_class->flags |= ANYOF_EOS;
3880 } else { /* Non-zero len */
3881 if (flags & SCF_DO_STCLASS_OR) {
3882 cl_or(pRExC_state, data->start_class, &this_class);
3883 cl_and(data->start_class, and_withp);
3885 else if (flags & SCF_DO_STCLASS_AND)
3886 cl_and(data->start_class, &this_class);
3887 flags &= ~SCF_DO_STCLASS;
3889 if (!scan) /* It was not CURLYX, but CURLY. */
3891 if ( /* ? quantifier ok, except for (?{ ... }) */
3892 (next_is_eval || !(mincount == 0 && maxcount == 1))
3893 && (minnext == 0) && (deltanext == 0)
3894 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3895 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3897 ckWARNreg(RExC_parse,
3898 "Quantifier unexpected on zero-length expression");
3901 min += minnext * mincount;
3902 is_inf_internal |= ((maxcount == REG_INFTY
3903 && (minnext + deltanext) > 0)
3904 || deltanext == I32_MAX);
3905 is_inf |= is_inf_internal;
3906 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3908 /* Try powerful optimization CURLYX => CURLYN. */
3909 if ( OP(oscan) == CURLYX && data
3910 && data->flags & SF_IN_PAR
3911 && !(data->flags & SF_HAS_EVAL)
3912 && !deltanext && minnext == 1 ) {
3913 /* Try to optimize to CURLYN. */
3914 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3915 regnode * const nxt1 = nxt;
3922 if (!REGNODE_SIMPLE(OP(nxt))
3923 && !(PL_regkind[OP(nxt)] == EXACT
3924 && STR_LEN(nxt) == 1))
3930 if (OP(nxt) != CLOSE)
3932 if (RExC_open_parens) {
3933 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3934 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3936 /* Now we know that nxt2 is the only contents: */
3937 oscan->flags = (U8)ARG(nxt);
3939 OP(nxt1) = NOTHING; /* was OPEN. */
3942 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3943 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3944 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3945 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3946 OP(nxt + 1) = OPTIMIZED; /* was count. */
3947 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3952 /* Try optimization CURLYX => CURLYM. */
3953 if ( OP(oscan) == CURLYX && data
3954 && !(data->flags & SF_HAS_PAR)
3955 && !(data->flags & SF_HAS_EVAL)
3956 && !deltanext /* atom is fixed width */
3957 && minnext != 0 /* CURLYM can't handle zero width */
3959 /* XXXX How to optimize if data == 0? */
3960 /* Optimize to a simpler form. */
3961 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3965 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3966 && (OP(nxt2) != WHILEM))
3968 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3969 /* Need to optimize away parenths. */
3970 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3971 /* Set the parenth number. */
3972 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3974 oscan->flags = (U8)ARG(nxt);
3975 if (RExC_open_parens) {
3976 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3977 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3979 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3980 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3983 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3984 OP(nxt + 1) = OPTIMIZED; /* was count. */
3985 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3986 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3989 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3990 regnode *nnxt = regnext(nxt1);
3992 if (reg_off_by_arg[OP(nxt1)])
3993 ARG_SET(nxt1, nxt2 - nxt1);
3994 else if (nxt2 - nxt1 < U16_MAX)
3995 NEXT_OFF(nxt1) = nxt2 - nxt1;
3997 OP(nxt) = NOTHING; /* Cannot beautify */
4002 /* Optimize again: */
4003 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4004 NULL, stopparen, recursed, NULL, 0,depth+1);
4009 else if ((OP(oscan) == CURLYX)
4010 && (flags & SCF_WHILEM_VISITED_POS)
4011 /* See the comment on a similar expression above.
4012 However, this time it's not a subexpression
4013 we care about, but the expression itself. */
4014 && (maxcount == REG_INFTY)
4015 && data && ++data->whilem_c < 16) {
4016 /* This stays as CURLYX, we can put the count/of pair. */
4017 /* Find WHILEM (as in regexec.c) */
4018 regnode *nxt = oscan + NEXT_OFF(oscan);
4020 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4022 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4023 | (RExC_whilem_seen << 4)); /* On WHILEM */
4025 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4027 if (flags & SCF_DO_SUBSTR) {
4028 SV *last_str = NULL;
4029 int counted = mincount != 0;
4031 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4032 #if defined(SPARC64_GCC_WORKAROUND)
4035 const char *s = NULL;
4038 if (pos_before >= data->last_start_min)
4041 b = data->last_start_min;
4044 s = SvPV_const(data->last_found, l);
4045 old = b - data->last_start_min;
4048 I32 b = pos_before >= data->last_start_min
4049 ? pos_before : data->last_start_min;
4051 const char * const s = SvPV_const(data->last_found, l);
4052 I32 old = b - data->last_start_min;
4056 old = utf8_hop((U8*)s, old) - (U8*)s;
4058 /* Get the added string: */
4059 last_str = newSVpvn_utf8(s + old, l, UTF);
4060 if (deltanext == 0 && pos_before == b) {
4061 /* What was added is a constant string */
4063 SvGROW(last_str, (mincount * l) + 1);
4064 repeatcpy(SvPVX(last_str) + l,
4065 SvPVX_const(last_str), l, mincount - 1);
4066 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4067 /* Add additional parts. */
4068 SvCUR_set(data->last_found,
4069 SvCUR(data->last_found) - l);
4070 sv_catsv(data->last_found, last_str);
4072 SV * sv = data->last_found;
4074 SvUTF8(sv) && SvMAGICAL(sv) ?
4075 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4076 if (mg && mg->mg_len >= 0)
4077 mg->mg_len += CHR_SVLEN(last_str) - l;
4079 data->last_end += l * (mincount - 1);
4082 /* start offset must point into the last copy */
4083 data->last_start_min += minnext * (mincount - 1);
4084 data->last_start_max += is_inf ? I32_MAX
4085 : (maxcount - 1) * (minnext + data->pos_delta);
4088 /* It is counted once already... */
4089 data->pos_min += minnext * (mincount - counted);
4090 data->pos_delta += - counted * deltanext +
4091 (minnext + deltanext) * maxcount - minnext * mincount;
4092 if (mincount != maxcount) {
4093 /* Cannot extend fixed substrings found inside
4095 SCAN_COMMIT(pRExC_state,data,minlenp);
4096 if (mincount && last_str) {
4097 SV * const sv = data->last_found;
4098 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4099 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4103 sv_setsv(sv, last_str);
4104 data->last_end = data->pos_min;
4105 data->last_start_min =
4106 data->pos_min - CHR_SVLEN(last_str);
4107 data->last_start_max = is_inf
4109 : data->pos_min + data->pos_delta
4110 - CHR_SVLEN(last_str);
4112 data->longest = &(data->longest_float);
4114 SvREFCNT_dec(last_str);
4116 if (data && (fl & SF_HAS_EVAL))
4117 data->flags |= SF_HAS_EVAL;
4118 optimize_curly_tail:
4119 if (OP(oscan) != CURLYX) {
4120 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4122 NEXT_OFF(oscan) += NEXT_OFF(next);
4125 default: /* REF, ANYOFV, and CLUMP only? */
4126 if (flags & SCF_DO_SUBSTR) {
4127 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4128 data->longest = &(data->longest_float);
4130 is_inf = is_inf_internal = 1;
4131 if (flags & SCF_DO_STCLASS_OR)
4132 cl_anything(pRExC_state, data->start_class);
4133 flags &= ~SCF_DO_STCLASS;
4137 else if (OP(scan) == LNBREAK) {
4138 if (flags & SCF_DO_STCLASS) {
4140 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4141 if (flags & SCF_DO_STCLASS_AND) {
4142 for (value = 0; value < 256; value++)
4143 if (!is_VERTWS_cp(value))
4144 ANYOF_BITMAP_CLEAR(data->start_class, value);
4147 for (value = 0; value < 256; value++)
4148 if (is_VERTWS_cp(value))
4149 ANYOF_BITMAP_SET(data->start_class, value);
4151 if (flags & SCF_DO_STCLASS_OR)
4152 cl_and(data->start_class, and_withp);
4153 flags &= ~SCF_DO_STCLASS;
4157 if (flags & SCF_DO_SUBSTR) {
4158 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4160 data->pos_delta += 1;
4161 data->longest = &(data->longest_float);
4164 else if (REGNODE_SIMPLE(OP(scan))) {
4167 if (flags & SCF_DO_SUBSTR) {
4168 SCAN_COMMIT(pRExC_state,data,minlenp);
4172 if (flags & SCF_DO_STCLASS) {
4173 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4175 /* Some of the logic below assumes that switching
4176 locale on will only add false positives. */
4177 switch (PL_regkind[OP(scan)]) {
4181 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4182 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4183 cl_anything(pRExC_state, data->start_class);
4186 if (OP(scan) == SANY)
4188 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4189 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4190 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4191 cl_anything(pRExC_state, data->start_class);
4193 if (flags & SCF_DO_STCLASS_AND || !value)
4194 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4197 if (flags & SCF_DO_STCLASS_AND)
4198 cl_and(data->start_class,
4199 (struct regnode_charclass_class*)scan);
4201 cl_or(pRExC_state, data->start_class,
4202 (struct regnode_charclass_class*)scan);
4205 if (flags & SCF_DO_STCLASS_AND) {
4206 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4207 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4208 if (OP(scan) == ALNUMU) {
4209 for (value = 0; value < 256; value++) {
4210 if (!isWORDCHAR_L1(value)) {
4211 ANYOF_BITMAP_CLEAR(data->start_class, value);
4215 for (value = 0; value < 256; value++) {
4216 if (!isALNUM(value)) {
4217 ANYOF_BITMAP_CLEAR(data->start_class, value);
4224 if (data->start_class->flags & ANYOF_LOCALE)
4225 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4227 /* Even if under locale, set the bits for non-locale
4228 * in case it isn't a true locale-node. This will
4229 * create false positives if it truly is locale */
4230 if (OP(scan) == ALNUMU) {
4231 for (value = 0; value < 256; value++) {
4232 if (isWORDCHAR_L1(value)) {
4233 ANYOF_BITMAP_SET(data->start_class, value);
4237 for (value = 0; value < 256; value++) {
4238 if (isALNUM(value)) {
4239 ANYOF_BITMAP_SET(data->start_class, value);
4246 if (flags & SCF_DO_STCLASS_AND) {
4247 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4248 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4249 if (OP(scan) == NALNUMU) {
4250 for (value = 0; value < 256; value++) {
4251 if (isWORDCHAR_L1(value)) {
4252 ANYOF_BITMAP_CLEAR(data->start_class, value);
4256 for (value = 0; value < 256; value++) {
4257 if (isALNUM(value)) {
4258 ANYOF_BITMAP_CLEAR(data->start_class, value);
4265 if (data->start_class->flags & ANYOF_LOCALE)
4266 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4268 /* Even if under locale, set the bits for non-locale in
4269 * case it isn't a true locale-node. This will create
4270 * false positives if it truly is locale */
4271 if (OP(scan) == NALNUMU) {
4272 for (value = 0; value < 256; value++) {
4273 if (! isWORDCHAR_L1(value)) {
4274 ANYOF_BITMAP_SET(data->start_class, value);
4278 for (value = 0; value < 256; value++) {
4279 if (! isALNUM(value)) {
4280 ANYOF_BITMAP_SET(data->start_class, value);
4287 if (flags & SCF_DO_STCLASS_AND) {
4288 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4289 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4290 if (OP(scan) == SPACEU) {
4291 for (value = 0; value < 256; value++) {
4292 if (!isSPACE_L1(value)) {
4293 ANYOF_BITMAP_CLEAR(data->start_class, value);
4297 for (value = 0; value < 256; value++) {
4298 if (!isSPACE(value)) {
4299 ANYOF_BITMAP_CLEAR(data->start_class, value);
4306 if (data->start_class->flags & ANYOF_LOCALE) {
4307 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4309 if (OP(scan) == SPACEU) {
4310 for (value = 0; value < 256; value++) {
4311 if (isSPACE_L1(value)) {
4312 ANYOF_BITMAP_SET(data->start_class, value);
4316 for (value = 0; value < 256; value++) {
4317 if (isSPACE(value)) {
4318 ANYOF_BITMAP_SET(data->start_class, value);
4325 if (flags & SCF_DO_STCLASS_AND) {
4326 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4327 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4328 if (OP(scan) == NSPACEU) {
4329 for (value = 0; value < 256; value++) {
4330 if (isSPACE_L1(value)) {
4331 ANYOF_BITMAP_CLEAR(data->start_class, value);
4335 for (value = 0; value < 256; value++) {
4336 if (isSPACE(value)) {
4337 ANYOF_BITMAP_CLEAR(data->start_class, value);
4344 if (data->start_class->flags & ANYOF_LOCALE)
4345 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4346 if (OP(scan) == NSPACEU) {
4347 for (value = 0; value < 256; value++) {
4348 if (!isSPACE_L1(value)) {
4349 ANYOF_BITMAP_SET(data->start_class, value);
4354 for (value = 0; value < 256; value++) {
4355 if (!isSPACE(value)) {
4356 ANYOF_BITMAP_SET(data->start_class, value);
4363 if (flags & SCF_DO_STCLASS_AND) {
4364 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4365 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4366 for (value = 0; value < 256; value++)
4367 if (!isDIGIT(value))
4368 ANYOF_BITMAP_CLEAR(data->start_class, value);
4372 if (data->start_class->flags & ANYOF_LOCALE)
4373 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4374 for (value = 0; value < 256; value++)
4376 ANYOF_BITMAP_SET(data->start_class, value);
4380 if (flags & SCF_DO_STCLASS_AND) {
4381 if (!(data->start_class->flags & ANYOF_LOCALE))
4382 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4383 for (value = 0; value < 256; value++)
4385 ANYOF_BITMAP_CLEAR(data->start_class, value);
4388 if (data->start_class->flags & ANYOF_LOCALE)
4389 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4390 for (value = 0; value < 256; value++)
4391 if (!isDIGIT(value))
4392 ANYOF_BITMAP_SET(data->start_class, value);
4395 CASE_SYNST_FNC(VERTWS);
4396 CASE_SYNST_FNC(HORIZWS);
4399 if (flags & SCF_DO_STCLASS_OR)
4400 cl_and(data->start_class, and_withp);
4401 flags &= ~SCF_DO_STCLASS;
4404 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4405 data->flags |= (OP(scan) == MEOL
4408 SCAN_COMMIT(pRExC_state, data, minlenp);
4411 else if ( PL_regkind[OP(scan)] == BRANCHJ
4412 /* Lookbehind, or need to calculate parens/evals/stclass: */
4413 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4414 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4415 if ( OP(scan) == UNLESSM &&
4417 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4418 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4421 regnode *upto= regnext(scan);
4423 SV * const mysv_val=sv_newmortal();
4424 DEBUG_STUDYDATA("OPFAIL",data,depth);
4426 /*DEBUG_PARSE_MSG("opfail");*/
4427 regprop(RExC_rx, mysv_val, upto);
4428 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4429 SvPV_nolen_const(mysv_val),
4430 (IV)REG_NODE_NUM(upto),
4435 NEXT_OFF(scan) = upto - scan;
4436 for (opt= scan + 1; opt < upto ; opt++)
4437 OP(opt) = OPTIMIZED;
4441 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4442 || OP(scan) == UNLESSM )
4444 /* Negative Lookahead/lookbehind
4445 In this case we can't do fixed string optimisation.
4448 I32 deltanext, minnext, fake = 0;
4450 struct regnode_charclass_class intrnl;
4453 data_fake.flags = 0;
4455 data_fake.whilem_c = data->whilem_c;
4456 data_fake.last_closep = data->last_closep;
4459 data_fake.last_closep = &fake;
4460 data_fake.pos_delta = delta;
4461 if ( flags & SCF_DO_STCLASS && !scan->flags
4462 && OP(scan) == IFMATCH ) { /* Lookahead */
4463 cl_init(pRExC_state, &intrnl);
4464 data_fake.start_class = &intrnl;
4465 f |= SCF_DO_STCLASS_AND;
4467 if (flags & SCF_WHILEM_VISITED_POS)
4468 f |= SCF_WHILEM_VISITED_POS;
4469 next = regnext(scan);
4470 nscan = NEXTOPER(NEXTOPER(scan));
4471 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4472 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4475 FAIL("Variable length lookbehind not implemented");
4477 else if (minnext > (I32)U8_MAX) {
4478 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4480 scan->flags = (U8)minnext;
4483 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4485 if (data_fake.flags & SF_HAS_EVAL)
4486 data->flags |= SF_HAS_EVAL;
4487 data->whilem_c = data_fake.whilem_c;
4489 if (f & SCF_DO_STCLASS_AND) {
4490 if (flags & SCF_DO_STCLASS_OR) {
4491 /* OR before, AND after: ideally we would recurse with
4492 * data_fake to get the AND applied by study of the
4493 * remainder of the pattern, and then derecurse;
4494 * *** HACK *** for now just treat as "no information".
4495 * See [perl #56690].
4497 cl_init(pRExC_state, data->start_class);
4499 /* AND before and after: combine and continue */
4500 const int was = (data->start_class->flags & ANYOF_EOS);
4502 cl_and(data->start_class, &intrnl);
4504 data->start_class->flags |= ANYOF_EOS;
4508 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4510 /* Positive Lookahead/lookbehind
4511 In this case we can do fixed string optimisation,
4512 but we must be careful about it. Note in the case of
4513 lookbehind the positions will be offset by the minimum
4514 length of the pattern, something we won't know about
4515 until after the recurse.
4517 I32 deltanext, fake = 0;
4519 struct regnode_charclass_class intrnl;
4521 /* We use SAVEFREEPV so that when the full compile
4522 is finished perl will clean up the allocated
4523 minlens when it's all done. This way we don't
4524 have to worry about freeing them when we know
4525 they wont be used, which would be a pain.
4528 Newx( minnextp, 1, I32 );
4529 SAVEFREEPV(minnextp);
4532 StructCopy(data, &data_fake, scan_data_t);
4533 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4536 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4537 data_fake.last_found=newSVsv(data->last_found);
4541 data_fake.last_closep = &fake;
4542 data_fake.flags = 0;
4543 data_fake.pos_delta = delta;
4545 data_fake.flags |= SF_IS_INF;
4546 if ( flags & SCF_DO_STCLASS && !scan->flags
4547 && OP(scan) == IFMATCH ) { /* Lookahead */
4548 cl_init(pRExC_state, &intrnl);
4549 data_fake.start_class = &intrnl;
4550 f |= SCF_DO_STCLASS_AND;
4552 if (flags & SCF_WHILEM_VISITED_POS)
4553 f |= SCF_WHILEM_VISITED_POS;
4554 next = regnext(scan);
4555 nscan = NEXTOPER(NEXTOPER(scan));
4557 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4558 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4561 FAIL("Variable length lookbehind not implemented");
4563 else if (*minnextp > (I32)U8_MAX) {
4564 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4566 scan->flags = (U8)*minnextp;
4571 if (f & SCF_DO_STCLASS_AND) {
4572 const int was = (data->start_class->flags & ANYOF_EOS);
4574 cl_and(data->start_class, &intrnl);
4576 data->start_class->flags |= ANYOF_EOS;
4579 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4581 if (data_fake.flags & SF_HAS_EVAL)
4582 data->flags |= SF_HAS_EVAL;
4583 data->whilem_c = data_fake.whilem_c;
4584 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4585 if (RExC_rx->minlen<*minnextp)
4586 RExC_rx->minlen=*minnextp;
4587 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4588 SvREFCNT_dec(data_fake.last_found);
4590 if ( data_fake.minlen_fixed != minlenp )
4592 data->offset_fixed= data_fake.offset_fixed;
4593 data->minlen_fixed= data_fake.minlen_fixed;
4594 data->lookbehind_fixed+= scan->flags;
4596 if ( data_fake.minlen_float != minlenp )
4598 data->minlen_float= data_fake.minlen_float;
4599 data->offset_float_min=data_fake.offset_float_min;
4600 data->offset_float_max=data_fake.offset_float_max;
4601 data->lookbehind_float+= scan->flags;
4608 else if (OP(scan) == OPEN) {
4609 if (stopparen != (I32)ARG(scan))
4612 else if (OP(scan) == CLOSE) {
4613 if (stopparen == (I32)ARG(scan)) {
4616 if ((I32)ARG(scan) == is_par) {
4617 next = regnext(scan);
4619 if ( next && (OP(next) != WHILEM) && next < last)
4620 is_par = 0; /* Disable optimization */
4623 *(data->last_closep) = ARG(scan);
4625 else if (OP(scan) == EVAL) {
4627 data->flags |= SF_HAS_EVAL;
4629 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4630 if (flags & SCF_DO_SUBSTR) {
4631 SCAN_COMMIT(pRExC_state,data,minlenp);
4632 flags &= ~SCF_DO_SUBSTR;
4634 if (data && OP(scan)==ACCEPT) {
4635 data->flags |= SCF_SEEN_ACCEPT;
4640 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4642 if (flags & SCF_DO_SUBSTR) {
4643 SCAN_COMMIT(pRExC_state,data,minlenp);
4644 data->longest = &(data->longest_float);
4646 is_inf = is_inf_internal = 1;
4647 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4648 cl_anything(pRExC_state, data->start_class);
4649 flags &= ~SCF_DO_STCLASS;
4651 else if (OP(scan) == GPOS) {
4652 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4653 !(delta || is_inf || (data && data->pos_delta)))
4655 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4656 RExC_rx->extflags |= RXf_ANCH_GPOS;
4657 if (RExC_rx->gofs < (U32)min)
4658 RExC_rx->gofs = min;
4660 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4664 #ifdef TRIE_STUDY_OPT
4665 #ifdef FULL_TRIE_STUDY
4666 else if (PL_regkind[OP(scan)] == TRIE) {
4667 /* NOTE - There is similar code to this block above for handling
4668 BRANCH nodes on the initial study. If you change stuff here
4670 regnode *trie_node= scan;
4671 regnode *tail= regnext(scan);
4672 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4673 I32 max1 = 0, min1 = I32_MAX;
4674 struct regnode_charclass_class accum;
4676 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4677 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4678 if (flags & SCF_DO_STCLASS)
4679 cl_init_zero(pRExC_state, &accum);
4685 const regnode *nextbranch= NULL;
4688 for ( word=1 ; word <= trie->wordcount ; word++)
4690 I32 deltanext=0, minnext=0, f = 0, fake;
4691 struct regnode_charclass_class this_class;
4693 data_fake.flags = 0;
4695 data_fake.whilem_c = data->whilem_c;
4696 data_fake.last_closep = data->last_closep;
4699 data_fake.last_closep = &fake;
4700 data_fake.pos_delta = delta;
4701 if (flags & SCF_DO_STCLASS) {
4702 cl_init(pRExC_state, &this_class);
4703 data_fake.start_class = &this_class;
4704 f = SCF_DO_STCLASS_AND;
4706 if (flags & SCF_WHILEM_VISITED_POS)
4707 f |= SCF_WHILEM_VISITED_POS;
4709 if (trie->jump[word]) {
4711 nextbranch = trie_node + trie->jump[0];
4712 scan= trie_node + trie->jump[word];
4713 /* We go from the jump point to the branch that follows
4714 it. Note this means we need the vestigal unused branches
4715 even though they arent otherwise used.
4717 minnext = study_chunk(pRExC_state, &scan, minlenp,
4718 &deltanext, (regnode *)nextbranch, &data_fake,
4719 stopparen, recursed, NULL, f,depth+1);
4721 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4722 nextbranch= regnext((regnode*)nextbranch);
4724 if (min1 > (I32)(minnext + trie->minlen))
4725 min1 = minnext + trie->minlen;
4726 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4727 max1 = minnext + deltanext + trie->maxlen;
4728 if (deltanext == I32_MAX)
4729 is_inf = is_inf_internal = 1;
4731 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4733 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4734 if ( stopmin > min + min1)
4735 stopmin = min + min1;
4736 flags &= ~SCF_DO_SUBSTR;
4738 data->flags |= SCF_SEEN_ACCEPT;
4741 if (data_fake.flags & SF_HAS_EVAL)
4742 data->flags |= SF_HAS_EVAL;
4743 data->whilem_c = data_fake.whilem_c;
4745 if (flags & SCF_DO_STCLASS)
4746 cl_or(pRExC_state, &accum, &this_class);
4749 if (flags & SCF_DO_SUBSTR) {
4750 data->pos_min += min1;
4751 data->pos_delta += max1 - min1;
4752 if (max1 != min1 || is_inf)
4753 data->longest = &(data->longest_float);
4756 delta += max1 - min1;
4757 if (flags & SCF_DO_STCLASS_OR) {
4758 cl_or(pRExC_state, data->start_class, &accum);
4760 cl_and(data->start_class, and_withp);
4761 flags &= ~SCF_DO_STCLASS;
4764 else if (flags & SCF_DO_STCLASS_AND) {
4766 cl_and(data->start_class, &accum);
4767 flags &= ~SCF_DO_STCLASS;
4770 /* Switch to OR mode: cache the old value of
4771 * data->start_class */
4773 StructCopy(data->start_class, and_withp,
4774 struct regnode_charclass_class);
4775 flags &= ~SCF_DO_STCLASS_AND;
4776 StructCopy(&accum, data->start_class,
4777 struct regnode_charclass_class);
4778 flags |= SCF_DO_STCLASS_OR;
4779 data->start_class->flags |= ANYOF_EOS;
4786 else if (PL_regkind[OP(scan)] == TRIE) {
4787 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4790 min += trie->minlen;
4791 delta += (trie->maxlen - trie->minlen);
4792 flags &= ~SCF_DO_STCLASS; /* xxx */
4793 if (flags & SCF_DO_SUBSTR) {
4794 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4795 data->pos_min += trie->minlen;
4796 data->pos_delta += (trie->maxlen - trie->minlen);
4797 if (trie->maxlen != trie->minlen)
4798 data->longest = &(data->longest_float);
4800 if (trie->jump) /* no more substrings -- for now /grr*/
4801 flags &= ~SCF_DO_SUBSTR;
4803 #endif /* old or new */
4804 #endif /* TRIE_STUDY_OPT */
4806 /* Else: zero-length, ignore. */
4807 scan = regnext(scan);
4812 stopparen = frame->stop;
4813 frame = frame->prev;
4814 goto fake_study_recurse;
4819 DEBUG_STUDYDATA("pre-fin:",data,depth);
4822 *deltap = is_inf_internal ? I32_MAX : delta;
4823 if (flags & SCF_DO_SUBSTR && is_inf)
4824 data->pos_delta = I32_MAX - data->pos_min;
4825 if (is_par > (I32)U8_MAX)
4827 if (is_par && pars==1 && data) {
4828 data->flags |= SF_IN_PAR;
4829 data->flags &= ~SF_HAS_PAR;
4831 else if (pars && data) {
4832 data->flags |= SF_HAS_PAR;
4833 data->flags &= ~SF_IN_PAR;
4835 if (flags & SCF_DO_STCLASS_OR)
4836 cl_and(data->start_class, and_withp);
4837 if (flags & SCF_TRIE_RESTUDY)
4838 data->flags |= SCF_TRIE_RESTUDY;
4840 DEBUG_STUDYDATA("post-fin:",data,depth);
4842 return min < stopmin ? min : stopmin;
4846 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4848 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4850 PERL_ARGS_ASSERT_ADD_DATA;
4852 Renewc(RExC_rxi->data,
4853 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4854 char, struct reg_data);
4856 Renew(RExC_rxi->data->what, count + n, U8);
4858 Newx(RExC_rxi->data->what, n, U8);
4859 RExC_rxi->data->count = count + n;
4860 Copy(s, RExC_rxi->data->what + count, n, U8);
4864 /*XXX: todo make this not included in a non debugging perl */
4865 #ifndef PERL_IN_XSUB_RE
4867 Perl_reginitcolors(pTHX)
4870 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4872 char *t = savepv(s);
4876 t = strchr(t, '\t');
4882 PL_colors[i] = t = (char *)"";
4887 PL_colors[i++] = (char *)"";
4894 #ifdef TRIE_STUDY_OPT
4895 #define CHECK_RESTUDY_GOTO \
4897 (data.flags & SCF_TRIE_RESTUDY) \
4901 #define CHECK_RESTUDY_GOTO
4905 * pregcomp - compile a regular expression into internal code
4907 * Decides which engine's compiler to call based on the hint currently in
4911 #ifndef PERL_IN_XSUB_RE
4913 /* return the currently in-scope regex engine (or the default if none) */
4915 regexp_engine const *
4916 Perl_current_re_engine(pTHX)
4920 if (IN_PERL_COMPILETIME) {
4921 HV * const table = GvHV(PL_hintgv);
4925 return &PL_core_reg_engine;
4926 ptr = hv_fetchs(table, "regcomp", FALSE);
4927 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4928 return &PL_core_reg_engine;
4929 return INT2PTR(regexp_engine*,SvIV(*ptr));
4933 if (!PL_curcop->cop_hints_hash)
4934 return &PL_core_reg_engine;
4935 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4936 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4937 return &PL_core_reg_engine;
4938 return INT2PTR(regexp_engine*,SvIV(ptr));
4944 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4947 regexp_engine const *eng = current_re_engine();
4948 GET_RE_DEBUG_FLAGS_DECL;
4950 PERL_ARGS_ASSERT_PREGCOMP;
4952 /* Dispatch a request to compile a regexp to correct regexp engine. */
4954 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4957 return CALLREGCOMP_ENG(eng, pattern, flags);
4961 /* public(ish) wrapper for Perl_re_op_compile that only takes an SV
4962 * pattern rather than a list of OPs */
4965 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4967 SV *pat = pattern; /* defeat constness! */
4968 PERL_ARGS_ASSERT_RE_COMPILE;
4969 return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
4970 NULL, NULL, rx_flags, 0);
4973 /* see if there are any run-time code blocks in the pattern.
4974 * False positives are allowed */
4977 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4978 U32 pm_flags, char *pat, STRLEN plen)
4983 /* avoid infinitely recursing when we recompile the pattern parcelled up
4984 * as qr'...'. A single constant qr// string can't have have any
4985 * run-time component in it, and thus, no runtime code. (A non-qr
4986 * string, however, can, e.g. $x =~ '(?{})') */
4987 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4990 for (s = 0; s < plen; s++) {
4991 if (n < pRExC_state->num_code_blocks
4992 && s == pRExC_state->code_blocks[n].start)
4994 s = pRExC_state->code_blocks[n].end;
4998 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5000 if (pat[s] == '(' && pat[s+1] == '?' &&
5001 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
5008 /* Handle run-time code blocks. We will already have compiled any direct
5009 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5010 * copy of it, but with any literal code blocks blanked out and
5011 * appropriate chars escaped; then feed it into
5013 * eval "qr'modified_pattern'"
5017 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5021 * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
5023 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5024 * and merge them with any code blocks of the original regexp.
5026 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5027 * instead, just save the qr and return FALSE; this tells our caller that
5028 * the original pattern needs upgrading to utf8.
5032 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5033 char *pat, STRLEN plen)
5037 GET_RE_DEBUG_FLAGS_DECL;
5039 if (pRExC_state->runtime_code_qr) {
5040 /* this is the second time we've been called; this should
5041 * only happen if the main pattern got upgraded to utf8
5042 * during compilation; re-use the qr we compiled first time
5043 * round (which should be utf8 too)
5045 qr = pRExC_state->runtime_code_qr;
5046 pRExC_state->runtime_code_qr = NULL;
5047 assert(RExC_utf8 && SvUTF8(qr));
5053 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5057 /* determine how many extra chars we need for ' and \ escaping */
5058 for (s = 0; s < plen; s++) {
5059 if (pat[s] == '\'' || pat[s] == '\\')
5063 Newx(newpat, newlen, char);
5065 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5067 for (s = 0; s < plen; s++) {
5068 if (n < pRExC_state->num_code_blocks
5069 && s == pRExC_state->code_blocks[n].start)
5071 /* blank out literal code block */
5072 assert(pat[s] == '(');
5073 while (s <= pRExC_state->code_blocks[n].end) {
5081 if (pat[s] == '\'' || pat[s] == '\\')
5086 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5090 PerlIO_printf(Perl_debug_log,
5091 "%sre-parsing pattern for runtime code:%s %s\n",
5092 PL_colors[4],PL_colors[5],newpat);
5095 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5101 PUSHSTACKi(PERLSI_REQUIRE);
5102 /* this causes the toker to collapse \\ into \ when parsing
5103 * qr''; normally only q'' does this. It also alters hints
5105 PL_reg_state.re_reparsing = TRUE;
5106 eval_sv(sv, G_SCALAR);
5112 Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5113 assert(SvROK(qr_ref));
5115 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5116 /* the leaving below frees the tmp qr_ref.
5117 * Give qr a life of its own */
5125 if (!RExC_utf8 && SvUTF8(qr)) {
5126 /* first time through; the pattern got upgraded; save the
5127 * qr for the next time through */
5128 assert(!pRExC_state->runtime_code_qr);
5129 pRExC_state->runtime_code_qr = qr;
5134 /* extract any code blocks within the returned qr// */
5137 /* merge the main (r1) and run-time (r2) code blocks into one */
5139 RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5140 struct reg_code_block *new_block, *dst;
5141 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5144 if (!r2->num_code_blocks) /* we guessed wrong */
5148 r1->num_code_blocks + r2->num_code_blocks,
5149 struct reg_code_block);
5152 while ( i1 < r1->num_code_blocks
5153 || i2 < r2->num_code_blocks)
5155 struct reg_code_block *src;
5158 if (i1 == r1->num_code_blocks) {
5159 src = &r2->code_blocks[i2++];
5162 else if (i2 == r2->num_code_blocks)
5163 src = &r1->code_blocks[i1++];
5164 else if ( r1->code_blocks[i1].start
5165 < r2->code_blocks[i2].start)
5167 src = &r1->code_blocks[i1++];
5168 assert(src->end < r2->code_blocks[i2].start);
5171 assert( r1->code_blocks[i1].start
5172 > r2->code_blocks[i2].start);
5173 src = &r2->code_blocks[i2++];
5175 assert(src->end < r1->code_blocks[i1].start);
5178 assert(pat[src->start] == '(');
5179 assert(pat[src->end] == ')');
5180 dst->start = src->start;
5181 dst->end = src->end;
5182 dst->block = src->block;
5183 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5187 r1->num_code_blocks += r2->num_code_blocks;
5188 Safefree(r1->code_blocks);
5189 r1->code_blocks = new_block;
5198 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5199 * regular expression into internal code.
5200 * The pattern may be passed either as:
5201 * a list of SVs (patternp plus pat_count)
5202 * a list of OPs (expr)
5203 * If both are passed, the SV list is used, but the OP list indicates
5204 * which SVs are actually pre-compiled code blocks
5206 * The SVs in the list have magic and qr overloading applied to them (and
5207 * the list may be modified in-place with replacement SVs in the latter
5210 * If the pattern hasn't changed from old_re, then old_re will be
5213 * eng is the current engine. If that engine has an op_comp method, then
5214 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5215 * do the initial concatenation of arguments and pass on to the external
5218 * If is_bare_re is not null, set it to a boolean indicating whether the
5219 * arg list reduced (after overloading) to a single bare regex which has
5220 * been returned (i.e. /$qr/).
5222 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5224 * pm_flags contains the PMf_* flags, typically based on those from the
5225 * pm_flags field of the related PMOP. Currently we're only interested in
5226 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5228 * We can't allocate space until we know how big the compiled form will be,
5229 * but we can't compile it (and thus know how big it is) until we've got a
5230 * place to put the code. So we cheat: we compile it twice, once with code
5231 * generation turned off and size counting turned on, and once "for real".
5232 * This also means that we don't allocate space until we are sure that the
5233 * thing really will compile successfully, and we never have to move the
5234 * code and thus invalidate pointers into it. (Note that it has to be in
5235 * one piece because free() must be able to free it all.) [NB: not true in perl]
5237 * Beware that the optimization-preparation code in here knows about some
5238 * of the structure of the compiled regexp. [I'll say.]
5242 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5243 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5244 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5249 register regexp_internal *ri;
5259 /* these are all flags - maybe they should be turned
5260 * into a single int with different bit masks */
5261 I32 sawlookahead = 0;
5264 bool used_setjump = FALSE;
5265 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5266 bool code_is_utf8 = 0;
5267 bool VOL recompile = 0;
5268 bool runtime_code = 0;
5272 RExC_state_t RExC_state;
5273 RExC_state_t * const pRExC_state = &RExC_state;
5274 #ifdef TRIE_STUDY_OPT
5276 RExC_state_t copyRExC_state;
5278 GET_RE_DEBUG_FLAGS_DECL;
5280 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5282 DEBUG_r(if (!PL_colorset) reginitcolors());
5284 #ifndef PERL_IN_XSUB_RE
5285 /* Initialize these here instead of as-needed, as is quick and avoids
5286 * having to test them each time otherwise */
5287 if (! PL_AboveLatin1) {
5288 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5289 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5290 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5292 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5293 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5295 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5296 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5298 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5299 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5301 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5303 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5304 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5306 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5308 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5309 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5311 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5312 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5314 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5315 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5317 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5318 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5320 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5321 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5323 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5324 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5326 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5327 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5329 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5330 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5332 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5334 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5335 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5337 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5338 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5342 pRExC_state->code_blocks = NULL;
5343 pRExC_state->num_code_blocks = 0;
5346 *is_bare_re = FALSE;
5348 if (expr && (expr->op_type == OP_LIST ||
5349 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5351 /* is the source UTF8, and how many code blocks are there? */
5355 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5356 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5358 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5359 /* count of DO blocks */
5363 pRExC_state->num_code_blocks = ncode;
5364 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5369 /* handle a list of SVs */
5373 /* apply magic and RE overloading to each arg */
5374 for (svp = patternp; svp < patternp + pat_count; svp++) {
5377 if (SvROK(rx) && SvAMAGIC(rx)) {
5378 SV *sv = AMG_CALLunary(rx, regexp_amg);
5382 if (SvTYPE(sv) != SVt_REGEXP)
5383 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5389 if (pat_count > 1) {
5390 /* concat multiple args and find any code block indexes */
5395 STRLEN orig_patlen = 0;
5397 if (pRExC_state->num_code_blocks) {
5398 o = cLISTOPx(expr)->op_first;
5399 assert(o->op_type == OP_PUSHMARK);
5403 pat = newSVpvn("", 0);
5406 /* determine if the pattern is going to be utf8 (needed
5407 * in advance to align code block indices correctly).
5408 * XXX This could fail to be detected for an arg with
5409 * overloading but not concat overloading; but the main effect
5410 * in this obscure case is to need a 'use re eval' for a
5411 * literal code block */
5412 for (svp = patternp; svp < patternp + pat_count; svp++) {
5419 for (svp = patternp; svp < patternp + pat_count; svp++) {
5420 SV *sv, *msv = *svp;
5424 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5425 assert(n < pRExC_state->num_code_blocks);
5426 pRExC_state->code_blocks[n].start = SvCUR(pat);
5427 pRExC_state->code_blocks[n].block = o;
5428 pRExC_state->code_blocks[n].src_regex = NULL;
5431 o = o->op_sibling; /* skip CONST */
5437 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5438 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5441 /* overloading involved: all bets are off over literal
5442 * code. Pretend we haven't seen it */
5443 pRExC_state->num_code_blocks -= n;
5449 while (SvAMAGIC(msv)
5450 && (sv = AMG_CALLunary(msv, string_amg))
5456 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5458 orig_patlen = SvCUR(pat);
5459 sv_catsv_nomg(pat, msv);
5462 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5465 /* extract any code blocks within any embedded qr//'s */
5466 if (rx && SvTYPE(rx) == SVt_REGEXP
5467 && RX_ENGINE((REGEXP*)rx)->op_comp)
5470 RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5471 if (ri->num_code_blocks) {
5473 /* the presence of an embedded qr// with code means
5474 * we should always recompile: the text of the
5475 * qr// may not have changed, but it may be a
5476 * different closure than last time */
5478 Renew(pRExC_state->code_blocks,
5479 pRExC_state->num_code_blocks + ri->num_code_blocks,
5480 struct reg_code_block);
5481 pRExC_state->num_code_blocks += ri->num_code_blocks;
5482 for (i=0; i < ri->num_code_blocks; i++) {
5483 struct reg_code_block *src, *dst;
5484 STRLEN offset = orig_patlen
5485 + ((struct regexp *)SvANY(rx))->pre_prefix;
5486 assert(n < pRExC_state->num_code_blocks);
5487 src = &ri->code_blocks[i];
5488 dst = &pRExC_state->code_blocks[n];
5489 dst->start = src->start + offset;
5490 dst->end = src->end + offset;
5491 dst->block = src->block;
5492 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5506 while (SvAMAGIC(pat)
5507 && (sv = AMG_CALLunary(pat, string_amg))
5515 /* handle bare regex: foo =~ $re */
5520 if (SvTYPE(re) == SVt_REGEXP) {
5524 Safefree(pRExC_state->code_blocks);
5530 /* not a list of SVs, so must be a list of OPs */
5532 if (expr->op_type == OP_LIST) {
5537 pat = newSVpvn("", 0);
5542 /* given a list of CONSTs and DO blocks in expr, append all
5543 * the CONSTs to pat, and record the start and end of each
5544 * code block in code_blocks[] (each DO{} op is followed by an
5545 * OP_CONST containing the corresponding literal '(?{...})
5548 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5549 if (o->op_type == OP_CONST) {
5550 sv_catsv(pat, cSVOPo_sv);
5552 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5556 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5557 assert(i+1 < pRExC_state->num_code_blocks);
5558 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5559 pRExC_state->code_blocks[i].block = o;
5560 pRExC_state->code_blocks[i].src_regex = NULL;
5566 assert(expr->op_type == OP_CONST);
5567 pat = cSVOPx_sv(expr);
5571 exp = SvPV_nomg(pat, plen);
5573 if (!eng->op_comp) {
5574 if ((SvUTF8(pat) && IN_BYTES)
5575 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5577 /* make a temporary copy; either to convert to bytes,
5578 * or to avoid repeating get-magic / overloaded stringify */
5579 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5580 (IN_BYTES ? 0 : SvUTF8(pat)));
5582 Safefree(pRExC_state->code_blocks);
5583 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5586 /* ignore the utf8ness if the pattern is 0 length */
5587 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5588 RExC_uni_semantics = 0;
5589 RExC_contains_locale = 0;
5590 pRExC_state->runtime_code_qr = NULL;
5592 /****************** LONG JUMP TARGET HERE***********************/
5593 /* Longjmp back to here if have to switch in midstream to utf8 */
5594 if (! RExC_orig_utf8) {
5595 JMPENV_PUSH(jump_ret);
5596 used_setjump = TRUE;
5599 if (jump_ret == 0) { /* First time through */
5603 SV *dsv= sv_newmortal();
5604 RE_PV_QUOTED_DECL(s, RExC_utf8,
5605 dsv, exp, plen, 60);
5606 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5607 PL_colors[4],PL_colors[5],s);
5610 else { /* longjumped back */
5613 STRLEN s = 0, d = 0;
5616 /* If the cause for the longjmp was other than changing to utf8, pop
5617 * our own setjmp, and longjmp to the correct handler */
5618 if (jump_ret != UTF8_LONGJMP) {
5620 JMPENV_JUMP(jump_ret);
5625 /* It's possible to write a regexp in ascii that represents Unicode
5626 codepoints outside of the byte range, such as via \x{100}. If we
5627 detect such a sequence we have to convert the entire pattern to utf8
5628 and then recompile, as our sizing calculation will have been based
5629 on 1 byte == 1 character, but we will need to use utf8 to encode
5630 at least some part of the pattern, and therefore must convert the whole
5633 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5634 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5636 /* upgrade pattern to UTF8, and if there are code blocks,
5637 * recalculate the indices.
5638 * This is essentially an unrolled Perl_bytes_to_utf8() */
5640 src = (U8*)SvPV_nomg(pat, plen);
5641 Newx(dst, plen * 2 + 1, U8);
5644 const UV uv = NATIVE_TO_ASCII(src[s]);
5645 if (UNI_IS_INVARIANT(uv))
5646 dst[d] = (U8)UTF_TO_NATIVE(uv);
5648 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5649 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5651 if (n < pRExC_state->num_code_blocks) {
5652 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5653 pRExC_state->code_blocks[n].start = d;
5654 assert(dst[d] == '(');
5657 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5658 pRExC_state->code_blocks[n].end = d;
5659 assert(dst[d] == ')');
5672 RExC_orig_utf8 = RExC_utf8 = 1;
5675 /* return old regex if pattern hasn't changed */
5679 && !!RX_UTF8(old_re) == !!RExC_utf8
5680 && RX_PRECOMP(old_re)
5681 && RX_PRELEN(old_re) == plen
5682 && memEQ(RX_PRECOMP(old_re), exp, plen))
5684 /* with runtime code, always recompile */
5685 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5687 if (!runtime_code) {
5688 ReREFCNT_inc(old_re);
5692 Safefree(pRExC_state->code_blocks);
5696 else if ((pm_flags & PMf_USE_RE_EVAL)
5697 /* this second condition covers the non-regex literal case,
5698 * i.e. $foo =~ '(?{})'. */
5699 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5700 && (PL_hints & HINT_RE_EVAL))
5702 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5705 #ifdef TRIE_STUDY_OPT
5709 rx_flags = orig_rx_flags;
5711 if (initial_charset == REGEX_LOCALE_CHARSET) {
5712 RExC_contains_locale = 1;
5714 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5716 /* Set to use unicode semantics if the pattern is in utf8 and has the
5717 * 'depends' charset specified, as it means unicode when utf8 */
5718 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5722 RExC_flags = rx_flags;
5723 RExC_pm_flags = pm_flags;
5726 if (PL_tainting && PL_tainted)
5727 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5729 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5730 /* whoops, we have a non-utf8 pattern, whilst run-time code
5731 * got compiled as utf8. Try again with a utf8 pattern */
5732 JMPENV_JUMP(UTF8_LONGJMP);
5735 assert(!pRExC_state->runtime_code_qr);
5740 RExC_in_lookbehind = 0;
5741 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5743 RExC_override_recoding = 0;
5745 /* First pass: determine size, legality. */
5753 RExC_emit = &PL_regdummy;
5754 RExC_whilem_seen = 0;
5755 RExC_open_parens = NULL;
5756 RExC_close_parens = NULL;
5758 RExC_paren_names = NULL;
5760 RExC_paren_name_list = NULL;
5762 RExC_recurse = NULL;
5763 RExC_recurse_count = 0;
5764 pRExC_state->code_index = 0;
5766 #if 0 /* REGC() is (currently) a NOP at the first pass.
5767 * Clever compilers notice this and complain. --jhi */
5768 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5771 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5773 RExC_lastparse=NULL;
5775 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5776 RExC_precomp = NULL;
5777 Safefree(pRExC_state->code_blocks);
5781 /* Here, finished first pass. Get rid of any added setjmp */
5787 PerlIO_printf(Perl_debug_log,
5788 "Required size %"IVdf" nodes\n"
5789 "Starting second pass (creation)\n",
5792 RExC_lastparse=NULL;
5795 /* The first pass could have found things that force Unicode semantics */
5796 if ((RExC_utf8 || RExC_uni_semantics)
5797 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5799 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5802 /* Small enough for pointer-storage convention?
5803 If extralen==0, this means that we will not need long jumps. */
5804 if (RExC_size >= 0x10000L && RExC_extralen)
5805 RExC_size += RExC_extralen;
5808 if (RExC_whilem_seen > 15)
5809 RExC_whilem_seen = 15;
5811 /* Allocate space and zero-initialize. Note, the two step process
5812 of zeroing when in debug mode, thus anything assigned has to
5813 happen after that */
5814 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5815 r = (struct regexp*)SvANY(rx);
5816 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5817 char, regexp_internal);
5818 if ( r == NULL || ri == NULL )
5819 FAIL("Regexp out of space");
5821 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5822 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5824 /* bulk initialize base fields with 0. */
5825 Zero(ri, sizeof(regexp_internal), char);
5828 /* non-zero initialization begins here */
5831 r->extflags = rx_flags;
5832 if (pm_flags & PMf_IS_QR) {
5833 ri->code_blocks = pRExC_state->code_blocks;
5834 ri->num_code_blocks = pRExC_state->num_code_blocks;
5837 SAVEFREEPV(pRExC_state->code_blocks);
5840 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5841 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5843 /* The caret is output if there are any defaults: if not all the STD
5844 * flags are set, or if no character set specifier is needed */
5846 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5848 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5849 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5850 >> RXf_PMf_STD_PMMOD_SHIFT);
5851 const char *fptr = STD_PAT_MODS; /*"msix"*/
5853 /* Allocate for the worst case, which is all the std flags are turned
5854 * on. If more precision is desired, we could do a population count of
5855 * the flags set. This could be done with a small lookup table, or by
5856 * shifting, masking and adding, or even, when available, assembly
5857 * language for a machine-language population count.
5858 * We never output a minus, as all those are defaults, so are
5859 * covered by the caret */
5860 const STRLEN wraplen = plen + has_p + has_runon
5861 + has_default /* If needs a caret */
5863 /* If needs a character set specifier */
5864 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5865 + (sizeof(STD_PAT_MODS) - 1)
5866 + (sizeof("(?:)") - 1);
5868 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5871 SvFLAGS(rx) |= SVf_UTF8;
5874 /* If a default, cover it using the caret */
5876 *p++= DEFAULT_PAT_MOD;
5880 const char* const name = get_regex_charset_name(r->extflags, &len);
5881 Copy(name, p, len, char);
5885 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5888 while((ch = *fptr++)) {
5896 Copy(RExC_precomp, p, plen, char);
5897 assert ((RX_WRAPPED(rx) - p) < 16);
5898 r->pre_prefix = p - RX_WRAPPED(rx);
5904 SvCUR_set(rx, p - SvPVX_const(rx));
5908 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5910 if (RExC_seen & REG_SEEN_RECURSE) {
5911 Newxz(RExC_open_parens, RExC_npar,regnode *);
5912 SAVEFREEPV(RExC_open_parens);
5913 Newxz(RExC_close_parens,RExC_npar,regnode *);
5914 SAVEFREEPV(RExC_close_parens);
5917 /* Useful during FAIL. */
5918 #ifdef RE_TRACK_PATTERN_OFFSETS
5919 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5920 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5921 "%s %"UVuf" bytes for offset annotations.\n",
5922 ri->u.offsets ? "Got" : "Couldn't get",
5923 (UV)((2*RExC_size+1) * sizeof(U32))));
5925 SetProgLen(ri,RExC_size);
5929 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5931 /* Second pass: emit code. */
5932 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5933 RExC_pm_flags = pm_flags;
5938 RExC_emit_start = ri->program;
5939 RExC_emit = ri->program;
5940 RExC_emit_bound = ri->program + RExC_size + 1;
5941 pRExC_state->code_index = 0;
5943 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5944 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5948 /* XXXX To minimize changes to RE engine we always allocate
5949 3-units-long substrs field. */
5950 Newx(r->substrs, 1, struct reg_substr_data);
5951 if (RExC_recurse_count) {
5952 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5953 SAVEFREEPV(RExC_recurse);
5957 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5958 Zero(r->substrs, 1, struct reg_substr_data);
5960 #ifdef TRIE_STUDY_OPT
5962 StructCopy(&zero_scan_data, &data, scan_data_t);
5963 copyRExC_state = RExC_state;
5966 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5968 RExC_state = copyRExC_state;
5969 if (seen & REG_TOP_LEVEL_BRANCHES)
5970 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5972 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5973 if (data.last_found) {
5974 SvREFCNT_dec(data.longest_fixed);
5975 SvREFCNT_dec(data.longest_float);
5976 SvREFCNT_dec(data.last_found);
5978 StructCopy(&zero_scan_data, &data, scan_data_t);
5981 StructCopy(&zero_scan_data, &data, scan_data_t);
5984 /* Dig out information for optimizations. */
5985 r->extflags = RExC_flags; /* was pm_op */
5986 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5989 SvUTF8_on(rx); /* Unicode in it? */
5990 ri->regstclass = NULL;
5991 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5992 r->intflags |= PREGf_NAUGHTY;
5993 scan = ri->program + 1; /* First BRANCH. */
5995 /* testing for BRANCH here tells us whether there is "must appear"
5996 data in the pattern. If there is then we can use it for optimisations */
5997 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5999 STRLEN longest_float_length, longest_fixed_length;
6000 struct regnode_charclass_class ch_class; /* pointed to by data */
6002 I32 last_close = 0; /* pointed to by data */
6003 regnode *first= scan;
6004 regnode *first_next= regnext(first);
6006 * Skip introductions and multiplicators >= 1
6007 * so that we can extract the 'meat' of the pattern that must
6008 * match in the large if() sequence following.
6009 * NOTE that EXACT is NOT covered here, as it is normally
6010 * picked up by the optimiser separately.
6012 * This is unfortunate as the optimiser isnt handling lookahead
6013 * properly currently.
6016 while ((OP(first) == OPEN && (sawopen = 1)) ||
6017 /* An OR of *one* alternative - should not happen now. */
6018 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6019 /* for now we can't handle lookbehind IFMATCH*/
6020 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6021 (OP(first) == PLUS) ||
6022 (OP(first) == MINMOD) ||
6023 /* An {n,m} with n>0 */
6024 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6025 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6028 * the only op that could be a regnode is PLUS, all the rest
6029 * will be regnode_1 or regnode_2.
6032 if (OP(first) == PLUS)
6035 first += regarglen[OP(first)];
6037 first = NEXTOPER(first);
6038 first_next= regnext(first);
6041 /* Starting-point info. */
6043 DEBUG_PEEP("first:",first,0);
6044 /* Ignore EXACT as we deal with it later. */
6045 if (PL_regkind[OP(first)] == EXACT) {
6046 if (OP(first) == EXACT)
6047 NOOP; /* Empty, get anchored substr later. */
6049 ri->regstclass = first;
6052 else if (PL_regkind[OP(first)] == TRIE &&
6053 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6056 /* this can happen only on restudy */
6057 if ( OP(first) == TRIE ) {
6058 struct regnode_1 *trieop = (struct regnode_1 *)
6059 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6060 StructCopy(first,trieop,struct regnode_1);
6061 trie_op=(regnode *)trieop;
6063 struct regnode_charclass *trieop = (struct regnode_charclass *)
6064 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6065 StructCopy(first,trieop,struct regnode_charclass);
6066 trie_op=(regnode *)trieop;
6069 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6070 ri->regstclass = trie_op;
6073 else if (REGNODE_SIMPLE(OP(first)))
6074 ri->regstclass = first;
6075 else if (PL_regkind[OP(first)] == BOUND ||
6076 PL_regkind[OP(first)] == NBOUND)
6077 ri->regstclass = first;
6078 else if (PL_regkind[OP(first)] == BOL) {
6079 r->extflags |= (OP(first) == MBOL
6081 : (OP(first) == SBOL
6084 first = NEXTOPER(first);
6087 else if (OP(first) == GPOS) {
6088 r->extflags |= RXf_ANCH_GPOS;
6089 first = NEXTOPER(first);
6092 else if ((!sawopen || !RExC_sawback) &&
6093 (OP(first) == STAR &&
6094 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6095 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6097 /* turn .* into ^.* with an implied $*=1 */
6099 (OP(NEXTOPER(first)) == REG_ANY)
6102 r->extflags |= type;
6103 r->intflags |= PREGf_IMPLICIT;
6104 first = NEXTOPER(first);
6107 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6108 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6109 /* x+ must match at the 1st pos of run of x's */
6110 r->intflags |= PREGf_SKIP;
6112 /* Scan is after the zeroth branch, first is atomic matcher. */
6113 #ifdef TRIE_STUDY_OPT
6116 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6117 (IV)(first - scan + 1))
6121 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6122 (IV)(first - scan + 1))
6128 * If there's something expensive in the r.e., find the
6129 * longest literal string that must appear and make it the
6130 * regmust. Resolve ties in favor of later strings, since
6131 * the regstart check works with the beginning of the r.e.
6132 * and avoiding duplication strengthens checking. Not a
6133 * strong reason, but sufficient in the absence of others.
6134 * [Now we resolve ties in favor of the earlier string if
6135 * it happens that c_offset_min has been invalidated, since the
6136 * earlier string may buy us something the later one won't.]
6139 data.longest_fixed = newSVpvs("");
6140 data.longest_float = newSVpvs("");
6141 data.last_found = newSVpvs("");
6142 data.longest = &(data.longest_fixed);
6144 if (!ri->regstclass) {
6145 cl_init(pRExC_state, &ch_class);
6146 data.start_class = &ch_class;
6147 stclass_flag = SCF_DO_STCLASS_AND;
6148 } else /* XXXX Check for BOUND? */
6150 data.last_closep = &last_close;
6152 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6153 &data, -1, NULL, NULL,
6154 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6160 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6161 && data.last_start_min == 0 && data.last_end > 0
6162 && !RExC_seen_zerolen
6163 && !(RExC_seen & REG_SEEN_VERBARG)
6164 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6165 r->extflags |= RXf_CHECK_ALL;
6166 scan_commit(pRExC_state, &data,&minlen,0);
6167 SvREFCNT_dec(data.last_found);
6169 /* Note that code very similar to this but for anchored string
6170 follows immediately below, changes may need to be made to both.
6173 longest_float_length = CHR_SVLEN(data.longest_float);
6174 if (longest_float_length
6175 || (data.flags & SF_FL_BEFORE_EOL
6176 && (!(data.flags & SF_FL_BEFORE_MEOL)
6177 || (RExC_flags & RXf_PMf_MULTILINE))))
6181 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
6182 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
6183 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6184 && data.offset_fixed == data.offset_float_min
6185 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6186 goto remove_float; /* As in (a)+. */
6188 /* copy the information about the longest float from the reg_scan_data
6189 over to the program. */
6190 if (SvUTF8(data.longest_float)) {
6191 r->float_utf8 = data.longest_float;
6192 r->float_substr = NULL;
6194 r->float_substr = data.longest_float;
6195 r->float_utf8 = NULL;
6197 /* float_end_shift is how many chars that must be matched that
6198 follow this item. We calculate it ahead of time as once the
6199 lookbehind offset is added in we lose the ability to correctly
6201 ml = data.minlen_float ? *(data.minlen_float)
6202 : (I32)longest_float_length;
6203 r->float_end_shift = ml - data.offset_float_min
6204 - longest_float_length + (SvTAIL(data.longest_float) != 0)
6205 + data.lookbehind_float;
6206 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6207 r->float_max_offset = data.offset_float_max;
6208 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6209 r->float_max_offset -= data.lookbehind_float;
6211 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
6212 && (!(data.flags & SF_FL_BEFORE_MEOL)
6213 || (RExC_flags & RXf_PMf_MULTILINE)));
6214 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
6218 r->float_substr = r->float_utf8 = NULL;
6219 SvREFCNT_dec(data.longest_float);
6220 longest_float_length = 0;
6223 /* Note that code very similar to this but for floating string
6224 is immediately above, changes may need to be made to both.
6227 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6229 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
6230 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
6231 && (longest_fixed_length
6232 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
6233 && (!(data.flags & SF_FIX_BEFORE_MEOL)
6234 || (RExC_flags & RXf_PMf_MULTILINE)))) )
6238 /* copy the information about the longest fixed
6239 from the reg_scan_data over to the program. */
6240 if (SvUTF8(data.longest_fixed)) {
6241 r->anchored_utf8 = data.longest_fixed;
6242 r->anchored_substr = NULL;
6244 r->anchored_substr = data.longest_fixed;
6245 r->anchored_utf8 = NULL;
6247 /* fixed_end_shift is how many chars that must be matched that
6248 follow this item. We calculate it ahead of time as once the
6249 lookbehind offset is added in we lose the ability to correctly
6251 ml = data.minlen_fixed ? *(data.minlen_fixed)
6252 : (I32)longest_fixed_length;
6253 r->anchored_end_shift = ml - data.offset_fixed
6254 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
6255 + data.lookbehind_fixed;
6256 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6258 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
6259 && (!(data.flags & SF_FIX_BEFORE_MEOL)
6260 || (RExC_flags & RXf_PMf_MULTILINE)));
6261 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
6264 r->anchored_substr = r->anchored_utf8 = NULL;
6265 SvREFCNT_dec(data.longest_fixed);
6266 longest_fixed_length = 0;
6269 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6270 ri->regstclass = NULL;
6272 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6274 && !(data.start_class->flags & ANYOF_EOS)
6275 && !cl_is_anything(data.start_class))
6277 const U32 n = add_data(pRExC_state, 1, "f");
6278 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6280 Newx(RExC_rxi->data->data[n], 1,
6281 struct regnode_charclass_class);
6282 StructCopy(data.start_class,
6283 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6284 struct regnode_charclass_class);
6285 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6286 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6287 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6288 regprop(r, sv, (regnode*)data.start_class);
6289 PerlIO_printf(Perl_debug_log,
6290 "synthetic stclass \"%s\".\n",
6291 SvPVX_const(sv));});
6294 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6295 if (longest_fixed_length > longest_float_length) {
6296 r->check_end_shift = r->anchored_end_shift;
6297 r->check_substr = r->anchored_substr;
6298 r->check_utf8 = r->anchored_utf8;
6299 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6300 if (r->extflags & RXf_ANCH_SINGLE)
6301 r->extflags |= RXf_NOSCAN;
6304 r->check_end_shift = r->float_end_shift;
6305 r->check_substr = r->float_substr;
6306 r->check_utf8 = r->float_utf8;
6307 r->check_offset_min = r->float_min_offset;
6308 r->check_offset_max = r->float_max_offset;
6310 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6311 This should be changed ASAP! */
6312 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6313 r->extflags |= RXf_USE_INTUIT;
6314 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6315 r->extflags |= RXf_INTUIT_TAIL;
6317 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6318 if ( (STRLEN)minlen < longest_float_length )
6319 minlen= longest_float_length;
6320 if ( (STRLEN)minlen < longest_fixed_length )
6321 minlen= longest_fixed_length;
6325 /* Several toplevels. Best we can is to set minlen. */
6327 struct regnode_charclass_class ch_class;
6330 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6332 scan = ri->program + 1;
6333 cl_init(pRExC_state, &ch_class);
6334 data.start_class = &ch_class;
6335 data.last_closep = &last_close;
6338 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6339 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6343 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6344 = r->float_substr = r->float_utf8 = NULL;
6346 if (!(data.start_class->flags & ANYOF_EOS)
6347 && !cl_is_anything(data.start_class))
6349 const U32 n = add_data(pRExC_state, 1, "f");
6350 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6352 Newx(RExC_rxi->data->data[n], 1,
6353 struct regnode_charclass_class);
6354 StructCopy(data.start_class,
6355 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6356 struct regnode_charclass_class);
6357 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6358 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6359 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6360 regprop(r, sv, (regnode*)data.start_class);
6361 PerlIO_printf(Perl_debug_log,
6362 "synthetic stclass \"%s\".\n",
6363 SvPVX_const(sv));});
6367 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6368 the "real" pattern. */
6370 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6371 (IV)minlen, (IV)r->minlen);
6373 r->minlenret = minlen;
6374 if (r->minlen < minlen)
6377 if (RExC_seen & REG_SEEN_GPOS)
6378 r->extflags |= RXf_GPOS_SEEN;
6379 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6380 r->extflags |= RXf_LOOKBEHIND_SEEN;
6381 if (pRExC_state->num_code_blocks)
6382 r->extflags |= RXf_EVAL_SEEN;
6383 if (RExC_seen & REG_SEEN_CANY)
6384 r->extflags |= RXf_CANY_SEEN;
6385 if (RExC_seen & REG_SEEN_VERBARG)
6386 r->intflags |= PREGf_VERBARG_SEEN;
6387 if (RExC_seen & REG_SEEN_CUTGROUP)
6388 r->intflags |= PREGf_CUTGROUP_SEEN;
6389 if (pm_flags & PMf_USE_RE_EVAL)
6390 r->intflags |= PREGf_USE_RE_EVAL;
6391 if (RExC_paren_names)
6392 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6394 RXp_PAREN_NAMES(r) = NULL;
6396 #ifdef STUPID_PATTERN_CHECKS
6397 if (RX_PRELEN(rx) == 0)
6398 r->extflags |= RXf_NULL;
6399 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6400 /* XXX: this should happen BEFORE we compile */
6401 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6402 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6403 r->extflags |= RXf_WHITE;
6404 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6405 r->extflags |= RXf_START_ONLY;
6407 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6408 /* XXX: this should happen BEFORE we compile */
6409 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6411 regnode *first = ri->program + 1;
6414 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6415 r->extflags |= RXf_NULL;
6416 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6417 r->extflags |= RXf_START_ONLY;
6418 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6419 && OP(regnext(first)) == END)
6420 r->extflags |= RXf_WHITE;
6424 if (RExC_paren_names) {
6425 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6426 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6429 ri->name_list_idx = 0;
6431 if (RExC_recurse_count) {
6432 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6433 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6434 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6437 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6438 /* assume we don't need to swap parens around before we match */
6441 PerlIO_printf(Perl_debug_log,"Final program:\n");
6444 #ifdef RE_TRACK_PATTERN_OFFSETS
6445 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6446 const U32 len = ri->u.offsets[0];
6448 GET_RE_DEBUG_FLAGS_DECL;
6449 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6450 for (i = 1; i <= len; i++) {
6451 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6452 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6453 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6455 PerlIO_printf(Perl_debug_log, "\n");
6463 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6466 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6468 PERL_UNUSED_ARG(value);
6470 if (flags & RXapif_FETCH) {
6471 return reg_named_buff_fetch(rx, key, flags);
6472 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6473 Perl_croak_no_modify(aTHX);
6475 } else if (flags & RXapif_EXISTS) {
6476 return reg_named_buff_exists(rx, key, flags)
6479 } else if (flags & RXapif_REGNAMES) {
6480 return reg_named_buff_all(rx, flags);
6481 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6482 return reg_named_buff_scalar(rx, flags);
6484 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6490 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6493 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6494 PERL_UNUSED_ARG(lastkey);
6496 if (flags & RXapif_FIRSTKEY)
6497 return reg_named_buff_firstkey(rx, flags);
6498 else if (flags & RXapif_NEXTKEY)
6499 return reg_named_buff_nextkey(rx, flags);
6501 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6507 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6510 AV *retarray = NULL;
6512 struct regexp *const rx = (struct regexp *)SvANY(r);
6514 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6516 if (flags & RXapif_ALL)
6519 if (rx && RXp_PAREN_NAMES(rx)) {
6520 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6523 SV* sv_dat=HeVAL(he_str);
6524 I32 *nums=(I32*)SvPVX(sv_dat);
6525 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6526 if ((I32)(rx->nparens) >= nums[i]
6527 && rx->offs[nums[i]].start != -1
6528 && rx->offs[nums[i]].end != -1)
6531 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6536 ret = newSVsv(&PL_sv_undef);
6539 av_push(retarray, ret);
6542 return newRV_noinc(MUTABLE_SV(retarray));
6549 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6552 struct regexp *const rx = (struct regexp *)SvANY(r);
6554 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6556 if (rx && RXp_PAREN_NAMES(rx)) {
6557 if (flags & RXapif_ALL) {
6558 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6560 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6574 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6576 struct regexp *const rx = (struct regexp *)SvANY(r);
6578 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6580 if ( rx && RXp_PAREN_NAMES(rx) ) {
6581 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6583 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6590 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6592 struct regexp *const rx = (struct regexp *)SvANY(r);
6593 GET_RE_DEBUG_FLAGS_DECL;
6595 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6597 if (rx && RXp_PAREN_NAMES(rx)) {
6598 HV *hv = RXp_PAREN_NAMES(rx);
6600 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6603 SV* sv_dat = HeVAL(temphe);
6604 I32 *nums = (I32*)SvPVX(sv_dat);
6605 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6606 if ((I32)(rx->lastparen) >= nums[i] &&
6607 rx->offs[nums[i]].start != -1 &&
6608 rx->offs[nums[i]].end != -1)
6614 if (parno || flags & RXapif_ALL) {
6615 return newSVhek(HeKEY_hek(temphe));
6623 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6628 struct regexp *const rx = (struct regexp *)SvANY(r);
6630 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6632 if (rx && RXp_PAREN_NAMES(rx)) {
6633 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6634 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6635 } else if (flags & RXapif_ONE) {
6636 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6637 av = MUTABLE_AV(SvRV(ret));
6638 length = av_len(av);
6640 return newSViv(length + 1);
6642 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6646 return &PL_sv_undef;
6650 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6652 struct regexp *const rx = (struct regexp *)SvANY(r);
6655 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6657 if (rx && RXp_PAREN_NAMES(rx)) {
6658 HV *hv= RXp_PAREN_NAMES(rx);
6660 (void)hv_iterinit(hv);
6661 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6664 SV* sv_dat = HeVAL(temphe);
6665 I32 *nums = (I32*)SvPVX(sv_dat);
6666 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6667 if ((I32)(rx->lastparen) >= nums[i] &&
6668 rx->offs[nums[i]].start != -1 &&
6669 rx->offs[nums[i]].end != -1)
6675 if (parno || flags & RXapif_ALL) {
6676 av_push(av, newSVhek(HeKEY_hek(temphe)));
6681 return newRV_noinc(MUTABLE_SV(av));
6685 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6688 struct regexp *const rx = (struct regexp *)SvANY(r);
6693 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6696 sv_setsv(sv,&PL_sv_undef);
6700 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6702 i = rx->offs[0].start;
6706 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6708 s = rx->subbeg + rx->offs[0].end;
6709 i = rx->sublen - rx->offs[0].end;
6712 if ( 0 <= paren && paren <= (I32)rx->nparens &&
6713 (s1 = rx->offs[paren].start) != -1 &&
6714 (t1 = rx->offs[paren].end) != -1)
6718 s = rx->subbeg + s1;
6720 sv_setsv(sv,&PL_sv_undef);
6723 assert(rx->sublen >= (s - rx->subbeg) + i );
6725 const int oldtainted = PL_tainted;
6727 sv_setpvn(sv, s, i);
6728 PL_tainted = oldtainted;
6729 if ( (rx->extflags & RXf_CANY_SEEN)
6730 ? (RXp_MATCH_UTF8(rx)
6731 && (!i || is_utf8_string((U8*)s, i)))
6732 : (RXp_MATCH_UTF8(rx)) )
6739 if (RXp_MATCH_TAINTED(rx)) {
6740 if (SvTYPE(sv) >= SVt_PVMG) {
6741 MAGIC* const mg = SvMAGIC(sv);
6744 SvMAGIC_set(sv, mg->mg_moremagic);
6746 if ((mgt = SvMAGIC(sv))) {
6747 mg->mg_moremagic = mgt;
6748 SvMAGIC_set(sv, mg);
6758 sv_setsv(sv,&PL_sv_undef);
6764 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6765 SV const * const value)
6767 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6769 PERL_UNUSED_ARG(rx);
6770 PERL_UNUSED_ARG(paren);
6771 PERL_UNUSED_ARG(value);
6774 Perl_croak_no_modify(aTHX);
6778 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6781 struct regexp *const rx = (struct regexp *)SvANY(r);
6785 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6787 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6789 /* $` / ${^PREMATCH} */
6790 case RX_BUFF_IDX_PREMATCH:
6791 if (rx->offs[0].start != -1) {
6792 i = rx->offs[0].start;
6800 /* $' / ${^POSTMATCH} */
6801 case RX_BUFF_IDX_POSTMATCH:
6802 if (rx->offs[0].end != -1) {
6803 i = rx->sublen - rx->offs[0].end;
6805 s1 = rx->offs[0].end;
6811 /* $& / ${^MATCH}, $1, $2, ... */
6813 if (paren <= (I32)rx->nparens &&
6814 (s1 = rx->offs[paren].start) != -1 &&
6815 (t1 = rx->offs[paren].end) != -1)
6820 if (ckWARN(WARN_UNINITIALIZED))
6821 report_uninit((const SV *)sv);
6826 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6827 const char * const s = rx->subbeg + s1;
6832 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6839 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6841 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6842 PERL_UNUSED_ARG(rx);
6846 return newSVpvs("Regexp");
6849 /* Scans the name of a named buffer from the pattern.
6850 * If flags is REG_RSN_RETURN_NULL returns null.
6851 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6852 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6853 * to the parsed name as looked up in the RExC_paren_names hash.
6854 * If there is an error throws a vFAIL().. type exception.
6857 #define REG_RSN_RETURN_NULL 0
6858 #define REG_RSN_RETURN_NAME 1
6859 #define REG_RSN_RETURN_DATA 2
6862 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6864 char *name_start = RExC_parse;
6866 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6868 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6869 /* skip IDFIRST by using do...while */
6872 RExC_parse += UTF8SKIP(RExC_parse);
6873 } while (isALNUM_utf8((U8*)RExC_parse));
6877 } while (isALNUM(*RExC_parse));
6879 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6880 vFAIL("Group name must start with a non-digit word character");
6884 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6885 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6886 if ( flags == REG_RSN_RETURN_NAME)
6888 else if (flags==REG_RSN_RETURN_DATA) {
6891 if ( ! sv_name ) /* should not happen*/
6892 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6893 if (RExC_paren_names)
6894 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6896 sv_dat = HeVAL(he_str);
6898 vFAIL("Reference to nonexistent named group");
6902 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6903 (unsigned long) flags);
6905 assert(0); /* NOT REACHED */
6910 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6911 int rem=(int)(RExC_end - RExC_parse); \
6920 if (RExC_lastparse!=RExC_parse) \
6921 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6924 iscut ? "..." : "<" \
6927 PerlIO_printf(Perl_debug_log,"%16s",""); \
6930 num = RExC_size + 1; \
6932 num=REG_NODE_NUM(RExC_emit); \
6933 if (RExC_lastnum!=num) \
6934 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6936 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6937 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6938 (int)((depth*2)), "", \
6942 RExC_lastparse=RExC_parse; \
6947 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6948 DEBUG_PARSE_MSG((funcname)); \
6949 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6951 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6952 DEBUG_PARSE_MSG((funcname)); \
6953 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6956 /* This section of code defines the inversion list object and its methods. The
6957 * interfaces are highly subject to change, so as much as possible is static to
6958 * this file. An inversion list is here implemented as a malloc'd C UV array
6959 * with some added info that is placed as UVs at the beginning in a header
6960 * portion. An inversion list for Unicode is an array of code points, sorted
6961 * by ordinal number. The zeroth element is the first code point in the list.
6962 * The 1th element is the first element beyond that not in the list. In other
6963 * words, the first range is
6964 * invlist[0]..(invlist[1]-1)
6965 * The other ranges follow. Thus every element whose index is divisible by two
6966 * marks the beginning of a range that is in the list, and every element not
6967 * divisible by two marks the beginning of a range not in the list. A single
6968 * element inversion list that contains the single code point N generally
6969 * consists of two elements
6972 * (The exception is when N is the highest representable value on the
6973 * machine, in which case the list containing just it would be a single
6974 * element, itself. By extension, if the last range in the list extends to
6975 * infinity, then the first element of that range will be in the inversion list
6976 * at a position that is divisible by two, and is the final element in the
6978 * Taking the complement (inverting) an inversion list is quite simple, if the
6979 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6980 * This implementation reserves an element at the beginning of each inversion list
6981 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6982 * beginning of the list is either that element if 0, or the next one if 1.
6984 * More about inversion lists can be found in "Unicode Demystified"
6985 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6986 * More will be coming when functionality is added later.
6988 * The inversion list data structure is currently implemented as an SV pointing
6989 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6990 * array of UV whose memory management is automatically handled by the existing
6991 * facilities for SV's.
6993 * Some of the methods should always be private to the implementation, and some
6994 * should eventually be made public */
6996 #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6997 #define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6999 /* This is a combination of a version and data structure type, so that one
7000 * being passed in can be validated to be an inversion list of the correct
7001 * vintage. When the structure of the header is changed, a new random number
7002 * in the range 2**31-1 should be generated and the new() method changed to
7003 * insert that at this location. Then, if an auxiliary program doesn't change
7004 * correspondingly, it will be discovered immediately */
7005 #define INVLIST_VERSION_ID_OFFSET 2
7006 #define INVLIST_VERSION_ID 1064334010
7008 /* For safety, when adding new elements, remember to #undef them at the end of
7009 * the inversion list code section */
7011 #define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
7012 /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
7013 * contains the code point U+00000, and begins here. If 1, the inversion list
7014 * doesn't contain U+0000, and it begins at the next UV in the array.
7015 * Inverting an inversion list consists of adding or removing the 0 at the
7016 * beginning of it. By reserving a space for that 0, inversion can be made
7019 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
7021 /* Internally things are UVs */
7022 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7023 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7025 #define INVLIST_INITIAL_LEN 10
7027 PERL_STATIC_INLINE UV*
7028 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7030 /* Returns a pointer to the first element in the inversion list's array.
7031 * This is called upon initialization of an inversion list. Where the
7032 * array begins depends on whether the list has the code point U+0000
7033 * in it or not. The other parameter tells it whether the code that
7034 * follows this call is about to put a 0 in the inversion list or not.
7035 * The first element is either the element with 0, if 0, or the next one,
7038 UV* zero = get_invlist_zero_addr(invlist);
7040 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7043 assert(! *get_invlist_len_addr(invlist));
7045 /* 1^1 = 0; 1^0 = 1 */
7046 *zero = 1 ^ will_have_0;
7047 return zero + *zero;
7050 PERL_STATIC_INLINE UV*
7051 S_invlist_array(pTHX_ SV* const invlist)
7053 /* Returns the pointer to the inversion list's array. Every time the
7054 * length changes, this needs to be called in case malloc or realloc moved
7057 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7059 /* Must not be empty. If these fail, you probably didn't check for <len>
7060 * being non-zero before trying to get the array */
7061 assert(*get_invlist_len_addr(invlist));
7062 assert(*get_invlist_zero_addr(invlist) == 0
7063 || *get_invlist_zero_addr(invlist) == 1);
7065 /* The array begins either at the element reserved for zero if the
7066 * list contains 0 (that element will be set to 0), or otherwise the next
7067 * element (in which case the reserved element will be set to 1). */
7068 return (UV *) (get_invlist_zero_addr(invlist)
7069 + *get_invlist_zero_addr(invlist));
7072 PERL_STATIC_INLINE UV*
7073 S_get_invlist_len_addr(pTHX_ SV* invlist)
7075 /* Return the address of the UV that contains the current number
7076 * of used elements in the inversion list */
7078 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
7080 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
7083 PERL_STATIC_INLINE UV
7084 S_invlist_len(pTHX_ SV* const invlist)
7086 /* Returns the current number of elements stored in the inversion list's
7089 PERL_ARGS_ASSERT_INVLIST_LEN;
7091 return *get_invlist_len_addr(invlist);
7094 PERL_STATIC_INLINE void
7095 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7097 /* Sets the current number of elements stored in the inversion list */
7099 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7101 *get_invlist_len_addr(invlist) = len;
7103 assert(len <= SvLEN(invlist));
7105 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7106 /* If the list contains U+0000, that element is part of the header,
7107 * and should not be counted as part of the array. It will contain
7108 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7110 * SvCUR_set(invlist,
7111 * TO_INTERNAL_SIZE(len
7112 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7113 * But, this is only valid if len is not 0. The consequences of not doing
7114 * this is that the memory allocation code may think that 1 more UV is
7115 * being used than actually is, and so might do an unnecessary grow. That
7116 * seems worth not bothering to make this the precise amount.
7118 * Note that when inverting, SvCUR shouldn't change */
7121 PERL_STATIC_INLINE UV
7122 S_invlist_max(pTHX_ SV* const invlist)
7124 /* Returns the maximum number of elements storable in the inversion list's
7125 * array, without having to realloc() */
7127 PERL_ARGS_ASSERT_INVLIST_MAX;
7129 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7132 PERL_STATIC_INLINE UV*
7133 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7135 /* Return the address of the UV that is reserved to hold 0 if the inversion
7136 * list contains 0. This has to be the last element of the heading, as the
7137 * list proper starts with either it if 0, or the next element if not.
7138 * (But we force it to contain either 0 or 1) */
7140 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7142 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7145 #ifndef PERL_IN_XSUB_RE
7147 Perl__new_invlist(pTHX_ IV initial_size)
7150 /* Return a pointer to a newly constructed inversion list, with enough
7151 * space to store 'initial_size' elements. If that number is negative, a
7152 * system default is used instead */
7156 if (initial_size < 0) {
7157 initial_size = INVLIST_INITIAL_LEN;
7160 /* Allocate the initial space */
7161 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7162 invlist_set_len(new_list, 0);
7164 /* Force iterinit() to be used to get iteration to work */
7165 *get_invlist_iter_addr(new_list) = UV_MAX;
7167 /* This should force a segfault if a method doesn't initialize this
7169 *get_invlist_zero_addr(new_list) = UV_MAX;
7171 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7172 #if HEADER_LENGTH != 4
7173 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7181 S__new_invlist_C_array(pTHX_ UV* list)
7183 /* Return a pointer to a newly constructed inversion list, initialized to
7184 * point to <list>, which has to be in the exact correct inversion list
7185 * form, including internal fields. Thus this is a dangerous routine that
7186 * should not be used in the wrong hands */
7188 SV* invlist = newSV_type(SVt_PV);
7190 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7192 SvPV_set(invlist, (char *) list);
7193 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7194 shouldn't touch it */
7195 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
7197 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7198 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7205 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7207 /* Grow the maximum size of an inversion list */
7209 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7211 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7214 PERL_STATIC_INLINE void
7215 S_invlist_trim(pTHX_ SV* const invlist)
7217 PERL_ARGS_ASSERT_INVLIST_TRIM;
7219 /* Change the length of the inversion list to how many entries it currently
7222 SvPV_shrink_to_cur((SV *) invlist);
7225 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
7227 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
7228 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
7230 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7233 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7235 /* Subject to change or removal. Append the range from 'start' to 'end' at
7236 * the end of the inversion list. The range must be above any existing
7240 UV max = invlist_max(invlist);
7241 UV len = invlist_len(invlist);
7243 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7245 if (len == 0) { /* Empty lists must be initialized */
7246 array = _invlist_array_init(invlist, start == 0);
7249 /* Here, the existing list is non-empty. The current max entry in the
7250 * list is generally the first value not in the set, except when the
7251 * set extends to the end of permissible values, in which case it is
7252 * the first entry in that final set, and so this call is an attempt to
7253 * append out-of-order */
7255 UV final_element = len - 1;
7256 array = invlist_array(invlist);
7257 if (array[final_element] > start
7258 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7260 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7261 array[final_element], start,
7262 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7265 /* Here, it is a legal append. If the new range begins with the first
7266 * value not in the set, it is extending the set, so the new first
7267 * value not in the set is one greater than the newly extended range.
7269 if (array[final_element] == start) {
7270 if (end != UV_MAX) {
7271 array[final_element] = end + 1;
7274 /* But if the end is the maximum representable on the machine,
7275 * just let the range that this would extend to have no end */
7276 invlist_set_len(invlist, len - 1);
7282 /* Here the new range doesn't extend any existing set. Add it */
7284 len += 2; /* Includes an element each for the start and end of range */
7286 /* If overflows the existing space, extend, which may cause the array to be
7289 invlist_extend(invlist, len);
7290 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7291 failure in invlist_array() */
7292 array = invlist_array(invlist);
7295 invlist_set_len(invlist, len);
7298 /* The next item on the list starts the range, the one after that is
7299 * one past the new range. */
7300 array[len - 2] = start;
7301 if (end != UV_MAX) {
7302 array[len - 1] = end + 1;
7305 /* But if the end is the maximum representable on the machine, just let
7306 * the range have no end */
7307 invlist_set_len(invlist, len - 1);
7311 #ifndef PERL_IN_XSUB_RE
7314 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
7316 /* Searches the inversion list for the entry that contains the input code
7317 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7318 * return value is the index into the list's array of the range that
7322 IV high = invlist_len(invlist);
7323 const UV * const array = invlist_array(invlist);
7325 PERL_ARGS_ASSERT_INVLIST_SEARCH;
7327 /* If list is empty or the code point is before the first element, return
7329 if (high == 0 || cp < array[0]) {
7333 /* Binary search. What we are looking for is <i> such that
7334 * array[i] <= cp < array[i+1]
7335 * The loop below converges on the i+1. */
7336 while (low < high) {
7337 IV mid = (low + high) / 2;
7338 if (array[mid] <= cp) {
7341 /* We could do this extra test to exit the loop early.
7342 if (cp < array[low]) {
7347 else { /* cp < array[mid] */
7356 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7358 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7359 * but is used when the swash has an inversion list. This makes this much
7360 * faster, as it uses a binary search instead of a linear one. This is
7361 * intimately tied to that function, and perhaps should be in utf8.c,
7362 * except it is intimately tied to inversion lists as well. It assumes
7363 * that <swatch> is all 0's on input */
7366 const IV len = invlist_len(invlist);
7370 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7372 if (len == 0) { /* Empty inversion list */
7376 array = invlist_array(invlist);
7378 /* Find which element it is */
7379 i = invlist_search(invlist, start);
7381 /* We populate from <start> to <end> */
7382 while (current < end) {
7385 /* The inversion list gives the results for every possible code point
7386 * after the first one in the list. Only those ranges whose index is
7387 * even are ones that the inversion list matches. For the odd ones,
7388 * and if the initial code point is not in the list, we have to skip
7389 * forward to the next element */
7390 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7392 if (i >= len) { /* Finished if beyond the end of the array */
7396 if (current >= end) { /* Finished if beyond the end of what we
7401 assert(current >= start);
7403 /* The current range ends one below the next one, except don't go past
7406 upper = (i < len && array[i] < end) ? array[i] : end;
7408 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7409 * for each code point in it */
7410 for (; current < upper; current++) {
7411 const STRLEN offset = (STRLEN)(current - start);
7412 swatch[offset >> 3] |= 1 << (offset & 7);
7415 /* Quit if at the end of the list */
7418 /* But first, have to deal with the highest possible code point on
7419 * the platform. The previous code assumes that <end> is one
7420 * beyond where we want to populate, but that is impossible at the
7421 * platform's infinity, so have to handle it specially */
7422 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7424 const STRLEN offset = (STRLEN)(end - start);
7425 swatch[offset >> 3] |= 1 << (offset & 7);
7430 /* Advance to the next range, which will be for code points not in the
7440 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7442 /* Take the union of two inversion lists and point <output> to it. *output
7443 * should be defined upon input, and if it points to one of the two lists,
7444 * the reference count to that list will be decremented. The first list,
7445 * <a>, may be NULL, in which case a copy of the second list is returned.
7446 * If <complement_b> is TRUE, the union is taken of the complement
7447 * (inversion) of <b> instead of b itself.
7449 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7450 * Richard Gillam, published by Addison-Wesley, and explained at some
7451 * length there. The preface says to incorporate its examples into your
7452 * code at your own risk.
7454 * The algorithm is like a merge sort.
7456 * XXX A potential performance improvement is to keep track as we go along
7457 * if only one of the inputs contributes to the result, meaning the other
7458 * is a subset of that one. In that case, we can skip the final copy and
7459 * return the larger of the input lists, but then outside code might need
7460 * to keep track of whether to free the input list or not */
7462 UV* array_a; /* a's array */
7464 UV len_a; /* length of a's array */
7467 SV* u; /* the resulting union */
7471 UV i_a = 0; /* current index into a's array */
7475 /* running count, as explained in the algorithm source book; items are
7476 * stopped accumulating and are output when the count changes to/from 0.
7477 * The count is incremented when we start a range that's in the set, and
7478 * decremented when we start a range that's not in the set. So its range
7479 * is 0 to 2. Only when the count is zero is something not in the set.
7483 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7486 /* If either one is empty, the union is the other one */
7487 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7494 *output = invlist_clone(b);
7496 _invlist_invert(*output);
7498 } /* else *output already = b; */
7501 else if ((len_b = invlist_len(b)) == 0) {
7506 /* The complement of an empty list is a list that has everything in it,
7507 * so the union with <a> includes everything too */
7512 *output = _new_invlist(1);
7513 _append_range_to_invlist(*output, 0, UV_MAX);
7515 else if (*output != a) {
7516 *output = invlist_clone(a);
7518 /* else *output already = a; */
7522 /* Here both lists exist and are non-empty */
7523 array_a = invlist_array(a);
7524 array_b = invlist_array(b);
7526 /* If are to take the union of 'a' with the complement of b, set it
7527 * up so are looking at b's complement. */
7530 /* To complement, we invert: if the first element is 0, remove it. To
7531 * do this, we just pretend the array starts one later, and clear the
7532 * flag as we don't have to do anything else later */
7533 if (array_b[0] == 0) {
7536 complement_b = FALSE;
7540 /* But if the first element is not zero, we unshift a 0 before the
7541 * array. The data structure reserves a space for that 0 (which
7542 * should be a '1' right now), so physical shifting is unneeded,
7543 * but temporarily change that element to 0. Before exiting the
7544 * routine, we must restore the element to '1' */
7551 /* Size the union for the worst case: that the sets are completely
7553 u = _new_invlist(len_a + len_b);
7555 /* Will contain U+0000 if either component does */
7556 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7557 || (len_b > 0 && array_b[0] == 0));
7559 /* Go through each list item by item, stopping when exhausted one of
7561 while (i_a < len_a && i_b < len_b) {
7562 UV cp; /* The element to potentially add to the union's array */
7563 bool cp_in_set; /* is it in the the input list's set or not */
7565 /* We need to take one or the other of the two inputs for the union.
7566 * Since we are merging two sorted lists, we take the smaller of the
7567 * next items. In case of a tie, we take the one that is in its set
7568 * first. If we took one not in the set first, it would decrement the
7569 * count, possibly to 0 which would cause it to be output as ending the
7570 * range, and the next time through we would take the same number, and
7571 * output it again as beginning the next range. By doing it the
7572 * opposite way, there is no possibility that the count will be
7573 * momentarily decremented to 0, and thus the two adjoining ranges will
7574 * be seamlessly merged. (In a tie and both are in the set or both not
7575 * in the set, it doesn't matter which we take first.) */
7576 if (array_a[i_a] < array_b[i_b]
7577 || (array_a[i_a] == array_b[i_b]
7578 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7580 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7584 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7588 /* Here, have chosen which of the two inputs to look at. Only output
7589 * if the running count changes to/from 0, which marks the
7590 * beginning/end of a range in that's in the set */
7593 array_u[i_u++] = cp;
7600 array_u[i_u++] = cp;
7605 /* Here, we are finished going through at least one of the lists, which
7606 * means there is something remaining in at most one. We check if the list
7607 * that hasn't been exhausted is positioned such that we are in the middle
7608 * of a range in its set or not. (i_a and i_b point to the element beyond
7609 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7610 * is potentially more to output.
7611 * There are four cases:
7612 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7613 * in the union is entirely from the non-exhausted set.
7614 * 2) Both were in their sets, count is 2. Nothing further should
7615 * be output, as everything that remains will be in the exhausted
7616 * list's set, hence in the union; decrementing to 1 but not 0 insures
7618 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7619 * Nothing further should be output because the union includes
7620 * everything from the exhausted set. Not decrementing ensures that.
7621 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7622 * decrementing to 0 insures that we look at the remainder of the
7623 * non-exhausted set */
7624 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7625 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7630 /* The final length is what we've output so far, plus what else is about to
7631 * be output. (If 'count' is non-zero, then the input list we exhausted
7632 * has everything remaining up to the machine's limit in its set, and hence
7633 * in the union, so there will be no further output. */
7636 /* At most one of the subexpressions will be non-zero */
7637 len_u += (len_a - i_a) + (len_b - i_b);
7640 /* Set result to final length, which can change the pointer to array_u, so
7642 if (len_u != invlist_len(u)) {
7643 invlist_set_len(u, len_u);
7645 array_u = invlist_array(u);
7648 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7649 * the other) ended with everything above it not in its set. That means
7650 * that the remaining part of the union is precisely the same as the
7651 * non-exhausted list, so can just copy it unchanged. (If both list were
7652 * exhausted at the same time, then the operations below will be both 0.)
7655 IV copy_count; /* At most one will have a non-zero copy count */
7656 if ((copy_count = len_a - i_a) > 0) {
7657 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7659 else if ((copy_count = len_b - i_b) > 0) {
7660 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7664 /* We may be removing a reference to one of the inputs */
7665 if (a == *output || b == *output) {
7666 SvREFCNT_dec(*output);
7669 /* If we've changed b, restore it */
7679 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7681 /* Take the intersection of two inversion lists and point <i> to it. *i
7682 * should be defined upon input, and if it points to one of the two lists,
7683 * the reference count to that list will be decremented.
7684 * If <complement_b> is TRUE, the result will be the intersection of <a>
7685 * and the complement (or inversion) of <b> instead of <b> directly.
7687 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7688 * Richard Gillam, published by Addison-Wesley, and explained at some
7689 * length there. The preface says to incorporate its examples into your
7690 * code at your own risk. In fact, it had bugs
7692 * The algorithm is like a merge sort, and is essentially the same as the
7696 UV* array_a; /* a's array */
7698 UV len_a; /* length of a's array */
7701 SV* r; /* the resulting intersection */
7705 UV i_a = 0; /* current index into a's array */
7709 /* running count, as explained in the algorithm source book; items are
7710 * stopped accumulating and are output when the count changes to/from 2.
7711 * The count is incremented when we start a range that's in the set, and
7712 * decremented when we start a range that's not in the set. So its range
7713 * is 0 to 2. Only when the count is 2 is something in the intersection.
7717 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7720 /* Special case if either one is empty */
7721 len_a = invlist_len(a);
7722 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7724 if (len_a != 0 && complement_b) {
7726 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7727 * be empty. Here, also we are using 'b's complement, which hence
7728 * must be every possible code point. Thus the intersection is
7731 *i = invlist_clone(a);
7737 /* else *i is already 'a' */
7741 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7742 * intersection must be empty */
7749 *i = _new_invlist(0);
7753 /* Here both lists exist and are non-empty */
7754 array_a = invlist_array(a);
7755 array_b = invlist_array(b);
7757 /* If are to take the intersection of 'a' with the complement of b, set it
7758 * up so are looking at b's complement. */
7761 /* To complement, we invert: if the first element is 0, remove it. To
7762 * do this, we just pretend the array starts one later, and clear the
7763 * flag as we don't have to do anything else later */
7764 if (array_b[0] == 0) {
7767 complement_b = FALSE;
7771 /* But if the first element is not zero, we unshift a 0 before the
7772 * array. The data structure reserves a space for that 0 (which
7773 * should be a '1' right now), so physical shifting is unneeded,
7774 * but temporarily change that element to 0. Before exiting the
7775 * routine, we must restore the element to '1' */
7782 /* Size the intersection for the worst case: that the intersection ends up
7783 * fragmenting everything to be completely disjoint */
7784 r= _new_invlist(len_a + len_b);
7786 /* Will contain U+0000 iff both components do */
7787 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7788 && len_b > 0 && array_b[0] == 0);
7790 /* Go through each list item by item, stopping when exhausted one of
7792 while (i_a < len_a && i_b < len_b) {
7793 UV cp; /* The element to potentially add to the intersection's
7795 bool cp_in_set; /* Is it in the input list's set or not */
7797 /* We need to take one or the other of the two inputs for the
7798 * intersection. Since we are merging two sorted lists, we take the
7799 * smaller of the next items. In case of a tie, we take the one that
7800 * is not in its set first (a difference from the union algorithm). If
7801 * we took one in the set first, it would increment the count, possibly
7802 * to 2 which would cause it to be output as starting a range in the
7803 * intersection, and the next time through we would take that same
7804 * number, and output it again as ending the set. By doing it the
7805 * opposite of this, there is no possibility that the count will be
7806 * momentarily incremented to 2. (In a tie and both are in the set or
7807 * both not in the set, it doesn't matter which we take first.) */
7808 if (array_a[i_a] < array_b[i_b]
7809 || (array_a[i_a] == array_b[i_b]
7810 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7812 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7816 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7820 /* Here, have chosen which of the two inputs to look at. Only output
7821 * if the running count changes to/from 2, which marks the
7822 * beginning/end of a range that's in the intersection */
7826 array_r[i_r++] = cp;
7831 array_r[i_r++] = cp;
7837 /* Here, we are finished going through at least one of the lists, which
7838 * means there is something remaining in at most one. We check if the list
7839 * that has been exhausted is positioned such that we are in the middle
7840 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7841 * the ones we care about.) There are four cases:
7842 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7843 * nothing left in the intersection.
7844 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7845 * above 2. What should be output is exactly that which is in the
7846 * non-exhausted set, as everything it has is also in the intersection
7847 * set, and everything it doesn't have can't be in the intersection
7848 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7849 * gets incremented to 2. Like the previous case, the intersection is
7850 * everything that remains in the non-exhausted set.
7851 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7852 * remains 1. And the intersection has nothing more. */
7853 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7854 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7859 /* The final length is what we've output so far plus what else is in the
7860 * intersection. At most one of the subexpressions below will be non-zero */
7863 len_r += (len_a - i_a) + (len_b - i_b);
7866 /* Set result to final length, which can change the pointer to array_r, so
7868 if (len_r != invlist_len(r)) {
7869 invlist_set_len(r, len_r);
7871 array_r = invlist_array(r);
7874 /* Finish outputting any remaining */
7875 if (count >= 2) { /* At most one will have a non-zero copy count */
7877 if ((copy_count = len_a - i_a) > 0) {
7878 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7880 else if ((copy_count = len_b - i_b) > 0) {
7881 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7885 /* We may be removing a reference to one of the inputs */
7886 if (a == *i || b == *i) {
7890 /* If we've changed b, restore it */
7900 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7902 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7903 * set. A pointer to the inversion list is returned. This may actually be
7904 * a new list, in which case the passed in one has been destroyed. The
7905 * passed in inversion list can be NULL, in which case a new one is created
7906 * with just the one range in it */
7911 if (invlist == NULL) {
7912 invlist = _new_invlist(2);
7916 len = invlist_len(invlist);
7919 /* If comes after the final entry, can just append it to the end */
7921 || start >= invlist_array(invlist)
7922 [invlist_len(invlist) - 1])
7924 _append_range_to_invlist(invlist, start, end);
7928 /* Here, can't just append things, create and return a new inversion list
7929 * which is the union of this range and the existing inversion list */
7930 range_invlist = _new_invlist(2);
7931 _append_range_to_invlist(range_invlist, start, end);
7933 _invlist_union(invlist, range_invlist, &invlist);
7935 /* The temporary can be freed */
7936 SvREFCNT_dec(range_invlist);
7943 PERL_STATIC_INLINE SV*
7944 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7945 return _add_range_to_invlist(invlist, cp, cp);
7948 #ifndef PERL_IN_XSUB_RE
7950 Perl__invlist_invert(pTHX_ SV* const invlist)
7952 /* Complement the input inversion list. This adds a 0 if the list didn't
7953 * have a zero; removes it otherwise. As described above, the data
7954 * structure is set up so that this is very efficient */
7956 UV* len_pos = get_invlist_len_addr(invlist);
7958 PERL_ARGS_ASSERT__INVLIST_INVERT;
7960 /* The inverse of matching nothing is matching everything */
7961 if (*len_pos == 0) {
7962 _append_range_to_invlist(invlist, 0, UV_MAX);
7966 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7967 * zero element was a 0, so it is being removed, so the length decrements
7968 * by 1; and vice-versa. SvCUR is unaffected */
7969 if (*get_invlist_zero_addr(invlist) ^= 1) {
7978 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7980 /* Complement the input inversion list (which must be a Unicode property,
7981 * all of which don't match above the Unicode maximum code point.) And
7982 * Perl has chosen to not have the inversion match above that either. This
7983 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7989 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7991 _invlist_invert(invlist);
7993 len = invlist_len(invlist);
7995 if (len != 0) { /* If empty do nothing */
7996 array = invlist_array(invlist);
7997 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7998 /* Add 0x110000. First, grow if necessary */
8000 if (invlist_max(invlist) < len) {
8001 invlist_extend(invlist, len);
8002 array = invlist_array(invlist);
8004 invlist_set_len(invlist, len);
8005 array[len - 1] = PERL_UNICODE_MAX + 1;
8007 else { /* Remove the 0x110000 */
8008 invlist_set_len(invlist, len - 1);
8016 PERL_STATIC_INLINE SV*
8017 S_invlist_clone(pTHX_ SV* const invlist)
8020 /* Return a new inversion list that is a copy of the input one, which is
8023 /* Need to allocate extra space to accommodate Perl's addition of a
8024 * trailing NUL to SvPV's, since it thinks they are always strings */
8025 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
8026 STRLEN length = SvCUR(invlist);
8028 PERL_ARGS_ASSERT_INVLIST_CLONE;
8030 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8031 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8036 PERL_STATIC_INLINE UV*
8037 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8039 /* Return the address of the UV that contains the current iteration
8042 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8044 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8047 PERL_STATIC_INLINE UV*
8048 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8050 /* Return the address of the UV that contains the version id. */
8052 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8054 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8057 PERL_STATIC_INLINE void
8058 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8060 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8062 *get_invlist_iter_addr(invlist) = 0;
8066 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8068 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8069 * This call sets in <*start> and <*end>, the next range in <invlist>.
8070 * Returns <TRUE> if successful and the next call will return the next
8071 * range; <FALSE> if was already at the end of the list. If the latter,
8072 * <*start> and <*end> are unchanged, and the next call to this function
8073 * will start over at the beginning of the list */
8075 UV* pos = get_invlist_iter_addr(invlist);
8076 UV len = invlist_len(invlist);
8079 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8082 *pos = UV_MAX; /* Force iternit() to be required next time */
8086 array = invlist_array(invlist);
8088 *start = array[(*pos)++];
8094 *end = array[(*pos)++] - 1;
8100 PERL_STATIC_INLINE UV
8101 S_invlist_highest(pTHX_ SV* const invlist)
8103 /* Returns the highest code point that matches an inversion list. This API
8104 * has an ambiguity, as it returns 0 under either the highest is actually
8105 * 0, or if the list is empty. If this distinction matters to you, check
8106 * for emptiness before calling this function */
8108 UV len = invlist_len(invlist);
8111 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8117 array = invlist_array(invlist);
8119 /* The last element in the array in the inversion list always starts a
8120 * range that goes to infinity. That range may be for code points that are
8121 * matched in the inversion list, or it may be for ones that aren't
8122 * matched. In the latter case, the highest code point in the set is one
8123 * less than the beginning of this range; otherwise it is the final element
8124 * of this range: infinity */
8125 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8127 : array[len - 1] - 1;
8130 #ifndef PERL_IN_XSUB_RE
8132 Perl__invlist_contents(pTHX_ SV* const invlist)
8134 /* Get the contents of an inversion list into a string SV so that they can
8135 * be printed out. It uses the format traditionally done for debug tracing
8139 SV* output = newSVpvs("\n");
8141 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8143 invlist_iterinit(invlist);
8144 while (invlist_iternext(invlist, &start, &end)) {
8145 if (end == UV_MAX) {
8146 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8148 else if (end != start) {
8149 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8153 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8163 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8165 /* Dumps out the ranges in an inversion list. The string 'header'
8166 * if present is output on a line before the first range */
8170 if (header && strlen(header)) {
8171 PerlIO_printf(Perl_debug_log, "%s\n", header);
8173 invlist_iterinit(invlist);
8174 while (invlist_iternext(invlist, &start, &end)) {
8175 if (end == UV_MAX) {
8176 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8179 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8185 #undef HEADER_LENGTH
8186 #undef INVLIST_INITIAL_LENGTH
8187 #undef TO_INTERNAL_SIZE
8188 #undef FROM_INTERNAL_SIZE
8189 #undef INVLIST_LEN_OFFSET
8190 #undef INVLIST_ZERO_OFFSET
8191 #undef INVLIST_ITER_OFFSET
8192 #undef INVLIST_VERSION_ID
8194 /* End of inversion list object */
8197 - reg - regular expression, i.e. main body or parenthesized thing
8199 * Caller must absorb opening parenthesis.
8201 * Combining parenthesis handling with the base level of regular expression
8202 * is a trifle forced, but the need to tie the tails of the branches to what
8203 * follows makes it hard to avoid.
8205 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8207 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8209 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8213 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8214 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8217 register regnode *ret; /* Will be the head of the group. */
8218 register regnode *br;
8219 register regnode *lastbr;
8220 register regnode *ender = NULL;
8221 register I32 parno = 0;
8223 U32 oregflags = RExC_flags;
8224 bool have_branch = 0;
8226 I32 freeze_paren = 0;
8227 I32 after_freeze = 0;
8229 /* for (?g), (?gc), and (?o) warnings; warning
8230 about (?c) will warn about (?g) -- japhy */
8232 #define WASTED_O 0x01
8233 #define WASTED_G 0x02
8234 #define WASTED_C 0x04
8235 #define WASTED_GC (0x02|0x04)
8236 I32 wastedflags = 0x00;
8238 char * parse_start = RExC_parse; /* MJD */
8239 char * const oregcomp_parse = RExC_parse;
8241 GET_RE_DEBUG_FLAGS_DECL;
8243 PERL_ARGS_ASSERT_REG;
8244 DEBUG_PARSE("reg ");
8246 *flagp = 0; /* Tentatively. */
8249 /* Make an OPEN node, if parenthesized. */
8251 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8252 char *start_verb = RExC_parse;
8253 STRLEN verb_len = 0;
8254 char *start_arg = NULL;
8255 unsigned char op = 0;
8257 int internal_argval = 0; /* internal_argval is only useful if !argok */
8258 while ( *RExC_parse && *RExC_parse != ')' ) {
8259 if ( *RExC_parse == ':' ) {
8260 start_arg = RExC_parse + 1;
8266 verb_len = RExC_parse - start_verb;
8269 while ( *RExC_parse && *RExC_parse != ')' )
8271 if ( *RExC_parse != ')' )
8272 vFAIL("Unterminated verb pattern argument");
8273 if ( RExC_parse == start_arg )
8276 if ( *RExC_parse != ')' )
8277 vFAIL("Unterminated verb pattern");
8280 switch ( *start_verb ) {
8281 case 'A': /* (*ACCEPT) */
8282 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8284 internal_argval = RExC_nestroot;
8287 case 'C': /* (*COMMIT) */
8288 if ( memEQs(start_verb,verb_len,"COMMIT") )
8291 case 'F': /* (*FAIL) */
8292 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8297 case ':': /* (*:NAME) */
8298 case 'M': /* (*MARK:NAME) */
8299 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8304 case 'P': /* (*PRUNE) */
8305 if ( memEQs(start_verb,verb_len,"PRUNE") )
8308 case 'S': /* (*SKIP) */
8309 if ( memEQs(start_verb,verb_len,"SKIP") )
8312 case 'T': /* (*THEN) */
8313 /* [19:06] <TimToady> :: is then */
8314 if ( memEQs(start_verb,verb_len,"THEN") ) {
8316 RExC_seen |= REG_SEEN_CUTGROUP;
8322 vFAIL3("Unknown verb pattern '%.*s'",
8323 verb_len, start_verb);
8326 if ( start_arg && internal_argval ) {
8327 vFAIL3("Verb pattern '%.*s' may not have an argument",
8328 verb_len, start_verb);
8329 } else if ( argok < 0 && !start_arg ) {
8330 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8331 verb_len, start_verb);
8333 ret = reganode(pRExC_state, op, internal_argval);
8334 if ( ! internal_argval && ! SIZE_ONLY ) {
8336 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8337 ARG(ret) = add_data( pRExC_state, 1, "S" );
8338 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8345 if (!internal_argval)
8346 RExC_seen |= REG_SEEN_VERBARG;
8347 } else if ( start_arg ) {
8348 vFAIL3("Verb pattern '%.*s' may not have an argument",
8349 verb_len, start_verb);
8351 ret = reg_node(pRExC_state, op);
8353 nextchar(pRExC_state);
8356 if (*RExC_parse == '?') { /* (?...) */
8357 bool is_logical = 0;
8358 const char * const seqstart = RExC_parse;
8359 bool has_use_defaults = FALSE;
8362 paren = *RExC_parse++;
8363 ret = NULL; /* For look-ahead/behind. */
8366 case 'P': /* (?P...) variants for those used to PCRE/Python */
8367 paren = *RExC_parse++;
8368 if ( paren == '<') /* (?P<...>) named capture */
8370 else if (paren == '>') { /* (?P>name) named recursion */
8371 goto named_recursion;
8373 else if (paren == '=') { /* (?P=...) named backref */
8374 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8375 you change this make sure you change that */
8376 char* name_start = RExC_parse;
8378 SV *sv_dat = reg_scan_name(pRExC_state,
8379 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8380 if (RExC_parse == name_start || *RExC_parse != ')')
8381 vFAIL2("Sequence %.3s... not terminated",parse_start);
8384 num = add_data( pRExC_state, 1, "S" );
8385 RExC_rxi->data->data[num]=(void*)sv_dat;
8386 SvREFCNT_inc_simple_void(sv_dat);
8389 ret = reganode(pRExC_state,
8392 : (MORE_ASCII_RESTRICTED)
8394 : (AT_LEAST_UNI_SEMANTICS)
8402 Set_Node_Offset(ret, parse_start+1);
8403 Set_Node_Cur_Length(ret); /* MJD */
8405 nextchar(pRExC_state);
8409 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8411 case '<': /* (?<...) */
8412 if (*RExC_parse == '!')
8414 else if (*RExC_parse != '=')
8420 case '\'': /* (?'...') */
8421 name_start= RExC_parse;
8422 svname = reg_scan_name(pRExC_state,
8423 SIZE_ONLY ? /* reverse test from the others */
8424 REG_RSN_RETURN_NAME :
8425 REG_RSN_RETURN_NULL);
8426 if (RExC_parse == name_start) {
8428 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8431 if (*RExC_parse != paren)
8432 vFAIL2("Sequence (?%c... not terminated",
8433 paren=='>' ? '<' : paren);
8437 if (!svname) /* shouldn't happen */
8439 "panic: reg_scan_name returned NULL");
8440 if (!RExC_paren_names) {
8441 RExC_paren_names= newHV();
8442 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8444 RExC_paren_name_list= newAV();
8445 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8448 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8450 sv_dat = HeVAL(he_str);
8452 /* croak baby croak */
8454 "panic: paren_name hash element allocation failed");
8455 } else if ( SvPOK(sv_dat) ) {
8456 /* (?|...) can mean we have dupes so scan to check
8457 its already been stored. Maybe a flag indicating
8458 we are inside such a construct would be useful,
8459 but the arrays are likely to be quite small, so
8460 for now we punt -- dmq */
8461 IV count = SvIV(sv_dat);
8462 I32 *pv = (I32*)SvPVX(sv_dat);
8464 for ( i = 0 ; i < count ; i++ ) {
8465 if ( pv[i] == RExC_npar ) {
8471 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8472 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8473 pv[count] = RExC_npar;
8474 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8477 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8478 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8480 SvIV_set(sv_dat, 1);
8483 /* Yes this does cause a memory leak in debugging Perls */
8484 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8485 SvREFCNT_dec(svname);
8488 /*sv_dump(sv_dat);*/
8490 nextchar(pRExC_state);
8492 goto capturing_parens;
8494 RExC_seen |= REG_SEEN_LOOKBEHIND;
8495 RExC_in_lookbehind++;
8497 case '=': /* (?=...) */
8498 RExC_seen_zerolen++;
8500 case '!': /* (?!...) */
8501 RExC_seen_zerolen++;
8502 if (*RExC_parse == ')') {
8503 ret=reg_node(pRExC_state, OPFAIL);
8504 nextchar(pRExC_state);
8508 case '|': /* (?|...) */
8509 /* branch reset, behave like a (?:...) except that
8510 buffers in alternations share the same numbers */
8512 after_freeze = freeze_paren = RExC_npar;
8514 case ':': /* (?:...) */
8515 case '>': /* (?>...) */
8517 case '$': /* (?$...) */
8518 case '@': /* (?@...) */
8519 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8521 case '#': /* (?#...) */
8522 while (*RExC_parse && *RExC_parse != ')')
8524 if (*RExC_parse != ')')
8525 FAIL("Sequence (?#... not terminated");
8526 nextchar(pRExC_state);
8529 case '0' : /* (?0) */
8530 case 'R' : /* (?R) */
8531 if (*RExC_parse != ')')
8532 FAIL("Sequence (?R) not terminated");
8533 ret = reg_node(pRExC_state, GOSTART);
8534 *flagp |= POSTPONED;
8535 nextchar(pRExC_state);
8538 { /* named and numeric backreferences */
8540 case '&': /* (?&NAME) */
8541 parse_start = RExC_parse - 1;
8544 SV *sv_dat = reg_scan_name(pRExC_state,
8545 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8546 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8548 goto gen_recurse_regop;
8549 assert(0); /* NOT REACHED */
8551 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8553 vFAIL("Illegal pattern");
8555 goto parse_recursion;
8557 case '-': /* (?-1) */
8558 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8559 RExC_parse--; /* rewind to let it be handled later */
8563 case '1': case '2': case '3': case '4': /* (?1) */
8564 case '5': case '6': case '7': case '8': case '9':
8567 num = atoi(RExC_parse);
8568 parse_start = RExC_parse - 1; /* MJD */
8569 if (*RExC_parse == '-')
8571 while (isDIGIT(*RExC_parse))
8573 if (*RExC_parse!=')')
8574 vFAIL("Expecting close bracket");
8577 if ( paren == '-' ) {
8579 Diagram of capture buffer numbering.
8580 Top line is the normal capture buffer numbers
8581 Bottom line is the negative indexing as from
8585 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8589 num = RExC_npar + num;
8592 vFAIL("Reference to nonexistent group");
8594 } else if ( paren == '+' ) {
8595 num = RExC_npar + num - 1;
8598 ret = reganode(pRExC_state, GOSUB, num);
8600 if (num > (I32)RExC_rx->nparens) {
8602 vFAIL("Reference to nonexistent group");
8604 ARG2L_SET( ret, RExC_recurse_count++);
8606 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8607 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8611 RExC_seen |= REG_SEEN_RECURSE;
8612 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8613 Set_Node_Offset(ret, parse_start); /* MJD */
8615 *flagp |= POSTPONED;
8616 nextchar(pRExC_state);
8618 } /* named and numeric backreferences */
8619 assert(0); /* NOT REACHED */
8621 case '?': /* (??...) */
8623 if (*RExC_parse != '{') {
8625 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8628 *flagp |= POSTPONED;
8629 paren = *RExC_parse++;
8631 case '{': /* (?{...}) */
8634 struct reg_code_block *cb;
8636 RExC_seen_zerolen++;
8638 if ( !pRExC_state->num_code_blocks
8639 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8640 || pRExC_state->code_blocks[pRExC_state->code_index].start
8641 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8644 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8645 FAIL("panic: Sequence (?{...}): no code block found\n");
8646 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8648 /* this is a pre-compiled code block (?{...}) */
8649 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8650 RExC_parse = RExC_start + cb->end;
8653 if (cb->src_regex) {
8654 n = add_data(pRExC_state, 2, "rl");
8655 RExC_rxi->data->data[n] =
8656 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8657 RExC_rxi->data->data[n+1] = (void*)o;
8660 n = add_data(pRExC_state, 1,
8661 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8662 RExC_rxi->data->data[n] = (void*)o;
8665 pRExC_state->code_index++;
8666 nextchar(pRExC_state);
8670 ret = reg_node(pRExC_state, LOGICAL);
8671 eval = reganode(pRExC_state, EVAL, n);
8674 /* for later propagation into (??{}) return value */
8675 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8677 REGTAIL(pRExC_state, ret, eval);
8678 /* deal with the length of this later - MJD */
8681 ret = reganode(pRExC_state, EVAL, n);
8682 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8683 Set_Node_Offset(ret, parse_start);
8686 case '(': /* (?(?{...})...) and (?(?=...)...) */
8689 if (RExC_parse[0] == '?') { /* (?(?...)) */
8690 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8691 || RExC_parse[1] == '<'
8692 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8695 ret = reg_node(pRExC_state, LOGICAL);
8698 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8702 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8703 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8705 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8706 char *name_start= RExC_parse++;
8708 SV *sv_dat=reg_scan_name(pRExC_state,
8709 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8710 if (RExC_parse == name_start || *RExC_parse != ch)
8711 vFAIL2("Sequence (?(%c... not terminated",
8712 (ch == '>' ? '<' : ch));
8715 num = add_data( pRExC_state, 1, "S" );
8716 RExC_rxi->data->data[num]=(void*)sv_dat;
8717 SvREFCNT_inc_simple_void(sv_dat);
8719 ret = reganode(pRExC_state,NGROUPP,num);
8720 goto insert_if_check_paren;
8722 else if (RExC_parse[0] == 'D' &&
8723 RExC_parse[1] == 'E' &&
8724 RExC_parse[2] == 'F' &&
8725 RExC_parse[3] == 'I' &&
8726 RExC_parse[4] == 'N' &&
8727 RExC_parse[5] == 'E')
8729 ret = reganode(pRExC_state,DEFINEP,0);
8732 goto insert_if_check_paren;
8734 else if (RExC_parse[0] == 'R') {
8737 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8738 parno = atoi(RExC_parse++);
8739 while (isDIGIT(*RExC_parse))
8741 } else if (RExC_parse[0] == '&') {
8744 sv_dat = reg_scan_name(pRExC_state,
8745 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8746 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8748 ret = reganode(pRExC_state,INSUBP,parno);
8749 goto insert_if_check_paren;
8751 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8754 parno = atoi(RExC_parse++);
8756 while (isDIGIT(*RExC_parse))
8758 ret = reganode(pRExC_state, GROUPP, parno);
8760 insert_if_check_paren:
8761 if ((c = *nextchar(pRExC_state)) != ')')
8762 vFAIL("Switch condition not recognized");
8764 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8765 br = regbranch(pRExC_state, &flags, 1,depth+1);
8767 br = reganode(pRExC_state, LONGJMP, 0);
8769 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8770 c = *nextchar(pRExC_state);
8775 vFAIL("(?(DEFINE)....) does not allow branches");
8776 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8777 regbranch(pRExC_state, &flags, 1,depth+1);
8778 REGTAIL(pRExC_state, ret, lastbr);
8781 c = *nextchar(pRExC_state);
8786 vFAIL("Switch (?(condition)... contains too many branches");
8787 ender = reg_node(pRExC_state, TAIL);
8788 REGTAIL(pRExC_state, br, ender);
8790 REGTAIL(pRExC_state, lastbr, ender);
8791 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8794 REGTAIL(pRExC_state, ret, ender);
8795 RExC_size++; /* XXX WHY do we need this?!!
8796 For large programs it seems to be required
8797 but I can't figure out why. -- dmq*/
8801 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8805 RExC_parse--; /* for vFAIL to print correctly */
8806 vFAIL("Sequence (? incomplete");
8808 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8810 has_use_defaults = TRUE;
8811 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8812 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8813 ? REGEX_UNICODE_CHARSET
8814 : REGEX_DEPENDS_CHARSET);
8818 parse_flags: /* (?i) */
8820 U32 posflags = 0, negflags = 0;
8821 U32 *flagsp = &posflags;
8822 char has_charset_modifier = '\0';
8823 regex_charset cs = get_regex_charset(RExC_flags);
8824 if (cs == REGEX_DEPENDS_CHARSET
8825 && (RExC_utf8 || RExC_uni_semantics))
8827 cs = REGEX_UNICODE_CHARSET;
8830 while (*RExC_parse) {
8831 /* && strchr("iogcmsx", *RExC_parse) */
8832 /* (?g), (?gc) and (?o) are useless here
8833 and must be globally applied -- japhy */
8834 switch (*RExC_parse) {
8835 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8836 case LOCALE_PAT_MOD:
8837 if (has_charset_modifier) {
8838 goto excess_modifier;
8840 else if (flagsp == &negflags) {
8843 cs = REGEX_LOCALE_CHARSET;
8844 has_charset_modifier = LOCALE_PAT_MOD;
8845 RExC_contains_locale = 1;
8847 case UNICODE_PAT_MOD:
8848 if (has_charset_modifier) {
8849 goto excess_modifier;
8851 else if (flagsp == &negflags) {
8854 cs = REGEX_UNICODE_CHARSET;
8855 has_charset_modifier = UNICODE_PAT_MOD;
8857 case ASCII_RESTRICT_PAT_MOD:
8858 if (flagsp == &negflags) {
8861 if (has_charset_modifier) {
8862 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8863 goto excess_modifier;
8865 /* Doubled modifier implies more restricted */
8866 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8869 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8871 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8873 case DEPENDS_PAT_MOD:
8874 if (has_use_defaults) {
8875 goto fail_modifiers;
8877 else if (flagsp == &negflags) {
8880 else if (has_charset_modifier) {
8881 goto excess_modifier;
8884 /* The dual charset means unicode semantics if the
8885 * pattern (or target, not known until runtime) are
8886 * utf8, or something in the pattern indicates unicode
8888 cs = (RExC_utf8 || RExC_uni_semantics)
8889 ? REGEX_UNICODE_CHARSET
8890 : REGEX_DEPENDS_CHARSET;
8891 has_charset_modifier = DEPENDS_PAT_MOD;
8895 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8896 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8898 else if (has_charset_modifier == *(RExC_parse - 1)) {
8899 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8902 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8907 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8909 case ONCE_PAT_MOD: /* 'o' */
8910 case GLOBAL_PAT_MOD: /* 'g' */
8911 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8912 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8913 if (! (wastedflags & wflagbit) ) {
8914 wastedflags |= wflagbit;
8917 "Useless (%s%c) - %suse /%c modifier",
8918 flagsp == &negflags ? "?-" : "?",
8920 flagsp == &negflags ? "don't " : "",
8927 case CONTINUE_PAT_MOD: /* 'c' */
8928 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8929 if (! (wastedflags & WASTED_C) ) {
8930 wastedflags |= WASTED_GC;
8933 "Useless (%sc) - %suse /gc modifier",
8934 flagsp == &negflags ? "?-" : "?",
8935 flagsp == &negflags ? "don't " : ""
8940 case KEEPCOPY_PAT_MOD: /* 'p' */
8941 if (flagsp == &negflags) {
8943 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8945 *flagsp |= RXf_PMf_KEEPCOPY;
8949 /* A flag is a default iff it is following a minus, so
8950 * if there is a minus, it means will be trying to
8951 * re-specify a default which is an error */
8952 if (has_use_defaults || flagsp == &negflags) {
8955 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8959 wastedflags = 0; /* reset so (?g-c) warns twice */
8965 RExC_flags |= posflags;
8966 RExC_flags &= ~negflags;
8967 set_regex_charset(&RExC_flags, cs);
8969 oregflags |= posflags;
8970 oregflags &= ~negflags;
8971 set_regex_charset(&oregflags, cs);
8973 nextchar(pRExC_state);
8984 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8989 }} /* one for the default block, one for the switch */
8996 ret = reganode(pRExC_state, OPEN, parno);
8999 RExC_nestroot = parno;
9000 if (RExC_seen & REG_SEEN_RECURSE
9001 && !RExC_open_parens[parno-1])
9003 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9004 "Setting open paren #%"IVdf" to %d\n",
9005 (IV)parno, REG_NODE_NUM(ret)));
9006 RExC_open_parens[parno-1]= ret;
9009 Set_Node_Length(ret, 1); /* MJD */
9010 Set_Node_Offset(ret, RExC_parse); /* MJD */
9018 /* Pick up the branches, linking them together. */
9019 parse_start = RExC_parse; /* MJD */
9020 br = regbranch(pRExC_state, &flags, 1,depth+1);
9022 /* branch_len = (paren != 0); */
9026 if (*RExC_parse == '|') {
9027 if (!SIZE_ONLY && RExC_extralen) {
9028 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9031 reginsert(pRExC_state, BRANCH, br, depth+1);
9032 Set_Node_Length(br, paren != 0);
9033 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9037 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9039 else if (paren == ':') {
9040 *flagp |= flags&SIMPLE;
9042 if (is_open) { /* Starts with OPEN. */
9043 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9045 else if (paren != '?') /* Not Conditional */
9047 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9049 while (*RExC_parse == '|') {
9050 if (!SIZE_ONLY && RExC_extralen) {
9051 ender = reganode(pRExC_state, LONGJMP,0);
9052 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9055 RExC_extralen += 2; /* Account for LONGJMP. */
9056 nextchar(pRExC_state);
9058 if (RExC_npar > after_freeze)
9059 after_freeze = RExC_npar;
9060 RExC_npar = freeze_paren;
9062 br = regbranch(pRExC_state, &flags, 0, depth+1);
9066 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9068 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9071 if (have_branch || paren != ':') {
9072 /* Make a closing node, and hook it on the end. */
9075 ender = reg_node(pRExC_state, TAIL);
9078 ender = reganode(pRExC_state, CLOSE, parno);
9079 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9080 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9081 "Setting close paren #%"IVdf" to %d\n",
9082 (IV)parno, REG_NODE_NUM(ender)));
9083 RExC_close_parens[parno-1]= ender;
9084 if (RExC_nestroot == parno)
9087 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9088 Set_Node_Length(ender,1); /* MJD */
9094 *flagp &= ~HASWIDTH;
9097 ender = reg_node(pRExC_state, SUCCEED);
9100 ender = reg_node(pRExC_state, END);
9102 assert(!RExC_opend); /* there can only be one! */
9107 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9108 SV * const mysv_val1=sv_newmortal();
9109 SV * const mysv_val2=sv_newmortal();
9110 DEBUG_PARSE_MSG("lsbr");
9111 regprop(RExC_rx, mysv_val1, lastbr);
9112 regprop(RExC_rx, mysv_val2, ender);
9113 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9114 SvPV_nolen_const(mysv_val1),
9115 (IV)REG_NODE_NUM(lastbr),
9116 SvPV_nolen_const(mysv_val2),
9117 (IV)REG_NODE_NUM(ender),
9118 (IV)(ender - lastbr)
9121 REGTAIL(pRExC_state, lastbr, ender);
9123 if (have_branch && !SIZE_ONLY) {
9126 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9128 /* Hook the tails of the branches to the closing node. */
9129 for (br = ret; br; br = regnext(br)) {
9130 const U8 op = PL_regkind[OP(br)];
9132 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9133 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9136 else if (op == BRANCHJ) {
9137 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9138 /* for now we always disable this optimisation * /
9139 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9145 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9146 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9147 SV * const mysv_val1=sv_newmortal();
9148 SV * const mysv_val2=sv_newmortal();
9149 DEBUG_PARSE_MSG("NADA");
9150 regprop(RExC_rx, mysv_val1, ret);
9151 regprop(RExC_rx, mysv_val2, ender);
9152 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9153 SvPV_nolen_const(mysv_val1),
9154 (IV)REG_NODE_NUM(ret),
9155 SvPV_nolen_const(mysv_val2),
9156 (IV)REG_NODE_NUM(ender),
9161 if (OP(ender) == TAIL) {
9166 for ( opt= br + 1; opt < ender ; opt++ )
9168 NEXT_OFF(br)= ender - br;
9176 static const char parens[] = "=!<,>";
9178 if (paren && (p = strchr(parens, paren))) {
9179 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9180 int flag = (p - parens) > 1;
9183 node = SUSPEND, flag = 0;
9184 reginsert(pRExC_state, node,ret, depth+1);
9185 Set_Node_Cur_Length(ret);
9186 Set_Node_Offset(ret, parse_start + 1);
9188 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9192 /* Check for proper termination. */
9194 RExC_flags = oregflags;
9195 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9196 RExC_parse = oregcomp_parse;
9197 vFAIL("Unmatched (");
9200 else if (!paren && RExC_parse < RExC_end) {
9201 if (*RExC_parse == ')') {
9203 vFAIL("Unmatched )");
9206 FAIL("Junk on end of regexp"); /* "Can't happen". */
9207 assert(0); /* NOTREACHED */
9210 if (RExC_in_lookbehind) {
9211 RExC_in_lookbehind--;
9213 if (after_freeze > RExC_npar)
9214 RExC_npar = after_freeze;
9219 - regbranch - one alternative of an | operator
9221 * Implements the concatenation operator.
9224 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9227 register regnode *ret;
9228 register regnode *chain = NULL;
9229 register regnode *latest;
9230 I32 flags = 0, c = 0;
9231 GET_RE_DEBUG_FLAGS_DECL;
9233 PERL_ARGS_ASSERT_REGBRANCH;
9235 DEBUG_PARSE("brnc");
9240 if (!SIZE_ONLY && RExC_extralen)
9241 ret = reganode(pRExC_state, BRANCHJ,0);
9243 ret = reg_node(pRExC_state, BRANCH);
9244 Set_Node_Length(ret, 1);
9248 if (!first && SIZE_ONLY)
9249 RExC_extralen += 1; /* BRANCHJ */
9251 *flagp = WORST; /* Tentatively. */
9254 nextchar(pRExC_state);
9255 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9257 latest = regpiece(pRExC_state, &flags,depth+1);
9258 if (latest == NULL) {
9259 if (flags & TRYAGAIN)
9263 else if (ret == NULL)
9265 *flagp |= flags&(HASWIDTH|POSTPONED);
9266 if (chain == NULL) /* First piece. */
9267 *flagp |= flags&SPSTART;
9270 REGTAIL(pRExC_state, chain, latest);
9275 if (chain == NULL) { /* Loop ran zero times. */
9276 chain = reg_node(pRExC_state, NOTHING);
9281 *flagp |= flags&SIMPLE;
9288 - regpiece - something followed by possible [*+?]
9290 * Note that the branching code sequences used for ? and the general cases
9291 * of * and + are somewhat optimized: they use the same NOTHING node as
9292 * both the endmarker for their branch list and the body of the last branch.
9293 * It might seem that this node could be dispensed with entirely, but the
9294 * endmarker role is not redundant.
9297 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9300 register regnode *ret;
9302 register char *next;
9304 const char * const origparse = RExC_parse;
9306 I32 max = REG_INFTY;
9307 #ifdef RE_TRACK_PATTERN_OFFSETS
9310 const char *maxpos = NULL;
9311 GET_RE_DEBUG_FLAGS_DECL;
9313 PERL_ARGS_ASSERT_REGPIECE;
9315 DEBUG_PARSE("piec");
9317 ret = regatom(pRExC_state, &flags,depth+1);
9319 if (flags & TRYAGAIN)
9326 if (op == '{' && regcurly(RExC_parse)) {
9328 #ifdef RE_TRACK_PATTERN_OFFSETS
9329 parse_start = RExC_parse; /* MJD */
9331 next = RExC_parse + 1;
9332 while (isDIGIT(*next) || *next == ',') {
9341 if (*next == '}') { /* got one */
9345 min = atoi(RExC_parse);
9349 maxpos = RExC_parse;
9351 if (!max && *maxpos != '0')
9352 max = REG_INFTY; /* meaning "infinity" */
9353 else if (max >= REG_INFTY)
9354 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9356 nextchar(pRExC_state);
9359 if ((flags&SIMPLE)) {
9360 RExC_naughty += 2 + RExC_naughty / 2;
9361 reginsert(pRExC_state, CURLY, ret, depth+1);
9362 Set_Node_Offset(ret, parse_start+1); /* MJD */
9363 Set_Node_Cur_Length(ret);
9366 regnode * const w = reg_node(pRExC_state, WHILEM);
9369 REGTAIL(pRExC_state, ret, w);
9370 if (!SIZE_ONLY && RExC_extralen) {
9371 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9372 reginsert(pRExC_state, NOTHING,ret, depth+1);
9373 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9375 reginsert(pRExC_state, CURLYX,ret, depth+1);
9377 Set_Node_Offset(ret, parse_start+1);
9378 Set_Node_Length(ret,
9379 op == '{' ? (RExC_parse - parse_start) : 1);
9381 if (!SIZE_ONLY && RExC_extralen)
9382 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9383 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9385 RExC_whilem_seen++, RExC_extralen += 3;
9386 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9395 vFAIL("Can't do {n,m} with n > m");
9397 ARG1_SET(ret, (U16)min);
9398 ARG2_SET(ret, (U16)max);
9410 #if 0 /* Now runtime fix should be reliable. */
9412 /* if this is reinstated, don't forget to put this back into perldiag:
9414 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9416 (F) The part of the regexp subject to either the * or + quantifier
9417 could match an empty string. The {#} shows in the regular
9418 expression about where the problem was discovered.
9422 if (!(flags&HASWIDTH) && op != '?')
9423 vFAIL("Regexp *+ operand could be empty");
9426 #ifdef RE_TRACK_PATTERN_OFFSETS
9427 parse_start = RExC_parse;
9429 nextchar(pRExC_state);
9431 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9433 if (op == '*' && (flags&SIMPLE)) {
9434 reginsert(pRExC_state, STAR, ret, depth+1);
9438 else if (op == '*') {
9442 else if (op == '+' && (flags&SIMPLE)) {
9443 reginsert(pRExC_state, PLUS, ret, depth+1);
9447 else if (op == '+') {
9451 else if (op == '?') {
9456 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9457 ckWARN3reg(RExC_parse,
9458 "%.*s matches null string many times",
9459 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9463 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9464 nextchar(pRExC_state);
9465 reginsert(pRExC_state, MINMOD, ret, depth+1);
9466 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9468 #ifndef REG_ALLOW_MINMOD_SUSPEND
9471 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9473 nextchar(pRExC_state);
9474 ender = reg_node(pRExC_state, SUCCEED);
9475 REGTAIL(pRExC_state, ret, ender);
9476 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9478 ender = reg_node(pRExC_state, TAIL);
9479 REGTAIL(pRExC_state, ret, ender);
9483 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9485 vFAIL("Nested quantifiers");
9492 /* reg_namedseq(pRExC_state,UVp, UV depth)
9494 This is expected to be called by a parser routine that has
9495 recognized '\N' and needs to handle the rest. RExC_parse is
9496 expected to point at the first char following the N at the time
9499 The \N may be inside (indicated by valuep not being NULL) or outside a
9502 \N may begin either a named sequence, or if outside a character class, mean
9503 to match a non-newline. For non single-quoted regexes, the tokenizer has
9504 attempted to decide which, and in the case of a named sequence converted it
9505 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9506 where c1... are the characters in the sequence. For single-quoted regexes,
9507 the tokenizer passes the \N sequence through unchanged; this code will not
9508 attempt to determine this nor expand those. The net effect is that if the
9509 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
9510 signals that this \N occurrence means to match a non-newline.
9512 Only the \N{U+...} form should occur in a character class, for the same
9513 reason that '.' inside a character class means to just match a period: it
9514 just doesn't make sense.
9516 If valuep is non-null then it is assumed that we are parsing inside
9517 of a charclass definition and the first codepoint in the resolved
9518 string is returned via *valuep and the routine will return NULL.
9519 In this mode if a multichar string is returned from the charnames
9520 handler, a warning will be issued, and only the first char in the
9521 sequence will be examined. If the string returned is zero length
9522 then the value of *valuep is undefined and NON-NULL will
9523 be returned to indicate failure. (This will NOT be a valid pointer
9526 If valuep is null then it is assumed that we are parsing normal text and a
9527 new EXACT node is inserted into the program containing the resolved string,
9528 and a pointer to the new node is returned. But if the string is zero length
9529 a NOTHING node is emitted instead.
9531 On success RExC_parse is set to the char following the endbrace.
9532 Parsing failures will generate a fatal error via vFAIL(...)
9535 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
9537 char * endbrace; /* '}' following the name */
9538 regnode *ret = NULL;
9541 GET_RE_DEBUG_FLAGS_DECL;
9543 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
9547 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9548 * modifier. The other meaning does not */
9549 p = (RExC_flags & RXf_PMf_EXTENDED)
9550 ? regwhite( pRExC_state, RExC_parse )
9553 /* Disambiguate between \N meaning a named character versus \N meaning
9554 * [^\n]. The former is assumed when it can't be the latter. */
9555 if (*p != '{' || regcurly(p)) {
9558 /* no bare \N in a charclass */
9559 vFAIL("\\N in a character class must be a named character: \\N{...}");
9561 nextchar(pRExC_state);
9562 ret = reg_node(pRExC_state, REG_ANY);
9563 *flagp |= HASWIDTH|SIMPLE;
9566 Set_Node_Length(ret, 1); /* MJD */
9570 /* Here, we have decided it should be a named sequence */
9572 /* The test above made sure that the next real character is a '{', but
9573 * under the /x modifier, it could be separated by space (or a comment and
9574 * \n) and this is not allowed (for consistency with \x{...} and the
9575 * tokenizer handling of \N{NAME}). */
9576 if (*RExC_parse != '{') {
9577 vFAIL("Missing braces on \\N{}");
9580 RExC_parse++; /* Skip past the '{' */
9582 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9583 || ! (endbrace == RExC_parse /* nothing between the {} */
9584 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9585 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9587 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9588 vFAIL("\\N{NAME} must be resolved by the lexer");
9591 if (endbrace == RExC_parse) { /* empty: \N{} */
9593 RExC_parse = endbrace + 1;
9594 return reg_node(pRExC_state,NOTHING);
9598 ckWARNreg(RExC_parse,
9599 "Ignoring zero length \\N{} in character class"
9601 RExC_parse = endbrace + 1;
9604 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
9607 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
9608 RExC_parse += 2; /* Skip past the 'U+' */
9610 if (valuep) { /* In a bracketed char class */
9611 /* We only pay attention to the first char of
9612 multichar strings being returned. I kinda wonder
9613 if this makes sense as it does change the behaviour
9614 from earlier versions, OTOH that behaviour was broken
9615 as well. XXX Solution is to recharacterize as
9616 [rest-of-class]|multi1|multi2... */
9618 STRLEN length_of_hex;
9619 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9620 | PERL_SCAN_DISALLOW_PREFIX
9621 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9623 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
9624 if (endchar < endbrace) {
9625 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9628 length_of_hex = (STRLEN)(endchar - RExC_parse);
9629 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
9631 /* The tokenizer should have guaranteed validity, but it's possible to
9632 * bypass it by using single quoting, so check */
9633 if (length_of_hex == 0
9634 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9636 RExC_parse += length_of_hex; /* Includes all the valid */
9637 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9638 ? UTF8SKIP(RExC_parse)
9640 /* Guard against malformed utf8 */
9641 if (RExC_parse >= endchar) RExC_parse = endchar;
9642 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9645 RExC_parse = endbrace + 1;
9646 if (endchar == endbrace) return NULL;
9648 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
9650 else { /* Not a char class */
9652 /* What is done here is to convert this to a sub-pattern of the form
9653 * (?:\x{char1}\x{char2}...)
9654 * and then call reg recursively. That way, it retains its atomicness,
9655 * while not having to worry about special handling that some code
9656 * points may have. toke.c has converted the original Unicode values
9657 * to native, so that we can just pass on the hex values unchanged. We
9658 * do have to set a flag to keep recoding from happening in the
9661 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9663 char *endchar; /* Points to '.' or '}' ending cur char in the input
9665 char *orig_end = RExC_end;
9667 while (RExC_parse < endbrace) {
9669 /* Code points are separated by dots. If none, there is only one
9670 * code point, and is terminated by the brace */
9671 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9673 /* Convert to notation the rest of the code understands */
9674 sv_catpv(substitute_parse, "\\x{");
9675 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9676 sv_catpv(substitute_parse, "}");
9678 /* Point to the beginning of the next character in the sequence. */
9679 RExC_parse = endchar + 1;
9681 sv_catpv(substitute_parse, ")");
9683 RExC_parse = SvPV(substitute_parse, len);
9685 /* Don't allow empty number */
9687 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9689 RExC_end = RExC_parse + len;
9691 /* The values are Unicode, and therefore not subject to recoding */
9692 RExC_override_recoding = 1;
9694 ret = reg(pRExC_state, 1, flagp, depth+1);
9696 RExC_parse = endbrace;
9697 RExC_end = orig_end;
9698 RExC_override_recoding = 0;
9700 nextchar(pRExC_state);
9710 * It returns the code point in utf8 for the value in *encp.
9711 * value: a code value in the source encoding
9712 * encp: a pointer to an Encode object
9714 * If the result from Encode is not a single character,
9715 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9718 S_reg_recode(pTHX_ const char value, SV **encp)
9721 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9722 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9723 const STRLEN newlen = SvCUR(sv);
9724 UV uv = UNICODE_REPLACEMENT;
9726 PERL_ARGS_ASSERT_REG_RECODE;
9730 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9733 if (!newlen || numlen != newlen) {
9734 uv = UNICODE_REPLACEMENT;
9742 - regatom - the lowest level
9744 Try to identify anything special at the start of the pattern. If there
9745 is, then handle it as required. This may involve generating a single regop,
9746 such as for an assertion; or it may involve recursing, such as to
9747 handle a () structure.
9749 If the string doesn't start with something special then we gobble up
9750 as much literal text as we can.
9752 Once we have been able to handle whatever type of thing started the
9753 sequence, we return.
9755 Note: we have to be careful with escapes, as they can be both literal
9756 and special, and in the case of \10 and friends, context determines which.
9758 A summary of the code structure is:
9760 switch (first_byte) {
9761 cases for each special:
9762 handle this special;
9766 cases for each unambiguous special:
9767 handle this special;
9769 cases for each ambigous special/literal:
9771 if (special) handle here
9773 default: // unambiguously literal:
9776 default: // is a literal char
9779 create EXACTish node for literal;
9780 while (more input and node isn't full) {
9781 switch (input_byte) {
9782 cases for each special;
9783 make sure parse pointer is set so that the next call to
9784 regatom will see this special first
9785 goto loopdone; // EXACTish node terminated by prev. char
9787 append char to EXACTISH node;
9789 get next input byte;
9793 return the generated node;
9795 Specifically there are two separate switches for handling
9796 escape sequences, with the one for handling literal escapes requiring
9797 a dummy entry for all of the special escapes that are actually handled
9802 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9805 register regnode *ret = NULL;
9807 char *parse_start = RExC_parse;
9809 GET_RE_DEBUG_FLAGS_DECL;
9810 DEBUG_PARSE("atom");
9811 *flagp = WORST; /* Tentatively. */
9813 PERL_ARGS_ASSERT_REGATOM;
9816 switch ((U8)*RExC_parse) {
9818 RExC_seen_zerolen++;
9819 nextchar(pRExC_state);
9820 if (RExC_flags & RXf_PMf_MULTILINE)
9821 ret = reg_node(pRExC_state, MBOL);
9822 else if (RExC_flags & RXf_PMf_SINGLELINE)
9823 ret = reg_node(pRExC_state, SBOL);
9825 ret = reg_node(pRExC_state, BOL);
9826 Set_Node_Length(ret, 1); /* MJD */
9829 nextchar(pRExC_state);
9831 RExC_seen_zerolen++;
9832 if (RExC_flags & RXf_PMf_MULTILINE)
9833 ret = reg_node(pRExC_state, MEOL);
9834 else if (RExC_flags & RXf_PMf_SINGLELINE)
9835 ret = reg_node(pRExC_state, SEOL);
9837 ret = reg_node(pRExC_state, EOL);
9838 Set_Node_Length(ret, 1); /* MJD */
9841 nextchar(pRExC_state);
9842 if (RExC_flags & RXf_PMf_SINGLELINE)
9843 ret = reg_node(pRExC_state, SANY);
9845 ret = reg_node(pRExC_state, REG_ANY);
9846 *flagp |= HASWIDTH|SIMPLE;
9848 Set_Node_Length(ret, 1); /* MJD */
9852 char * const oregcomp_parse = ++RExC_parse;
9853 ret = regclass(pRExC_state,depth+1);
9854 if (*RExC_parse != ']') {
9855 RExC_parse = oregcomp_parse;
9856 vFAIL("Unmatched [");
9858 nextchar(pRExC_state);
9859 *flagp |= HASWIDTH|SIMPLE;
9860 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9864 nextchar(pRExC_state);
9865 ret = reg(pRExC_state, 1, &flags,depth+1);
9867 if (flags & TRYAGAIN) {
9868 if (RExC_parse == RExC_end) {
9869 /* Make parent create an empty node if needed. */
9877 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9881 if (flags & TRYAGAIN) {
9885 vFAIL("Internal urp");
9886 /* Supposed to be caught earlier. */
9892 vFAIL("Quantifier follows nothing");
9897 This switch handles escape sequences that resolve to some kind
9898 of special regop and not to literal text. Escape sequnces that
9899 resolve to literal text are handled below in the switch marked
9902 Every entry in this switch *must* have a corresponding entry
9903 in the literal escape switch. However, the opposite is not
9904 required, as the default for this switch is to jump to the
9905 literal text handling code.
9907 switch ((U8)*++RExC_parse) {
9908 /* Special Escapes */
9910 RExC_seen_zerolen++;
9911 ret = reg_node(pRExC_state, SBOL);
9913 goto finish_meta_pat;
9915 ret = reg_node(pRExC_state, GPOS);
9916 RExC_seen |= REG_SEEN_GPOS;
9918 goto finish_meta_pat;
9920 RExC_seen_zerolen++;
9921 ret = reg_node(pRExC_state, KEEPS);
9923 /* XXX:dmq : disabling in-place substitution seems to
9924 * be necessary here to avoid cases of memory corruption, as
9925 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9927 RExC_seen |= REG_SEEN_LOOKBEHIND;
9928 goto finish_meta_pat;
9930 ret = reg_node(pRExC_state, SEOL);
9932 RExC_seen_zerolen++; /* Do not optimize RE away */
9933 goto finish_meta_pat;
9935 ret = reg_node(pRExC_state, EOS);
9937 RExC_seen_zerolen++; /* Do not optimize RE away */
9938 goto finish_meta_pat;
9940 ret = reg_node(pRExC_state, CANY);
9941 RExC_seen |= REG_SEEN_CANY;
9942 *flagp |= HASWIDTH|SIMPLE;
9943 goto finish_meta_pat;
9945 ret = reg_node(pRExC_state, CLUMP);
9947 goto finish_meta_pat;
9949 op = ALNUM + get_regex_charset(RExC_flags);
9950 if (op > ALNUMA) { /* /aa is same as /a */
9953 ret = reg_node(pRExC_state, op);
9954 *flagp |= HASWIDTH|SIMPLE;
9955 goto finish_meta_pat;
9957 op = NALNUM + get_regex_charset(RExC_flags);
9958 if (op > NALNUMA) { /* /aa is same as /a */
9961 ret = reg_node(pRExC_state, op);
9962 *flagp |= HASWIDTH|SIMPLE;
9963 goto finish_meta_pat;
9965 RExC_seen_zerolen++;
9966 RExC_seen |= REG_SEEN_LOOKBEHIND;
9967 op = BOUND + get_regex_charset(RExC_flags);
9968 if (op > BOUNDA) { /* /aa is same as /a */
9971 ret = reg_node(pRExC_state, op);
9972 FLAGS(ret) = get_regex_charset(RExC_flags);
9974 goto finish_meta_pat;
9976 RExC_seen_zerolen++;
9977 RExC_seen |= REG_SEEN_LOOKBEHIND;
9978 op = NBOUND + get_regex_charset(RExC_flags);
9979 if (op > NBOUNDA) { /* /aa is same as /a */
9982 ret = reg_node(pRExC_state, op);
9983 FLAGS(ret) = get_regex_charset(RExC_flags);
9985 goto finish_meta_pat;
9987 op = SPACE + get_regex_charset(RExC_flags);
9988 if (op > SPACEA) { /* /aa is same as /a */
9991 ret = reg_node(pRExC_state, op);
9992 *flagp |= HASWIDTH|SIMPLE;
9993 goto finish_meta_pat;
9995 op = NSPACE + get_regex_charset(RExC_flags);
9996 if (op > NSPACEA) { /* /aa is same as /a */
9999 ret = reg_node(pRExC_state, op);
10000 *flagp |= HASWIDTH|SIMPLE;
10001 goto finish_meta_pat;
10009 U8 offset = get_regex_charset(RExC_flags);
10010 if (offset == REGEX_UNICODE_CHARSET) {
10011 offset = REGEX_DEPENDS_CHARSET;
10013 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10014 offset = REGEX_ASCII_RESTRICTED_CHARSET;
10018 ret = reg_node(pRExC_state, op);
10019 *flagp |= HASWIDTH|SIMPLE;
10020 goto finish_meta_pat;
10022 ret = reg_node(pRExC_state, LNBREAK);
10023 *flagp |= HASWIDTH|SIMPLE;
10024 goto finish_meta_pat;
10026 ret = reg_node(pRExC_state, HORIZWS);
10027 *flagp |= HASWIDTH|SIMPLE;
10028 goto finish_meta_pat;
10030 ret = reg_node(pRExC_state, NHORIZWS);
10031 *flagp |= HASWIDTH|SIMPLE;
10032 goto finish_meta_pat;
10034 ret = reg_node(pRExC_state, VERTWS);
10035 *flagp |= HASWIDTH|SIMPLE;
10036 goto finish_meta_pat;
10038 ret = reg_node(pRExC_state, NVERTWS);
10039 *flagp |= HASWIDTH|SIMPLE;
10041 nextchar(pRExC_state);
10042 Set_Node_Length(ret, 2); /* MJD */
10047 char* const oldregxend = RExC_end;
10049 char* parse_start = RExC_parse - 2;
10052 if (RExC_parse[1] == '{') {
10053 /* a lovely hack--pretend we saw [\pX] instead */
10054 RExC_end = strchr(RExC_parse, '}');
10056 const U8 c = (U8)*RExC_parse;
10058 RExC_end = oldregxend;
10059 vFAIL2("Missing right brace on \\%c{}", c);
10064 RExC_end = RExC_parse + 2;
10065 if (RExC_end > oldregxend)
10066 RExC_end = oldregxend;
10070 ret = regclass(pRExC_state,depth+1);
10072 RExC_end = oldregxend;
10075 Set_Node_Offset(ret, parse_start + 2);
10076 Set_Node_Cur_Length(ret);
10077 nextchar(pRExC_state);
10078 *flagp |= HASWIDTH|SIMPLE;
10082 /* Handle \N and \N{NAME} here and not below because it can be
10083 multicharacter. join_exact() will join them up later on.
10084 Also this makes sure that things like /\N{BLAH}+/ and
10085 \N{BLAH} being multi char Just Happen. dmq*/
10087 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
10089 case 'k': /* Handle \k<NAME> and \k'NAME' */
10092 char ch= RExC_parse[1];
10093 if (ch != '<' && ch != '\'' && ch != '{') {
10095 vFAIL2("Sequence %.2s... not terminated",parse_start);
10097 /* this pretty much dupes the code for (?P=...) in reg(), if
10098 you change this make sure you change that */
10099 char* name_start = (RExC_parse += 2);
10101 SV *sv_dat = reg_scan_name(pRExC_state,
10102 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10103 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10104 if (RExC_parse == name_start || *RExC_parse != ch)
10105 vFAIL2("Sequence %.3s... not terminated",parse_start);
10108 num = add_data( pRExC_state, 1, "S" );
10109 RExC_rxi->data->data[num]=(void*)sv_dat;
10110 SvREFCNT_inc_simple_void(sv_dat);
10114 ret = reganode(pRExC_state,
10117 : (MORE_ASCII_RESTRICTED)
10119 : (AT_LEAST_UNI_SEMANTICS)
10125 *flagp |= HASWIDTH;
10127 /* override incorrect value set in reganode MJD */
10128 Set_Node_Offset(ret, parse_start+1);
10129 Set_Node_Cur_Length(ret); /* MJD */
10130 nextchar(pRExC_state);
10136 case '1': case '2': case '3': case '4':
10137 case '5': case '6': case '7': case '8': case '9':
10140 bool isg = *RExC_parse == 'g';
10145 if (*RExC_parse == '{') {
10149 if (*RExC_parse == '-') {
10153 if (hasbrace && !isDIGIT(*RExC_parse)) {
10154 if (isrel) RExC_parse--;
10156 goto parse_named_seq;
10158 num = atoi(RExC_parse);
10159 if (isg && num == 0)
10160 vFAIL("Reference to invalid group 0");
10162 num = RExC_npar - num;
10164 vFAIL("Reference to nonexistent or unclosed group");
10166 if (!isg && num > 9 && num >= RExC_npar)
10167 /* Probably a character specified in octal, e.g. \35 */
10170 char * const parse_start = RExC_parse - 1; /* MJD */
10171 while (isDIGIT(*RExC_parse))
10173 if (parse_start == RExC_parse - 1)
10174 vFAIL("Unterminated \\g... pattern");
10176 if (*RExC_parse != '}')
10177 vFAIL("Unterminated \\g{...} pattern");
10181 if (num > (I32)RExC_rx->nparens)
10182 vFAIL("Reference to nonexistent group");
10185 ret = reganode(pRExC_state,
10188 : (MORE_ASCII_RESTRICTED)
10190 : (AT_LEAST_UNI_SEMANTICS)
10196 *flagp |= HASWIDTH;
10198 /* override incorrect value set in reganode MJD */
10199 Set_Node_Offset(ret, parse_start+1);
10200 Set_Node_Cur_Length(ret); /* MJD */
10202 nextchar(pRExC_state);
10207 if (RExC_parse >= RExC_end)
10208 FAIL("Trailing \\");
10211 /* Do not generate "unrecognized" warnings here, we fall
10212 back into the quick-grab loop below */
10219 if (RExC_flags & RXf_PMf_EXTENDED) {
10220 if ( reg_skipcomment( pRExC_state ) )
10227 parse_start = RExC_parse - 1;
10232 register STRLEN len;
10237 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
10240 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
10241 * it is folded to 'ss' even if not utf8 */
10242 bool is_exactfu_sharp_s;
10249 node_type = get_regex_charset(RExC_flags);
10250 if (node_type >= REGEX_ASCII_RESTRICTED_CHARSET) {
10251 node_type--; /* /a is same as /u, and map /aa's offset to
10252 what /a's would have been, so there is no
10255 node_type += EXACTF;
10257 ret = reg_node(pRExC_state, node_type);
10260 /* XXX The node can hold up to 255 bytes, yet this only goes to
10261 * 127. I (khw) do not know why. Keeping it somewhat less than
10262 * 255 allows us to not have to worry about overflow due to
10263 * converting to utf8 and fold expansion, but that value is
10264 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10265 * split up by this limit into a single one using the real max of
10266 * 255. Even at 127, this breaks under rare circumstances. If
10267 * folding, we do not want to split a node at a character that is a
10268 * non-final in a multi-char fold, as an input string could just
10269 * happen to want to match across the node boundary. The join
10270 * would solve that problem if the join actually happens. But a
10271 * series of more than two nodes in a row each of 127 would cause
10272 * the first join to succeed to get to 254, but then there wouldn't
10273 * be room for the next one, which could at be one of those split
10274 * multi-char folds. I don't know of any fool-proof solution. One
10275 * could back off to end with only a code point that isn't such a
10276 * non-final, but it is possible for there not to be any in the
10278 for (len = 0, p = RExC_parse - 1;
10279 len < 127 && p < RExC_end;
10282 char * const oldp = p;
10284 if (RExC_flags & RXf_PMf_EXTENDED)
10285 p = regwhite( pRExC_state, p );
10296 /* Literal Escapes Switch
10298 This switch is meant to handle escape sequences that
10299 resolve to a literal character.
10301 Every escape sequence that represents something
10302 else, like an assertion or a char class, is handled
10303 in the switch marked 'Special Escapes' above in this
10304 routine, but also has an entry here as anything that
10305 isn't explicitly mentioned here will be treated as
10306 an unescaped equivalent literal.
10309 switch ((U8)*++p) {
10310 /* These are all the special escapes. */
10311 case 'A': /* Start assertion */
10312 case 'b': case 'B': /* Word-boundary assertion*/
10313 case 'C': /* Single char !DANGEROUS! */
10314 case 'd': case 'D': /* digit class */
10315 case 'g': case 'G': /* generic-backref, pos assertion */
10316 case 'h': case 'H': /* HORIZWS */
10317 case 'k': case 'K': /* named backref, keep marker */
10318 case 'N': /* named char sequence */
10319 case 'p': case 'P': /* Unicode property */
10320 case 'R': /* LNBREAK */
10321 case 's': case 'S': /* space class */
10322 case 'v': case 'V': /* VERTWS */
10323 case 'w': case 'W': /* word class */
10324 case 'X': /* eXtended Unicode "combining character sequence" */
10325 case 'z': case 'Z': /* End of line/string assertion */
10329 /* Anything after here is an escape that resolves to a
10330 literal. (Except digits, which may or may not)
10349 ender = ASCII_TO_NATIVE('\033');
10353 ender = ASCII_TO_NATIVE('\007');
10358 STRLEN brace_len = len;
10360 const char* error_msg;
10362 bool valid = grok_bslash_o(p,
10369 RExC_parse = p; /* going to die anyway; point
10370 to exact spot of failure */
10377 if (PL_encoding && ender < 0x100) {
10378 goto recode_encoding;
10380 if (ender > 0xff) {
10387 STRLEN brace_len = len;
10389 const char* error_msg;
10391 bool valid = grok_bslash_x(p,
10398 RExC_parse = p; /* going to die anyway; point
10399 to exact spot of failure */
10405 if (PL_encoding && ender < 0x100) {
10406 goto recode_encoding;
10408 if (ender > 0xff) {
10415 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10417 case '0': case '1': case '2': case '3':case '4':
10418 case '5': case '6': case '7':
10420 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10422 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10424 ender = grok_oct(p, &numlen, &flags, NULL);
10425 if (ender > 0xff) {
10434 if (PL_encoding && ender < 0x100)
10435 goto recode_encoding;
10438 if (! RExC_override_recoding) {
10439 SV* enc = PL_encoding;
10440 ender = reg_recode((const char)(U8)ender, &enc);
10441 if (!enc && SIZE_ONLY)
10442 ckWARNreg(p, "Invalid escape in the specified encoding");
10448 FAIL("Trailing \\");
10451 if (!SIZE_ONLY&& isALNUMC(*p)) {
10452 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10454 goto normal_default;
10458 /* Currently we don't warn when the lbrace is at the start
10459 * of a construct. This catches it in the middle of a
10460 * literal string, or when its the first thing after
10461 * something like "\b" */
10463 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10465 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10470 if (UTF8_IS_START(*p) && UTF) {
10472 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10473 &numlen, UTF8_ALLOW_DEFAULT);
10479 } /* End of switch on the literal */
10481 is_exactfu_sharp_s = (node_type == EXACTFU
10482 && ender == LATIN_SMALL_LETTER_SHARP_S);
10483 if ( RExC_flags & RXf_PMf_EXTENDED)
10484 p = regwhite( pRExC_state, p );
10485 if ((UTF && FOLD) || is_exactfu_sharp_s) {
10486 /* Prime the casefolded buffer. Locale rules, which apply
10487 * only to code points < 256, aren't known until execution,
10488 * so for them, just output the original character using
10489 * utf8. If we start to fold non-UTF patterns, be sure to
10490 * update join_exact() */
10491 if (LOC && ender < 256) {
10492 if (UNI_IS_INVARIANT(ender)) {
10493 *tmpbuf = (U8) ender;
10496 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
10497 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
10501 else if (isASCII(ender)) { /* Note: Here can't also be LOC
10503 ender = toLOWER(ender);
10504 *tmpbuf = (U8) ender;
10507 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
10509 /* Locale and /aa require more selectivity about the
10510 * fold, so are handled below. Otherwise, here, just
10512 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
10515 /* Under locale rules or /aa we are not to mix,
10516 * respectively, ords < 256 or ASCII with non-. So
10517 * reject folds that mix them, using only the
10518 * non-folded code point. So do the fold to a
10519 * temporary, and inspect each character in it. */
10520 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
10522 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
10523 U8* e = s + foldlen;
10524 bool fold_ok = TRUE;
10528 || (LOC && (UTF8_IS_INVARIANT(*s)
10529 || UTF8_IS_DOWNGRADEABLE_START(*s))))
10537 Copy(trialbuf, tmpbuf, foldlen, U8);
10541 uvuni_to_utf8(tmpbuf, ender);
10542 foldlen = UNISKIP(ender);
10546 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
10549 else if (UTF || is_exactfu_sharp_s) {
10551 /* Emit all the Unicode characters. */
10553 for (foldbuf = tmpbuf;
10555 foldlen -= numlen) {
10557 /* tmpbuf has been constructed by us, so we
10558 * know it is valid utf8 */
10559 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10561 const STRLEN unilen = reguni(pRExC_state, ender, s);
10564 /* In EBCDIC the numlen
10565 * and unilen can differ. */
10567 if (numlen >= foldlen)
10571 break; /* "Can't happen." */
10575 const STRLEN unilen = reguni(pRExC_state, ender, s);
10584 REGC((char)ender, s++);
10588 if (UTF || is_exactfu_sharp_s) {
10590 /* Emit all the Unicode characters. */
10592 for (foldbuf = tmpbuf;
10594 foldlen -= numlen) {
10595 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10597 const STRLEN unilen = reguni(pRExC_state, ender, s);
10600 /* In EBCDIC the numlen
10601 * and unilen can differ. */
10603 if (numlen >= foldlen)
10611 const STRLEN unilen = reguni(pRExC_state, ender, s);
10620 REGC((char)ender, s++);
10623 loopdone: /* Jumped to when encounters something that shouldn't be in
10625 RExC_parse = p - 1;
10626 Set_Node_Cur_Length(ret); /* MJD */
10627 nextchar(pRExC_state);
10629 /* len is STRLEN which is unsigned, need to copy to signed */
10632 vFAIL("Internal disaster");
10635 *flagp |= HASWIDTH;
10636 if (len == 1 && UNI_IS_INVARIANT(ender))
10640 RExC_size += STR_SZ(len);
10642 STR_LEN(ret) = len;
10643 RExC_emit += STR_SZ(len);
10653 S_regwhite( RExC_state_t *pRExC_state, char *p )
10655 const char *e = RExC_end;
10657 PERL_ARGS_ASSERT_REGWHITE;
10662 else if (*p == '#') {
10665 if (*p++ == '\n') {
10671 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10679 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
10680 Character classes ([:foo:]) can also be negated ([:^foo:]).
10681 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
10682 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
10683 but trigger failures because they are currently unimplemented. */
10685 #define POSIXCC_DONE(c) ((c) == ':')
10686 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
10687 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
10690 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
10693 I32 namedclass = OOB_NAMEDCLASS;
10695 PERL_ARGS_ASSERT_REGPPOSIXCC;
10697 if (value == '[' && RExC_parse + 1 < RExC_end &&
10698 /* I smell either [: or [= or [. -- POSIX has been here, right? */
10699 POSIXCC(UCHARAT(RExC_parse))) {
10700 const char c = UCHARAT(RExC_parse);
10701 char* const s = RExC_parse++;
10703 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
10705 if (RExC_parse == RExC_end)
10706 /* Grandfather lone [:, [=, [. */
10709 const char* const t = RExC_parse++; /* skip over the c */
10712 if (UCHARAT(RExC_parse) == ']') {
10713 const char *posixcc = s + 1;
10714 RExC_parse++; /* skip over the ending ] */
10717 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10718 const I32 skip = t - posixcc;
10720 /* Initially switch on the length of the name. */
10723 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10724 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
10727 /* Names all of length 5. */
10728 /* alnum alpha ascii blank cntrl digit graph lower
10729 print punct space upper */
10730 /* Offset 4 gives the best switch position. */
10731 switch (posixcc[4]) {
10733 if (memEQ(posixcc, "alph", 4)) /* alpha */
10734 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10737 if (memEQ(posixcc, "spac", 4)) /* space */
10738 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10741 if (memEQ(posixcc, "grap", 4)) /* graph */
10742 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10745 if (memEQ(posixcc, "asci", 4)) /* ascii */
10746 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10749 if (memEQ(posixcc, "blan", 4)) /* blank */
10750 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10753 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10754 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10757 if (memEQ(posixcc, "alnu", 4)) /* alnum */
10758 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10761 if (memEQ(posixcc, "lowe", 4)) /* lower */
10762 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10763 else if (memEQ(posixcc, "uppe", 4)) /* upper */
10764 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10767 if (memEQ(posixcc, "digi", 4)) /* digit */
10768 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10769 else if (memEQ(posixcc, "prin", 4)) /* print */
10770 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10771 else if (memEQ(posixcc, "punc", 4)) /* punct */
10772 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10777 if (memEQ(posixcc, "xdigit", 6))
10778 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10782 if (namedclass == OOB_NAMEDCLASS)
10783 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10785 assert (posixcc[skip] == ':');
10786 assert (posixcc[skip+1] == ']');
10787 } else if (!SIZE_ONLY) {
10788 /* [[=foo=]] and [[.foo.]] are still future. */
10790 /* adjust RExC_parse so the warning shows after
10791 the class closes */
10792 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10794 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10797 /* Maternal grandfather:
10798 * "[:" ending in ":" but not in ":]" */
10808 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10812 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10814 if (POSIXCC(UCHARAT(RExC_parse))) {
10815 const char *s = RExC_parse;
10816 const char c = *s++;
10818 while (isALNUM(*s))
10820 if (*s && c == *s && s[1] == ']') {
10822 "POSIX syntax [%c %c] belongs inside character classes",
10825 /* [[=foo=]] and [[.foo.]] are still future. */
10826 if (POSIXCC_NOTYET(c)) {
10827 /* adjust RExC_parse so the error shows after
10828 the class closes */
10829 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10831 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10837 /* Generate the code to add a full posix character <class> to the bracketed
10838 * character class given by <node>. (<node> is needed only under locale rules)
10839 * destlist is the inversion list for non-locale rules that this class is
10841 * sourcelist is the ASCII-range inversion list to add under /a rules
10842 * Xsourcelist is the full Unicode range list to use otherwise. */
10843 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10845 SV* scratch_list = NULL; \
10847 /* Set this class in the node for runtime matching */ \
10848 ANYOF_CLASS_SET(node, class); \
10850 /* For above Latin1 code points, we use the full Unicode range */ \
10851 _invlist_intersection(PL_AboveLatin1, \
10854 /* And set the output to it, adding instead if there already is an \
10855 * output. Checking if <destlist> is NULL first saves an extra \
10856 * clone. Its reference count will be decremented at the next \
10857 * union, etc, or if this is the only instance, at the end of the \
10859 if (! destlist) { \
10860 destlist = scratch_list; \
10863 _invlist_union(destlist, scratch_list, &destlist); \
10864 SvREFCNT_dec(scratch_list); \
10868 /* For non-locale, just add it to any existing list */ \
10869 _invlist_union(destlist, \
10870 (AT_LEAST_ASCII_RESTRICTED) \
10876 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10878 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10880 SV* scratch_list = NULL; \
10881 ANYOF_CLASS_SET(node, class); \
10882 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10883 if (! destlist) { \
10884 destlist = scratch_list; \
10887 _invlist_union(destlist, scratch_list, &destlist); \
10888 SvREFCNT_dec(scratch_list); \
10892 _invlist_union_complement_2nd(destlist, \
10893 (AT_LEAST_ASCII_RESTRICTED) \
10897 /* Under /d, everything in the upper half of the Latin1 range \
10898 * matches this complement */ \
10899 if (DEPENDS_SEMANTICS) { \
10900 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10904 /* Generate the code to add a posix character <class> to the bracketed
10905 * character class given by <node>. (<node> is needed only under locale rules)
10906 * destlist is the inversion list for non-locale rules that this class is
10908 * sourcelist is the ASCII-range inversion list to add under /a rules
10909 * l1_sourcelist is the Latin1 range list to use otherwise.
10910 * Xpropertyname is the name to add to <run_time_list> of the property to
10911 * specify the code points above Latin1 that will have to be
10912 * determined at run-time
10913 * run_time_list is a SV* that contains text names of properties that are to
10914 * be computed at run time. This concatenates <Xpropertyname>
10915 * to it, apppropriately
10916 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10918 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10919 l1_sourcelist, Xpropertyname, run_time_list) \
10920 /* First, resolve whether to use the ASCII-only list or the L1 \
10922 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
10923 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10924 Xpropertyname, run_time_list)
10926 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10927 Xpropertyname, run_time_list) \
10928 /* If not /a matching, there are going to be code points we will have \
10929 * to defer to runtime to look-up */ \
10930 if (! AT_LEAST_ASCII_RESTRICTED) { \
10931 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10934 ANYOF_CLASS_SET(node, class); \
10937 _invlist_union(destlist, sourcelist, &destlist); \
10940 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
10941 * this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
10943 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10944 l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
10945 if (AT_LEAST_ASCII_RESTRICTED) { \
10946 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
10949 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10950 matches_above_unicode = TRUE; \
10952 ANYOF_CLASS_SET(node, namedclass); \
10955 SV* scratch_list = NULL; \
10956 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
10957 if (! destlist) { \
10958 destlist = scratch_list; \
10961 _invlist_union(destlist, scratch_list, &destlist); \
10962 SvREFCNT_dec(scratch_list); \
10964 if (DEPENDS_SEMANTICS) { \
10965 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10971 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10973 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10974 * alternate list, pointed to by 'alternate_ptr'. This is an array of
10975 * the multi-character folds of characters in the node */
10978 PERL_ARGS_ASSERT_ADD_ALTERNATE;
10980 if (! *alternate_ptr) {
10981 *alternate_ptr = newAV();
10983 sv = newSVpvn_utf8((char*)string, len, TRUE);
10984 av_push(*alternate_ptr, sv);
10989 parse a class specification and produce either an ANYOF node that
10990 matches the pattern or perhaps will be optimized into an EXACTish node
10991 instead. The node contains a bit map for the first 256 characters, with the
10992 corresponding bit set if that character is in the list. For characters
10993 above 255, a range list is used */
10996 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10999 register UV nextvalue;
11000 register IV prevvalue = OOB_UNICODE;
11001 register IV range = 0;
11002 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
11003 register regnode *ret;
11005 IV namedclass = OOB_NAMEDCLASS;
11006 char *rangebegin = NULL;
11007 bool need_class = 0;
11008 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
11010 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11011 than just initialized. */
11012 SV* properties = NULL; /* Code points that match \p{} \P{} */
11013 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11014 extended beyond the Latin1 range */
11015 UV element_count = 0; /* Number of distinct elements in the class.
11016 Optimizations may be possible if this is tiny */
11019 /* Certain named classes have equivalents that can appear outside a
11020 * character class, e.g. \w. These flags are set for these classes. The
11021 * first flag indicates the op depends on the character set modifier, like
11022 * /d, /u.... The second is for those that don't have this dependency. */
11023 bool has_special_charset_op = FALSE;
11024 bool has_special_non_charset_op = FALSE;
11026 /* Unicode properties are stored in a swash; this holds the current one
11027 * being parsed. If this swash is the only above-latin1 component of the
11028 * character class, an optimization is to pass it directly on to the
11029 * execution engine. Otherwise, it is set to NULL to indicate that there
11030 * are other things in the class that have to be dealt with at execution
11032 SV* swash = NULL; /* Code points that match \p{} \P{} */
11034 /* Set if a component of this character class is user-defined; just passed
11035 * on to the engine */
11036 bool has_user_defined_property = FALSE;
11038 /* inversion list of code points this node matches only when the target
11039 * string is in UTF-8. (Because is under /d) */
11040 SV* depends_list = NULL;
11042 /* inversion list of code points this node matches. For much of the
11043 * function, it includes only those that match regardless of the utf8ness
11044 * of the target string */
11045 SV* cp_list = NULL;
11047 /* List of multi-character folds that are matched by this node */
11048 AV* unicode_alternate = NULL;
11050 /* In a range, counts how many 0-2 of the ends of it came from literals,
11051 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11052 UV literal_endpoint = 0;
11054 UV stored = 0; /* how many chars stored in the bitmap */
11055 bool invert = FALSE; /* Is this class to be complemented */
11057 /* Is there any thing like \W or [:^digit:] that matches above the legal
11058 * Unicode range? */
11059 bool runtime_posix_matches_above_Unicode = FALSE;
11061 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11062 case we need to change the emitted regop to an EXACT. */
11063 const char * orig_parse = RExC_parse;
11064 GET_RE_DEBUG_FLAGS_DECL;
11066 PERL_ARGS_ASSERT_REGCLASS;
11068 PERL_UNUSED_ARG(depth);
11071 DEBUG_PARSE("clas");
11073 /* Assume we are going to generate an ANYOF node. */
11074 ret = reganode(pRExC_state, ANYOF, 0);
11078 ANYOF_FLAGS(ret) = 0;
11081 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11086 /* We have decided to not allow multi-char folds in inverted character
11087 * classes, due to the confusion that can happen, especially with
11088 * classes that are designed for a non-Unicode world: You have the
11089 * peculiar case that:
11090 "s s" =~ /^[^\xDF]+$/i => Y
11091 "ss" =~ /^[^\xDF]+$/i => N
11093 * See [perl #89750] */
11094 allow_full_fold = FALSE;
11098 RExC_size += ANYOF_SKIP;
11099 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11102 RExC_emit += ANYOF_SKIP;
11104 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11106 ANYOF_BITMAP_ZERO(ret);
11107 listsv = newSVpvs("# comment\n");
11108 initial_listsv_len = SvCUR(listsv);
11111 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11113 if (!SIZE_ONLY && POSIXCC(nextvalue))
11114 checkposixcc(pRExC_state);
11116 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11117 if (UCHARAT(RExC_parse) == ']')
11118 goto charclassloop;
11121 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11125 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11128 rangebegin = RExC_parse;
11132 value = utf8n_to_uvchr((U8*)RExC_parse,
11133 RExC_end - RExC_parse,
11134 &numlen, UTF8_ALLOW_DEFAULT);
11135 RExC_parse += numlen;
11138 value = UCHARAT(RExC_parse++);
11140 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11141 if (value == '[' && POSIXCC(nextvalue))
11142 namedclass = regpposixcc(pRExC_state, value);
11143 else if (value == '\\') {
11145 value = utf8n_to_uvchr((U8*)RExC_parse,
11146 RExC_end - RExC_parse,
11147 &numlen, UTF8_ALLOW_DEFAULT);
11148 RExC_parse += numlen;
11151 value = UCHARAT(RExC_parse++);
11152 /* Some compilers cannot handle switching on 64-bit integer
11153 * values, therefore value cannot be an UV. Yes, this will
11154 * be a problem later if we want switch on Unicode.
11155 * A similar issue a little bit later when switching on
11156 * namedclass. --jhi */
11157 switch ((I32)value) {
11158 case 'w': namedclass = ANYOF_ALNUM; break;
11159 case 'W': namedclass = ANYOF_NALNUM; break;
11160 case 's': namedclass = ANYOF_SPACE; break;
11161 case 'S': namedclass = ANYOF_NSPACE; break;
11162 case 'd': namedclass = ANYOF_DIGIT; break;
11163 case 'D': namedclass = ANYOF_NDIGIT; break;
11164 case 'v': namedclass = ANYOF_VERTWS; break;
11165 case 'V': namedclass = ANYOF_NVERTWS; break;
11166 case 'h': namedclass = ANYOF_HORIZWS; break;
11167 case 'H': namedclass = ANYOF_NHORIZWS; break;
11168 case 'N': /* Handle \N{NAME} in class */
11170 /* We only pay attention to the first char of
11171 multichar strings being returned. I kinda wonder
11172 if this makes sense as it does change the behaviour
11173 from earlier versions, OTOH that behaviour was broken
11175 UV v; /* value is register so we cant & it /grrr */
11176 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
11186 if (RExC_parse >= RExC_end)
11187 vFAIL2("Empty \\%c{}", (U8)value);
11188 if (*RExC_parse == '{') {
11189 const U8 c = (U8)value;
11190 e = strchr(RExC_parse++, '}');
11192 vFAIL2("Missing right brace on \\%c{}", c);
11193 while (isSPACE(UCHARAT(RExC_parse)))
11195 if (e == RExC_parse)
11196 vFAIL2("Empty \\%c{}", c);
11197 n = e - RExC_parse;
11198 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11210 if (UCHARAT(RExC_parse) == '^') {
11213 value = value == 'p' ? 'P' : 'p'; /* toggle */
11214 while (isSPACE(UCHARAT(RExC_parse))) {
11219 /* Try to get the definition of the property into
11220 * <invlist>. If /i is in effect, the effective property
11221 * will have its name be <__NAME_i>. The design is
11222 * discussed in commit
11223 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11224 Newx(name, n + sizeof("_i__\n"), char);
11226 sprintf(name, "%s%.*s%s\n",
11227 (FOLD) ? "__" : "",
11233 /* Look up the property name, and get its swash and
11234 * inversion list, if the property is found */
11236 SvREFCNT_dec(swash);
11238 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11241 TRUE, /* this routine will handle
11242 undefined properties */
11243 NULL, FALSE /* No inversion list */
11247 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11249 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11251 || ! (invlist = *invlistsvp))
11254 SvREFCNT_dec(swash);
11258 /* Here didn't find it. It could be a user-defined
11259 * property that will be available at run-time. Add it
11260 * to the list to look up then */
11261 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11262 (value == 'p' ? '+' : '!'),
11264 has_user_defined_property = TRUE;
11266 /* We don't know yet, so have to assume that the
11267 * property could match something in the Latin1 range,
11268 * hence something that isn't utf8 */
11269 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11273 /* Here, did get the swash and its inversion list. If
11274 * the swash is from a user-defined property, then this
11275 * whole character class should be regarded as such */
11276 has_user_defined_property =
11277 _is_swash_user_defined(swash);
11279 /* Invert if asking for the complement */
11280 if (value == 'P') {
11281 _invlist_union_complement_2nd(properties,
11285 /* The swash can't be used as-is, because we've
11286 * inverted things; delay removing it to here after
11287 * have copied its invlist above */
11288 SvREFCNT_dec(swash);
11292 _invlist_union(properties, invlist, &properties);
11297 RExC_parse = e + 1;
11298 namedclass = ANYOF_MAX; /* no official name, but it's named */
11300 /* \p means they want Unicode semantics */
11301 RExC_uni_semantics = 1;
11304 case 'n': value = '\n'; break;
11305 case 'r': value = '\r'; break;
11306 case 't': value = '\t'; break;
11307 case 'f': value = '\f'; break;
11308 case 'b': value = '\b'; break;
11309 case 'e': value = ASCII_TO_NATIVE('\033');break;
11310 case 'a': value = ASCII_TO_NATIVE('\007');break;
11312 RExC_parse--; /* function expects to be pointed at the 'o' */
11314 const char* error_msg;
11315 bool valid = grok_bslash_o(RExC_parse,
11320 RExC_parse += numlen;
11325 if (PL_encoding && value < 0x100) {
11326 goto recode_encoding;
11330 RExC_parse--; /* function expects to be pointed at the 'x' */
11332 const char* error_msg;
11333 bool valid = grok_bslash_x(RExC_parse,
11338 RExC_parse += numlen;
11343 if (PL_encoding && value < 0x100)
11344 goto recode_encoding;
11347 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11349 case '0': case '1': case '2': case '3': case '4':
11350 case '5': case '6': case '7':
11352 /* Take 1-3 octal digits */
11353 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11355 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11356 RExC_parse += numlen;
11357 if (PL_encoding && value < 0x100)
11358 goto recode_encoding;
11362 if (! RExC_override_recoding) {
11363 SV* enc = PL_encoding;
11364 value = reg_recode((const char)(U8)value, &enc);
11365 if (!enc && SIZE_ONLY)
11366 ckWARNreg(RExC_parse,
11367 "Invalid escape in the specified encoding");
11371 /* Allow \_ to not give an error */
11372 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11373 ckWARN2reg(RExC_parse,
11374 "Unrecognized escape \\%c in character class passed through",
11379 } /* end of \blah */
11382 literal_endpoint++;
11385 /* What matches in a locale is not known until runtime. This
11386 * includes what the Posix classes (like \w, [:space:]) match.
11387 * Room must be reserved (one time per class) to store such
11388 * classes, either if Perl is compiled so that locale nodes always
11389 * should have this space, or if there is such class info to be
11390 * stored. The space will contain a bit for each named class that
11391 * is to be matched against. This isn't needed for \p{} and
11392 * pseudo-classes, as they are not affected by locale, and hence
11393 * are dealt with separately */
11396 && (ANYOF_LOCALE == ANYOF_CLASS
11397 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11401 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11404 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11405 ANYOF_CLASS_ZERO(ret);
11407 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11410 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11412 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11413 * literal, as is the character that began the false range, i.e.
11414 * the 'a' in the examples */
11418 RExC_parse >= rangebegin ?
11419 RExC_parse - rangebegin : 0;
11420 ckWARN4reg(RExC_parse,
11421 "False [] range \"%*.*s\"",
11423 cp_list = add_cp_to_invlist(cp_list, '-');
11424 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11427 range = 0; /* this was not a true range */
11428 element_count += 2; /* So counts for three values */
11433 /* In the first pass, do a little extra work so below can
11434 * possibly optimize the whole node to one of the nodes that
11435 * correspond to the classes given below */
11437 /* The optimization will only take place if there is a single
11438 * element in the class, so can skip if there is more than one
11440 if (element_count == 1) {
11442 /* Possible truncation here but in some 64-bit environments
11443 * the compiler gets heartburn about switch on 64-bit values.
11444 * A similar issue a little earlier when switching on value.
11446 switch ((I32)namedclass) {
11453 has_special_charset_op = TRUE;
11456 case ANYOF_HORIZWS:
11457 case ANYOF_NHORIZWS:
11459 case ANYOF_NVERTWS:
11460 has_special_non_charset_op = TRUE;
11466 switch ((I32)namedclass) {
11468 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11469 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11470 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11472 case ANYOF_NALNUMC:
11473 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11474 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11475 runtime_posix_matches_above_Unicode);
11478 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11479 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11482 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11483 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11484 runtime_posix_matches_above_Unicode);
11488 ANYOF_CLASS_SET(ret, namedclass);
11491 _invlist_union(posixes, PL_ASCII, &posixes);
11496 ANYOF_CLASS_SET(ret, namedclass);
11499 _invlist_union_complement_2nd(posixes,
11500 PL_ASCII, &posixes);
11501 if (DEPENDS_SEMANTICS) {
11502 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11507 DO_POSIX(ret, namedclass, posixes,
11508 PL_PosixBlank, PL_XPosixBlank);
11511 DO_N_POSIX(ret, namedclass, posixes,
11512 PL_PosixBlank, PL_XPosixBlank);
11515 DO_POSIX(ret, namedclass, posixes,
11516 PL_PosixCntrl, PL_XPosixCntrl);
11519 DO_N_POSIX(ret, namedclass, posixes,
11520 PL_PosixCntrl, PL_XPosixCntrl);
11523 /* There are no digits in the Latin1 range outside of
11524 * ASCII, so call the macro that doesn't have to resolve
11526 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
11527 PL_PosixDigit, "XPosixDigit", listsv);
11528 has_special_charset_op = TRUE;
11531 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11532 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
11533 runtime_posix_matches_above_Unicode);
11534 has_special_charset_op = TRUE;
11537 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11538 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11541 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11542 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
11543 runtime_posix_matches_above_Unicode);
11545 case ANYOF_HORIZWS:
11546 /* For these, we use the cp_list, as /d doesn't make a
11547 * difference in what these match. There would be problems
11548 * if these characters had folds other than themselves, as
11549 * cp_list is subject to folding. It turns out that \h
11550 * is just a synonym for XPosixBlank */
11551 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
11552 has_special_non_charset_op = TRUE;
11554 case ANYOF_NHORIZWS:
11555 _invlist_union_complement_2nd(cp_list,
11556 PL_XPosixBlank, &cp_list);
11557 has_special_non_charset_op = TRUE;
11561 { /* These require special handling, as they differ under
11562 folding, matching Cased there (which in the ASCII range
11563 is the same as Alpha */
11569 if (FOLD && ! LOC) {
11570 ascii_source = PL_PosixAlpha;
11571 l1_source = PL_L1Cased;
11575 ascii_source = PL_PosixLower;
11576 l1_source = PL_L1PosixLower;
11577 Xname = "XPosixLower";
11579 if (namedclass == ANYOF_LOWER) {
11580 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11581 ascii_source, l1_source, Xname, listsv);
11584 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11585 posixes, ascii_source, l1_source, Xname, listsv,
11586 runtime_posix_matches_above_Unicode);
11591 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11592 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11595 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11596 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
11597 runtime_posix_matches_above_Unicode);
11600 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11601 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11604 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11605 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
11606 runtime_posix_matches_above_Unicode);
11609 DO_POSIX(ret, namedclass, posixes,
11610 PL_PosixSpace, PL_XPosixSpace);
11612 case ANYOF_NPSXSPC:
11613 DO_N_POSIX(ret, namedclass, posixes,
11614 PL_PosixSpace, PL_XPosixSpace);
11617 DO_POSIX(ret, namedclass, posixes,
11618 PL_PerlSpace, PL_XPerlSpace);
11619 has_special_charset_op = TRUE;
11622 DO_N_POSIX(ret, namedclass, posixes,
11623 PL_PerlSpace, PL_XPerlSpace);
11624 has_special_charset_op = TRUE;
11626 case ANYOF_UPPER: /* Same as LOWER, above */
11633 if (FOLD && ! LOC) {
11634 ascii_source = PL_PosixAlpha;
11635 l1_source = PL_L1Cased;
11639 ascii_source = PL_PosixUpper;
11640 l1_source = PL_L1PosixUpper;
11641 Xname = "XPosixUpper";
11643 if (namedclass == ANYOF_UPPER) {
11644 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11645 ascii_source, l1_source, Xname, listsv);
11648 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11649 posixes, ascii_source, l1_source, Xname, listsv,
11650 runtime_posix_matches_above_Unicode);
11654 case ANYOF_ALNUM: /* Really is 'Word' */
11655 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11656 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11657 has_special_charset_op = TRUE;
11660 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11661 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
11662 runtime_posix_matches_above_Unicode);
11663 has_special_charset_op = TRUE;
11666 /* For these, we use the cp_list, as /d doesn't make a
11667 * difference in what these match. There would be problems
11668 * if these characters had folds other than themselves, as
11669 * cp_list is subject to folding */
11670 _invlist_union(cp_list, PL_VertSpace, &cp_list);
11671 has_special_non_charset_op = TRUE;
11673 case ANYOF_NVERTWS:
11674 _invlist_union_complement_2nd(cp_list,
11675 PL_VertSpace, &cp_list);
11676 has_special_non_charset_op = TRUE;
11679 DO_POSIX(ret, namedclass, posixes,
11680 PL_PosixXDigit, PL_XPosixXDigit);
11682 case ANYOF_NXDIGIT:
11683 DO_N_POSIX(ret, namedclass, posixes,
11684 PL_PosixXDigit, PL_XPosixXDigit);
11687 /* this is to handle \p and \P */
11690 vFAIL("Invalid [::] class");
11696 } /* end of namedclass \blah */
11699 if (prevvalue > (IV)value) /* b-a */ {
11700 const int w = RExC_parse - rangebegin;
11701 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11702 range = 0; /* not a valid range */
11706 prevvalue = value; /* save the beginning of the range */
11707 if (RExC_parse+1 < RExC_end
11708 && *RExC_parse == '-'
11709 && RExC_parse[1] != ']')
11713 /* a bad range like \w-, [:word:]- ? */
11714 if (namedclass > OOB_NAMEDCLASS) {
11715 if (ckWARN(WARN_REGEXP)) {
11717 RExC_parse >= rangebegin ?
11718 RExC_parse - rangebegin : 0;
11720 "False [] range \"%*.*s\"",
11724 cp_list = add_cp_to_invlist(cp_list, '-');
11726 range = 1; /* yeah, it's a range! */
11727 continue; /* but do it the next time */
11731 /* non-Latin1 code point implies unicode semantics. Must be set in
11732 * pass1 so is there for the whole of pass 2 */
11734 RExC_uni_semantics = 1;
11737 /* now is the next time */
11740 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
11742 UV* this_range = _new_invlist(1);
11743 _append_range_to_invlist(this_range, prevvalue, value);
11745 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
11746 * If this range was specified using something like 'i-j', we want
11747 * to include only the 'i' and the 'j', and not anything in
11748 * between, so exclude non-ASCII, non-alphabetics from it.
11749 * However, if the range was specified with something like
11750 * [\x89-\x91] or [\x89-j], all code points within it should be
11751 * included. literal_endpoint==2 means both ends of the range used
11752 * a literal character, not \x{foo} */
11753 if (literal_endpoint == 2
11754 && (prevvalue >= 'a' && value <= 'z')
11755 || (prevvalue >= 'A' && value <= 'Z'))
11757 _invlist_intersection(this_range, PL_ASCII, &this_range, );
11758 _invlist_intersection(this_range, PL_Alpha, &this_range, );
11760 _invlist_union(cp_list, this_range, &cp_list);
11761 literal_endpoint = 0;
11765 range = 0; /* this range (if it was one) is done now */
11768 /* [\w] can be optimized into \w, but not if there is anything else in the
11769 * brackets (except for an initial '^' which indictes omplementing). We
11770 * also can optimize the common special case /[0-9]/ into /\d/a */
11771 if (element_count == 1 &&
11772 (has_special_charset_op
11773 || has_special_non_charset_op
11774 || (prevvalue == '0' && value == '9')))
11777 const char * cur_parse = RExC_parse;
11779 if (has_special_charset_op) {
11780 U8 offset = get_regex_charset(RExC_flags);
11782 /* /aa is the same as /a for these */
11783 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
11784 offset = REGEX_ASCII_RESTRICTED_CHARSET;
11786 switch ((I32)namedclass) {
11805 /* There is no DIGITU */
11806 if (offset == REGEX_UNICODE_CHARSET) {
11807 offset = REGEX_DEPENDS_CHARSET;
11811 Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
11814 /* The number of varieties of each of these is the same, hence, so
11815 * is the delta between the normal and complemented nodes */
11817 offset += NALNUM - ALNUM;
11822 else if (has_special_non_charset_op) {
11823 switch ((I32)namedclass) {
11824 case ANYOF_NHORIZWS:
11827 case ANYOF_HORIZWS:
11830 case ANYOF_NVERTWS:
11837 Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
11840 /* The complement version of each of these nodes is adjacently next
11846 else { /* The remaining possibility is [0-9] */
11847 op = (invert) ? NDIGITA : DIGITA;
11850 /* Throw away this ANYOF regnode, and emit the calculated one, which
11851 * should correspond to the beginning, not current, state of the parse
11853 RExC_parse = (char *)orig_parse;
11854 RExC_emit = (regnode *)orig_emit;
11855 ret = reg_node(pRExC_state, op);
11856 RExC_parse = (char *) cur_parse;
11858 SvREFCNT_dec(listsv);
11864 /****** !SIZE_ONLY AFTER HERE *********/
11866 /* If folding, we calculate all characters that could fold to or from the
11867 * ones already on the list */
11868 if (FOLD && cp_list) {
11869 UV start, end; /* End points of code point ranges */
11871 SV* fold_intersection = NULL;
11873 /* In the Latin1 range, the characters that can be folded-to or -from
11874 * are precisely the alphabetic characters. If the highest code point
11875 * is within Latin1, we can use the compiled-in list, and not have to
11876 * go out to disk. */
11877 if (invlist_highest(cp_list) < 256) {
11878 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
11882 /* This is a list of all the characters that participate in folds
11883 * (except marks, etc in multi-char folds */
11884 if (! PL_utf8_foldable) {
11885 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11886 PL_utf8_foldable = _swash_to_invlist(swash);
11887 SvREFCNT_dec(swash);
11890 /* This is a hash that for a particular fold gives all characters
11891 * that are involved in it */
11892 if (! PL_utf8_foldclosures) {
11894 /* If we were unable to find any folds, then we likely won't be
11895 * able to find the closures. So just create an empty list.
11896 * Folding will effectively be restricted to the non-Unicode
11897 * rules hard-coded into Perl. (This case happens legitimately
11898 * during compilation of Perl itself before the Unicode tables
11899 * are generated) */
11900 if (invlist_len(PL_utf8_foldable) == 0) {
11901 PL_utf8_foldclosures = newHV();
11904 /* If the folds haven't been read in, call a fold function
11906 if (! PL_utf8_tofold) {
11907 U8 dummy[UTF8_MAXBYTES+1];
11910 /* This particular string is above \xff in both UTF-8
11912 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11913 assert(PL_utf8_tofold); /* Verify that worked */
11915 PL_utf8_foldclosures =
11916 _swash_inversion_hash(PL_utf8_tofold);
11920 /* Only the characters in this class that participate in folds need
11921 * be checked. Get the intersection of this class and all the
11922 * possible characters that are foldable. This can quickly narrow
11923 * down a large class */
11924 _invlist_intersection(PL_utf8_foldable, cp_list,
11925 &fold_intersection);
11928 /* Now look at the foldable characters in this class individually */
11929 invlist_iterinit(fold_intersection);
11930 while (invlist_iternext(fold_intersection, &start, &end)) {
11933 /* Locale folding for Latin1 characters is deferred until runtime */
11934 if (LOC && start < 256) {
11938 /* Look at every character in the range */
11939 for (j = start; j <= end; j++) {
11941 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11947 /* We have the latin1 folding rules hard-coded here so that
11948 * an innocent-looking character class, like /[ks]/i won't
11949 * have to go out to disk to find the possible matches.
11950 * XXX It would be better to generate these via regen, in
11951 * case a new version of the Unicode standard adds new
11952 * mappings, though that is not really likely, and may be
11953 * caught by the default: case of the switch below. */
11955 if (PL_fold_latin1[j] != j) {
11957 /* ASCII is always matched; non-ASCII is matched only
11958 * under Unicode rules */
11959 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
11961 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
11965 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
11969 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
11970 && (! isASCII(j) || ! MORE_ASCII_RESTRICTED))
11972 /* Certain Latin1 characters have matches outside
11973 * Latin1, or are multi-character. To get here, 'j' is
11974 * one of those characters. None of these matches is
11975 * valid for ASCII characters under /aa, which is why
11976 * the 'if' just above excludes those. The matches
11977 * fall into three categories:
11978 * 1) They are singly folded-to or -from an above 255
11979 * character, e.g., LATIN SMALL LETTER Y WITH
11980 * DIAERESIS and LATIN CAPITAL LETTER Y WITH
11982 * 2) They are part of a multi-char fold with another
11983 * latin1 character; only LATIN SMALL LETTER
11984 * SHARP S => "ss" fits this;
11985 * 3) They are part of a multi-char fold with a
11986 * character outside of Latin1, such as various
11988 * We aren't dealing fully with multi-char folds, except
11989 * we do deal with the pattern containing a character
11990 * that has a multi-char fold (not so much the inverse).
11991 * For types 1) and 3), the matches only happen when the
11992 * target string is utf8; that's not true for 2), and we
11993 * set a flag for it.
11995 * The code below adds the single fold closures for 'j'
11996 * to the inversion list. */
12002 add_cp_to_invlist(cp_list, 0x212A);
12006 /* LATIN SMALL LETTER LONG S */
12008 add_cp_to_invlist(cp_list, 0x017F);
12011 cp_list = add_cp_to_invlist(cp_list,
12012 GREEK_SMALL_LETTER_MU);
12013 cp_list = add_cp_to_invlist(cp_list,
12014 GREEK_CAPITAL_LETTER_MU);
12016 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12017 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12018 /* ANGSTROM SIGN */
12020 add_cp_to_invlist(cp_list, 0x212B);
12022 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12023 cp_list = add_cp_to_invlist(cp_list,
12024 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12026 case LATIN_SMALL_LETTER_SHARP_S:
12027 cp_list = add_cp_to_invlist(cp_list,
12028 LATIN_CAPITAL_LETTER_SHARP_S);
12030 /* Under /a, /d, and /u, this can match the two
12032 if (! MORE_ASCII_RESTRICTED) {
12033 add_alternate(&unicode_alternate,
12036 /* And under /u or /a, it can match even if
12037 * the target is not utf8 */
12038 if (AT_LEAST_UNI_SEMANTICS) {
12039 ANYOF_FLAGS(ret) |=
12040 ANYOF_NONBITMAP_NON_UTF8;
12044 case 'F': case 'f':
12045 case 'I': case 'i':
12046 case 'L': case 'l':
12047 case 'T': case 't':
12048 case 'A': case 'a':
12049 case 'H': case 'h':
12050 case 'J': case 'j':
12051 case 'N': case 'n':
12052 case 'W': case 'w':
12053 case 'Y': case 'y':
12054 /* These all are targets of multi-character
12055 * folds from code points that require UTF8 to
12056 * express, so they can't match unless the
12057 * target string is in UTF-8, so no action here
12058 * is necessary, as regexec.c properly handles
12059 * the general case for UTF-8 matching */
12062 /* Use deprecated warning to increase the
12063 * chances of this being output */
12064 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12071 /* Here is an above Latin1 character. We don't have the rules
12072 * hard-coded for it. First, get its fold */
12073 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
12074 ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
12076 ? FOLD_FLAGS_LOCALE
12077 : (MORE_ASCII_RESTRICTED)
12078 ? FOLD_FLAGS_NOMIX_ASCII
12081 if (foldlen > (STRLEN)UNISKIP(f)) {
12083 /* Any multicharacter foldings (disallowed in lookbehind
12084 * patterns) require the following transform: [ABCDEF] ->
12085 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
12086 * folds into "rst", all other characters fold to single
12087 * characters. We save away these multicharacter foldings,
12088 * to be later saved as part of the additional "s" data. */
12089 if (! RExC_in_lookbehind) {
12091 U8* e = foldbuf + foldlen;
12093 /* If any of the folded characters of this are in the
12094 * Latin1 range, tell the regex engine that this can
12095 * match a non-utf8 target string. */
12097 if (UTF8_IS_INVARIANT(*loc)
12098 || UTF8_IS_DOWNGRADEABLE_START(*loc))
12101 |= ANYOF_NONBITMAP_NON_UTF8;
12104 loc += UTF8SKIP(loc);
12107 add_alternate(&unicode_alternate, foldbuf, foldlen);
12111 /* Single character fold of above Latin1. Add everything
12112 * in its fold closure to the list that this node should
12116 /* The fold closures data structure is a hash with the keys
12117 * being every character that is folded to, like 'k', and
12118 * the values each an array of everything that folds to its
12119 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
12120 if ((listp = hv_fetch(PL_utf8_foldclosures,
12121 (char *) foldbuf, foldlen, FALSE)))
12123 AV* list = (AV*) *listp;
12125 for (k = 0; k <= av_len(list); k++) {
12126 SV** c_p = av_fetch(list, k, FALSE);
12129 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12133 /* /aa doesn't allow folds between ASCII and non-;
12134 * /l doesn't allow them between above and below
12136 if ((MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j)))
12137 || (LOC && ((c < 256) != (j < 256))))
12142 /* Folds involving non-ascii Latin1 characters
12143 * under /d are added to a separate list */
12144 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12146 cp_list = add_cp_to_invlist(cp_list, c);
12149 depends_list = add_cp_to_invlist(depends_list, c);
12156 SvREFCNT_dec(fold_intersection);
12159 /* And combine the result (if any) with any inversion list from posix
12160 * classes. The lists are kept separate up to now because we don't want to
12161 * fold the classes */
12163 if (AT_LEAST_UNI_SEMANTICS) {
12165 _invlist_union(cp_list, posixes, &cp_list);
12166 SvREFCNT_dec(posixes);
12174 /* Under /d, we put into a separate list the Latin1 things that
12175 * match only when the target string is utf8 */
12176 SV* nonascii_but_latin1_properties = NULL;
12177 _invlist_intersection(posixes, PL_Latin1,
12178 &nonascii_but_latin1_properties);
12179 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12180 &nonascii_but_latin1_properties);
12181 _invlist_subtract(posixes, nonascii_but_latin1_properties,
12184 _invlist_union(cp_list, posixes, &cp_list);
12185 SvREFCNT_dec(posixes);
12191 if (depends_list) {
12192 _invlist_union(depends_list, nonascii_but_latin1_properties,
12194 SvREFCNT_dec(nonascii_but_latin1_properties);
12197 depends_list = nonascii_but_latin1_properties;
12202 /* And combine the result (if any) with any inversion list from properties.
12203 * The lists are kept separate up to now so that we can distinguish the two
12204 * in regards to matching above-Unicode. A run-time warning is generated
12205 * if a Unicode property is matched against a non-Unicode code point. But,
12206 * we allow user-defined properties to match anything, without any warning,
12207 * and we also suppress the warning if there is a portion of the character
12208 * class that isn't a Unicode property, and which matches above Unicode, \W
12209 * or [\x{110000}] for example.
12210 * (Note that in this case, unlike the Posix one above, there is no
12211 * <depends_list>, because having a Unicode property forces Unicode
12214 bool warn_super = ! has_user_defined_property;
12217 /* If it matters to the final outcome, see if a non-property
12218 * component of the class matches above Unicode. If so, the
12219 * warning gets suppressed. This is true even if just a single
12220 * such code point is specified, as though not strictly correct if
12221 * another such code point is matched against, the fact that they
12222 * are using above-Unicode code points indicates they should know
12223 * the issues involved */
12225 bool non_prop_matches_above_Unicode =
12226 runtime_posix_matches_above_Unicode
12227 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12229 non_prop_matches_above_Unicode =
12230 ! non_prop_matches_above_Unicode;
12232 warn_super = ! non_prop_matches_above_Unicode;
12235 _invlist_union(properties, cp_list, &cp_list);
12236 SvREFCNT_dec(properties);
12239 cp_list = properties;
12243 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12247 /* Here, we have calculated what code points should be in the character
12250 * Now we can see about various optimizations. Fold calculation (which we
12251 * did above) needs to take place before inversion. Otherwise /[^k]/i
12252 * would invert to include K, which under /i would match k, which it
12255 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
12256 * set the FOLD flag yet, so this does optimize those. It doesn't
12257 * optimize locale. Doing so perhaps could be done as long as there is
12258 * nothing like \w in it; some thought also would have to be given to the
12259 * interaction with above 0x100 chars */
12263 && ! unicode_alternate
12264 && SvCUR(listsv) == initial_listsv_len)
12266 _invlist_invert(cp_list);
12268 /* Any swash can't be used as-is, because we've inverted things */
12270 SvREFCNT_dec(swash);
12274 /* Clear the invert flag since have just done it here */
12278 /* Here, <cp_list> contains all the code points we can determine at
12279 * compile time that match under all conditions. Go through it, and
12280 * for things that belong in the bitmap, put them there, and delete from
12284 /* This gets set if we actually need to modify things */
12285 bool change_invlist = FALSE;
12289 /* Start looking through <cp_list> */
12290 invlist_iterinit(cp_list);
12291 while (invlist_iternext(cp_list, &start, &end)) {
12295 /* Quit if are above what we should change */
12300 change_invlist = TRUE;
12302 /* Set all the bits in the range, up to the max that we are doing */
12303 high = (end < 255) ? end : 255;
12304 for (i = start; i <= (int) high; i++) {
12305 if (! ANYOF_BITMAP_TEST(ret, i)) {
12306 ANYOF_BITMAP_SET(ret, i);
12314 /* Done with loop; remove any code points that are in the bitmap from
12316 if (change_invlist) {
12317 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
12320 /* If have completely emptied it, remove it completely */
12321 if (invlist_len(cp_list) == 0) {
12322 SvREFCNT_dec(cp_list);
12328 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
12331 /* Combine the two lists into one. */
12332 if (depends_list) {
12334 _invlist_union(cp_list, depends_list, &cp_list);
12335 SvREFCNT_dec(depends_list);
12338 cp_list = depends_list;
12342 /* Folding in the bitmap is taken care of above, but not for locale (for
12343 * which we have to wait to see what folding is in effect at runtime), and
12344 * for some things not in the bitmap (only the upper latin folds in this
12345 * case, as all other single-char folding has been set above). Set
12346 * run-time fold flag for these */
12348 || (DEPENDS_SEMANTICS
12350 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12351 || unicode_alternate))
12353 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12356 /* A single character class can be "optimized" into an EXACTish node.
12357 * Note that since we don't currently count how many characters there are
12358 * outside the bitmap, we are XXX missing optimization possibilities for
12359 * them. This optimization can't happen unless this is a truly single
12360 * character class, which means that it can't be an inversion into a
12361 * many-character class, and there must be no possibility of there being
12362 * things outside the bitmap. 'stored' (only) for locales doesn't include
12363 * \w, etc, so have to make a special test that they aren't present
12365 * Similarly A 2-character class of the very special form like [bB] can be
12366 * optimized into an EXACTFish node, but only for non-locales, and for
12367 * characters which only have the two folds; so things like 'fF' and 'Ii'
12368 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
12371 && ! unicode_alternate
12372 && SvCUR(listsv) == initial_listsv_len
12373 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
12374 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12375 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
12376 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12377 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
12378 /* If the latest code point has a fold whose
12379 * bit is set, it must be the only other one */
12380 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
12381 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
12383 /* Note that the information needed to decide to do this optimization
12384 * is not currently available until the 2nd pass, and that the actually
12385 * used EXACTish node takes less space than the calculated ANYOF node,
12386 * and hence the amount of space calculated in the first pass is larger
12387 * than actually used, so this optimization doesn't gain us any space.
12388 * But an EXACT node is faster than an ANYOF node, and can be combined
12389 * with any adjacent EXACT nodes later by the optimizer for further
12390 * gains. The speed of executing an EXACTF is similar to an ANYOF
12391 * node, so the optimization advantage comes from the ability to join
12392 * it to adjacent EXACT nodes */
12394 const char * cur_parse= RExC_parse;
12396 RExC_emit = (regnode *)orig_emit;
12397 RExC_parse = (char *)orig_parse;
12401 /* A locale node with one point can be folded; all the other cases
12402 * with folding will have two points, since we calculate them above
12404 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
12411 else { /* else 2 chars in the bit map: the folds of each other */
12413 /* Use the folded value, which for the cases where we get here,
12414 * is just the lower case of the current one (which may resolve to
12415 * itself, or to the other one */
12416 value = toLOWER_LATIN1(value);
12418 /* To join adjacent nodes, they must be the exact EXACTish type.
12419 * Try to use the most likely type, by using EXACTFA if possible,
12420 * then EXACTFU if the regex calls for it, or is required because
12421 * the character is non-ASCII. (If <value> is ASCII, its fold is
12422 * also ASCII for the cases where we get here.) */
12423 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
12426 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
12429 else { /* Otherwise, more likely to be EXACTF type */
12434 ret = reg_node(pRExC_state, op);
12435 RExC_parse = (char *)cur_parse;
12436 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
12437 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
12438 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
12440 RExC_emit += STR_SZ(2);
12443 *STRING(ret)= (char)value;
12445 RExC_emit += STR_SZ(1);
12447 SvREFCNT_dec(listsv);
12451 /* If there is a swash and more than one element, we can't use the swash in
12452 * the optimization below. */
12453 if (swash && element_count > 1) {
12454 SvREFCNT_dec(swash);
12458 && SvCUR(listsv) == initial_listsv_len
12459 && ! unicode_alternate)
12461 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12462 SvREFCNT_dec(listsv);
12463 SvREFCNT_dec(unicode_alternate);
12466 /* av[0] stores the character class description in its textual form:
12467 * used later (regexec.c:Perl_regclass_swash()) to initialize the
12468 * appropriate swash, and is also useful for dumping the regnode.
12469 * av[1] if NULL, is a placeholder to later contain the swash computed
12470 * from av[0]. But if no further computation need be done, the
12471 * swash is stored there now.
12472 * av[2] stores the multicharacter foldings, used later in
12473 * regexec.c:S_reginclass().
12474 * av[3] stores the cp_list inversion list for use in addition or
12475 * instead of av[0]; used only if av[1] is NULL
12476 * av[4] is set if any component of the class is from a user-defined
12477 * property; used only if av[1] is NULL */
12478 AV * const av = newAV();
12481 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
12485 av_store(av, 1, swash);
12486 SvREFCNT_dec(cp_list);
12489 av_store(av, 1, NULL);
12491 av_store(av, 3, cp_list);
12492 av_store(av, 4, newSVuv(has_user_defined_property));
12496 /* Store any computed multi-char folds only if we are allowing
12498 if (allow_full_fold) {
12499 av_store(av, 2, MUTABLE_SV(unicode_alternate));
12500 if (unicode_alternate) { /* This node is variable length */
12505 av_store(av, 2, NULL);
12507 rv = newRV_noinc(MUTABLE_SV(av));
12508 n = add_data(pRExC_state, 1, "s");
12509 RExC_rxi->data->data[n] = (void*)rv;
12516 /* reg_skipcomment()
12518 Absorbs an /x style # comments from the input stream.
12519 Returns true if there is more text remaining in the stream.
12520 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12521 terminates the pattern without including a newline.
12523 Note its the callers responsibility to ensure that we are
12524 actually in /x mode
12529 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12533 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12535 while (RExC_parse < RExC_end)
12536 if (*RExC_parse++ == '\n') {
12541 /* we ran off the end of the pattern without ending
12542 the comment, so we have to add an \n when wrapping */
12543 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12551 Advances the parse position, and optionally absorbs
12552 "whitespace" from the inputstream.
12554 Without /x "whitespace" means (?#...) style comments only,
12555 with /x this means (?#...) and # comments and whitespace proper.
12557 Returns the RExC_parse point from BEFORE the scan occurs.
12559 This is the /x friendly way of saying RExC_parse++.
12563 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12565 char* const retval = RExC_parse++;
12567 PERL_ARGS_ASSERT_NEXTCHAR;
12570 if (RExC_end - RExC_parse >= 3
12571 && *RExC_parse == '('
12572 && RExC_parse[1] == '?'
12573 && RExC_parse[2] == '#')
12575 while (*RExC_parse != ')') {
12576 if (RExC_parse == RExC_end)
12577 FAIL("Sequence (?#... not terminated");
12583 if (RExC_flags & RXf_PMf_EXTENDED) {
12584 if (isSPACE(*RExC_parse)) {
12588 else if (*RExC_parse == '#') {
12589 if ( reg_skipcomment( pRExC_state ) )
12598 - reg_node - emit a node
12600 STATIC regnode * /* Location. */
12601 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12604 register regnode *ptr;
12605 regnode * const ret = RExC_emit;
12606 GET_RE_DEBUG_FLAGS_DECL;
12608 PERL_ARGS_ASSERT_REG_NODE;
12611 SIZE_ALIGN(RExC_size);
12615 if (RExC_emit >= RExC_emit_bound)
12616 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12617 op, RExC_emit, RExC_emit_bound);
12619 NODE_ALIGN_FILL(ret);
12621 FILL_ADVANCE_NODE(ptr, op);
12622 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
12623 #ifdef RE_TRACK_PATTERN_OFFSETS
12624 if (RExC_offsets) { /* MJD */
12625 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
12626 "reg_node", __LINE__,
12628 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
12629 ? "Overwriting end of array!\n" : "OK",
12630 (UV)(RExC_emit - RExC_emit_start),
12631 (UV)(RExC_parse - RExC_start),
12632 (UV)RExC_offsets[0]));
12633 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
12641 - reganode - emit a node with an argument
12643 STATIC regnode * /* Location. */
12644 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12647 register regnode *ptr;
12648 regnode * const ret = RExC_emit;
12649 GET_RE_DEBUG_FLAGS_DECL;
12651 PERL_ARGS_ASSERT_REGANODE;
12654 SIZE_ALIGN(RExC_size);
12659 assert(2==regarglen[op]+1);
12661 Anything larger than this has to allocate the extra amount.
12662 If we changed this to be:
12664 RExC_size += (1 + regarglen[op]);
12666 then it wouldn't matter. Its not clear what side effect
12667 might come from that so its not done so far.
12672 if (RExC_emit >= RExC_emit_bound)
12673 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12674 op, RExC_emit, RExC_emit_bound);
12676 NODE_ALIGN_FILL(ret);
12678 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
12679 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
12680 #ifdef RE_TRACK_PATTERN_OFFSETS
12681 if (RExC_offsets) { /* MJD */
12682 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12686 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
12687 "Overwriting end of array!\n" : "OK",
12688 (UV)(RExC_emit - RExC_emit_start),
12689 (UV)(RExC_parse - RExC_start),
12690 (UV)RExC_offsets[0]));
12691 Set_Cur_Node_Offset;
12699 - reguni - emit (if appropriate) a Unicode character
12702 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
12706 PERL_ARGS_ASSERT_REGUNI;
12708 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
12712 - reginsert - insert an operator in front of already-emitted operand
12714 * Means relocating the operand.
12717 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
12720 register regnode *src;
12721 register regnode *dst;
12722 register regnode *place;
12723 const int offset = regarglen[(U8)op];
12724 const int size = NODE_STEP_REGNODE + offset;
12725 GET_RE_DEBUG_FLAGS_DECL;
12727 PERL_ARGS_ASSERT_REGINSERT;
12728 PERL_UNUSED_ARG(depth);
12729 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
12730 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
12739 if (RExC_open_parens) {
12741 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
12742 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12743 if ( RExC_open_parens[paren] >= opnd ) {
12744 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
12745 RExC_open_parens[paren] += size;
12747 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12749 if ( RExC_close_parens[paren] >= opnd ) {
12750 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
12751 RExC_close_parens[paren] += size;
12753 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12758 while (src > opnd) {
12759 StructCopy(--src, --dst, regnode);
12760 #ifdef RE_TRACK_PATTERN_OFFSETS
12761 if (RExC_offsets) { /* MJD 20010112 */
12762 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
12766 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
12767 ? "Overwriting end of array!\n" : "OK",
12768 (UV)(src - RExC_emit_start),
12769 (UV)(dst - RExC_emit_start),
12770 (UV)RExC_offsets[0]));
12771 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12772 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
12778 place = opnd; /* Op node, where operand used to be. */
12779 #ifdef RE_TRACK_PATTERN_OFFSETS
12780 if (RExC_offsets) { /* MJD */
12781 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12785 (UV)(place - RExC_emit_start) > RExC_offsets[0]
12786 ? "Overwriting end of array!\n" : "OK",
12787 (UV)(place - RExC_emit_start),
12788 (UV)(RExC_parse - RExC_start),
12789 (UV)RExC_offsets[0]));
12790 Set_Node_Offset(place, RExC_parse);
12791 Set_Node_Length(place, 1);
12794 src = NEXTOPER(place);
12795 FILL_ADVANCE_NODE(place, op);
12796 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
12797 Zero(src, offset, regnode);
12801 - regtail - set the next-pointer at the end of a node chain of p to val.
12802 - SEE ALSO: regtail_study
12804 /* TODO: All three parms should be const */
12806 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12809 register regnode *scan;
12810 GET_RE_DEBUG_FLAGS_DECL;
12812 PERL_ARGS_ASSERT_REGTAIL;
12814 PERL_UNUSED_ARG(depth);
12820 /* Find last node. */
12823 regnode * const temp = regnext(scan);
12825 SV * const mysv=sv_newmortal();
12826 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12827 regprop(RExC_rx, mysv, scan);
12828 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12829 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12830 (temp == NULL ? "->" : ""),
12831 (temp == NULL ? PL_reg_name[OP(val)] : "")
12839 if (reg_off_by_arg[OP(scan)]) {
12840 ARG_SET(scan, val - scan);
12843 NEXT_OFF(scan) = val - scan;
12849 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12850 - Look for optimizable sequences at the same time.
12851 - currently only looks for EXACT chains.
12853 This is experimental code. The idea is to use this routine to perform
12854 in place optimizations on branches and groups as they are constructed,
12855 with the long term intention of removing optimization from study_chunk so
12856 that it is purely analytical.
12858 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12859 to control which is which.
12862 /* TODO: All four parms should be const */
12865 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12868 register regnode *scan;
12870 #ifdef EXPERIMENTAL_INPLACESCAN
12873 GET_RE_DEBUG_FLAGS_DECL;
12875 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12881 /* Find last node. */
12885 regnode * const temp = regnext(scan);
12886 #ifdef EXPERIMENTAL_INPLACESCAN
12887 if (PL_regkind[OP(scan)] == EXACT) {
12888 bool has_exactf_sharp_s; /* Unexamined in this routine */
12889 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12894 switch (OP(scan)) {
12900 case EXACTFU_TRICKYFOLD:
12902 if( exact == PSEUDO )
12904 else if ( exact != OP(scan) )
12913 SV * const mysv=sv_newmortal();
12914 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12915 regprop(RExC_rx, mysv, scan);
12916 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12917 SvPV_nolen_const(mysv),
12918 REG_NODE_NUM(scan),
12919 PL_reg_name[exact]);
12926 SV * const mysv_val=sv_newmortal();
12927 DEBUG_PARSE_MSG("");
12928 regprop(RExC_rx, mysv_val, val);
12929 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12930 SvPV_nolen_const(mysv_val),
12931 (IV)REG_NODE_NUM(val),
12935 if (reg_off_by_arg[OP(scan)]) {
12936 ARG_SET(scan, val - scan);
12939 NEXT_OFF(scan) = val - scan;
12947 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12951 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12957 for (bit=0; bit<32; bit++) {
12958 if (flags & (1<<bit)) {
12959 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12962 if (!set++ && lead)
12963 PerlIO_printf(Perl_debug_log, "%s",lead);
12964 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12967 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12968 if (!set++ && lead) {
12969 PerlIO_printf(Perl_debug_log, "%s",lead);
12972 case REGEX_UNICODE_CHARSET:
12973 PerlIO_printf(Perl_debug_log, "UNICODE");
12975 case REGEX_LOCALE_CHARSET:
12976 PerlIO_printf(Perl_debug_log, "LOCALE");
12978 case REGEX_ASCII_RESTRICTED_CHARSET:
12979 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12981 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12982 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12985 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12991 PerlIO_printf(Perl_debug_log, "\n");
12993 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12999 Perl_regdump(pTHX_ const regexp *r)
13003 SV * const sv = sv_newmortal();
13004 SV *dsv= sv_newmortal();
13005 RXi_GET_DECL(r,ri);
13006 GET_RE_DEBUG_FLAGS_DECL;
13008 PERL_ARGS_ASSERT_REGDUMP;
13010 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13012 /* Header fields of interest. */
13013 if (r->anchored_substr) {
13014 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13015 RE_SV_DUMPLEN(r->anchored_substr), 30);
13016 PerlIO_printf(Perl_debug_log,
13017 "anchored %s%s at %"IVdf" ",
13018 s, RE_SV_TAIL(r->anchored_substr),
13019 (IV)r->anchored_offset);
13020 } else if (r->anchored_utf8) {
13021 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13022 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13023 PerlIO_printf(Perl_debug_log,
13024 "anchored utf8 %s%s at %"IVdf" ",
13025 s, RE_SV_TAIL(r->anchored_utf8),
13026 (IV)r->anchored_offset);
13028 if (r->float_substr) {
13029 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13030 RE_SV_DUMPLEN(r->float_substr), 30);
13031 PerlIO_printf(Perl_debug_log,
13032 "floating %s%s at %"IVdf"..%"UVuf" ",
13033 s, RE_SV_TAIL(r->float_substr),
13034 (IV)r->float_min_offset, (UV)r->float_max_offset);
13035 } else if (r->float_utf8) {
13036 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13037 RE_SV_DUMPLEN(r->float_utf8), 30);
13038 PerlIO_printf(Perl_debug_log,
13039 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13040 s, RE_SV_TAIL(r->float_utf8),
13041 (IV)r->float_min_offset, (UV)r->float_max_offset);
13043 if (r->check_substr || r->check_utf8)
13044 PerlIO_printf(Perl_debug_log,
13046 (r->check_substr == r->float_substr
13047 && r->check_utf8 == r->float_utf8
13048 ? "(checking floating" : "(checking anchored"));
13049 if (r->extflags & RXf_NOSCAN)
13050 PerlIO_printf(Perl_debug_log, " noscan");
13051 if (r->extflags & RXf_CHECK_ALL)
13052 PerlIO_printf(Perl_debug_log, " isall");
13053 if (r->check_substr || r->check_utf8)
13054 PerlIO_printf(Perl_debug_log, ") ");
13056 if (ri->regstclass) {
13057 regprop(r, sv, ri->regstclass);
13058 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13060 if (r->extflags & RXf_ANCH) {
13061 PerlIO_printf(Perl_debug_log, "anchored");
13062 if (r->extflags & RXf_ANCH_BOL)
13063 PerlIO_printf(Perl_debug_log, "(BOL)");
13064 if (r->extflags & RXf_ANCH_MBOL)
13065 PerlIO_printf(Perl_debug_log, "(MBOL)");
13066 if (r->extflags & RXf_ANCH_SBOL)
13067 PerlIO_printf(Perl_debug_log, "(SBOL)");
13068 if (r->extflags & RXf_ANCH_GPOS)
13069 PerlIO_printf(Perl_debug_log, "(GPOS)");
13070 PerlIO_putc(Perl_debug_log, ' ');
13072 if (r->extflags & RXf_GPOS_SEEN)
13073 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13074 if (r->intflags & PREGf_SKIP)
13075 PerlIO_printf(Perl_debug_log, "plus ");
13076 if (r->intflags & PREGf_IMPLICIT)
13077 PerlIO_printf(Perl_debug_log, "implicit ");
13078 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13079 if (r->extflags & RXf_EVAL_SEEN)
13080 PerlIO_printf(Perl_debug_log, "with eval ");
13081 PerlIO_printf(Perl_debug_log, "\n");
13082 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13084 PERL_ARGS_ASSERT_REGDUMP;
13085 PERL_UNUSED_CONTEXT;
13086 PERL_UNUSED_ARG(r);
13087 #endif /* DEBUGGING */
13091 - regprop - printable representation of opcode
13093 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13096 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13097 if (flags & ANYOF_INVERT) \
13098 /*make sure the invert info is in each */ \
13099 sv_catpvs(sv, "^"); \
13105 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13110 RXi_GET_DECL(prog,progi);
13111 GET_RE_DEBUG_FLAGS_DECL;
13113 PERL_ARGS_ASSERT_REGPROP;
13117 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13118 /* It would be nice to FAIL() here, but this may be called from
13119 regexec.c, and it would be hard to supply pRExC_state. */
13120 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13121 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13123 k = PL_regkind[OP(o)];
13126 sv_catpvs(sv, " ");
13127 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13128 * is a crude hack but it may be the best for now since
13129 * we have no flag "this EXACTish node was UTF-8"
13131 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13132 PERL_PV_ESCAPE_UNI_DETECT |
13133 PERL_PV_ESCAPE_NONASCII |
13134 PERL_PV_PRETTY_ELLIPSES |
13135 PERL_PV_PRETTY_LTGT |
13136 PERL_PV_PRETTY_NOCLEAR
13138 } else if (k == TRIE) {
13139 /* print the details of the trie in dumpuntil instead, as
13140 * progi->data isn't available here */
13141 const char op = OP(o);
13142 const U32 n = ARG(o);
13143 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13144 (reg_ac_data *)progi->data->data[n] :
13146 const reg_trie_data * const trie
13147 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13149 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13150 DEBUG_TRIE_COMPILE_r(
13151 Perl_sv_catpvf(aTHX_ sv,
13152 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13153 (UV)trie->startstate,
13154 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13155 (UV)trie->wordcount,
13158 (UV)TRIE_CHARCOUNT(trie),
13159 (UV)trie->uniquecharcount
13162 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13164 int rangestart = -1;
13165 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13166 sv_catpvs(sv, "[");
13167 for (i = 0; i <= 256; i++) {
13168 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13169 if (rangestart == -1)
13171 } else if (rangestart != -1) {
13172 if (i <= rangestart + 3)
13173 for (; rangestart < i; rangestart++)
13174 put_byte(sv, rangestart);
13176 put_byte(sv, rangestart);
13177 sv_catpvs(sv, "-");
13178 put_byte(sv, i - 1);
13183 sv_catpvs(sv, "]");
13186 } else if (k == CURLY) {
13187 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13188 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13189 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13191 else if (k == WHILEM && o->flags) /* Ordinal/of */
13192 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13193 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13194 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13195 if ( RXp_PAREN_NAMES(prog) ) {
13196 if ( k != REF || (OP(o) < NREF)) {
13197 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13198 SV **name= av_fetch(list, ARG(o), 0 );
13200 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13203 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13204 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13205 I32 *nums=(I32*)SvPVX(sv_dat);
13206 SV **name= av_fetch(list, nums[0], 0 );
13209 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13210 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13211 (n ? "," : ""), (IV)nums[n]);
13213 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13217 } else if (k == GOSUB)
13218 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13219 else if (k == VERB) {
13221 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13222 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13223 } else if (k == LOGICAL)
13224 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13225 else if (k == ANYOF) {
13226 int i, rangestart = -1;
13227 const U8 flags = ANYOF_FLAGS(o);
13230 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13231 static const char * const anyofs[] = {
13264 if (flags & ANYOF_LOCALE)
13265 sv_catpvs(sv, "{loc}");
13266 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13267 sv_catpvs(sv, "{i}");
13268 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13269 if (flags & ANYOF_INVERT)
13270 sv_catpvs(sv, "^");
13272 /* output what the standard cp 0-255 bitmap matches */
13273 for (i = 0; i <= 256; i++) {
13274 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13275 if (rangestart == -1)
13277 } else if (rangestart != -1) {
13278 if (i <= rangestart + 3)
13279 for (; rangestart < i; rangestart++)
13280 put_byte(sv, rangestart);
13282 put_byte(sv, rangestart);
13283 sv_catpvs(sv, "-");
13284 put_byte(sv, i - 1);
13291 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13292 /* output any special charclass tests (used entirely under use locale) */
13293 if (ANYOF_CLASS_TEST_ANY_SET(o))
13294 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13295 if (ANYOF_CLASS_TEST(o,i)) {
13296 sv_catpv(sv, anyofs[i]);
13300 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13302 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13303 sv_catpvs(sv, "{non-utf8-latin1-all}");
13306 /* output information about the unicode matching */
13307 if (flags & ANYOF_UNICODE_ALL)
13308 sv_catpvs(sv, "{unicode_all}");
13309 else if (ANYOF_NONBITMAP(o))
13310 sv_catpvs(sv, "{unicode}");
13311 if (flags & ANYOF_NONBITMAP_NON_UTF8)
13312 sv_catpvs(sv, "{outside bitmap}");
13314 if (ANYOF_NONBITMAP(o)) {
13315 SV *lv; /* Set if there is something outside the bit map */
13316 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13317 bool byte_output = FALSE; /* If something in the bitmap has been
13320 if (lv && lv != &PL_sv_undef) {
13322 U8 s[UTF8_MAXBYTES_CASE+1];
13324 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13325 uvchr_to_utf8(s, i);
13328 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
13332 && swash_fetch(sw, s, TRUE))
13334 if (rangestart == -1)
13336 } else if (rangestart != -1) {
13337 byte_output = TRUE;
13338 if (i <= rangestart + 3)
13339 for (; rangestart < i; rangestart++) {
13340 put_byte(sv, rangestart);
13343 put_byte(sv, rangestart);
13344 sv_catpvs(sv, "-");
13353 char *s = savesvpv(lv);
13354 char * const origs = s;
13356 while (*s && *s != '\n')
13360 const char * const t = ++s;
13363 sv_catpvs(sv, " ");
13369 /* Truncate very long output */
13370 if (s - origs > 256) {
13371 Perl_sv_catpvf(aTHX_ sv,
13373 (int) (s - origs - 1),
13379 else if (*s == '\t') {
13398 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13400 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13401 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13403 PERL_UNUSED_CONTEXT;
13404 PERL_UNUSED_ARG(sv);
13405 PERL_UNUSED_ARG(o);
13406 PERL_UNUSED_ARG(prog);
13407 #endif /* DEBUGGING */
13411 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13412 { /* Assume that RE_INTUIT is set */
13414 struct regexp *const prog = (struct regexp *)SvANY(r);
13415 GET_RE_DEBUG_FLAGS_DECL;
13417 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13418 PERL_UNUSED_CONTEXT;
13422 const char * const s = SvPV_nolen_const(prog->check_substr
13423 ? prog->check_substr : prog->check_utf8);
13425 if (!PL_colorset) reginitcolors();
13426 PerlIO_printf(Perl_debug_log,
13427 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13429 prog->check_substr ? "" : "utf8 ",
13430 PL_colors[5],PL_colors[0],
13433 (strlen(s) > 60 ? "..." : ""));
13436 return prog->check_substr ? prog->check_substr : prog->check_utf8;
13442 handles refcounting and freeing the perl core regexp structure. When
13443 it is necessary to actually free the structure the first thing it
13444 does is call the 'free' method of the regexp_engine associated to
13445 the regexp, allowing the handling of the void *pprivate; member
13446 first. (This routine is not overridable by extensions, which is why
13447 the extensions free is called first.)
13449 See regdupe and regdupe_internal if you change anything here.
13451 #ifndef PERL_IN_XSUB_RE
13453 Perl_pregfree(pTHX_ REGEXP *r)
13459 Perl_pregfree2(pTHX_ REGEXP *rx)
13462 struct regexp *const r = (struct regexp *)SvANY(rx);
13463 GET_RE_DEBUG_FLAGS_DECL;
13465 PERL_ARGS_ASSERT_PREGFREE2;
13467 if (r->mother_re) {
13468 ReREFCNT_dec(r->mother_re);
13470 CALLREGFREE_PVT(rx); /* free the private data */
13471 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13474 SvREFCNT_dec(r->anchored_substr);
13475 SvREFCNT_dec(r->anchored_utf8);
13476 SvREFCNT_dec(r->float_substr);
13477 SvREFCNT_dec(r->float_utf8);
13478 Safefree(r->substrs);
13480 RX_MATCH_COPY_FREE(rx);
13481 #ifdef PERL_OLD_COPY_ON_WRITE
13482 SvREFCNT_dec(r->saved_copy);
13485 SvREFCNT_dec(r->qr_anoncv);
13490 This is a hacky workaround to the structural issue of match results
13491 being stored in the regexp structure which is in turn stored in
13492 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13493 could be PL_curpm in multiple contexts, and could require multiple
13494 result sets being associated with the pattern simultaneously, such
13495 as when doing a recursive match with (??{$qr})
13497 The solution is to make a lightweight copy of the regexp structure
13498 when a qr// is returned from the code executed by (??{$qr}) this
13499 lightweight copy doesn't actually own any of its data except for
13500 the starp/end and the actual regexp structure itself.
13506 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13508 struct regexp *ret;
13509 struct regexp *const r = (struct regexp *)SvANY(rx);
13511 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13514 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13515 ret = (struct regexp *)SvANY(ret_x);
13517 (void)ReREFCNT_inc(rx);
13518 /* We can take advantage of the existing "copied buffer" mechanism in SVs
13519 by pointing directly at the buffer, but flagging that the allocated
13520 space in the copy is zero. As we've just done a struct copy, it's now
13521 a case of zero-ing that, rather than copying the current length. */
13522 SvPV_set(ret_x, RX_WRAPPED(rx));
13523 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13524 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13525 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13526 SvLEN_set(ret_x, 0);
13527 SvSTASH_set(ret_x, NULL);
13528 SvMAGIC_set(ret_x, NULL);
13530 const I32 npar = r->nparens+1;
13531 Newx(ret->offs, npar, regexp_paren_pair);
13532 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13535 Newx(ret->substrs, 1, struct reg_substr_data);
13536 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13538 SvREFCNT_inc_void(ret->anchored_substr);
13539 SvREFCNT_inc_void(ret->anchored_utf8);
13540 SvREFCNT_inc_void(ret->float_substr);
13541 SvREFCNT_inc_void(ret->float_utf8);
13543 /* check_substr and check_utf8, if non-NULL, point to either their
13544 anchored or float namesakes, and don't hold a second reference. */
13546 RX_MATCH_COPIED_off(ret_x);
13547 #ifdef PERL_OLD_COPY_ON_WRITE
13548 ret->saved_copy = NULL;
13550 ret->mother_re = rx;
13551 SvREFCNT_inc_void(ret->qr_anoncv);
13557 /* regfree_internal()
13559 Free the private data in a regexp. This is overloadable by
13560 extensions. Perl takes care of the regexp structure in pregfree(),
13561 this covers the *pprivate pointer which technically perl doesn't
13562 know about, however of course we have to handle the
13563 regexp_internal structure when no extension is in use.
13565 Note this is called before freeing anything in the regexp
13570 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13573 struct regexp *const r = (struct regexp *)SvANY(rx);
13574 RXi_GET_DECL(r,ri);
13575 GET_RE_DEBUG_FLAGS_DECL;
13577 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13583 SV *dsv= sv_newmortal();
13584 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13585 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
13586 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
13587 PL_colors[4],PL_colors[5],s);
13590 #ifdef RE_TRACK_PATTERN_OFFSETS
13592 Safefree(ri->u.offsets); /* 20010421 MJD */
13594 if (ri->code_blocks) {
13596 for (n = 0; n < ri->num_code_blocks; n++)
13597 SvREFCNT_dec(ri->code_blocks[n].src_regex);
13598 Safefree(ri->code_blocks);
13602 int n = ri->data->count;
13605 /* If you add a ->what type here, update the comment in regcomp.h */
13606 switch (ri->data->what[n]) {
13612 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13615 Safefree(ri->data->data[n]);
13621 { /* Aho Corasick add-on structure for a trie node.
13622 Used in stclass optimization only */
13624 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13626 refcount = --aho->refcount;
13629 PerlMemShared_free(aho->states);
13630 PerlMemShared_free(aho->fail);
13631 /* do this last!!!! */
13632 PerlMemShared_free(ri->data->data[n]);
13633 PerlMemShared_free(ri->regstclass);
13639 /* trie structure. */
13641 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13643 refcount = --trie->refcount;
13646 PerlMemShared_free(trie->charmap);
13647 PerlMemShared_free(trie->states);
13648 PerlMemShared_free(trie->trans);
13650 PerlMemShared_free(trie->bitmap);
13652 PerlMemShared_free(trie->jump);
13653 PerlMemShared_free(trie->wordinfo);
13654 /* do this last!!!! */
13655 PerlMemShared_free(ri->data->data[n]);
13660 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
13663 Safefree(ri->data->what);
13664 Safefree(ri->data);
13670 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
13671 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
13672 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
13675 re_dup - duplicate a regexp.
13677 This routine is expected to clone a given regexp structure. It is only
13678 compiled under USE_ITHREADS.
13680 After all of the core data stored in struct regexp is duplicated
13681 the regexp_engine.dupe method is used to copy any private data
13682 stored in the *pprivate pointer. This allows extensions to handle
13683 any duplication it needs to do.
13685 See pregfree() and regfree_internal() if you change anything here.
13687 #if defined(USE_ITHREADS)
13688 #ifndef PERL_IN_XSUB_RE
13690 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13694 const struct regexp *r = (const struct regexp *)SvANY(sstr);
13695 struct regexp *ret = (struct regexp *)SvANY(dstr);
13697 PERL_ARGS_ASSERT_RE_DUP_GUTS;
13699 npar = r->nparens+1;
13700 Newx(ret->offs, npar, regexp_paren_pair);
13701 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13703 /* no need to copy these */
13704 Newx(ret->swap, npar, regexp_paren_pair);
13707 if (ret->substrs) {
13708 /* Do it this way to avoid reading from *r after the StructCopy().
13709 That way, if any of the sv_dup_inc()s dislodge *r from the L1
13710 cache, it doesn't matter. */
13711 const bool anchored = r->check_substr
13712 ? r->check_substr == r->anchored_substr
13713 : r->check_utf8 == r->anchored_utf8;
13714 Newx(ret->substrs, 1, struct reg_substr_data);
13715 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13717 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
13718 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
13719 ret->float_substr = sv_dup_inc(ret->float_substr, param);
13720 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
13722 /* check_substr and check_utf8, if non-NULL, point to either their
13723 anchored or float namesakes, and don't hold a second reference. */
13725 if (ret->check_substr) {
13727 assert(r->check_utf8 == r->anchored_utf8);
13728 ret->check_substr = ret->anchored_substr;
13729 ret->check_utf8 = ret->anchored_utf8;
13731 assert(r->check_substr == r->float_substr);
13732 assert(r->check_utf8 == r->float_utf8);
13733 ret->check_substr = ret->float_substr;
13734 ret->check_utf8 = ret->float_utf8;
13736 } else if (ret->check_utf8) {
13738 ret->check_utf8 = ret->anchored_utf8;
13740 ret->check_utf8 = ret->float_utf8;
13745 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13746 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13749 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
13751 if (RX_MATCH_COPIED(dstr))
13752 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
13754 ret->subbeg = NULL;
13755 #ifdef PERL_OLD_COPY_ON_WRITE
13756 ret->saved_copy = NULL;
13759 if (ret->mother_re) {
13760 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
13761 /* Our storage points directly to our mother regexp, but that's
13762 1: a buffer in a different thread
13763 2: something we no longer hold a reference on
13764 so we need to copy it locally. */
13765 /* Note we need to use SvCUR(), rather than
13766 SvLEN(), on our mother_re, because it, in
13767 turn, may well be pointing to its own mother_re. */
13768 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13769 SvCUR(ret->mother_re)+1));
13770 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13772 ret->mother_re = NULL;
13776 #endif /* PERL_IN_XSUB_RE */
13781 This is the internal complement to regdupe() which is used to copy
13782 the structure pointed to by the *pprivate pointer in the regexp.
13783 This is the core version of the extension overridable cloning hook.
13784 The regexp structure being duplicated will be copied by perl prior
13785 to this and will be provided as the regexp *r argument, however
13786 with the /old/ structures pprivate pointer value. Thus this routine
13787 may override any copying normally done by perl.
13789 It returns a pointer to the new regexp_internal structure.
13793 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13796 struct regexp *const r = (struct regexp *)SvANY(rx);
13797 regexp_internal *reti;
13799 RXi_GET_DECL(r,ri);
13801 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13805 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13806 Copy(ri->program, reti->program, len+1, regnode);
13808 reti->num_code_blocks = ri->num_code_blocks;
13809 if (ri->code_blocks) {
13811 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
13812 struct reg_code_block);
13813 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
13814 struct reg_code_block);
13815 for (n = 0; n < ri->num_code_blocks; n++)
13816 reti->code_blocks[n].src_regex = (REGEXP*)
13817 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
13820 reti->code_blocks = NULL;
13822 reti->regstclass = NULL;
13825 struct reg_data *d;
13826 const int count = ri->data->count;
13829 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13830 char, struct reg_data);
13831 Newx(d->what, count, U8);
13834 for (i = 0; i < count; i++) {
13835 d->what[i] = ri->data->what[i];
13836 switch (d->what[i]) {
13837 /* see also regcomp.h and regfree_internal() */
13838 case 'a': /* actually an AV, but the dup function is identical. */
13842 case 'u': /* actually an HV, but the dup function is identical. */
13843 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13846 /* This is cheating. */
13847 Newx(d->data[i], 1, struct regnode_charclass_class);
13848 StructCopy(ri->data->data[i], d->data[i],
13849 struct regnode_charclass_class);
13850 reti->regstclass = (regnode*)d->data[i];
13853 /* Trie stclasses are readonly and can thus be shared
13854 * without duplication. We free the stclass in pregfree
13855 * when the corresponding reg_ac_data struct is freed.
13857 reti->regstclass= ri->regstclass;
13861 ((reg_trie_data*)ri->data->data[i])->refcount++;
13866 d->data[i] = ri->data->data[i];
13869 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13878 reti->name_list_idx = ri->name_list_idx;
13880 #ifdef RE_TRACK_PATTERN_OFFSETS
13881 if (ri->u.offsets) {
13882 Newx(reti->u.offsets, 2*len+1, U32);
13883 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13886 SetProgLen(reti,len);
13889 return (void*)reti;
13892 #endif /* USE_ITHREADS */
13894 #ifndef PERL_IN_XSUB_RE
13897 - regnext - dig the "next" pointer out of a node
13900 Perl_regnext(pTHX_ register regnode *p)
13903 register I32 offset;
13908 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13909 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13912 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13921 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13924 STRLEN l1 = strlen(pat1);
13925 STRLEN l2 = strlen(pat2);
13928 const char *message;
13930 PERL_ARGS_ASSERT_RE_CROAK2;
13936 Copy(pat1, buf, l1 , char);
13937 Copy(pat2, buf + l1, l2 , char);
13938 buf[l1 + l2] = '\n';
13939 buf[l1 + l2 + 1] = '\0';
13941 /* ANSI variant takes additional second argument */
13942 va_start(args, pat2);
13946 msv = vmess(buf, &args);
13948 message = SvPV_const(msv,l1);
13951 Copy(message, buf, l1 , char);
13952 buf[l1-1] = '\0'; /* Overwrite \n */
13953 Perl_croak(aTHX_ "%s", buf);
13956 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13958 #ifndef PERL_IN_XSUB_RE
13960 Perl_save_re_context(pTHX)
13964 struct re_save_state *state;
13966 SAVEVPTR(PL_curcop);
13967 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13969 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13970 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13971 SSPUSHUV(SAVEt_RE_STATE);
13973 Copy(&PL_reg_state, state, 1, struct re_save_state);
13975 PL_reg_oldsaved = NULL;
13976 PL_reg_oldsavedlen = 0;
13977 PL_reg_maxiter = 0;
13978 PL_reg_leftiter = 0;
13979 PL_reg_poscache = NULL;
13980 PL_reg_poscache_size = 0;
13981 #ifdef PERL_OLD_COPY_ON_WRITE
13985 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13987 const REGEXP * const rx = PM_GETRE(PL_curpm);
13990 for (i = 1; i <= RX_NPARENS(rx); i++) {
13991 char digits[TYPE_CHARS(long)];
13992 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13993 GV *const *const gvp
13994 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13997 GV * const gv = *gvp;
13998 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14008 clear_re(pTHX_ void *r)
14011 ReREFCNT_dec((REGEXP *)r);
14017 S_put_byte(pTHX_ SV *sv, int c)
14019 PERL_ARGS_ASSERT_PUT_BYTE;
14021 /* Our definition of isPRINT() ignores locales, so only bytes that are
14022 not part of UTF-8 are considered printable. I assume that the same
14023 holds for UTF-EBCDIC.
14024 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14025 which Wikipedia says:
14027 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14028 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14029 identical, to the ASCII delete (DEL) or rubout control character.
14030 ) So the old condition can be simplified to !isPRINT(c) */
14033 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14036 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14040 const char string = c;
14041 if (c == '-' || c == ']' || c == '\\' || c == '^')
14042 sv_catpvs(sv, "\\");
14043 sv_catpvn(sv, &string, 1);
14048 #define CLEAR_OPTSTART \
14049 if (optstart) STMT_START { \
14050 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14054 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14056 STATIC const regnode *
14057 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14058 const regnode *last, const regnode *plast,
14059 SV* sv, I32 indent, U32 depth)
14062 register U8 op = PSEUDO; /* Arbitrary non-END op. */
14063 register const regnode *next;
14064 const regnode *optstart= NULL;
14066 RXi_GET_DECL(r,ri);
14067 GET_RE_DEBUG_FLAGS_DECL;
14069 PERL_ARGS_ASSERT_DUMPUNTIL;
14071 #ifdef DEBUG_DUMPUNTIL
14072 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14073 last ? last-start : 0,plast ? plast-start : 0);
14076 if (plast && plast < last)
14079 while (PL_regkind[op] != END && (!last || node < last)) {
14080 /* While that wasn't END last time... */
14083 if (op == CLOSE || op == WHILEM)
14085 next = regnext((regnode *)node);
14088 if (OP(node) == OPTIMIZED) {
14089 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14096 regprop(r, sv, node);
14097 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14098 (int)(2*indent + 1), "", SvPVX_const(sv));
14100 if (OP(node) != OPTIMIZED) {
14101 if (next == NULL) /* Next ptr. */
14102 PerlIO_printf(Perl_debug_log, " (0)");
14103 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14104 PerlIO_printf(Perl_debug_log, " (FAIL)");
14106 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14107 (void)PerlIO_putc(Perl_debug_log, '\n');
14111 if (PL_regkind[(U8)op] == BRANCHJ) {
14114 register const regnode *nnode = (OP(next) == LONGJMP
14115 ? regnext((regnode *)next)
14117 if (last && nnode > last)
14119 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14122 else if (PL_regkind[(U8)op] == BRANCH) {
14124 DUMPUNTIL(NEXTOPER(node), next);
14126 else if ( PL_regkind[(U8)op] == TRIE ) {
14127 const regnode *this_trie = node;
14128 const char op = OP(node);
14129 const U32 n = ARG(node);
14130 const reg_ac_data * const ac = op>=AHOCORASICK ?
14131 (reg_ac_data *)ri->data->data[n] :
14133 const reg_trie_data * const trie =
14134 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14136 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14138 const regnode *nextbranch= NULL;
14141 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14142 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14144 PerlIO_printf(Perl_debug_log, "%*s%s ",
14145 (int)(2*(indent+3)), "",
14146 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14147 PL_colors[0], PL_colors[1],
14148 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14149 PERL_PV_PRETTY_ELLIPSES |
14150 PERL_PV_PRETTY_LTGT
14155 U16 dist= trie->jump[word_idx+1];
14156 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14157 (UV)((dist ? this_trie + dist : next) - start));
14160 nextbranch= this_trie + trie->jump[0];
14161 DUMPUNTIL(this_trie + dist, nextbranch);
14163 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14164 nextbranch= regnext((regnode *)nextbranch);
14166 PerlIO_printf(Perl_debug_log, "\n");
14169 if (last && next > last)
14174 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14175 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14176 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14178 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14180 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14182 else if ( op == PLUS || op == STAR) {
14183 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14185 else if (PL_regkind[(U8)op] == ANYOF) {
14186 /* arglen 1 + class block */
14187 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14188 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14189 node = NEXTOPER(node);
14191 else if (PL_regkind[(U8)op] == EXACT) {
14192 /* Literal string, where present. */
14193 node += NODE_SZ_STR(node) - 1;
14194 node = NEXTOPER(node);
14197 node = NEXTOPER(node);
14198 node += regarglen[(U8)op];
14200 if (op == CURLYX || op == OPEN)
14204 #ifdef DEBUG_DUMPUNTIL
14205 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14210 #endif /* DEBUGGING */
14214 * c-indentation-style: bsd
14215 * c-basic-offset: 4
14216 * indent-tabs-mode: nil
14219 * ex: set ts=8 sts=4 sw=4 et: