5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 # if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107 # endif /* BUGGY_MSC6 */
111 #define STATIC static
115 typedef struct RExC_state_t {
116 U32 flags; /* RXf_* are we folding, multilining? */
117 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
118 char *precomp; /* uncompiled string. */
119 REGEXP *rx_sv; /* The SV that is the regexp. */
120 regexp *rx; /* perl core regexp structure */
121 regexp_internal *rxi; /* internal data for regexp object pprivate field */
122 char *start; /* Start of input for compile */
123 char *end; /* End of input for compile */
124 char *parse; /* Input-scan pointer. */
125 I32 whilem_seen; /* number of WHILEM in this expr */
126 regnode *emit_start; /* Start of emitted-code area */
127 regnode *emit_bound; /* First regnode outside of the allocated space */
128 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
129 I32 naughty; /* How bad is this pattern? */
130 I32 sawback; /* Did we see \1, ...? */
132 I32 size; /* Code size. */
133 I32 npar; /* Capture buffer count, (OPEN). */
134 I32 cpar; /* Capture buffer count, (CLOSE). */
135 I32 nestroot; /* root parens we are in - used by accept */
138 regnode **open_parens; /* pointers to open parens */
139 regnode **close_parens; /* pointers to close parens */
140 regnode *opend; /* END node in program */
141 I32 utf8; /* whether the pattern is utf8 or not */
142 I32 orig_utf8; /* whether the pattern was originally in utf8 */
143 /* XXX use this for future optimisation of case
144 * where pattern must be upgraded to utf8. */
145 I32 uni_semantics; /* If a d charset modifier should use unicode
146 rules, even if the pattern is not in
148 HV *paren_names; /* Paren names */
150 regnode **recurse; /* Recurse regops */
151 I32 recurse_count; /* Number of recurse regops */
154 I32 override_recoding;
155 struct reg_code_block *code_blocks; /* positions of literal (?{})
157 int num_code_blocks; /* size of code_blocks[] */
158 int code_index; /* next code_blocks[] slot */
160 char *starttry; /* -Dr: where regtry was called. */
161 #define RExC_starttry (pRExC_state->starttry)
163 SV *runtime_code_qr; /* qr with the runtime code blocks */
165 const char *lastparse;
167 AV *paren_name_list; /* idx -> name */
168 #define RExC_lastparse (pRExC_state->lastparse)
169 #define RExC_lastnum (pRExC_state->lastnum)
170 #define RExC_paren_name_list (pRExC_state->paren_name_list)
174 #define RExC_flags (pRExC_state->flags)
175 #define RExC_pm_flags (pRExC_state->pm_flags)
176 #define RExC_precomp (pRExC_state->precomp)
177 #define RExC_rx_sv (pRExC_state->rx_sv)
178 #define RExC_rx (pRExC_state->rx)
179 #define RExC_rxi (pRExC_state->rxi)
180 #define RExC_start (pRExC_state->start)
181 #define RExC_end (pRExC_state->end)
182 #define RExC_parse (pRExC_state->parse)
183 #define RExC_whilem_seen (pRExC_state->whilem_seen)
184 #ifdef RE_TRACK_PATTERN_OFFSETS
185 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
187 #define RExC_emit (pRExC_state->emit)
188 #define RExC_emit_start (pRExC_state->emit_start)
189 #define RExC_emit_bound (pRExC_state->emit_bound)
190 #define RExC_naughty (pRExC_state->naughty)
191 #define RExC_sawback (pRExC_state->sawback)
192 #define RExC_seen (pRExC_state->seen)
193 #define RExC_size (pRExC_state->size)
194 #define RExC_npar (pRExC_state->npar)
195 #define RExC_nestroot (pRExC_state->nestroot)
196 #define RExC_extralen (pRExC_state->extralen)
197 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
198 #define RExC_utf8 (pRExC_state->utf8)
199 #define RExC_uni_semantics (pRExC_state->uni_semantics)
200 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
201 #define RExC_open_parens (pRExC_state->open_parens)
202 #define RExC_close_parens (pRExC_state->close_parens)
203 #define RExC_opend (pRExC_state->opend)
204 #define RExC_paren_names (pRExC_state->paren_names)
205 #define RExC_recurse (pRExC_state->recurse)
206 #define RExC_recurse_count (pRExC_state->recurse_count)
207 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
208 #define RExC_contains_locale (pRExC_state->contains_locale)
209 #define RExC_override_recoding (pRExC_state->override_recoding)
212 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
213 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
214 ((*s) == '{' && regcurly(s)))
217 #undef SPSTART /* dratted cpp namespace... */
220 * Flags to be passed up and down.
222 #define WORST 0 /* Worst case. */
223 #define HASWIDTH 0x01 /* Known to match non-null strings. */
225 /* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
226 * character, and if utf8, must be invariant. Note that this is not the same
227 * thing as REGNODE_SIMPLE */
229 #define SPSTART 0x04 /* Starts with * or +. */
230 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
231 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
233 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
235 /* whether trie related optimizations are enabled */
236 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
237 #define TRIE_STUDY_OPT
238 #define FULL_TRIE_STUDY
244 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
245 #define PBITVAL(paren) (1 << ((paren) & 7))
246 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
247 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
248 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
250 /* If not already in utf8, do a longjmp back to the beginning */
251 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
252 #define REQUIRE_UTF8 STMT_START { \
253 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
256 /* About scan_data_t.
258 During optimisation we recurse through the regexp program performing
259 various inplace (keyhole style) optimisations. In addition study_chunk
260 and scan_commit populate this data structure with information about
261 what strings MUST appear in the pattern. We look for the longest
262 string that must appear at a fixed location, and we look for the
263 longest string that may appear at a floating location. So for instance
268 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
269 strings (because they follow a .* construct). study_chunk will identify
270 both FOO and BAR as being the longest fixed and floating strings respectively.
272 The strings can be composites, for instance
276 will result in a composite fixed substring 'foo'.
278 For each string some basic information is maintained:
280 - offset or min_offset
281 This is the position the string must appear at, or not before.
282 It also implicitly (when combined with minlenp) tells us how many
283 characters must match before the string we are searching for.
284 Likewise when combined with minlenp and the length of the string it
285 tells us how many characters must appear after the string we have
289 Only used for floating strings. This is the rightmost point that
290 the string can appear at. If set to I32 max it indicates that the
291 string can occur infinitely far to the right.
294 A pointer to the minimum length of the pattern that the string
295 was found inside. This is important as in the case of positive
296 lookahead or positive lookbehind we can have multiple patterns
301 The minimum length of the pattern overall is 3, the minimum length
302 of the lookahead part is 3, but the minimum length of the part that
303 will actually match is 1. So 'FOO's minimum length is 3, but the
304 minimum length for the F is 1. This is important as the minimum length
305 is used to determine offsets in front of and behind the string being
306 looked for. Since strings can be composites this is the length of the
307 pattern at the time it was committed with a scan_commit. Note that
308 the length is calculated by study_chunk, so that the minimum lengths
309 are not known until the full pattern has been compiled, thus the
310 pointer to the value.
314 In the case of lookbehind the string being searched for can be
315 offset past the start point of the final matching string.
316 If this value was just blithely removed from the min_offset it would
317 invalidate some of the calculations for how many chars must match
318 before or after (as they are derived from min_offset and minlen and
319 the length of the string being searched for).
320 When the final pattern is compiled and the data is moved from the
321 scan_data_t structure into the regexp structure the information
322 about lookbehind is factored in, with the information that would
323 have been lost precalculated in the end_shift field for the
326 The fields pos_min and pos_delta are used to store the minimum offset
327 and the delta to the maximum offset at the current point in the pattern.
331 typedef struct scan_data_t {
332 /*I32 len_min; unused */
333 /*I32 len_delta; unused */
337 I32 last_end; /* min value, <0 unless valid. */
340 SV **longest; /* Either &l_fixed, or &l_float. */
341 SV *longest_fixed; /* longest fixed string found in pattern */
342 I32 offset_fixed; /* offset where it starts */
343 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
344 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
345 SV *longest_float; /* longest floating string found in pattern */
346 I32 offset_float_min; /* earliest point in string it can appear */
347 I32 offset_float_max; /* latest point in string it can appear */
348 I32 *minlen_float; /* pointer to the minlen relevant to the string */
349 I32 lookbehind_float; /* is the position of the string modified by LB */
353 struct regnode_charclass_class *start_class;
357 * Forward declarations for pregcomp()'s friends.
360 static const scan_data_t zero_scan_data =
361 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
363 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
364 #define SF_BEFORE_SEOL 0x0001
365 #define SF_BEFORE_MEOL 0x0002
366 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
367 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
370 # define SF_FIX_SHIFT_EOL (0+2)
371 # define SF_FL_SHIFT_EOL (0+4)
373 # define SF_FIX_SHIFT_EOL (+2)
374 # define SF_FL_SHIFT_EOL (+4)
377 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
378 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
380 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
381 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
382 #define SF_IS_INF 0x0040
383 #define SF_HAS_PAR 0x0080
384 #define SF_IN_PAR 0x0100
385 #define SF_HAS_EVAL 0x0200
386 #define SCF_DO_SUBSTR 0x0400
387 #define SCF_DO_STCLASS_AND 0x0800
388 #define SCF_DO_STCLASS_OR 0x1000
389 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
390 #define SCF_WHILEM_VISITED_POS 0x2000
392 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
393 #define SCF_SEEN_ACCEPT 0x8000
395 #define UTF cBOOL(RExC_utf8)
397 /* The enums for all these are ordered so things work out correctly */
398 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
399 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
400 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
401 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
402 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
403 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
404 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
406 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
408 #define OOB_NAMEDCLASS -1
410 /* There is no code point that is out-of-bounds, so this is problematic. But
411 * its only current use is to initialize a variable that is always set before
413 #define OOB_UNICODE 0xDEADBEEF
415 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
416 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
419 /* length of regex to show in messages that don't mark a position within */
420 #define RegexLengthToShowInErrorMessages 127
423 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
424 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
425 * op/pragma/warn/regcomp.
427 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
428 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
430 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
433 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
434 * arg. Show regex, up to a maximum length. If it's too long, chop and add
437 #define _FAIL(code) STMT_START { \
438 const char *ellipses = ""; \
439 IV len = RExC_end - RExC_precomp; \
442 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
443 if (len > RegexLengthToShowInErrorMessages) { \
444 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
445 len = RegexLengthToShowInErrorMessages - 10; \
451 #define FAIL(msg) _FAIL( \
452 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
453 msg, (int)len, RExC_precomp, ellipses))
455 #define FAIL2(msg,arg) _FAIL( \
456 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
457 arg, (int)len, RExC_precomp, ellipses))
460 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
462 #define Simple_vFAIL(m) STMT_START { \
463 const IV offset = RExC_parse - RExC_precomp; \
464 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
465 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
469 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
471 #define vFAIL(m) STMT_START { \
473 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
478 * Like Simple_vFAIL(), but accepts two arguments.
480 #define Simple_vFAIL2(m,a1) STMT_START { \
481 const IV offset = RExC_parse - RExC_precomp; \
482 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
483 (int)offset, RExC_precomp, RExC_precomp + offset); \
487 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
489 #define vFAIL2(m,a1) STMT_START { \
491 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
492 Simple_vFAIL2(m, a1); \
497 * Like Simple_vFAIL(), but accepts three arguments.
499 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
500 const IV offset = RExC_parse - RExC_precomp; \
501 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
502 (int)offset, RExC_precomp, RExC_precomp + offset); \
506 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
508 #define vFAIL3(m,a1,a2) STMT_START { \
510 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
511 Simple_vFAIL3(m, a1, a2); \
515 * Like Simple_vFAIL(), but accepts four arguments.
517 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
518 const IV offset = RExC_parse - RExC_precomp; \
519 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
520 (int)offset, RExC_precomp, RExC_precomp + offset); \
523 #define ckWARNreg(loc,m) STMT_START { \
524 const IV offset = loc - RExC_precomp; \
525 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
526 (int)offset, RExC_precomp, RExC_precomp + offset); \
529 #define ckWARNregdep(loc,m) STMT_START { \
530 const IV offset = loc - RExC_precomp; \
531 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
533 (int)offset, RExC_precomp, RExC_precomp + offset); \
536 #define ckWARN2regdep(loc,m, a1) STMT_START { \
537 const IV offset = loc - RExC_precomp; \
538 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
540 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
543 #define ckWARN2reg(loc, m, a1) STMT_START { \
544 const IV offset = loc - RExC_precomp; \
545 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
546 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
549 #define vWARN3(loc, m, a1, a2) STMT_START { \
550 const IV offset = loc - RExC_precomp; \
551 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
552 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
555 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
556 const IV offset = loc - RExC_precomp; \
557 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
558 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
561 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
562 const IV offset = loc - RExC_precomp; \
563 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
564 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
567 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
568 const IV offset = loc - RExC_precomp; \
569 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
570 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
573 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
574 const IV offset = loc - RExC_precomp; \
575 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
576 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
580 /* Allow for side effects in s */
581 #define REGC(c,s) STMT_START { \
582 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
585 /* Macros for recording node offsets. 20001227 mjd@plover.com
586 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
587 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
588 * Element 0 holds the number n.
589 * Position is 1 indexed.
591 #ifndef RE_TRACK_PATTERN_OFFSETS
592 #define Set_Node_Offset_To_R(node,byte)
593 #define Set_Node_Offset(node,byte)
594 #define Set_Cur_Node_Offset
595 #define Set_Node_Length_To_R(node,len)
596 #define Set_Node_Length(node,len)
597 #define Set_Node_Cur_Length(node)
598 #define Node_Offset(n)
599 #define Node_Length(n)
600 #define Set_Node_Offset_Length(node,offset,len)
601 #define ProgLen(ri) ri->u.proglen
602 #define SetProgLen(ri,x) ri->u.proglen = x
604 #define ProgLen(ri) ri->u.offsets[0]
605 #define SetProgLen(ri,x) ri->u.offsets[0] = x
606 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
608 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
609 __LINE__, (int)(node), (int)(byte))); \
611 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
613 RExC_offsets[2*(node)-1] = (byte); \
618 #define Set_Node_Offset(node,byte) \
619 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
620 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
622 #define Set_Node_Length_To_R(node,len) STMT_START { \
624 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
625 __LINE__, (int)(node), (int)(len))); \
627 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
629 RExC_offsets[2*(node)] = (len); \
634 #define Set_Node_Length(node,len) \
635 Set_Node_Length_To_R((node)-RExC_emit_start, len)
636 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
637 #define Set_Node_Cur_Length(node) \
638 Set_Node_Length(node, RExC_parse - parse_start)
640 /* Get offsets and lengths */
641 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
642 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
644 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
645 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
646 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
650 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
651 #define EXPERIMENTAL_INPLACESCAN
652 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
654 #define DEBUG_STUDYDATA(str,data,depth) \
655 DEBUG_OPTIMISE_MORE_r(if(data){ \
656 PerlIO_printf(Perl_debug_log, \
657 "%*s" str "Pos:%"IVdf"/%"IVdf \
658 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
659 (int)(depth)*2, "", \
660 (IV)((data)->pos_min), \
661 (IV)((data)->pos_delta), \
662 (UV)((data)->flags), \
663 (IV)((data)->whilem_c), \
664 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
665 is_inf ? "INF " : "" \
667 if ((data)->last_found) \
668 PerlIO_printf(Perl_debug_log, \
669 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
670 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
671 SvPVX_const((data)->last_found), \
672 (IV)((data)->last_end), \
673 (IV)((data)->last_start_min), \
674 (IV)((data)->last_start_max), \
675 ((data)->longest && \
676 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
677 SvPVX_const((data)->longest_fixed), \
678 (IV)((data)->offset_fixed), \
679 ((data)->longest && \
680 (data)->longest==&((data)->longest_float)) ? "*" : "", \
681 SvPVX_const((data)->longest_float), \
682 (IV)((data)->offset_float_min), \
683 (IV)((data)->offset_float_max) \
685 PerlIO_printf(Perl_debug_log,"\n"); \
688 static void clear_re(pTHX_ void *r);
690 /* Mark that we cannot extend a found fixed substring at this point.
691 Update the longest found anchored substring and the longest found
692 floating substrings if needed. */
695 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
697 const STRLEN l = CHR_SVLEN(data->last_found);
698 const STRLEN old_l = CHR_SVLEN(*data->longest);
699 GET_RE_DEBUG_FLAGS_DECL;
701 PERL_ARGS_ASSERT_SCAN_COMMIT;
703 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
704 SvSetMagicSV(*data->longest, data->last_found);
705 if (*data->longest == data->longest_fixed) {
706 data->offset_fixed = l ? data->last_start_min : data->pos_min;
707 if (data->flags & SF_BEFORE_EOL)
709 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
711 data->flags &= ~SF_FIX_BEFORE_EOL;
712 data->minlen_fixed=minlenp;
713 data->lookbehind_fixed=0;
715 else { /* *data->longest == data->longest_float */
716 data->offset_float_min = l ? data->last_start_min : data->pos_min;
717 data->offset_float_max = (l
718 ? data->last_start_max
719 : data->pos_min + data->pos_delta);
720 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
721 data->offset_float_max = I32_MAX;
722 if (data->flags & SF_BEFORE_EOL)
724 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
726 data->flags &= ~SF_FL_BEFORE_EOL;
727 data->minlen_float=minlenp;
728 data->lookbehind_float=0;
731 SvCUR_set(data->last_found, 0);
733 SV * const sv = data->last_found;
734 if (SvUTF8(sv) && SvMAGICAL(sv)) {
735 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
741 data->flags &= ~SF_BEFORE_EOL;
742 DEBUG_STUDYDATA("commit: ",data,0);
745 /* Can match anything (initialization) */
747 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
749 PERL_ARGS_ASSERT_CL_ANYTHING;
751 ANYOF_BITMAP_SETALL(cl);
752 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
753 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
755 /* If any portion of the regex is to operate under locale rules,
756 * initialization includes it. The reason this isn't done for all regexes
757 * is that the optimizer was written under the assumption that locale was
758 * all-or-nothing. Given the complexity and lack of documentation in the
759 * optimizer, and that there are inadequate test cases for locale, so many
760 * parts of it may not work properly, it is safest to avoid locale unless
762 if (RExC_contains_locale) {
763 ANYOF_CLASS_SETALL(cl); /* /l uses class */
764 cl->flags |= ANYOF_LOCALE;
767 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
771 /* Can match anything (initialization) */
773 S_cl_is_anything(const struct regnode_charclass_class *cl)
777 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
779 for (value = 0; value <= ANYOF_MAX; value += 2)
780 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
782 if (!(cl->flags & ANYOF_UNICODE_ALL))
784 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
789 /* Can match anything (initialization) */
791 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
793 PERL_ARGS_ASSERT_CL_INIT;
795 Zero(cl, 1, struct regnode_charclass_class);
797 cl_anything(pRExC_state, cl);
798 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
801 /* These two functions currently do the exact same thing */
802 #define cl_init_zero S_cl_init
804 /* 'AND' a given class with another one. Can create false positives. 'cl'
805 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
806 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
808 S_cl_and(struct regnode_charclass_class *cl,
809 const struct regnode_charclass_class *and_with)
811 PERL_ARGS_ASSERT_CL_AND;
813 assert(and_with->type == ANYOF);
815 /* I (khw) am not sure all these restrictions are necessary XXX */
816 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
817 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
818 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
819 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
820 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
823 if (and_with->flags & ANYOF_INVERT)
824 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
825 cl->bitmap[i] &= ~and_with->bitmap[i];
827 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
828 cl->bitmap[i] &= and_with->bitmap[i];
829 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
831 if (and_with->flags & ANYOF_INVERT) {
833 /* Here, the and'ed node is inverted. Get the AND of the flags that
834 * aren't affected by the inversion. Those that are affected are
835 * handled individually below */
836 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
837 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
838 cl->flags |= affected_flags;
840 /* We currently don't know how to deal with things that aren't in the
841 * bitmap, but we know that the intersection is no greater than what
842 * is already in cl, so let there be false positives that get sorted
843 * out after the synthetic start class succeeds, and the node is
844 * matched for real. */
846 /* The inversion of these two flags indicate that the resulting
847 * intersection doesn't have them */
848 if (and_with->flags & ANYOF_UNICODE_ALL) {
849 cl->flags &= ~ANYOF_UNICODE_ALL;
851 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
852 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
855 else { /* and'd node is not inverted */
856 U8 outside_bitmap_but_not_utf8; /* Temp variable */
858 if (! ANYOF_NONBITMAP(and_with)) {
860 /* Here 'and_with' doesn't match anything outside the bitmap
861 * (except possibly ANYOF_UNICODE_ALL), which means the
862 * intersection can't either, except for ANYOF_UNICODE_ALL, in
863 * which case we don't know what the intersection is, but it's no
864 * greater than what cl already has, so can just leave it alone,
865 * with possible false positives */
866 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
867 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
868 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
871 else if (! ANYOF_NONBITMAP(cl)) {
873 /* Here, 'and_with' does match something outside the bitmap, and cl
874 * doesn't have a list of things to match outside the bitmap. If
875 * cl can match all code points above 255, the intersection will
876 * be those above-255 code points that 'and_with' matches. If cl
877 * can't match all Unicode code points, it means that it can't
878 * match anything outside the bitmap (since the 'if' that got us
879 * into this block tested for that), so we leave the bitmap empty.
881 if (cl->flags & ANYOF_UNICODE_ALL) {
882 ARG_SET(cl, ARG(and_with));
884 /* and_with's ARG may match things that don't require UTF8.
885 * And now cl's will too, in spite of this being an 'and'. See
886 * the comments below about the kludge */
887 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
891 /* Here, both 'and_with' and cl match something outside the
892 * bitmap. Currently we do not do the intersection, so just match
893 * whatever cl had at the beginning. */
897 /* Take the intersection of the two sets of flags. However, the
898 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
899 * kludge around the fact that this flag is not treated like the others
900 * which are initialized in cl_anything(). The way the optimizer works
901 * is that the synthetic start class (SSC) is initialized to match
902 * anything, and then the first time a real node is encountered, its
903 * values are AND'd with the SSC's with the result being the values of
904 * the real node. However, there are paths through the optimizer where
905 * the AND never gets called, so those initialized bits are set
906 * inappropriately, which is not usually a big deal, as they just cause
907 * false positives in the SSC, which will just mean a probably
908 * imperceptible slow down in execution. However this bit has a
909 * higher false positive consequence in that it can cause utf8.pm,
910 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
911 * bigger slowdown and also causes significant extra memory to be used.
912 * In order to prevent this, the code now takes a different tack. The
913 * bit isn't set unless some part of the regular expression needs it,
914 * but once set it won't get cleared. This means that these extra
915 * modules won't get loaded unless there was some path through the
916 * pattern that would have required them anyway, and so any false
917 * positives that occur by not ANDing them out when they could be
918 * aren't as severe as they would be if we treated this bit like all
920 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
921 & ANYOF_NONBITMAP_NON_UTF8;
922 cl->flags &= and_with->flags;
923 cl->flags |= outside_bitmap_but_not_utf8;
927 /* 'OR' a given class with another one. Can create false positives. 'cl'
928 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
929 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
931 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
933 PERL_ARGS_ASSERT_CL_OR;
935 if (or_with->flags & ANYOF_INVERT) {
937 /* Here, the or'd node is to be inverted. This means we take the
938 * complement of everything not in the bitmap, but currently we don't
939 * know what that is, so give up and match anything */
940 if (ANYOF_NONBITMAP(or_with)) {
941 cl_anything(pRExC_state, cl);
944 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
945 * <= (B1 | !B2) | (CL1 | !CL2)
946 * which is wasteful if CL2 is small, but we ignore CL2:
947 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
948 * XXXX Can we handle case-fold? Unclear:
949 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
950 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
952 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
953 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
954 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
957 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
958 cl->bitmap[i] |= ~or_with->bitmap[i];
959 } /* XXXX: logic is complicated otherwise */
961 cl_anything(pRExC_state, cl);
964 /* And, we can just take the union of the flags that aren't affected
965 * by the inversion */
966 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
968 /* For the remaining flags:
969 ANYOF_UNICODE_ALL and inverted means to not match anything above
970 255, which means that the union with cl should just be
971 what cl has in it, so can ignore this flag
972 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
973 is 127-255 to match them, but then invert that, so the
974 union with cl should just be what cl has in it, so can
977 } else { /* 'or_with' is not inverted */
978 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
979 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
980 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
981 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
984 /* OR char bitmap and class bitmap separately */
985 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
986 cl->bitmap[i] |= or_with->bitmap[i];
987 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
988 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
989 cl->classflags[i] |= or_with->classflags[i];
990 cl->flags |= ANYOF_CLASS;
993 else { /* XXXX: logic is complicated, leave it along for a moment. */
994 cl_anything(pRExC_state, cl);
997 if (ANYOF_NONBITMAP(or_with)) {
999 /* Use the added node's outside-the-bit-map match if there isn't a
1000 * conflict. If there is a conflict (both nodes match something
1001 * outside the bitmap, but what they match outside is not the same
1002 * pointer, and hence not easily compared until XXX we extend
1003 * inversion lists this far), give up and allow the start class to
1004 * match everything outside the bitmap. If that stuff is all above
1005 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1006 if (! ANYOF_NONBITMAP(cl)) {
1007 ARG_SET(cl, ARG(or_with));
1009 else if (ARG(cl) != ARG(or_with)) {
1011 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1012 cl_anything(pRExC_state, cl);
1015 cl->flags |= ANYOF_UNICODE_ALL;
1020 /* Take the union */
1021 cl->flags |= or_with->flags;
1025 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1026 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1027 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1028 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1033 dump_trie(trie,widecharmap,revcharmap)
1034 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1035 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1037 These routines dump out a trie in a somewhat readable format.
1038 The _interim_ variants are used for debugging the interim
1039 tables that are used to generate the final compressed
1040 representation which is what dump_trie expects.
1042 Part of the reason for their existence is to provide a form
1043 of documentation as to how the different representations function.
1048 Dumps the final compressed table form of the trie to Perl_debug_log.
1049 Used for debugging make_trie().
1053 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1054 AV *revcharmap, U32 depth)
1057 SV *sv=sv_newmortal();
1058 int colwidth= widecharmap ? 6 : 4;
1060 GET_RE_DEBUG_FLAGS_DECL;
1062 PERL_ARGS_ASSERT_DUMP_TRIE;
1064 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1065 (int)depth * 2 + 2,"",
1066 "Match","Base","Ofs" );
1068 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1069 SV ** const tmp = av_fetch( revcharmap, state, 0);
1071 PerlIO_printf( Perl_debug_log, "%*s",
1073 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1074 PL_colors[0], PL_colors[1],
1075 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1076 PERL_PV_ESCAPE_FIRSTCHAR
1081 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1082 (int)depth * 2 + 2,"");
1084 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1085 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1086 PerlIO_printf( Perl_debug_log, "\n");
1088 for( state = 1 ; state < trie->statecount ; state++ ) {
1089 const U32 base = trie->states[ state ].trans.base;
1091 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1093 if ( trie->states[ state ].wordnum ) {
1094 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1096 PerlIO_printf( Perl_debug_log, "%6s", "" );
1099 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1104 while( ( base + ofs < trie->uniquecharcount ) ||
1105 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1106 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1109 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1111 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1112 if ( ( base + ofs >= trie->uniquecharcount ) &&
1113 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1114 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1116 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1118 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1120 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1124 PerlIO_printf( Perl_debug_log, "]");
1127 PerlIO_printf( Perl_debug_log, "\n" );
1129 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1130 for (word=1; word <= trie->wordcount; word++) {
1131 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1132 (int)word, (int)(trie->wordinfo[word].prev),
1133 (int)(trie->wordinfo[word].len));
1135 PerlIO_printf(Perl_debug_log, "\n" );
1138 Dumps a fully constructed but uncompressed trie in list form.
1139 List tries normally only are used for construction when the number of
1140 possible chars (trie->uniquecharcount) is very high.
1141 Used for debugging make_trie().
1144 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1145 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1149 SV *sv=sv_newmortal();
1150 int colwidth= widecharmap ? 6 : 4;
1151 GET_RE_DEBUG_FLAGS_DECL;
1153 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1155 /* print out the table precompression. */
1156 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1157 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1158 "------:-----+-----------------\n" );
1160 for( state=1 ; state < next_alloc ; state ++ ) {
1163 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1164 (int)depth * 2 + 2,"", (UV)state );
1165 if ( ! trie->states[ state ].wordnum ) {
1166 PerlIO_printf( Perl_debug_log, "%5s| ","");
1168 PerlIO_printf( Perl_debug_log, "W%4x| ",
1169 trie->states[ state ].wordnum
1172 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1173 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1175 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1177 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1178 PL_colors[0], PL_colors[1],
1179 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1180 PERL_PV_ESCAPE_FIRSTCHAR
1182 TRIE_LIST_ITEM(state,charid).forid,
1183 (UV)TRIE_LIST_ITEM(state,charid).newstate
1186 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1187 (int)((depth * 2) + 14), "");
1190 PerlIO_printf( Perl_debug_log, "\n");
1195 Dumps a fully constructed but uncompressed trie in table form.
1196 This is the normal DFA style state transition table, with a few
1197 twists to facilitate compression later.
1198 Used for debugging make_trie().
1201 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1202 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1207 SV *sv=sv_newmortal();
1208 int colwidth= widecharmap ? 6 : 4;
1209 GET_RE_DEBUG_FLAGS_DECL;
1211 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1214 print out the table precompression so that we can do a visual check
1215 that they are identical.
1218 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1220 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1221 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1223 PerlIO_printf( Perl_debug_log, "%*s",
1225 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1226 PL_colors[0], PL_colors[1],
1227 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1228 PERL_PV_ESCAPE_FIRSTCHAR
1234 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1236 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1237 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1240 PerlIO_printf( Perl_debug_log, "\n" );
1242 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1244 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1245 (int)depth * 2 + 2,"",
1246 (UV)TRIE_NODENUM( state ) );
1248 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1249 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1251 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1253 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1255 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1256 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1258 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1259 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1267 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1268 startbranch: the first branch in the whole branch sequence
1269 first : start branch of sequence of branch-exact nodes.
1270 May be the same as startbranch
1271 last : Thing following the last branch.
1272 May be the same as tail.
1273 tail : item following the branch sequence
1274 count : words in the sequence
1275 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1276 depth : indent depth
1278 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1280 A trie is an N'ary tree where the branches are determined by digital
1281 decomposition of the key. IE, at the root node you look up the 1st character and
1282 follow that branch repeat until you find the end of the branches. Nodes can be
1283 marked as "accepting" meaning they represent a complete word. Eg:
1287 would convert into the following structure. Numbers represent states, letters
1288 following numbers represent valid transitions on the letter from that state, if
1289 the number is in square brackets it represents an accepting state, otherwise it
1290 will be in parenthesis.
1292 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1296 (1) +-i->(6)-+-s->[7]
1298 +-s->(3)-+-h->(4)-+-e->[5]
1300 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1302 This shows that when matching against the string 'hers' we will begin at state 1
1303 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1304 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1305 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1306 single traverse. We store a mapping from accepting to state to which word was
1307 matched, and then when we have multiple possibilities we try to complete the
1308 rest of the regex in the order in which they occured in the alternation.
1310 The only prior NFA like behaviour that would be changed by the TRIE support is
1311 the silent ignoring of duplicate alternations which are of the form:
1313 / (DUPE|DUPE) X? (?{ ... }) Y /x
1315 Thus EVAL blocks following a trie may be called a different number of times with
1316 and without the optimisation. With the optimisations dupes will be silently
1317 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1318 the following demonstrates:
1320 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1322 which prints out 'word' three times, but
1324 'words'=~/(word|word|word)(?{ print $1 })S/
1326 which doesnt print it out at all. This is due to other optimisations kicking in.
1328 Example of what happens on a structural level:
1330 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1332 1: CURLYM[1] {1,32767}(18)
1343 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1344 and should turn into:
1346 1: CURLYM[1] {1,32767}(18)
1348 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1356 Cases where tail != last would be like /(?foo|bar)baz/:
1366 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1367 and would end up looking like:
1370 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1377 d = uvuni_to_utf8_flags(d, uv, 0);
1379 is the recommended Unicode-aware way of saying
1384 #define TRIE_STORE_REVCHAR(val) \
1387 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1388 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1389 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1390 SvCUR_set(zlopp, kapow - flrbbbbb); \
1393 av_push(revcharmap, zlopp); \
1395 char ooooff = (char)val; \
1396 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1400 #define TRIE_READ_CHAR STMT_START { \
1403 /* if it is UTF then it is either already folded, or does not need folding */ \
1404 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1406 else if (folder == PL_fold_latin1) { \
1407 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1408 if ( foldlen > 0 ) { \
1409 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1415 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1416 skiplen = UNISKIP(uvc); \
1417 foldlen -= skiplen; \
1418 scan = foldbuf + skiplen; \
1421 /* raw data, will be folded later if needed */ \
1429 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1430 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1431 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1432 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1434 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1435 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1436 TRIE_LIST_CUR( state )++; \
1439 #define TRIE_LIST_NEW(state) STMT_START { \
1440 Newxz( trie->states[ state ].trans.list, \
1441 4, reg_trie_trans_le ); \
1442 TRIE_LIST_CUR( state ) = 1; \
1443 TRIE_LIST_LEN( state ) = 4; \
1446 #define TRIE_HANDLE_WORD(state) STMT_START { \
1447 U16 dupe= trie->states[ state ].wordnum; \
1448 regnode * const noper_next = regnext( noper ); \
1451 /* store the word for dumping */ \
1453 if (OP(noper) != NOTHING) \
1454 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1456 tmp = newSVpvn_utf8( "", 0, UTF ); \
1457 av_push( trie_words, tmp ); \
1461 trie->wordinfo[curword].prev = 0; \
1462 trie->wordinfo[curword].len = wordlen; \
1463 trie->wordinfo[curword].accept = state; \
1465 if ( noper_next < tail ) { \
1467 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1468 trie->jump[curword] = (U16)(noper_next - convert); \
1470 jumper = noper_next; \
1472 nextbranch= regnext(cur); \
1476 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1477 /* chain, so that when the bits of chain are later */\
1478 /* linked together, the dups appear in the chain */\
1479 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1480 trie->wordinfo[dupe].prev = curword; \
1482 /* we haven't inserted this word yet. */ \
1483 trie->states[ state ].wordnum = curword; \
1488 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1489 ( ( base + charid >= ucharcount \
1490 && base + charid < ubound \
1491 && state == trie->trans[ base - ucharcount + charid ].check \
1492 && trie->trans[ base - ucharcount + charid ].next ) \
1493 ? trie->trans[ base - ucharcount + charid ].next \
1494 : ( state==1 ? special : 0 ) \
1498 #define MADE_JUMP_TRIE 2
1499 #define MADE_EXACT_TRIE 4
1502 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1505 /* first pass, loop through and scan words */
1506 reg_trie_data *trie;
1507 HV *widecharmap = NULL;
1508 AV *revcharmap = newAV();
1510 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1515 regnode *jumper = NULL;
1516 regnode *nextbranch = NULL;
1517 regnode *convert = NULL;
1518 U32 *prev_states; /* temp array mapping each state to previous one */
1519 /* we just use folder as a flag in utf8 */
1520 const U8 * folder = NULL;
1523 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1524 AV *trie_words = NULL;
1525 /* along with revcharmap, this only used during construction but both are
1526 * useful during debugging so we store them in the struct when debugging.
1529 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1530 STRLEN trie_charcount=0;
1532 SV *re_trie_maxbuff;
1533 GET_RE_DEBUG_FLAGS_DECL;
1535 PERL_ARGS_ASSERT_MAKE_TRIE;
1537 PERL_UNUSED_ARG(depth);
1544 case EXACTFU_TRICKYFOLD:
1545 case EXACTFU: folder = PL_fold_latin1; break;
1546 case EXACTF: folder = PL_fold; break;
1547 case EXACTFL: folder = PL_fold_locale; break;
1548 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1551 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1553 trie->startstate = 1;
1554 trie->wordcount = word_count;
1555 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1556 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1558 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1559 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1560 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1563 trie_words = newAV();
1566 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1567 if (!SvIOK(re_trie_maxbuff)) {
1568 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1570 DEBUG_TRIE_COMPILE_r({
1571 PerlIO_printf( Perl_debug_log,
1572 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1573 (int)depth * 2 + 2, "",
1574 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1575 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1579 /* Find the node we are going to overwrite */
1580 if ( first == startbranch && OP( last ) != BRANCH ) {
1581 /* whole branch chain */
1584 /* branch sub-chain */
1585 convert = NEXTOPER( first );
1588 /* -- First loop and Setup --
1590 We first traverse the branches and scan each word to determine if it
1591 contains widechars, and how many unique chars there are, this is
1592 important as we have to build a table with at least as many columns as we
1595 We use an array of integers to represent the character codes 0..255
1596 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1597 native representation of the character value as the key and IV's for the
1600 *TODO* If we keep track of how many times each character is used we can
1601 remap the columns so that the table compression later on is more
1602 efficient in terms of memory by ensuring the most common value is in the
1603 middle and the least common are on the outside. IMO this would be better
1604 than a most to least common mapping as theres a decent chance the most
1605 common letter will share a node with the least common, meaning the node
1606 will not be compressible. With a middle is most common approach the worst
1607 case is when we have the least common nodes twice.
1611 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1612 regnode *noper = NEXTOPER( cur );
1613 const U8 *uc = (U8*)STRING( noper );
1614 const U8 *e = uc + STR_LEN( noper );
1616 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1618 const U8 *scan = (U8*)NULL;
1619 U32 wordlen = 0; /* required init */
1621 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1623 if (OP(noper) == NOTHING) {
1624 regnode *noper_next= regnext(noper);
1625 if (noper_next != tail && OP(noper_next) == flags) {
1627 uc= (U8*)STRING(noper);
1628 e= uc + STR_LEN(noper);
1629 trie->minlen= STR_LEN(noper);
1636 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1637 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1638 regardless of encoding */
1639 if (OP( noper ) == EXACTFU_SS) {
1640 /* false positives are ok, so just set this */
1641 TRIE_BITMAP_SET(trie,0xDF);
1644 for ( ; uc < e ; uc += len ) {
1645 TRIE_CHARCOUNT(trie)++;
1650 U8 folded= folder[ (U8) uvc ];
1651 if ( !trie->charmap[ folded ] ) {
1652 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1653 TRIE_STORE_REVCHAR( folded );
1656 if ( !trie->charmap[ uvc ] ) {
1657 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1658 TRIE_STORE_REVCHAR( uvc );
1661 /* store the codepoint in the bitmap, and its folded
1663 TRIE_BITMAP_SET(trie, uvc);
1665 /* store the folded codepoint */
1666 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1669 /* store first byte of utf8 representation of
1670 variant codepoints */
1671 if (! UNI_IS_INVARIANT(uvc)) {
1672 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1675 set_bit = 0; /* We've done our bit :-) */
1680 widecharmap = newHV();
1682 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1685 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1687 if ( !SvTRUE( *svpp ) ) {
1688 sv_setiv( *svpp, ++trie->uniquecharcount );
1689 TRIE_STORE_REVCHAR(uvc);
1693 if( cur == first ) {
1694 trie->minlen = chars;
1695 trie->maxlen = chars;
1696 } else if (chars < trie->minlen) {
1697 trie->minlen = chars;
1698 } else if (chars > trie->maxlen) {
1699 trie->maxlen = chars;
1701 if (OP( noper ) == EXACTFU_SS) {
1702 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1703 if (trie->minlen > 1)
1706 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1707 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1708 * - We assume that any such sequence might match a 2 byte string */
1709 if (trie->minlen > 2 )
1713 } /* end first pass */
1714 DEBUG_TRIE_COMPILE_r(
1715 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1716 (int)depth * 2 + 2,"",
1717 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1718 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1719 (int)trie->minlen, (int)trie->maxlen )
1723 We now know what we are dealing with in terms of unique chars and
1724 string sizes so we can calculate how much memory a naive
1725 representation using a flat table will take. If it's over a reasonable
1726 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1727 conservative but potentially much slower representation using an array
1730 At the end we convert both representations into the same compressed
1731 form that will be used in regexec.c for matching with. The latter
1732 is a form that cannot be used to construct with but has memory
1733 properties similar to the list form and access properties similar
1734 to the table form making it both suitable for fast searches and
1735 small enough that its feasable to store for the duration of a program.
1737 See the comment in the code where the compressed table is produced
1738 inplace from the flat tabe representation for an explanation of how
1739 the compression works.
1744 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1747 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1749 Second Pass -- Array Of Lists Representation
1751 Each state will be represented by a list of charid:state records
1752 (reg_trie_trans_le) the first such element holds the CUR and LEN
1753 points of the allocated array. (See defines above).
1755 We build the initial structure using the lists, and then convert
1756 it into the compressed table form which allows faster lookups
1757 (but cant be modified once converted).
1760 STRLEN transcount = 1;
1762 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1763 "%*sCompiling trie using list compiler\n",
1764 (int)depth * 2 + 2, ""));
1766 trie->states = (reg_trie_state *)
1767 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1768 sizeof(reg_trie_state) );
1772 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1774 regnode *noper = NEXTOPER( cur );
1775 U8 *uc = (U8*)STRING( noper );
1776 const U8 *e = uc + STR_LEN( noper );
1777 U32 state = 1; /* required init */
1778 U16 charid = 0; /* sanity init */
1779 U8 *scan = (U8*)NULL; /* sanity init */
1780 STRLEN foldlen = 0; /* required init */
1781 U32 wordlen = 0; /* required init */
1782 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1785 if (OP(noper) == NOTHING) {
1786 regnode *noper_next= regnext(noper);
1787 if (noper_next != tail && OP(noper_next) == flags) {
1789 uc= (U8*)STRING(noper);
1790 e= uc + STR_LEN(noper);
1794 if (OP(noper) != NOTHING) {
1795 for ( ; uc < e ; uc += len ) {
1800 charid = trie->charmap[ uvc ];
1802 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1806 charid=(U16)SvIV( *svpp );
1809 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1816 if ( !trie->states[ state ].trans.list ) {
1817 TRIE_LIST_NEW( state );
1819 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1820 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1821 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1826 newstate = next_alloc++;
1827 prev_states[newstate] = state;
1828 TRIE_LIST_PUSH( state, charid, newstate );
1833 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1837 TRIE_HANDLE_WORD(state);
1839 } /* end second pass */
1841 /* next alloc is the NEXT state to be allocated */
1842 trie->statecount = next_alloc;
1843 trie->states = (reg_trie_state *)
1844 PerlMemShared_realloc( trie->states,
1846 * sizeof(reg_trie_state) );
1848 /* and now dump it out before we compress it */
1849 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1850 revcharmap, next_alloc,
1854 trie->trans = (reg_trie_trans *)
1855 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1862 for( state=1 ; state < next_alloc ; state ++ ) {
1866 DEBUG_TRIE_COMPILE_MORE_r(
1867 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1871 if (trie->states[state].trans.list) {
1872 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1876 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1877 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1878 if ( forid < minid ) {
1880 } else if ( forid > maxid ) {
1884 if ( transcount < tp + maxid - minid + 1) {
1886 trie->trans = (reg_trie_trans *)
1887 PerlMemShared_realloc( trie->trans,
1889 * sizeof(reg_trie_trans) );
1890 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1892 base = trie->uniquecharcount + tp - minid;
1893 if ( maxid == minid ) {
1895 for ( ; zp < tp ; zp++ ) {
1896 if ( ! trie->trans[ zp ].next ) {
1897 base = trie->uniquecharcount + zp - minid;
1898 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1899 trie->trans[ zp ].check = state;
1905 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1906 trie->trans[ tp ].check = state;
1911 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1912 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1913 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1914 trie->trans[ tid ].check = state;
1916 tp += ( maxid - minid + 1 );
1918 Safefree(trie->states[ state ].trans.list);
1921 DEBUG_TRIE_COMPILE_MORE_r(
1922 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1925 trie->states[ state ].trans.base=base;
1927 trie->lasttrans = tp + 1;
1931 Second Pass -- Flat Table Representation.
1933 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1934 We know that we will need Charcount+1 trans at most to store the data
1935 (one row per char at worst case) So we preallocate both structures
1936 assuming worst case.
1938 We then construct the trie using only the .next slots of the entry
1941 We use the .check field of the first entry of the node temporarily to
1942 make compression both faster and easier by keeping track of how many non
1943 zero fields are in the node.
1945 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1948 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1949 number representing the first entry of the node, and state as a
1950 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1951 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1952 are 2 entrys per node. eg:
1960 The table is internally in the right hand, idx form. However as we also
1961 have to deal with the states array which is indexed by nodenum we have to
1962 use TRIE_NODENUM() to convert.
1965 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1966 "%*sCompiling trie using table compiler\n",
1967 (int)depth * 2 + 2, ""));
1969 trie->trans = (reg_trie_trans *)
1970 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1971 * trie->uniquecharcount + 1,
1972 sizeof(reg_trie_trans) );
1973 trie->states = (reg_trie_state *)
1974 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1975 sizeof(reg_trie_state) );
1976 next_alloc = trie->uniquecharcount + 1;
1979 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1981 regnode *noper = NEXTOPER( cur );
1982 const U8 *uc = (U8*)STRING( noper );
1983 const U8 *e = uc + STR_LEN( noper );
1985 U32 state = 1; /* required init */
1987 U16 charid = 0; /* sanity init */
1988 U32 accept_state = 0; /* sanity init */
1989 U8 *scan = (U8*)NULL; /* sanity init */
1991 STRLEN foldlen = 0; /* required init */
1992 U32 wordlen = 0; /* required init */
1994 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1996 if (OP(noper) == NOTHING) {
1997 regnode *noper_next= regnext(noper);
1998 if (noper_next != tail && OP(noper_next) == flags) {
2000 uc= (U8*)STRING(noper);
2001 e= uc + STR_LEN(noper);
2005 if ( OP(noper) != NOTHING ) {
2006 for ( ; uc < e ; uc += len ) {
2011 charid = trie->charmap[ uvc ];
2013 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2014 charid = svpp ? (U16)SvIV(*svpp) : 0;
2018 if ( !trie->trans[ state + charid ].next ) {
2019 trie->trans[ state + charid ].next = next_alloc;
2020 trie->trans[ state ].check++;
2021 prev_states[TRIE_NODENUM(next_alloc)]
2022 = TRIE_NODENUM(state);
2023 next_alloc += trie->uniquecharcount;
2025 state = trie->trans[ state + charid ].next;
2027 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2029 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2032 accept_state = TRIE_NODENUM( state );
2033 TRIE_HANDLE_WORD(accept_state);
2035 } /* end second pass */
2037 /* and now dump it out before we compress it */
2038 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2040 next_alloc, depth+1));
2044 * Inplace compress the table.*
2046 For sparse data sets the table constructed by the trie algorithm will
2047 be mostly 0/FAIL transitions or to put it another way mostly empty.
2048 (Note that leaf nodes will not contain any transitions.)
2050 This algorithm compresses the tables by eliminating most such
2051 transitions, at the cost of a modest bit of extra work during lookup:
2053 - Each states[] entry contains a .base field which indicates the
2054 index in the state[] array wheres its transition data is stored.
2056 - If .base is 0 there are no valid transitions from that node.
2058 - If .base is nonzero then charid is added to it to find an entry in
2061 -If trans[states[state].base+charid].check!=state then the
2062 transition is taken to be a 0/Fail transition. Thus if there are fail
2063 transitions at the front of the node then the .base offset will point
2064 somewhere inside the previous nodes data (or maybe even into a node
2065 even earlier), but the .check field determines if the transition is
2069 The following process inplace converts the table to the compressed
2070 table: We first do not compress the root node 1,and mark all its
2071 .check pointers as 1 and set its .base pointer as 1 as well. This
2072 allows us to do a DFA construction from the compressed table later,
2073 and ensures that any .base pointers we calculate later are greater
2076 - We set 'pos' to indicate the first entry of the second node.
2078 - We then iterate over the columns of the node, finding the first and
2079 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2080 and set the .check pointers accordingly, and advance pos
2081 appropriately and repreat for the next node. Note that when we copy
2082 the next pointers we have to convert them from the original
2083 NODEIDX form to NODENUM form as the former is not valid post
2086 - If a node has no transitions used we mark its base as 0 and do not
2087 advance the pos pointer.
2089 - If a node only has one transition we use a second pointer into the
2090 structure to fill in allocated fail transitions from other states.
2091 This pointer is independent of the main pointer and scans forward
2092 looking for null transitions that are allocated to a state. When it
2093 finds one it writes the single transition into the "hole". If the
2094 pointer doesnt find one the single transition is appended as normal.
2096 - Once compressed we can Renew/realloc the structures to release the
2099 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2100 specifically Fig 3.47 and the associated pseudocode.
2104 const U32 laststate = TRIE_NODENUM( next_alloc );
2107 trie->statecount = laststate;
2109 for ( state = 1 ; state < laststate ; state++ ) {
2111 const U32 stateidx = TRIE_NODEIDX( state );
2112 const U32 o_used = trie->trans[ stateidx ].check;
2113 U32 used = trie->trans[ stateidx ].check;
2114 trie->trans[ stateidx ].check = 0;
2116 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2117 if ( flag || trie->trans[ stateidx + charid ].next ) {
2118 if ( trie->trans[ stateidx + charid ].next ) {
2120 for ( ; zp < pos ; zp++ ) {
2121 if ( ! trie->trans[ zp ].next ) {
2125 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2126 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2127 trie->trans[ zp ].check = state;
2128 if ( ++zp > pos ) pos = zp;
2135 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2137 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2138 trie->trans[ pos ].check = state;
2143 trie->lasttrans = pos + 1;
2144 trie->states = (reg_trie_state *)
2145 PerlMemShared_realloc( trie->states, laststate
2146 * sizeof(reg_trie_state) );
2147 DEBUG_TRIE_COMPILE_MORE_r(
2148 PerlIO_printf( Perl_debug_log,
2149 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2150 (int)depth * 2 + 2,"",
2151 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2154 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2157 } /* end table compress */
2159 DEBUG_TRIE_COMPILE_MORE_r(
2160 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2161 (int)depth * 2 + 2, "",
2162 (UV)trie->statecount,
2163 (UV)trie->lasttrans)
2165 /* resize the trans array to remove unused space */
2166 trie->trans = (reg_trie_trans *)
2167 PerlMemShared_realloc( trie->trans, trie->lasttrans
2168 * sizeof(reg_trie_trans) );
2170 { /* Modify the program and insert the new TRIE node */
2171 U8 nodetype =(U8)(flags & 0xFF);
2175 regnode *optimize = NULL;
2176 #ifdef RE_TRACK_PATTERN_OFFSETS
2179 U32 mjd_nodelen = 0;
2180 #endif /* RE_TRACK_PATTERN_OFFSETS */
2181 #endif /* DEBUGGING */
2183 This means we convert either the first branch or the first Exact,
2184 depending on whether the thing following (in 'last') is a branch
2185 or not and whther first is the startbranch (ie is it a sub part of
2186 the alternation or is it the whole thing.)
2187 Assuming its a sub part we convert the EXACT otherwise we convert
2188 the whole branch sequence, including the first.
2190 /* Find the node we are going to overwrite */
2191 if ( first != startbranch || OP( last ) == BRANCH ) {
2192 /* branch sub-chain */
2193 NEXT_OFF( first ) = (U16)(last - first);
2194 #ifdef RE_TRACK_PATTERN_OFFSETS
2196 mjd_offset= Node_Offset((convert));
2197 mjd_nodelen= Node_Length((convert));
2200 /* whole branch chain */
2202 #ifdef RE_TRACK_PATTERN_OFFSETS
2205 const regnode *nop = NEXTOPER( convert );
2206 mjd_offset= Node_Offset((nop));
2207 mjd_nodelen= Node_Length((nop));
2211 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2212 (int)depth * 2 + 2, "",
2213 (UV)mjd_offset, (UV)mjd_nodelen)
2216 /* But first we check to see if there is a common prefix we can
2217 split out as an EXACT and put in front of the TRIE node. */
2218 trie->startstate= 1;
2219 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2221 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2225 const U32 base = trie->states[ state ].trans.base;
2227 if ( trie->states[state].wordnum )
2230 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2231 if ( ( base + ofs >= trie->uniquecharcount ) &&
2232 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2233 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2235 if ( ++count > 1 ) {
2236 SV **tmp = av_fetch( revcharmap, ofs, 0);
2237 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2238 if ( state == 1 ) break;
2240 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2242 PerlIO_printf(Perl_debug_log,
2243 "%*sNew Start State=%"UVuf" Class: [",
2244 (int)depth * 2 + 2, "",
2247 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2248 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2250 TRIE_BITMAP_SET(trie,*ch);
2252 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2254 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2258 TRIE_BITMAP_SET(trie,*ch);
2260 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2261 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2267 SV **tmp = av_fetch( revcharmap, idx, 0);
2269 char *ch = SvPV( *tmp, len );
2271 SV *sv=sv_newmortal();
2272 PerlIO_printf( Perl_debug_log,
2273 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2274 (int)depth * 2 + 2, "",
2276 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2277 PL_colors[0], PL_colors[1],
2278 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2279 PERL_PV_ESCAPE_FIRSTCHAR
2284 OP( convert ) = nodetype;
2285 str=STRING(convert);
2288 STR_LEN(convert) += len;
2294 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2299 trie->prefixlen = (state-1);
2301 regnode *n = convert+NODE_SZ_STR(convert);
2302 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2303 trie->startstate = state;
2304 trie->minlen -= (state - 1);
2305 trie->maxlen -= (state - 1);
2307 /* At least the UNICOS C compiler choked on this
2308 * being argument to DEBUG_r(), so let's just have
2311 #ifdef PERL_EXT_RE_BUILD
2317 regnode *fix = convert;
2318 U32 word = trie->wordcount;
2320 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2321 while( ++fix < n ) {
2322 Set_Node_Offset_Length(fix, 0, 0);
2325 SV ** const tmp = av_fetch( trie_words, word, 0 );
2327 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2328 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2330 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2338 NEXT_OFF(convert) = (U16)(tail - convert);
2339 DEBUG_r(optimize= n);
2345 if ( trie->maxlen ) {
2346 NEXT_OFF( convert ) = (U16)(tail - convert);
2347 ARG_SET( convert, data_slot );
2348 /* Store the offset to the first unabsorbed branch in
2349 jump[0], which is otherwise unused by the jump logic.
2350 We use this when dumping a trie and during optimisation. */
2352 trie->jump[0] = (U16)(nextbranch - convert);
2354 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2355 * and there is a bitmap
2356 * and the first "jump target" node we found leaves enough room
2357 * then convert the TRIE node into a TRIEC node, with the bitmap
2358 * embedded inline in the opcode - this is hypothetically faster.
2360 if ( !trie->states[trie->startstate].wordnum
2362 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2364 OP( convert ) = TRIEC;
2365 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2366 PerlMemShared_free(trie->bitmap);
2369 OP( convert ) = TRIE;
2371 /* store the type in the flags */
2372 convert->flags = nodetype;
2376 + regarglen[ OP( convert ) ];
2378 /* XXX We really should free up the resource in trie now,
2379 as we won't use them - (which resources?) dmq */
2381 /* needed for dumping*/
2382 DEBUG_r(if (optimize) {
2383 regnode *opt = convert;
2385 while ( ++opt < optimize) {
2386 Set_Node_Offset_Length(opt,0,0);
2389 Try to clean up some of the debris left after the
2392 while( optimize < jumper ) {
2393 mjd_nodelen += Node_Length((optimize));
2394 OP( optimize ) = OPTIMIZED;
2395 Set_Node_Offset_Length(optimize,0,0);
2398 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2400 } /* end node insert */
2401 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2403 /* Finish populating the prev field of the wordinfo array. Walk back
2404 * from each accept state until we find another accept state, and if
2405 * so, point the first word's .prev field at the second word. If the
2406 * second already has a .prev field set, stop now. This will be the
2407 * case either if we've already processed that word's accept state,
2408 * or that state had multiple words, and the overspill words were
2409 * already linked up earlier.
2416 for (word=1; word <= trie->wordcount; word++) {
2418 if (trie->wordinfo[word].prev)
2420 state = trie->wordinfo[word].accept;
2422 state = prev_states[state];
2425 prev = trie->states[state].wordnum;
2429 trie->wordinfo[word].prev = prev;
2431 Safefree(prev_states);
2435 /* and now dump out the compressed format */
2436 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2438 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2440 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2441 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2443 SvREFCNT_dec(revcharmap);
2447 : trie->startstate>1
2453 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2455 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2457 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2458 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2461 We find the fail state for each state in the trie, this state is the longest proper
2462 suffix of the current state's 'word' that is also a proper prefix of another word in our
2463 trie. State 1 represents the word '' and is thus the default fail state. This allows
2464 the DFA not to have to restart after its tried and failed a word at a given point, it
2465 simply continues as though it had been matching the other word in the first place.
2467 'abcdgu'=~/abcdefg|cdgu/
2468 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2469 fail, which would bring us to the state representing 'd' in the second word where we would
2470 try 'g' and succeed, proceeding to match 'cdgu'.
2472 /* add a fail transition */
2473 const U32 trie_offset = ARG(source);
2474 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2476 const U32 ucharcount = trie->uniquecharcount;
2477 const U32 numstates = trie->statecount;
2478 const U32 ubound = trie->lasttrans + ucharcount;
2482 U32 base = trie->states[ 1 ].trans.base;
2485 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2486 GET_RE_DEBUG_FLAGS_DECL;
2488 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2490 PERL_UNUSED_ARG(depth);
2494 ARG_SET( stclass, data_slot );
2495 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2496 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2497 aho->trie=trie_offset;
2498 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2499 Copy( trie->states, aho->states, numstates, reg_trie_state );
2500 Newxz( q, numstates, U32);
2501 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2504 /* initialize fail[0..1] to be 1 so that we always have
2505 a valid final fail state */
2506 fail[ 0 ] = fail[ 1 ] = 1;
2508 for ( charid = 0; charid < ucharcount ; charid++ ) {
2509 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2511 q[ q_write ] = newstate;
2512 /* set to point at the root */
2513 fail[ q[ q_write++ ] ]=1;
2516 while ( q_read < q_write) {
2517 const U32 cur = q[ q_read++ % numstates ];
2518 base = trie->states[ cur ].trans.base;
2520 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2521 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2523 U32 fail_state = cur;
2526 fail_state = fail[ fail_state ];
2527 fail_base = aho->states[ fail_state ].trans.base;
2528 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2530 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2531 fail[ ch_state ] = fail_state;
2532 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2534 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2536 q[ q_write++ % numstates] = ch_state;
2540 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2541 when we fail in state 1, this allows us to use the
2542 charclass scan to find a valid start char. This is based on the principle
2543 that theres a good chance the string being searched contains lots of stuff
2544 that cant be a start char.
2546 fail[ 0 ] = fail[ 1 ] = 0;
2547 DEBUG_TRIE_COMPILE_r({
2548 PerlIO_printf(Perl_debug_log,
2549 "%*sStclass Failtable (%"UVuf" states): 0",
2550 (int)(depth * 2), "", (UV)numstates
2552 for( q_read=1; q_read<numstates; q_read++ ) {
2553 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2555 PerlIO_printf(Perl_debug_log, "\n");
2558 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2563 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2564 * These need to be revisited when a newer toolchain becomes available.
2566 #if defined(__sparc64__) && defined(__GNUC__)
2567 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2568 # undef SPARC64_GCC_WORKAROUND
2569 # define SPARC64_GCC_WORKAROUND 1
2573 #define DEBUG_PEEP(str,scan,depth) \
2574 DEBUG_OPTIMISE_r({if (scan){ \
2575 SV * const mysv=sv_newmortal(); \
2576 regnode *Next = regnext(scan); \
2577 regprop(RExC_rx, mysv, scan); \
2578 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2579 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2580 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2584 /* The below joins as many adjacent EXACTish nodes as possible into a single
2585 * one, and looks for problematic sequences of characters whose folds vs.
2586 * non-folds have sufficiently different lengths, that the optimizer would be
2587 * fooled into rejecting legitimate matches of them, and the trie construction
2588 * code needs to handle specially. The joining is only done if:
2589 * 1) there is room in the current conglomerated node to entirely contain the
2591 * 2) they are the exact same node type
2593 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2594 * these get optimized out
2596 * If there are problematic code sequences, *min_subtract is set to the delta
2597 * that the minimum size of the node can be less than its actual size. And,
2598 * the node type of the result is changed to reflect that it contains these
2601 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2602 * and contains LATIN SMALL LETTER SHARP S
2604 * This is as good a place as any to discuss the design of handling these
2605 * problematic sequences. It's been wrong in Perl for a very long time. There
2606 * are three code points currently in Unicode whose folded lengths differ so
2607 * much from the un-folded lengths that it causes problems for the optimizer
2608 * and trie construction. Why only these are problematic, and not others where
2609 * lengths also differ is something I (khw) do not understand. New versions of
2610 * Unicode might add more such code points. Hopefully the logic in
2611 * fold_grind.t that figures out what to test (in part by verifying that each
2612 * size-combination gets tested) will catch any that do come along, so they can
2613 * be added to the special handling below. The chances of new ones are
2614 * actually rather small, as most, if not all, of the world's scripts that have
2615 * casefolding have already been encoded by Unicode. Also, a number of
2616 * Unicode's decisions were made to allow compatibility with pre-existing
2617 * standards, and almost all of those have already been dealt with. These
2618 * would otherwise be the most likely candidates for generating further tricky
2619 * sequences. In other words, Unicode by itself is unlikely to add new ones
2620 * unless it is for compatibility with pre-existing standards, and there aren't
2621 * many of those left.
2623 * The previous designs for dealing with these involved assigning a special
2624 * node for them. This approach doesn't work, as evidenced by this example:
2625 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2626 * Both these fold to "sss", but if the pattern is parsed to create a node
2627 * that would match just the \xDF, it won't be able to handle the case where a
2628 * successful match would have to cross the node's boundary. The new approach
2629 * that hopefully generally solves the problem generates an EXACTFU_SS node
2632 * There are a number of components to the approach (a lot of work for just
2633 * three code points!):
2634 * 1) This routine examines each EXACTFish node that could contain the
2635 * problematic sequences. It returns in *min_subtract how much to
2636 * subtract from the the actual length of the string to get a real minimum
2637 * for one that could match it. This number is usually 0 except for the
2638 * problematic sequences. This delta is used by the caller to adjust the
2639 * min length of the match, and the delta between min and max, so that the
2640 * optimizer doesn't reject these possibilities based on size constraints.
2641 * 2) These sequences require special handling by the trie code, so this code
2642 * changes the joined node type to special ops: EXACTFU_TRICKYFOLD and
2644 * 3) This is sufficient for the two Greek sequences (described below), but
2645 * the one involving the Sharp s (\xDF) needs more. The node type
2646 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2647 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2648 * case where there is a possible fold length change. That means that a
2649 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2650 * itself with length changes, and so can be processed faster. regexec.c
2651 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2652 * is pre-folded by regcomp.c. This saves effort in regex matching.
2653 * However, the pre-folding isn't done for non-UTF8 patterns because the
2654 * fold of the MICRO SIGN requires UTF-8. Also what EXACTF and EXACTFL
2655 * nodes fold to isn't known until runtime. The fold possibilities for
2656 * the non-UTF8 patterns are quite simple, except for the sharp s. All
2657 * the ones that don't involve a UTF-8 target string are members of a
2658 * fold-pair, and arrays are set up for all of them so that the other
2659 * member of the pair can be found quickly. Code elsewhere in this file
2660 * makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even
2661 * if the pattern isn't UTF-8. This avoids the issues described in the
2663 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2664 * 'ss' or not is not knowable at compile time. It will match iff the
2665 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2666 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2667 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2668 * described in item 3). An assumption that the optimizer part of
2669 * regexec.c (probably unwittingly) makes is that a character in the
2670 * pattern corresponds to at most a single character in the target string.
2671 * (And I do mean character, and not byte here, unlike other parts of the
2672 * documentation that have never been updated to account for multibyte
2673 * Unicode.) This assumption is wrong only in this case, as all other
2674 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2675 * virtue of having this file pre-fold UTF-8 patterns. I'm
2676 * reluctant to try to change this assumption, so instead the code punts.
2677 * This routine examines EXACTF nodes for the sharp s, and returns a
2678 * boolean indicating whether or not the node is an EXACTF node that
2679 * contains a sharp s. When it is true, the caller sets a flag that later
2680 * causes the optimizer in this file to not set values for the floating
2681 * and fixed string lengths, and thus avoids the optimizer code in
2682 * regexec.c that makes the invalid assumption. Thus, there is no
2683 * optimization based on string lengths for EXACTF nodes that contain the
2684 * sharp s. This only happens for /id rules (which means the pattern
2688 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2689 if (PL_regkind[OP(scan)] == EXACT) \
2690 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2693 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2694 /* Merge several consecutive EXACTish nodes into one. */
2695 regnode *n = regnext(scan);
2697 regnode *next = scan + NODE_SZ_STR(scan);
2701 regnode *stop = scan;
2702 GET_RE_DEBUG_FLAGS_DECL;
2704 PERL_UNUSED_ARG(depth);
2707 PERL_ARGS_ASSERT_JOIN_EXACT;
2708 #ifndef EXPERIMENTAL_INPLACESCAN
2709 PERL_UNUSED_ARG(flags);
2710 PERL_UNUSED_ARG(val);
2712 DEBUG_PEEP("join",scan,depth);
2714 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2715 * EXACT ones that are mergeable to the current one. */
2717 && (PL_regkind[OP(n)] == NOTHING
2718 || (stringok && OP(n) == OP(scan)))
2720 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2723 if (OP(n) == TAIL || n > next)
2725 if (PL_regkind[OP(n)] == NOTHING) {
2726 DEBUG_PEEP("skip:",n,depth);
2727 NEXT_OFF(scan) += NEXT_OFF(n);
2728 next = n + NODE_STEP_REGNODE;
2735 else if (stringok) {
2736 const unsigned int oldl = STR_LEN(scan);
2737 regnode * const nnext = regnext(n);
2739 /* XXX I (khw) kind of doubt that this works on platforms where
2740 * U8_MAX is above 255 because of lots of other assumptions */
2741 if (oldl + STR_LEN(n) > U8_MAX)
2744 DEBUG_PEEP("merg",n,depth);
2747 NEXT_OFF(scan) += NEXT_OFF(n);
2748 STR_LEN(scan) += STR_LEN(n);
2749 next = n + NODE_SZ_STR(n);
2750 /* Now we can overwrite *n : */
2751 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2759 #ifdef EXPERIMENTAL_INPLACESCAN
2760 if (flags && !NEXT_OFF(n)) {
2761 DEBUG_PEEP("atch", val, depth);
2762 if (reg_off_by_arg[OP(n)]) {
2763 ARG_SET(n, val - n);
2766 NEXT_OFF(n) = val - n;
2774 *has_exactf_sharp_s = FALSE;
2776 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2777 * can now analyze for sequences of problematic code points. (Prior to
2778 * this final joining, sequences could have been split over boundaries, and
2779 * hence missed). The sequences only happen in folding, hence for any
2780 * non-EXACT EXACTish node */
2781 if (OP(scan) != EXACT) {
2783 U8 * s0 = (U8*) STRING(scan);
2784 U8 * const s_end = s0 + STR_LEN(scan);
2786 /* The below is perhaps overboard, but this allows us to save a test
2787 * each time through the loop at the expense of a mask. This is
2788 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2789 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2790 * This uses an exclusive 'or' to find that bit and then inverts it to
2791 * form a mask, with just a single 0, in the bit position where 'S' and
2793 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2794 const U8 s_masked = 's' & S_or_s_mask;
2796 /* One pass is made over the node's string looking for all the
2797 * possibilities. to avoid some tests in the loop, there are two main
2798 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2802 /* There are two problematic Greek code points in Unicode
2805 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2806 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2812 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2813 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2815 * This means that in case-insensitive matching (or "loose
2816 * matching", as Unicode calls it), an EXACTF of length six (the
2817 * UTF-8 encoded byte length of the above casefolded versions) can
2818 * match a target string of length two (the byte length of UTF-8
2819 * encoded U+0390 or U+03B0). This would rather mess up the
2820 * minimum length computation. (there are other code points that
2821 * also fold to these two sequences, but the delta is smaller)
2823 * If these sequences are found, the minimum length is decreased by
2824 * four (six minus two).
2826 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2827 * LETTER SHARP S. We decrease the min length by 1 for each
2828 * occurrence of 'ss' found */
2830 #define U390_FIRST_BYTE GREEK_SMALL_LETTER_IOTA_UTF8_FIRST_BYTE
2831 #define U3B0_FIRST_BYTE GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE
2832 const U8 U390_tail[] = GREEK_SMALL_LETTER_IOTA_UTF8_TAIL
2833 COMBINING_DIAERESIS_UTF8
2834 COMBINING_ACUTE_ACCENT_UTF8;
2835 const U8 U3B0_tail[] = GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL
2836 COMBINING_DIAERESIS_UTF8
2837 COMBINING_ACUTE_ACCENT_UTF8;
2838 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2839 yields a net of 0 */
2840 /* Examine the string for one of the problematic sequences */
2842 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2843 * sequence we are looking for is 2 */
2847 /* Look for the first byte in each problematic sequence */
2849 /* We don't have to worry about other things that fold to
2850 * 's' (such as the long s, U+017F), as all above-latin1
2851 * code points have been pre-folded */
2855 /* Current character is an 's' or 'S'. If next one is
2856 * as well, we have the dreaded sequence */
2857 if (((*(s+1) & S_or_s_mask) == s_masked)
2858 /* These two node types don't have special handling
2860 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2863 OP(scan) = EXACTFU_SS;
2864 s++; /* No need to look at this character again */
2868 case U390_FIRST_BYTE:
2869 if (s_end - s >= len
2871 /* The 1's are because are skipping comparing the
2873 && memEQ(s + 1, U390_tail, len - 1))
2875 goto greek_sequence;
2879 case U3B0_FIRST_BYTE:
2880 if (! (s_end - s >= len
2881 && memEQ(s + 1, U3B0_tail, len - 1)))
2888 /* This requires special handling by trie's, so change
2889 * the node type to indicate this. If EXACTFA and
2890 * EXACTFL were ever to be handled by trie's, this
2891 * would have to be changed. If this node has already
2892 * been changed to EXACTFU_SS in this loop, leave it as
2893 * is. (I (khw) think it doesn't matter in regexec.c
2894 * for UTF patterns, but no need to change it */
2895 if (OP(scan) == EXACTFU) {
2896 OP(scan) = EXACTFU_TRICKYFOLD;
2898 s += 6; /* We already know what this sequence is. Skip
2904 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2906 /* Here, the pattern is not UTF-8. We need to look only for the
2907 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2908 * in the final position. Otherwise we can stop looking 1 byte
2909 * earlier because have to find both the first and second 's' */
2910 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2912 for (s = s0; s < upper; s++) {
2917 && ((*(s+1) & S_or_s_mask) == s_masked))
2921 /* EXACTF nodes need to know that the minimum
2922 * length changed so that a sharp s in the string
2923 * can match this ss in the pattern, but they
2924 * remain EXACTF nodes, as they won't match this
2925 * unless the target string is is UTF-8, which we
2926 * don't know until runtime */
2927 if (OP(scan) != EXACTF) {
2928 OP(scan) = EXACTFU_SS;
2933 case LATIN_SMALL_LETTER_SHARP_S:
2934 if (OP(scan) == EXACTF) {
2935 *has_exactf_sharp_s = TRUE;
2944 /* Allow dumping but overwriting the collection of skipped
2945 * ops and/or strings with fake optimized ops */
2946 n = scan + NODE_SZ_STR(scan);
2954 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2958 /* REx optimizer. Converts nodes into quicker variants "in place".
2959 Finds fixed substrings. */
2961 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2962 to the position after last scanned or to NULL. */
2964 #define INIT_AND_WITHP \
2965 assert(!and_withp); \
2966 Newx(and_withp,1,struct regnode_charclass_class); \
2967 SAVEFREEPV(and_withp)
2969 /* this is a chain of data about sub patterns we are processing that
2970 need to be handled separately/specially in study_chunk. Its so
2971 we can simulate recursion without losing state. */
2973 typedef struct scan_frame {
2974 regnode *last; /* last node to process in this frame */
2975 regnode *next; /* next node to process when last is reached */
2976 struct scan_frame *prev; /*previous frame*/
2977 I32 stop; /* what stopparen do we use */
2981 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2983 #define CASE_SYNST_FNC(nAmE) \
2985 if (flags & SCF_DO_STCLASS_AND) { \
2986 for (value = 0; value < 256; value++) \
2987 if (!is_ ## nAmE ## _cp(value)) \
2988 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2991 for (value = 0; value < 256; value++) \
2992 if (is_ ## nAmE ## _cp(value)) \
2993 ANYOF_BITMAP_SET(data->start_class, value); \
2997 if (flags & SCF_DO_STCLASS_AND) { \
2998 for (value = 0; value < 256; value++) \
2999 if (is_ ## nAmE ## _cp(value)) \
3000 ANYOF_BITMAP_CLEAR(data->start_class, value); \
3003 for (value = 0; value < 256; value++) \
3004 if (!is_ ## nAmE ## _cp(value)) \
3005 ANYOF_BITMAP_SET(data->start_class, value); \
3012 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3013 I32 *minlenp, I32 *deltap,
3018 struct regnode_charclass_class *and_withp,
3019 U32 flags, U32 depth)
3020 /* scanp: Start here (read-write). */
3021 /* deltap: Write maxlen-minlen here. */
3022 /* last: Stop before this one. */
3023 /* data: string data about the pattern */
3024 /* stopparen: treat close N as END */
3025 /* recursed: which subroutines have we recursed into */
3026 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3029 I32 min = 0, pars = 0, code;
3030 regnode *scan = *scanp, *next;
3032 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3033 int is_inf_internal = 0; /* The studied chunk is infinite */
3034 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3035 scan_data_t data_fake;
3036 SV *re_trie_maxbuff = NULL;
3037 regnode *first_non_open = scan;
3038 I32 stopmin = I32_MAX;
3039 scan_frame *frame = NULL;
3040 GET_RE_DEBUG_FLAGS_DECL;
3042 PERL_ARGS_ASSERT_STUDY_CHUNK;
3045 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3049 while (first_non_open && OP(first_non_open) == OPEN)
3050 first_non_open=regnext(first_non_open);
3055 while ( scan && OP(scan) != END && scan < last ){
3056 UV min_subtract = 0; /* How much to subtract from the minimum node
3057 length to get a real minimum (because the
3058 folded version may be shorter) */
3059 bool has_exactf_sharp_s = FALSE;
3060 /* Peephole optimizer: */
3061 DEBUG_STUDYDATA("Peep:", data,depth);
3062 DEBUG_PEEP("Peep",scan,depth);
3064 /* Its not clear to khw or hv why this is done here, and not in the
3065 * clauses that deal with EXACT nodes. khw's guess is that it's
3066 * because of a previous design */
3067 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3069 /* Follow the next-chain of the current node and optimize
3070 away all the NOTHINGs from it. */
3071 if (OP(scan) != CURLYX) {
3072 const int max = (reg_off_by_arg[OP(scan)]
3074 /* I32 may be smaller than U16 on CRAYs! */
3075 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3076 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3080 /* Skip NOTHING and LONGJMP. */
3081 while ((n = regnext(n))
3082 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3083 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3084 && off + noff < max)
3086 if (reg_off_by_arg[OP(scan)])
3089 NEXT_OFF(scan) = off;
3094 /* The principal pseudo-switch. Cannot be a switch, since we
3095 look into several different things. */
3096 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3097 || OP(scan) == IFTHEN) {
3098 next = regnext(scan);
3100 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3102 if (OP(next) == code || code == IFTHEN) {
3103 /* NOTE - There is similar code to this block below for handling
3104 TRIE nodes on a re-study. If you change stuff here check there
3106 I32 max1 = 0, min1 = I32_MAX, num = 0;
3107 struct regnode_charclass_class accum;
3108 regnode * const startbranch=scan;
3110 if (flags & SCF_DO_SUBSTR)
3111 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3112 if (flags & SCF_DO_STCLASS)
3113 cl_init_zero(pRExC_state, &accum);
3115 while (OP(scan) == code) {
3116 I32 deltanext, minnext, f = 0, fake;
3117 struct regnode_charclass_class this_class;
3120 data_fake.flags = 0;
3122 data_fake.whilem_c = data->whilem_c;
3123 data_fake.last_closep = data->last_closep;
3126 data_fake.last_closep = &fake;
3128 data_fake.pos_delta = delta;
3129 next = regnext(scan);
3130 scan = NEXTOPER(scan);
3132 scan = NEXTOPER(scan);
3133 if (flags & SCF_DO_STCLASS) {
3134 cl_init(pRExC_state, &this_class);
3135 data_fake.start_class = &this_class;
3136 f = SCF_DO_STCLASS_AND;
3138 if (flags & SCF_WHILEM_VISITED_POS)
3139 f |= SCF_WHILEM_VISITED_POS;
3141 /* we suppose the run is continuous, last=next...*/
3142 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3144 stopparen, recursed, NULL, f,depth+1);
3147 if (max1 < minnext + deltanext)
3148 max1 = minnext + deltanext;
3149 if (deltanext == I32_MAX)
3150 is_inf = is_inf_internal = 1;
3152 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3154 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3155 if ( stopmin > minnext)
3156 stopmin = min + min1;
3157 flags &= ~SCF_DO_SUBSTR;
3159 data->flags |= SCF_SEEN_ACCEPT;
3162 if (data_fake.flags & SF_HAS_EVAL)
3163 data->flags |= SF_HAS_EVAL;
3164 data->whilem_c = data_fake.whilem_c;
3166 if (flags & SCF_DO_STCLASS)
3167 cl_or(pRExC_state, &accum, &this_class);
3169 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3171 if (flags & SCF_DO_SUBSTR) {
3172 data->pos_min += min1;
3173 data->pos_delta += max1 - min1;
3174 if (max1 != min1 || is_inf)
3175 data->longest = &(data->longest_float);
3178 delta += max1 - min1;
3179 if (flags & SCF_DO_STCLASS_OR) {
3180 cl_or(pRExC_state, data->start_class, &accum);
3182 cl_and(data->start_class, and_withp);
3183 flags &= ~SCF_DO_STCLASS;
3186 else if (flags & SCF_DO_STCLASS_AND) {
3188 cl_and(data->start_class, &accum);
3189 flags &= ~SCF_DO_STCLASS;
3192 /* Switch to OR mode: cache the old value of
3193 * data->start_class */
3195 StructCopy(data->start_class, and_withp,
3196 struct regnode_charclass_class);
3197 flags &= ~SCF_DO_STCLASS_AND;
3198 StructCopy(&accum, data->start_class,
3199 struct regnode_charclass_class);
3200 flags |= SCF_DO_STCLASS_OR;
3201 data->start_class->flags |= ANYOF_EOS;
3205 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3208 Assuming this was/is a branch we are dealing with: 'scan' now
3209 points at the item that follows the branch sequence, whatever
3210 it is. We now start at the beginning of the sequence and look
3217 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3219 If we can find such a subsequence we need to turn the first
3220 element into a trie and then add the subsequent branch exact
3221 strings to the trie.
3225 1. patterns where the whole set of branches can be converted.
3227 2. patterns where only a subset can be converted.
3229 In case 1 we can replace the whole set with a single regop
3230 for the trie. In case 2 we need to keep the start and end
3233 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3234 becomes BRANCH TRIE; BRANCH X;
3236 There is an additional case, that being where there is a
3237 common prefix, which gets split out into an EXACT like node
3238 preceding the TRIE node.
3240 If x(1..n)==tail then we can do a simple trie, if not we make
3241 a "jump" trie, such that when we match the appropriate word
3242 we "jump" to the appropriate tail node. Essentially we turn
3243 a nested if into a case structure of sorts.
3248 if (!re_trie_maxbuff) {
3249 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3250 if (!SvIOK(re_trie_maxbuff))
3251 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3253 if ( SvIV(re_trie_maxbuff)>=0 ) {
3255 regnode *first = (regnode *)NULL;
3256 regnode *last = (regnode *)NULL;
3257 regnode *tail = scan;
3262 SV * const mysv = sv_newmortal(); /* for dumping */
3264 /* var tail is used because there may be a TAIL
3265 regop in the way. Ie, the exacts will point to the
3266 thing following the TAIL, but the last branch will
3267 point at the TAIL. So we advance tail. If we
3268 have nested (?:) we may have to move through several
3272 while ( OP( tail ) == TAIL ) {
3273 /* this is the TAIL generated by (?:) */
3274 tail = regnext( tail );
3278 DEBUG_TRIE_COMPILE_r({
3279 regprop(RExC_rx, mysv, tail );
3280 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3281 (int)depth * 2 + 2, "",
3282 "Looking for TRIE'able sequences. Tail node is: ",
3283 SvPV_nolen_const( mysv )
3289 Step through the branches
3290 cur represents each branch,
3291 noper is the first thing to be matched as part of that branch
3292 noper_next is the regnext() of that node.
3294 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3295 via a "jump trie" but we also support building with NOJUMPTRIE,
3296 which restricts the trie logic to structures like /FOO|BAR/.
3298 If noper is a trieable nodetype then the branch is a possible optimization
3299 target. If we are building under NOJUMPTRIE then we require that noper_next
3300 is the same as scan (our current position in the regex program).
3302 Once we have two or more consecutive such branches we can create a
3303 trie of the EXACT's contents and stitch it in place into the program.
3305 If the sequence represents all of the branches in the alternation we
3306 replace the entire thing with a single TRIE node.
3308 Otherwise when it is a subsequence we need to stitch it in place and
3309 replace only the relevant branches. This means the first branch has
3310 to remain as it is used by the alternation logic, and its next pointer,
3311 and needs to be repointed at the item on the branch chain following
3312 the last branch we have optimized away.
3314 This could be either a BRANCH, in which case the subsequence is internal,
3315 or it could be the item following the branch sequence in which case the
3316 subsequence is at the end (which does not necessarily mean the first node
3317 is the start of the alternation).
3319 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3322 ----------------+-----------
3326 EXACTFU_SS | EXACTFU
3327 EXACTFU_TRICKYFOLD | EXACTFU
3332 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3333 ( EXACT == (X) ) ? EXACT : \
3334 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3337 /* dont use tail as the end marker for this traverse */
3338 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3339 regnode * const noper = NEXTOPER( cur );
3340 U8 noper_type = OP( noper );
3341 U8 noper_trietype = TRIE_TYPE( noper_type );
3342 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3343 regnode * const noper_next = regnext( noper );
3344 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3345 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3348 DEBUG_TRIE_COMPILE_r({
3349 regprop(RExC_rx, mysv, cur);
3350 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3351 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3353 regprop(RExC_rx, mysv, noper);
3354 PerlIO_printf( Perl_debug_log, " -> %s",
3355 SvPV_nolen_const(mysv));
3358 regprop(RExC_rx, mysv, noper_next );
3359 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3360 SvPV_nolen_const(mysv));
3362 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3363 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3364 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3368 /* Is noper a trieable nodetype that can be merged with the
3369 * current trie (if there is one)? */
3373 ( noper_trietype == NOTHING)
3374 || ( trietype == NOTHING )
3375 || ( trietype == noper_trietype )
3378 && noper_next == tail
3382 /* Handle mergable triable node
3383 * Either we are the first node in a new trieable sequence,
3384 * in which case we do some bookkeeping, otherwise we update
3385 * the end pointer. */
3388 if ( noper_trietype == NOTHING ) {
3389 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3390 regnode * const noper_next = regnext( noper );
3391 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3392 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3395 if ( noper_next_trietype ) {
3396 trietype = noper_next_trietype;
3397 } else if (noper_next_type) {
3398 /* a NOTHING regop is 1 regop wide. We need at least two
3399 * for a trie so we can't merge this in */
3403 trietype = noper_trietype;
3406 if ( trietype == NOTHING )
3407 trietype = noper_trietype;
3412 } /* end handle mergable triable node */
3414 /* handle unmergable node -
3415 * noper may either be a triable node which can not be tried
3416 * together with the current trie, or a non triable node */
3418 /* If last is set and trietype is not NOTHING then we have found
3419 * at least two triable branch sequences in a row of a similar
3420 * trietype so we can turn them into a trie. If/when we
3421 * allow NOTHING to start a trie sequence this condition will be
3422 * required, and it isn't expensive so we leave it in for now. */
3423 if ( trietype != NOTHING )
3424 make_trie( pRExC_state,
3425 startbranch, first, cur, tail, count,
3426 trietype, depth+1 );
3427 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3431 && noper_next == tail
3434 /* noper is triable, so we can start a new trie sequence */
3437 trietype = noper_trietype;
3439 /* if we already saw a first but the current node is not triable then we have
3440 * to reset the first information. */
3445 } /* end handle unmergable node */
3446 } /* loop over branches */
3447 DEBUG_TRIE_COMPILE_r({
3448 regprop(RExC_rx, mysv, cur);
3449 PerlIO_printf( Perl_debug_log,
3450 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3451 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3455 if ( trietype != NOTHING ) {
3456 /* the last branch of the sequence was part of a trie,
3457 * so we have to construct it here outside of the loop
3459 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3460 #ifdef TRIE_STUDY_OPT
3461 if ( ((made == MADE_EXACT_TRIE &&
3462 startbranch == first)
3463 || ( first_non_open == first )) &&
3465 flags |= SCF_TRIE_RESTUDY;
3466 if ( startbranch == first
3469 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3474 /* at this point we know whatever we have is a NOTHING sequence/branch
3475 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3477 if ( startbranch == first ) {
3479 /* the entire thing is a NOTHING sequence, something like this:
3480 * (?:|) So we can turn it into a plain NOTHING op. */
3481 DEBUG_TRIE_COMPILE_r({
3482 regprop(RExC_rx, mysv, cur);
3483 PerlIO_printf( Perl_debug_log,
3484 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3485 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3488 OP(startbranch)= NOTHING;
3489 NEXT_OFF(startbranch)= tail - startbranch;
3490 for ( opt= startbranch + 1; opt < tail ; opt++ )
3494 } /* end if ( last) */
3495 } /* TRIE_MAXBUF is non zero */
3500 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3501 scan = NEXTOPER(NEXTOPER(scan));
3502 } else /* single branch is optimized. */
3503 scan = NEXTOPER(scan);
3505 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3506 scan_frame *newframe = NULL;
3511 if (OP(scan) != SUSPEND) {
3512 /* set the pointer */
3513 if (OP(scan) == GOSUB) {
3515 RExC_recurse[ARG2L(scan)] = scan;
3516 start = RExC_open_parens[paren-1];
3517 end = RExC_close_parens[paren-1];
3520 start = RExC_rxi->program + 1;
3524 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3525 SAVEFREEPV(recursed);
3527 if (!PAREN_TEST(recursed,paren+1)) {
3528 PAREN_SET(recursed,paren+1);
3529 Newx(newframe,1,scan_frame);
3531 if (flags & SCF_DO_SUBSTR) {
3532 SCAN_COMMIT(pRExC_state,data,minlenp);
3533 data->longest = &(data->longest_float);
3535 is_inf = is_inf_internal = 1;
3536 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3537 cl_anything(pRExC_state, data->start_class);
3538 flags &= ~SCF_DO_STCLASS;
3541 Newx(newframe,1,scan_frame);
3544 end = regnext(scan);
3549 SAVEFREEPV(newframe);
3550 newframe->next = regnext(scan);
3551 newframe->last = last;
3552 newframe->stop = stopparen;
3553 newframe->prev = frame;
3563 else if (OP(scan) == EXACT) {
3564 I32 l = STR_LEN(scan);
3567 const U8 * const s = (U8*)STRING(scan);
3568 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3569 l = utf8_length(s, s + l);
3571 uc = *((U8*)STRING(scan));
3574 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3575 /* The code below prefers earlier match for fixed
3576 offset, later match for variable offset. */
3577 if (data->last_end == -1) { /* Update the start info. */
3578 data->last_start_min = data->pos_min;
3579 data->last_start_max = is_inf
3580 ? I32_MAX : data->pos_min + data->pos_delta;
3582 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3584 SvUTF8_on(data->last_found);
3586 SV * const sv = data->last_found;
3587 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3588 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3589 if (mg && mg->mg_len >= 0)
3590 mg->mg_len += utf8_length((U8*)STRING(scan),
3591 (U8*)STRING(scan)+STR_LEN(scan));
3593 data->last_end = data->pos_min + l;
3594 data->pos_min += l; /* As in the first entry. */
3595 data->flags &= ~SF_BEFORE_EOL;
3597 if (flags & SCF_DO_STCLASS_AND) {
3598 /* Check whether it is compatible with what we know already! */
3602 /* If compatible, we or it in below. It is compatible if is
3603 * in the bitmp and either 1) its bit or its fold is set, or 2)
3604 * it's for a locale. Even if there isn't unicode semantics
3605 * here, at runtime there may be because of matching against a
3606 * utf8 string, so accept a possible false positive for
3607 * latin1-range folds */
3609 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3610 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3611 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3612 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3617 ANYOF_CLASS_ZERO(data->start_class);
3618 ANYOF_BITMAP_ZERO(data->start_class);
3620 ANYOF_BITMAP_SET(data->start_class, uc);
3621 else if (uc >= 0x100) {
3624 /* Some Unicode code points fold to the Latin1 range; as
3625 * XXX temporary code, instead of figuring out if this is
3626 * one, just assume it is and set all the start class bits
3627 * that could be some such above 255 code point's fold
3628 * which will generate fals positives. As the code
3629 * elsewhere that does compute the fold settles down, it
3630 * can be extracted out and re-used here */
3631 for (i = 0; i < 256; i++){
3632 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3633 ANYOF_BITMAP_SET(data->start_class, i);
3637 data->start_class->flags &= ~ANYOF_EOS;
3639 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3641 else if (flags & SCF_DO_STCLASS_OR) {
3642 /* false positive possible if the class is case-folded */
3644 ANYOF_BITMAP_SET(data->start_class, uc);
3646 data->start_class->flags |= ANYOF_UNICODE_ALL;
3647 data->start_class->flags &= ~ANYOF_EOS;
3648 cl_and(data->start_class, and_withp);
3650 flags &= ~SCF_DO_STCLASS;
3652 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3653 I32 l = STR_LEN(scan);
3654 UV uc = *((U8*)STRING(scan));
3656 /* Search for fixed substrings supports EXACT only. */
3657 if (flags & SCF_DO_SUBSTR) {
3659 SCAN_COMMIT(pRExC_state, data, minlenp);
3662 const U8 * const s = (U8 *)STRING(scan);
3663 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3664 l = utf8_length(s, s + l);
3666 if (has_exactf_sharp_s) {
3667 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3669 min += l - min_subtract;
3673 delta += min_subtract;
3674 if (flags & SCF_DO_SUBSTR) {
3675 data->pos_min += l - min_subtract;
3676 if (data->pos_min < 0) {
3679 data->pos_delta += min_subtract;
3681 data->longest = &(data->longest_float);
3684 if (flags & SCF_DO_STCLASS_AND) {
3685 /* Check whether it is compatible with what we know already! */
3688 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3689 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3690 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3694 ANYOF_CLASS_ZERO(data->start_class);
3695 ANYOF_BITMAP_ZERO(data->start_class);
3697 ANYOF_BITMAP_SET(data->start_class, uc);
3698 data->start_class->flags &= ~ANYOF_EOS;
3699 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3700 if (OP(scan) == EXACTFL) {
3701 /* XXX This set is probably no longer necessary, and
3702 * probably wrong as LOCALE now is on in the initial
3704 data->start_class->flags |= ANYOF_LOCALE;
3708 /* Also set the other member of the fold pair. In case
3709 * that unicode semantics is called for at runtime, use
3710 * the full latin1 fold. (Can't do this for locale,
3711 * because not known until runtime) */
3712 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3714 /* All other (EXACTFL handled above) folds except under
3715 * /iaa that include s, S, and sharp_s also may include
3717 if (OP(scan) != EXACTFA) {
3718 if (uc == 's' || uc == 'S') {
3719 ANYOF_BITMAP_SET(data->start_class,
3720 LATIN_SMALL_LETTER_SHARP_S);
3722 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3723 ANYOF_BITMAP_SET(data->start_class, 's');
3724 ANYOF_BITMAP_SET(data->start_class, 'S');
3729 else if (uc >= 0x100) {
3731 for (i = 0; i < 256; i++){
3732 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3733 ANYOF_BITMAP_SET(data->start_class, i);
3738 else if (flags & SCF_DO_STCLASS_OR) {
3739 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3740 /* false positive possible if the class is case-folded.
3741 Assume that the locale settings are the same... */
3743 ANYOF_BITMAP_SET(data->start_class, uc);
3744 if (OP(scan) != EXACTFL) {
3746 /* And set the other member of the fold pair, but
3747 * can't do that in locale because not known until
3749 ANYOF_BITMAP_SET(data->start_class,
3750 PL_fold_latin1[uc]);
3752 /* All folds except under /iaa that include s, S,
3753 * and sharp_s also may include the others */
3754 if (OP(scan) != EXACTFA) {
3755 if (uc == 's' || uc == 'S') {
3756 ANYOF_BITMAP_SET(data->start_class,
3757 LATIN_SMALL_LETTER_SHARP_S);
3759 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3760 ANYOF_BITMAP_SET(data->start_class, 's');
3761 ANYOF_BITMAP_SET(data->start_class, 'S');
3766 data->start_class->flags &= ~ANYOF_EOS;
3768 cl_and(data->start_class, and_withp);
3770 flags &= ~SCF_DO_STCLASS;
3772 else if (REGNODE_VARIES(OP(scan))) {
3773 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3774 I32 f = flags, pos_before = 0;
3775 regnode * const oscan = scan;
3776 struct regnode_charclass_class this_class;
3777 struct regnode_charclass_class *oclass = NULL;
3778 I32 next_is_eval = 0;
3780 switch (PL_regkind[OP(scan)]) {
3781 case WHILEM: /* End of (?:...)* . */
3782 scan = NEXTOPER(scan);
3785 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3786 next = NEXTOPER(scan);
3787 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3789 maxcount = REG_INFTY;
3790 next = regnext(scan);
3791 scan = NEXTOPER(scan);
3795 if (flags & SCF_DO_SUBSTR)
3800 if (flags & SCF_DO_STCLASS) {
3802 maxcount = REG_INFTY;
3803 next = regnext(scan);
3804 scan = NEXTOPER(scan);
3807 is_inf = is_inf_internal = 1;
3808 scan = regnext(scan);
3809 if (flags & SCF_DO_SUBSTR) {
3810 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3811 data->longest = &(data->longest_float);
3813 goto optimize_curly_tail;
3815 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3816 && (scan->flags == stopparen))
3821 mincount = ARG1(scan);
3822 maxcount = ARG2(scan);
3824 next = regnext(scan);
3825 if (OP(scan) == CURLYX) {
3826 I32 lp = (data ? *(data->last_closep) : 0);
3827 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3829 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3830 next_is_eval = (OP(scan) == EVAL);
3832 if (flags & SCF_DO_SUBSTR) {
3833 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3834 pos_before = data->pos_min;
3838 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3840 data->flags |= SF_IS_INF;
3842 if (flags & SCF_DO_STCLASS) {
3843 cl_init(pRExC_state, &this_class);
3844 oclass = data->start_class;
3845 data->start_class = &this_class;
3846 f |= SCF_DO_STCLASS_AND;
3847 f &= ~SCF_DO_STCLASS_OR;
3849 /* Exclude from super-linear cache processing any {n,m}
3850 regops for which the combination of input pos and regex
3851 pos is not enough information to determine if a match
3854 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3855 regex pos at the \s*, the prospects for a match depend not
3856 only on the input position but also on how many (bar\s*)
3857 repeats into the {4,8} we are. */
3858 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3859 f &= ~SCF_WHILEM_VISITED_POS;
3861 /* This will finish on WHILEM, setting scan, or on NULL: */
3862 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3863 last, data, stopparen, recursed, NULL,
3865 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3867 if (flags & SCF_DO_STCLASS)
3868 data->start_class = oclass;
3869 if (mincount == 0 || minnext == 0) {
3870 if (flags & SCF_DO_STCLASS_OR) {
3871 cl_or(pRExC_state, data->start_class, &this_class);
3873 else if (flags & SCF_DO_STCLASS_AND) {
3874 /* Switch to OR mode: cache the old value of
3875 * data->start_class */
3877 StructCopy(data->start_class, and_withp,
3878 struct regnode_charclass_class);
3879 flags &= ~SCF_DO_STCLASS_AND;
3880 StructCopy(&this_class, data->start_class,
3881 struct regnode_charclass_class);
3882 flags |= SCF_DO_STCLASS_OR;
3883 data->start_class->flags |= ANYOF_EOS;
3885 } else { /* Non-zero len */
3886 if (flags & SCF_DO_STCLASS_OR) {
3887 cl_or(pRExC_state, data->start_class, &this_class);
3888 cl_and(data->start_class, and_withp);
3890 else if (flags & SCF_DO_STCLASS_AND)
3891 cl_and(data->start_class, &this_class);
3892 flags &= ~SCF_DO_STCLASS;
3894 if (!scan) /* It was not CURLYX, but CURLY. */
3896 if ( /* ? quantifier ok, except for (?{ ... }) */
3897 (next_is_eval || !(mincount == 0 && maxcount == 1))
3898 && (minnext == 0) && (deltanext == 0)
3899 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3900 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3902 ckWARNreg(RExC_parse,
3903 "Quantifier unexpected on zero-length expression");
3906 min += minnext * mincount;
3907 is_inf_internal |= ((maxcount == REG_INFTY
3908 && (minnext + deltanext) > 0)
3909 || deltanext == I32_MAX);
3910 is_inf |= is_inf_internal;
3911 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3913 /* Try powerful optimization CURLYX => CURLYN. */
3914 if ( OP(oscan) == CURLYX && data
3915 && data->flags & SF_IN_PAR
3916 && !(data->flags & SF_HAS_EVAL)
3917 && !deltanext && minnext == 1 ) {
3918 /* Try to optimize to CURLYN. */
3919 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3920 regnode * const nxt1 = nxt;
3927 if (!REGNODE_SIMPLE(OP(nxt))
3928 && !(PL_regkind[OP(nxt)] == EXACT
3929 && STR_LEN(nxt) == 1))
3935 if (OP(nxt) != CLOSE)
3937 if (RExC_open_parens) {
3938 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3939 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3941 /* Now we know that nxt2 is the only contents: */
3942 oscan->flags = (U8)ARG(nxt);
3944 OP(nxt1) = NOTHING; /* was OPEN. */
3947 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3948 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3949 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3950 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3951 OP(nxt + 1) = OPTIMIZED; /* was count. */
3952 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3957 /* Try optimization CURLYX => CURLYM. */
3958 if ( OP(oscan) == CURLYX && data
3959 && !(data->flags & SF_HAS_PAR)
3960 && !(data->flags & SF_HAS_EVAL)
3961 && !deltanext /* atom is fixed width */
3962 && minnext != 0 /* CURLYM can't handle zero width */
3963 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3965 /* XXXX How to optimize if data == 0? */
3966 /* Optimize to a simpler form. */
3967 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3971 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3972 && (OP(nxt2) != WHILEM))
3974 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3975 /* Need to optimize away parenths. */
3976 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3977 /* Set the parenth number. */
3978 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3980 oscan->flags = (U8)ARG(nxt);
3981 if (RExC_open_parens) {
3982 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3983 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3985 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3986 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3989 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3990 OP(nxt + 1) = OPTIMIZED; /* was count. */
3991 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3992 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3995 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3996 regnode *nnxt = regnext(nxt1);
3998 if (reg_off_by_arg[OP(nxt1)])
3999 ARG_SET(nxt1, nxt2 - nxt1);
4000 else if (nxt2 - nxt1 < U16_MAX)
4001 NEXT_OFF(nxt1) = nxt2 - nxt1;
4003 OP(nxt) = NOTHING; /* Cannot beautify */
4008 /* Optimize again: */
4009 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4010 NULL, stopparen, recursed, NULL, 0,depth+1);
4015 else if ((OP(oscan) == CURLYX)
4016 && (flags & SCF_WHILEM_VISITED_POS)
4017 /* See the comment on a similar expression above.
4018 However, this time it's not a subexpression
4019 we care about, but the expression itself. */
4020 && (maxcount == REG_INFTY)
4021 && data && ++data->whilem_c < 16) {
4022 /* This stays as CURLYX, we can put the count/of pair. */
4023 /* Find WHILEM (as in regexec.c) */
4024 regnode *nxt = oscan + NEXT_OFF(oscan);
4026 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4028 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4029 | (RExC_whilem_seen << 4)); /* On WHILEM */
4031 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4033 if (flags & SCF_DO_SUBSTR) {
4034 SV *last_str = NULL;
4035 int counted = mincount != 0;
4037 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4038 #if defined(SPARC64_GCC_WORKAROUND)
4041 const char *s = NULL;
4044 if (pos_before >= data->last_start_min)
4047 b = data->last_start_min;
4050 s = SvPV_const(data->last_found, l);
4051 old = b - data->last_start_min;
4054 I32 b = pos_before >= data->last_start_min
4055 ? pos_before : data->last_start_min;
4057 const char * const s = SvPV_const(data->last_found, l);
4058 I32 old = b - data->last_start_min;
4062 old = utf8_hop((U8*)s, old) - (U8*)s;
4064 /* Get the added string: */
4065 last_str = newSVpvn_utf8(s + old, l, UTF);
4066 if (deltanext == 0 && pos_before == b) {
4067 /* What was added is a constant string */
4069 SvGROW(last_str, (mincount * l) + 1);
4070 repeatcpy(SvPVX(last_str) + l,
4071 SvPVX_const(last_str), l, mincount - 1);
4072 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4073 /* Add additional parts. */
4074 SvCUR_set(data->last_found,
4075 SvCUR(data->last_found) - l);
4076 sv_catsv(data->last_found, last_str);
4078 SV * sv = data->last_found;
4080 SvUTF8(sv) && SvMAGICAL(sv) ?
4081 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4082 if (mg && mg->mg_len >= 0)
4083 mg->mg_len += CHR_SVLEN(last_str) - l;
4085 data->last_end += l * (mincount - 1);
4088 /* start offset must point into the last copy */
4089 data->last_start_min += minnext * (mincount - 1);
4090 data->last_start_max += is_inf ? I32_MAX
4091 : (maxcount - 1) * (minnext + data->pos_delta);
4094 /* It is counted once already... */
4095 data->pos_min += minnext * (mincount - counted);
4096 data->pos_delta += - counted * deltanext +
4097 (minnext + deltanext) * maxcount - minnext * mincount;
4098 if (mincount != maxcount) {
4099 /* Cannot extend fixed substrings found inside
4101 SCAN_COMMIT(pRExC_state,data,minlenp);
4102 if (mincount && last_str) {
4103 SV * const sv = data->last_found;
4104 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4105 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4109 sv_setsv(sv, last_str);
4110 data->last_end = data->pos_min;
4111 data->last_start_min =
4112 data->pos_min - CHR_SVLEN(last_str);
4113 data->last_start_max = is_inf
4115 : data->pos_min + data->pos_delta
4116 - CHR_SVLEN(last_str);
4118 data->longest = &(data->longest_float);
4120 SvREFCNT_dec(last_str);
4122 if (data && (fl & SF_HAS_EVAL))
4123 data->flags |= SF_HAS_EVAL;
4124 optimize_curly_tail:
4125 if (OP(oscan) != CURLYX) {
4126 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4128 NEXT_OFF(oscan) += NEXT_OFF(next);
4131 default: /* REF, ANYOFV, and CLUMP only? */
4132 if (flags & SCF_DO_SUBSTR) {
4133 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4134 data->longest = &(data->longest_float);
4136 is_inf = is_inf_internal = 1;
4137 if (flags & SCF_DO_STCLASS_OR)
4138 cl_anything(pRExC_state, data->start_class);
4139 flags &= ~SCF_DO_STCLASS;
4143 else if (OP(scan) == LNBREAK) {
4144 if (flags & SCF_DO_STCLASS) {
4146 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4147 if (flags & SCF_DO_STCLASS_AND) {
4148 for (value = 0; value < 256; value++)
4149 if (!is_VERTWS_cp(value))
4150 ANYOF_BITMAP_CLEAR(data->start_class, value);
4153 for (value = 0; value < 256; value++)
4154 if (is_VERTWS_cp(value))
4155 ANYOF_BITMAP_SET(data->start_class, value);
4157 if (flags & SCF_DO_STCLASS_OR)
4158 cl_and(data->start_class, and_withp);
4159 flags &= ~SCF_DO_STCLASS;
4163 if (flags & SCF_DO_SUBSTR) {
4164 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4166 data->pos_delta += 1;
4167 data->longest = &(data->longest_float);
4170 else if (REGNODE_SIMPLE(OP(scan))) {
4173 if (flags & SCF_DO_SUBSTR) {
4174 SCAN_COMMIT(pRExC_state,data,minlenp);
4178 if (flags & SCF_DO_STCLASS) {
4179 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4181 /* Some of the logic below assumes that switching
4182 locale on will only add false positives. */
4183 switch (PL_regkind[OP(scan)]) {
4187 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4188 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4189 cl_anything(pRExC_state, data->start_class);
4192 if (OP(scan) == SANY)
4194 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4195 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4196 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4197 cl_anything(pRExC_state, data->start_class);
4199 if (flags & SCF_DO_STCLASS_AND || !value)
4200 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4203 if (flags & SCF_DO_STCLASS_AND)
4204 cl_and(data->start_class,
4205 (struct regnode_charclass_class*)scan);
4207 cl_or(pRExC_state, data->start_class,
4208 (struct regnode_charclass_class*)scan);
4211 if (flags & SCF_DO_STCLASS_AND) {
4212 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4213 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4214 if (OP(scan) == ALNUMU) {
4215 for (value = 0; value < 256; value++) {
4216 if (!isWORDCHAR_L1(value)) {
4217 ANYOF_BITMAP_CLEAR(data->start_class, value);
4221 for (value = 0; value < 256; value++) {
4222 if (!isALNUM(value)) {
4223 ANYOF_BITMAP_CLEAR(data->start_class, value);
4230 if (data->start_class->flags & ANYOF_LOCALE)
4231 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4233 /* Even if under locale, set the bits for non-locale
4234 * in case it isn't a true locale-node. This will
4235 * create false positives if it truly is locale */
4236 if (OP(scan) == ALNUMU) {
4237 for (value = 0; value < 256; value++) {
4238 if (isWORDCHAR_L1(value)) {
4239 ANYOF_BITMAP_SET(data->start_class, value);
4243 for (value = 0; value < 256; value++) {
4244 if (isALNUM(value)) {
4245 ANYOF_BITMAP_SET(data->start_class, value);
4252 if (flags & SCF_DO_STCLASS_AND) {
4253 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4254 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4255 if (OP(scan) == NALNUMU) {
4256 for (value = 0; value < 256; value++) {
4257 if (isWORDCHAR_L1(value)) {
4258 ANYOF_BITMAP_CLEAR(data->start_class, value);
4262 for (value = 0; value < 256; value++) {
4263 if (isALNUM(value)) {
4264 ANYOF_BITMAP_CLEAR(data->start_class, value);
4271 if (data->start_class->flags & ANYOF_LOCALE)
4272 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4274 /* Even if under locale, set the bits for non-locale in
4275 * case it isn't a true locale-node. This will create
4276 * false positives if it truly is locale */
4277 if (OP(scan) == NALNUMU) {
4278 for (value = 0; value < 256; value++) {
4279 if (! isWORDCHAR_L1(value)) {
4280 ANYOF_BITMAP_SET(data->start_class, value);
4284 for (value = 0; value < 256; value++) {
4285 if (! isALNUM(value)) {
4286 ANYOF_BITMAP_SET(data->start_class, value);
4293 if (flags & SCF_DO_STCLASS_AND) {
4294 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4295 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4296 if (OP(scan) == SPACEU) {
4297 for (value = 0; value < 256; value++) {
4298 if (!isSPACE_L1(value)) {
4299 ANYOF_BITMAP_CLEAR(data->start_class, value);
4303 for (value = 0; value < 256; value++) {
4304 if (!isSPACE(value)) {
4305 ANYOF_BITMAP_CLEAR(data->start_class, value);
4312 if (data->start_class->flags & ANYOF_LOCALE) {
4313 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4315 if (OP(scan) == SPACEU) {
4316 for (value = 0; value < 256; value++) {
4317 if (isSPACE_L1(value)) {
4318 ANYOF_BITMAP_SET(data->start_class, value);
4322 for (value = 0; value < 256; value++) {
4323 if (isSPACE(value)) {
4324 ANYOF_BITMAP_SET(data->start_class, value);
4331 if (flags & SCF_DO_STCLASS_AND) {
4332 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4333 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4334 if (OP(scan) == NSPACEU) {
4335 for (value = 0; value < 256; value++) {
4336 if (isSPACE_L1(value)) {
4337 ANYOF_BITMAP_CLEAR(data->start_class, value);
4341 for (value = 0; value < 256; value++) {
4342 if (isSPACE(value)) {
4343 ANYOF_BITMAP_CLEAR(data->start_class, value);
4350 if (data->start_class->flags & ANYOF_LOCALE)
4351 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4352 if (OP(scan) == NSPACEU) {
4353 for (value = 0; value < 256; value++) {
4354 if (!isSPACE_L1(value)) {
4355 ANYOF_BITMAP_SET(data->start_class, value);
4360 for (value = 0; value < 256; value++) {
4361 if (!isSPACE(value)) {
4362 ANYOF_BITMAP_SET(data->start_class, value);
4369 if (flags & SCF_DO_STCLASS_AND) {
4370 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4371 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4372 for (value = 0; value < 256; value++)
4373 if (!isDIGIT(value))
4374 ANYOF_BITMAP_CLEAR(data->start_class, value);
4378 if (data->start_class->flags & ANYOF_LOCALE)
4379 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4380 for (value = 0; value < 256; value++)
4382 ANYOF_BITMAP_SET(data->start_class, value);
4386 if (flags & SCF_DO_STCLASS_AND) {
4387 if (!(data->start_class->flags & ANYOF_LOCALE))
4388 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4389 for (value = 0; value < 256; value++)
4391 ANYOF_BITMAP_CLEAR(data->start_class, value);
4394 if (data->start_class->flags & ANYOF_LOCALE)
4395 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4396 for (value = 0; value < 256; value++)
4397 if (!isDIGIT(value))
4398 ANYOF_BITMAP_SET(data->start_class, value);
4401 CASE_SYNST_FNC(VERTWS);
4402 CASE_SYNST_FNC(HORIZWS);
4405 if (flags & SCF_DO_STCLASS_OR)
4406 cl_and(data->start_class, and_withp);
4407 flags &= ~SCF_DO_STCLASS;
4410 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4411 data->flags |= (OP(scan) == MEOL
4414 SCAN_COMMIT(pRExC_state, data, minlenp);
4417 else if ( PL_regkind[OP(scan)] == BRANCHJ
4418 /* Lookbehind, or need to calculate parens/evals/stclass: */
4419 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4420 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4421 if ( OP(scan) == UNLESSM &&
4423 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4424 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4427 regnode *upto= regnext(scan);
4429 SV * const mysv_val=sv_newmortal();
4430 DEBUG_STUDYDATA("OPFAIL",data,depth);
4432 /*DEBUG_PARSE_MSG("opfail");*/
4433 regprop(RExC_rx, mysv_val, upto);
4434 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4435 SvPV_nolen_const(mysv_val),
4436 (IV)REG_NODE_NUM(upto),
4441 NEXT_OFF(scan) = upto - scan;
4442 for (opt= scan + 1; opt < upto ; opt++)
4443 OP(opt) = OPTIMIZED;
4447 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4448 || OP(scan) == UNLESSM )
4450 /* Negative Lookahead/lookbehind
4451 In this case we can't do fixed string optimisation.
4454 I32 deltanext, minnext, fake = 0;
4456 struct regnode_charclass_class intrnl;
4459 data_fake.flags = 0;
4461 data_fake.whilem_c = data->whilem_c;
4462 data_fake.last_closep = data->last_closep;
4465 data_fake.last_closep = &fake;
4466 data_fake.pos_delta = delta;
4467 if ( flags & SCF_DO_STCLASS && !scan->flags
4468 && OP(scan) == IFMATCH ) { /* Lookahead */
4469 cl_init(pRExC_state, &intrnl);
4470 data_fake.start_class = &intrnl;
4471 f |= SCF_DO_STCLASS_AND;
4473 if (flags & SCF_WHILEM_VISITED_POS)
4474 f |= SCF_WHILEM_VISITED_POS;
4475 next = regnext(scan);
4476 nscan = NEXTOPER(NEXTOPER(scan));
4477 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4478 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4481 FAIL("Variable length lookbehind not implemented");
4483 else if (minnext > (I32)U8_MAX) {
4484 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4486 scan->flags = (U8)minnext;
4489 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4491 if (data_fake.flags & SF_HAS_EVAL)
4492 data->flags |= SF_HAS_EVAL;
4493 data->whilem_c = data_fake.whilem_c;
4495 if (f & SCF_DO_STCLASS_AND) {
4496 if (flags & SCF_DO_STCLASS_OR) {
4497 /* OR before, AND after: ideally we would recurse with
4498 * data_fake to get the AND applied by study of the
4499 * remainder of the pattern, and then derecurse;
4500 * *** HACK *** for now just treat as "no information".
4501 * See [perl #56690].
4503 cl_init(pRExC_state, data->start_class);
4505 /* AND before and after: combine and continue */
4506 const int was = (data->start_class->flags & ANYOF_EOS);
4508 cl_and(data->start_class, &intrnl);
4510 data->start_class->flags |= ANYOF_EOS;
4514 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4516 /* Positive Lookahead/lookbehind
4517 In this case we can do fixed string optimisation,
4518 but we must be careful about it. Note in the case of
4519 lookbehind the positions will be offset by the minimum
4520 length of the pattern, something we won't know about
4521 until after the recurse.
4523 I32 deltanext, fake = 0;
4525 struct regnode_charclass_class intrnl;
4527 /* We use SAVEFREEPV so that when the full compile
4528 is finished perl will clean up the allocated
4529 minlens when it's all done. This way we don't
4530 have to worry about freeing them when we know
4531 they wont be used, which would be a pain.
4534 Newx( minnextp, 1, I32 );
4535 SAVEFREEPV(minnextp);
4538 StructCopy(data, &data_fake, scan_data_t);
4539 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4542 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4543 data_fake.last_found=newSVsv(data->last_found);
4547 data_fake.last_closep = &fake;
4548 data_fake.flags = 0;
4549 data_fake.pos_delta = delta;
4551 data_fake.flags |= SF_IS_INF;
4552 if ( flags & SCF_DO_STCLASS && !scan->flags
4553 && OP(scan) == IFMATCH ) { /* Lookahead */
4554 cl_init(pRExC_state, &intrnl);
4555 data_fake.start_class = &intrnl;
4556 f |= SCF_DO_STCLASS_AND;
4558 if (flags & SCF_WHILEM_VISITED_POS)
4559 f |= SCF_WHILEM_VISITED_POS;
4560 next = regnext(scan);
4561 nscan = NEXTOPER(NEXTOPER(scan));
4563 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4564 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4567 FAIL("Variable length lookbehind not implemented");
4569 else if (*minnextp > (I32)U8_MAX) {
4570 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4572 scan->flags = (U8)*minnextp;
4577 if (f & SCF_DO_STCLASS_AND) {
4578 const int was = (data->start_class->flags & ANYOF_EOS);
4580 cl_and(data->start_class, &intrnl);
4582 data->start_class->flags |= ANYOF_EOS;
4585 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4587 if (data_fake.flags & SF_HAS_EVAL)
4588 data->flags |= SF_HAS_EVAL;
4589 data->whilem_c = data_fake.whilem_c;
4590 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4591 if (RExC_rx->minlen<*minnextp)
4592 RExC_rx->minlen=*minnextp;
4593 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4594 SvREFCNT_dec(data_fake.last_found);
4596 if ( data_fake.minlen_fixed != minlenp )
4598 data->offset_fixed= data_fake.offset_fixed;
4599 data->minlen_fixed= data_fake.minlen_fixed;
4600 data->lookbehind_fixed+= scan->flags;
4602 if ( data_fake.minlen_float != minlenp )
4604 data->minlen_float= data_fake.minlen_float;
4605 data->offset_float_min=data_fake.offset_float_min;
4606 data->offset_float_max=data_fake.offset_float_max;
4607 data->lookbehind_float+= scan->flags;
4614 else if (OP(scan) == OPEN) {
4615 if (stopparen != (I32)ARG(scan))
4618 else if (OP(scan) == CLOSE) {
4619 if (stopparen == (I32)ARG(scan)) {
4622 if ((I32)ARG(scan) == is_par) {
4623 next = regnext(scan);
4625 if ( next && (OP(next) != WHILEM) && next < last)
4626 is_par = 0; /* Disable optimization */
4629 *(data->last_closep) = ARG(scan);
4631 else if (OP(scan) == EVAL) {
4633 data->flags |= SF_HAS_EVAL;
4635 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4636 if (flags & SCF_DO_SUBSTR) {
4637 SCAN_COMMIT(pRExC_state,data,minlenp);
4638 flags &= ~SCF_DO_SUBSTR;
4640 if (data && OP(scan)==ACCEPT) {
4641 data->flags |= SCF_SEEN_ACCEPT;
4646 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4648 if (flags & SCF_DO_SUBSTR) {
4649 SCAN_COMMIT(pRExC_state,data,minlenp);
4650 data->longest = &(data->longest_float);
4652 is_inf = is_inf_internal = 1;
4653 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4654 cl_anything(pRExC_state, data->start_class);
4655 flags &= ~SCF_DO_STCLASS;
4657 else if (OP(scan) == GPOS) {
4658 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4659 !(delta || is_inf || (data && data->pos_delta)))
4661 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4662 RExC_rx->extflags |= RXf_ANCH_GPOS;
4663 if (RExC_rx->gofs < (U32)min)
4664 RExC_rx->gofs = min;
4666 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4670 #ifdef TRIE_STUDY_OPT
4671 #ifdef FULL_TRIE_STUDY
4672 else if (PL_regkind[OP(scan)] == TRIE) {
4673 /* NOTE - There is similar code to this block above for handling
4674 BRANCH nodes on the initial study. If you change stuff here
4676 regnode *trie_node= scan;
4677 regnode *tail= regnext(scan);
4678 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4679 I32 max1 = 0, min1 = I32_MAX;
4680 struct regnode_charclass_class accum;
4682 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4683 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4684 if (flags & SCF_DO_STCLASS)
4685 cl_init_zero(pRExC_state, &accum);
4691 const regnode *nextbranch= NULL;
4694 for ( word=1 ; word <= trie->wordcount ; word++)
4696 I32 deltanext=0, minnext=0, f = 0, fake;
4697 struct regnode_charclass_class this_class;
4699 data_fake.flags = 0;
4701 data_fake.whilem_c = data->whilem_c;
4702 data_fake.last_closep = data->last_closep;
4705 data_fake.last_closep = &fake;
4706 data_fake.pos_delta = delta;
4707 if (flags & SCF_DO_STCLASS) {
4708 cl_init(pRExC_state, &this_class);
4709 data_fake.start_class = &this_class;
4710 f = SCF_DO_STCLASS_AND;
4712 if (flags & SCF_WHILEM_VISITED_POS)
4713 f |= SCF_WHILEM_VISITED_POS;
4715 if (trie->jump[word]) {
4717 nextbranch = trie_node + trie->jump[0];
4718 scan= trie_node + trie->jump[word];
4719 /* We go from the jump point to the branch that follows
4720 it. Note this means we need the vestigal unused branches
4721 even though they arent otherwise used.
4723 minnext = study_chunk(pRExC_state, &scan, minlenp,
4724 &deltanext, (regnode *)nextbranch, &data_fake,
4725 stopparen, recursed, NULL, f,depth+1);
4727 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4728 nextbranch= regnext((regnode*)nextbranch);
4730 if (min1 > (I32)(minnext + trie->minlen))
4731 min1 = minnext + trie->minlen;
4732 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4733 max1 = minnext + deltanext + trie->maxlen;
4734 if (deltanext == I32_MAX)
4735 is_inf = is_inf_internal = 1;
4737 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4739 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4740 if ( stopmin > min + min1)
4741 stopmin = min + min1;
4742 flags &= ~SCF_DO_SUBSTR;
4744 data->flags |= SCF_SEEN_ACCEPT;
4747 if (data_fake.flags & SF_HAS_EVAL)
4748 data->flags |= SF_HAS_EVAL;
4749 data->whilem_c = data_fake.whilem_c;
4751 if (flags & SCF_DO_STCLASS)
4752 cl_or(pRExC_state, &accum, &this_class);
4755 if (flags & SCF_DO_SUBSTR) {
4756 data->pos_min += min1;
4757 data->pos_delta += max1 - min1;
4758 if (max1 != min1 || is_inf)
4759 data->longest = &(data->longest_float);
4762 delta += max1 - min1;
4763 if (flags & SCF_DO_STCLASS_OR) {
4764 cl_or(pRExC_state, data->start_class, &accum);
4766 cl_and(data->start_class, and_withp);
4767 flags &= ~SCF_DO_STCLASS;
4770 else if (flags & SCF_DO_STCLASS_AND) {
4772 cl_and(data->start_class, &accum);
4773 flags &= ~SCF_DO_STCLASS;
4776 /* Switch to OR mode: cache the old value of
4777 * data->start_class */
4779 StructCopy(data->start_class, and_withp,
4780 struct regnode_charclass_class);
4781 flags &= ~SCF_DO_STCLASS_AND;
4782 StructCopy(&accum, data->start_class,
4783 struct regnode_charclass_class);
4784 flags |= SCF_DO_STCLASS_OR;
4785 data->start_class->flags |= ANYOF_EOS;
4792 else if (PL_regkind[OP(scan)] == TRIE) {
4793 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4796 min += trie->minlen;
4797 delta += (trie->maxlen - trie->minlen);
4798 flags &= ~SCF_DO_STCLASS; /* xxx */
4799 if (flags & SCF_DO_SUBSTR) {
4800 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4801 data->pos_min += trie->minlen;
4802 data->pos_delta += (trie->maxlen - trie->minlen);
4803 if (trie->maxlen != trie->minlen)
4804 data->longest = &(data->longest_float);
4806 if (trie->jump) /* no more substrings -- for now /grr*/
4807 flags &= ~SCF_DO_SUBSTR;
4809 #endif /* old or new */
4810 #endif /* TRIE_STUDY_OPT */
4812 /* Else: zero-length, ignore. */
4813 scan = regnext(scan);
4818 stopparen = frame->stop;
4819 frame = frame->prev;
4820 goto fake_study_recurse;
4825 DEBUG_STUDYDATA("pre-fin:",data,depth);
4828 *deltap = is_inf_internal ? I32_MAX : delta;
4829 if (flags & SCF_DO_SUBSTR && is_inf)
4830 data->pos_delta = I32_MAX - data->pos_min;
4831 if (is_par > (I32)U8_MAX)
4833 if (is_par && pars==1 && data) {
4834 data->flags |= SF_IN_PAR;
4835 data->flags &= ~SF_HAS_PAR;
4837 else if (pars && data) {
4838 data->flags |= SF_HAS_PAR;
4839 data->flags &= ~SF_IN_PAR;
4841 if (flags & SCF_DO_STCLASS_OR)
4842 cl_and(data->start_class, and_withp);
4843 if (flags & SCF_TRIE_RESTUDY)
4844 data->flags |= SCF_TRIE_RESTUDY;
4846 DEBUG_STUDYDATA("post-fin:",data,depth);
4848 return min < stopmin ? min : stopmin;
4852 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4854 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4856 PERL_ARGS_ASSERT_ADD_DATA;
4858 Renewc(RExC_rxi->data,
4859 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4860 char, struct reg_data);
4862 Renew(RExC_rxi->data->what, count + n, U8);
4864 Newx(RExC_rxi->data->what, n, U8);
4865 RExC_rxi->data->count = count + n;
4866 Copy(s, RExC_rxi->data->what + count, n, U8);
4870 /*XXX: todo make this not included in a non debugging perl */
4871 #ifndef PERL_IN_XSUB_RE
4873 Perl_reginitcolors(pTHX)
4876 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4878 char *t = savepv(s);
4882 t = strchr(t, '\t');
4888 PL_colors[i] = t = (char *)"";
4893 PL_colors[i++] = (char *)"";
4900 #ifdef TRIE_STUDY_OPT
4901 #define CHECK_RESTUDY_GOTO \
4903 (data.flags & SCF_TRIE_RESTUDY) \
4907 #define CHECK_RESTUDY_GOTO
4911 * pregcomp - compile a regular expression into internal code
4913 * Decides which engine's compiler to call based on the hint currently in
4917 #ifndef PERL_IN_XSUB_RE
4919 /* return the currently in-scope regex engine (or the default if none) */
4921 regexp_engine const *
4922 Perl_current_re_engine(pTHX)
4926 if (IN_PERL_COMPILETIME) {
4927 HV * const table = GvHV(PL_hintgv);
4931 return &reh_regexp_engine;
4932 ptr = hv_fetchs(table, "regcomp", FALSE);
4933 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4934 return &reh_regexp_engine;
4935 return INT2PTR(regexp_engine*,SvIV(*ptr));
4939 if (!PL_curcop->cop_hints_hash)
4940 return &reh_regexp_engine;
4941 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4942 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4943 return &reh_regexp_engine;
4944 return INT2PTR(regexp_engine*,SvIV(ptr));
4950 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4953 regexp_engine const *eng = current_re_engine();
4954 GET_RE_DEBUG_FLAGS_DECL;
4956 PERL_ARGS_ASSERT_PREGCOMP;
4958 /* Dispatch a request to compile a regexp to correct regexp engine. */
4960 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4963 return CALLREGCOMP_ENG(eng, pattern, flags);
4967 /* public(ish) entry point for the perl core's own regex compiling code.
4968 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4969 * pattern rather than a list of OPs, and uses the internal engine rather
4970 * than the current one */
4973 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4975 SV *pat = pattern; /* defeat constness! */
4976 PERL_ARGS_ASSERT_RE_COMPILE;
4977 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4978 #ifdef PERL_IN_XSUB_RE
4983 NULL, NULL, rx_flags, 0);
4986 /* see if there are any run-time code blocks in the pattern.
4987 * False positives are allowed */
4990 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4991 U32 pm_flags, char *pat, STRLEN plen)
4996 /* avoid infinitely recursing when we recompile the pattern parcelled up
4997 * as qr'...'. A single constant qr// string can't have have any
4998 * run-time component in it, and thus, no runtime code. (A non-qr
4999 * string, however, can, e.g. $x =~ '(?{})') */
5000 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
5003 for (s = 0; s < plen; s++) {
5004 if (n < pRExC_state->num_code_blocks
5005 && s == pRExC_state->code_blocks[n].start)
5007 s = pRExC_state->code_blocks[n].end;
5011 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5013 if (pat[s] == '(' && pat[s+1] == '?' &&
5014 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
5021 /* Handle run-time code blocks. We will already have compiled any direct
5022 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5023 * copy of it, but with any literal code blocks blanked out and
5024 * appropriate chars escaped; then feed it into
5026 * eval "qr'modified_pattern'"
5030 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5034 * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
5036 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5037 * and merge them with any code blocks of the original regexp.
5039 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5040 * instead, just save the qr and return FALSE; this tells our caller that
5041 * the original pattern needs upgrading to utf8.
5045 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5046 char *pat, STRLEN plen)
5050 GET_RE_DEBUG_FLAGS_DECL;
5052 if (pRExC_state->runtime_code_qr) {
5053 /* this is the second time we've been called; this should
5054 * only happen if the main pattern got upgraded to utf8
5055 * during compilation; re-use the qr we compiled first time
5056 * round (which should be utf8 too)
5058 qr = pRExC_state->runtime_code_qr;
5059 pRExC_state->runtime_code_qr = NULL;
5060 assert(RExC_utf8 && SvUTF8(qr));
5066 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5070 /* determine how many extra chars we need for ' and \ escaping */
5071 for (s = 0; s < plen; s++) {
5072 if (pat[s] == '\'' || pat[s] == '\\')
5076 Newx(newpat, newlen, char);
5078 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5080 for (s = 0; s < plen; s++) {
5081 if (n < pRExC_state->num_code_blocks
5082 && s == pRExC_state->code_blocks[n].start)
5084 /* blank out literal code block */
5085 assert(pat[s] == '(');
5086 while (s <= pRExC_state->code_blocks[n].end) {
5094 if (pat[s] == '\'' || pat[s] == '\\')
5099 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5103 PerlIO_printf(Perl_debug_log,
5104 "%sre-parsing pattern for runtime code:%s %s\n",
5105 PL_colors[4],PL_colors[5],newpat);
5108 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5114 PUSHSTACKi(PERLSI_REQUIRE);
5115 /* this causes the toker to collapse \\ into \ when parsing
5116 * qr''; normally only q'' does this. It also alters hints
5118 PL_reg_state.re_reparsing = TRUE;
5119 eval_sv(sv, G_SCALAR);
5125 Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5126 assert(SvROK(qr_ref));
5128 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5129 /* the leaving below frees the tmp qr_ref.
5130 * Give qr a life of its own */
5138 if (!RExC_utf8 && SvUTF8(qr)) {
5139 /* first time through; the pattern got upgraded; save the
5140 * qr for the next time through */
5141 assert(!pRExC_state->runtime_code_qr);
5142 pRExC_state->runtime_code_qr = qr;
5147 /* extract any code blocks within the returned qr// */
5150 /* merge the main (r1) and run-time (r2) code blocks into one */
5152 RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5153 struct reg_code_block *new_block, *dst;
5154 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5157 if (!r2->num_code_blocks) /* we guessed wrong */
5161 r1->num_code_blocks + r2->num_code_blocks,
5162 struct reg_code_block);
5165 while ( i1 < r1->num_code_blocks
5166 || i2 < r2->num_code_blocks)
5168 struct reg_code_block *src;
5171 if (i1 == r1->num_code_blocks) {
5172 src = &r2->code_blocks[i2++];
5175 else if (i2 == r2->num_code_blocks)
5176 src = &r1->code_blocks[i1++];
5177 else if ( r1->code_blocks[i1].start
5178 < r2->code_blocks[i2].start)
5180 src = &r1->code_blocks[i1++];
5181 assert(src->end < r2->code_blocks[i2].start);
5184 assert( r1->code_blocks[i1].start
5185 > r2->code_blocks[i2].start);
5186 src = &r2->code_blocks[i2++];
5188 assert(src->end < r1->code_blocks[i1].start);
5191 assert(pat[src->start] == '(');
5192 assert(pat[src->end] == ')');
5193 dst->start = src->start;
5194 dst->end = src->end;
5195 dst->block = src->block;
5196 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5200 r1->num_code_blocks += r2->num_code_blocks;
5201 Safefree(r1->code_blocks);
5202 r1->code_blocks = new_block;
5211 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5213 /* This is the common code for setting up the floating and fixed length
5214 * string data extracted from Perlre_op_compile() below. Returns a boolean
5215 * as to whether succeeded or not */
5219 if (! (longest_length
5220 || (eol /* Can't have SEOL and MULTI */
5221 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5223 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5224 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5229 /* copy the information about the longest from the reg_scan_data
5230 over to the program. */
5231 if (SvUTF8(sv_longest)) {
5232 *rx_utf8 = sv_longest;
5235 *rx_substr = sv_longest;
5238 /* end_shift is how many chars that must be matched that
5239 follow this item. We calculate it ahead of time as once the
5240 lookbehind offset is added in we lose the ability to correctly
5242 ml = minlen ? *(minlen) : (I32)longest_length;
5243 *rx_end_shift = ml - offset
5244 - longest_length + (SvTAIL(sv_longest) != 0)
5247 t = (eol/* Can't have SEOL and MULTI */
5248 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5249 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5255 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5256 * regular expression into internal code.
5257 * The pattern may be passed either as:
5258 * a list of SVs (patternp plus pat_count)
5259 * a list of OPs (expr)
5260 * If both are passed, the SV list is used, but the OP list indicates
5261 * which SVs are actually pre-compiled code blocks
5263 * The SVs in the list have magic and qr overloading applied to them (and
5264 * the list may be modified in-place with replacement SVs in the latter
5267 * If the pattern hasn't changed from old_re, then old_re will be
5270 * eng is the current engine. If that engine has an op_comp method, then
5271 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5272 * do the initial concatenation of arguments and pass on to the external
5275 * If is_bare_re is not null, set it to a boolean indicating whether the
5276 * arg list reduced (after overloading) to a single bare regex which has
5277 * been returned (i.e. /$qr/).
5279 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5281 * pm_flags contains the PMf_* flags, typically based on those from the
5282 * pm_flags field of the related PMOP. Currently we're only interested in
5283 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5285 * We can't allocate space until we know how big the compiled form will be,
5286 * but we can't compile it (and thus know how big it is) until we've got a
5287 * place to put the code. So we cheat: we compile it twice, once with code
5288 * generation turned off and size counting turned on, and once "for real".
5289 * This also means that we don't allocate space until we are sure that the
5290 * thing really will compile successfully, and we never have to move the
5291 * code and thus invalidate pointers into it. (Note that it has to be in
5292 * one piece because free() must be able to free it all.) [NB: not true in perl]
5294 * Beware that the optimization-preparation code in here knows about some
5295 * of the structure of the compiled regexp. [I'll say.]
5299 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5300 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5301 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5306 regexp_internal *ri;
5316 /* these are all flags - maybe they should be turned
5317 * into a single int with different bit masks */
5318 I32 sawlookahead = 0;
5321 bool used_setjump = FALSE;
5322 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5323 bool code_is_utf8 = 0;
5324 bool VOL recompile = 0;
5325 bool runtime_code = 0;
5329 RExC_state_t RExC_state;
5330 RExC_state_t * const pRExC_state = &RExC_state;
5331 #ifdef TRIE_STUDY_OPT
5333 RExC_state_t copyRExC_state;
5335 GET_RE_DEBUG_FLAGS_DECL;
5337 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5339 DEBUG_r(if (!PL_colorset) reginitcolors());
5341 #ifndef PERL_IN_XSUB_RE
5342 /* Initialize these here instead of as-needed, as is quick and avoids
5343 * having to test them each time otherwise */
5344 if (! PL_AboveLatin1) {
5345 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5346 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5347 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5349 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5350 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5352 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5353 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5355 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5356 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5358 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5360 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5361 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5363 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5365 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5366 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5368 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5369 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5371 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5372 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5374 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5375 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5377 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5378 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5380 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5381 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5383 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5384 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5386 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5388 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5389 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5391 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5392 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5396 pRExC_state->code_blocks = NULL;
5397 pRExC_state->num_code_blocks = 0;
5400 *is_bare_re = FALSE;
5402 if (expr && (expr->op_type == OP_LIST ||
5403 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5405 /* is the source UTF8, and how many code blocks are there? */
5409 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5410 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5412 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5413 /* count of DO blocks */
5417 pRExC_state->num_code_blocks = ncode;
5418 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5423 /* handle a list of SVs */
5427 /* apply magic and RE overloading to each arg */
5428 for (svp = patternp; svp < patternp + pat_count; svp++) {
5431 if (SvROK(rx) && SvAMAGIC(rx)) {
5432 SV *sv = AMG_CALLunary(rx, regexp_amg);
5436 if (SvTYPE(sv) != SVt_REGEXP)
5437 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5443 if (pat_count > 1) {
5444 /* concat multiple args and find any code block indexes */
5449 STRLEN orig_patlen = 0;
5451 if (pRExC_state->num_code_blocks) {
5452 o = cLISTOPx(expr)->op_first;
5453 assert(o->op_type == OP_PUSHMARK);
5457 pat = newSVpvn("", 0);
5460 /* determine if the pattern is going to be utf8 (needed
5461 * in advance to align code block indices correctly).
5462 * XXX This could fail to be detected for an arg with
5463 * overloading but not concat overloading; but the main effect
5464 * in this obscure case is to need a 'use re eval' for a
5465 * literal code block */
5466 for (svp = patternp; svp < patternp + pat_count; svp++) {
5473 for (svp = patternp; svp < patternp + pat_count; svp++) {
5474 SV *sv, *msv = *svp;
5478 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5479 assert(n < pRExC_state->num_code_blocks);
5480 pRExC_state->code_blocks[n].start = SvCUR(pat);
5481 pRExC_state->code_blocks[n].block = o;
5482 pRExC_state->code_blocks[n].src_regex = NULL;
5485 o = o->op_sibling; /* skip CONST */
5491 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5492 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5495 /* overloading involved: all bets are off over literal
5496 * code. Pretend we haven't seen it */
5497 pRExC_state->num_code_blocks -= n;
5503 while (SvAMAGIC(msv)
5504 && (sv = AMG_CALLunary(msv, string_amg))
5508 && SvRV(msv) == SvRV(sv))
5513 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5515 orig_patlen = SvCUR(pat);
5516 sv_catsv_nomg(pat, msv);
5519 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5522 /* extract any code blocks within any embedded qr//'s */
5523 if (rx && SvTYPE(rx) == SVt_REGEXP
5524 && RX_ENGINE((REGEXP*)rx)->op_comp)
5527 RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5528 if (ri->num_code_blocks) {
5530 /* the presence of an embedded qr// with code means
5531 * we should always recompile: the text of the
5532 * qr// may not have changed, but it may be a
5533 * different closure than last time */
5535 Renew(pRExC_state->code_blocks,
5536 pRExC_state->num_code_blocks + ri->num_code_blocks,
5537 struct reg_code_block);
5538 pRExC_state->num_code_blocks += ri->num_code_blocks;
5539 for (i=0; i < ri->num_code_blocks; i++) {
5540 struct reg_code_block *src, *dst;
5541 STRLEN offset = orig_patlen
5542 + ((struct regexp *)SvANY(rx))->pre_prefix;
5543 assert(n < pRExC_state->num_code_blocks);
5544 src = &ri->code_blocks[i];
5545 dst = &pRExC_state->code_blocks[n];
5546 dst->start = src->start + offset;
5547 dst->end = src->end + offset;
5548 dst->block = src->block;
5549 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5563 while (SvAMAGIC(pat)
5564 && (sv = AMG_CALLunary(pat, string_amg))
5572 /* handle bare regex: foo =~ $re */
5577 if (SvTYPE(re) == SVt_REGEXP) {
5581 Safefree(pRExC_state->code_blocks);
5587 /* not a list of SVs, so must be a list of OPs */
5589 if (expr->op_type == OP_LIST) {
5594 pat = newSVpvn("", 0);
5599 /* given a list of CONSTs and DO blocks in expr, append all
5600 * the CONSTs to pat, and record the start and end of each
5601 * code block in code_blocks[] (each DO{} op is followed by an
5602 * OP_CONST containing the corresponding literal '(?{...})
5605 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5606 if (o->op_type == OP_CONST) {
5607 sv_catsv(pat, cSVOPo_sv);
5609 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5613 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5614 assert(i+1 < pRExC_state->num_code_blocks);
5615 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5616 pRExC_state->code_blocks[i].block = o;
5617 pRExC_state->code_blocks[i].src_regex = NULL;
5623 assert(expr->op_type == OP_CONST);
5624 pat = cSVOPx_sv(expr);
5628 exp = SvPV_nomg(pat, plen);
5630 if (!eng->op_comp) {
5631 if ((SvUTF8(pat) && IN_BYTES)
5632 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5634 /* make a temporary copy; either to convert to bytes,
5635 * or to avoid repeating get-magic / overloaded stringify */
5636 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5637 (IN_BYTES ? 0 : SvUTF8(pat)));
5639 Safefree(pRExC_state->code_blocks);
5640 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5643 /* ignore the utf8ness if the pattern is 0 length */
5644 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5645 RExC_uni_semantics = 0;
5646 RExC_contains_locale = 0;
5647 pRExC_state->runtime_code_qr = NULL;
5649 /****************** LONG JUMP TARGET HERE***********************/
5650 /* Longjmp back to here if have to switch in midstream to utf8 */
5651 if (! RExC_orig_utf8) {
5652 JMPENV_PUSH(jump_ret);
5653 used_setjump = TRUE;
5656 if (jump_ret == 0) { /* First time through */
5660 SV *dsv= sv_newmortal();
5661 RE_PV_QUOTED_DECL(s, RExC_utf8,
5662 dsv, exp, plen, 60);
5663 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5664 PL_colors[4],PL_colors[5],s);
5667 else { /* longjumped back */
5670 STRLEN s = 0, d = 0;
5673 /* If the cause for the longjmp was other than changing to utf8, pop
5674 * our own setjmp, and longjmp to the correct handler */
5675 if (jump_ret != UTF8_LONGJMP) {
5677 JMPENV_JUMP(jump_ret);
5682 /* It's possible to write a regexp in ascii that represents Unicode
5683 codepoints outside of the byte range, such as via \x{100}. If we
5684 detect such a sequence we have to convert the entire pattern to utf8
5685 and then recompile, as our sizing calculation will have been based
5686 on 1 byte == 1 character, but we will need to use utf8 to encode
5687 at least some part of the pattern, and therefore must convert the whole
5690 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5691 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5693 /* upgrade pattern to UTF8, and if there are code blocks,
5694 * recalculate the indices.
5695 * This is essentially an unrolled Perl_bytes_to_utf8() */
5697 src = (U8*)SvPV_nomg(pat, plen);
5698 Newx(dst, plen * 2 + 1, U8);
5701 const UV uv = NATIVE_TO_ASCII(src[s]);
5702 if (UNI_IS_INVARIANT(uv))
5703 dst[d] = (U8)UTF_TO_NATIVE(uv);
5705 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5706 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5708 if (n < pRExC_state->num_code_blocks) {
5709 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5710 pRExC_state->code_blocks[n].start = d;
5711 assert(dst[d] == '(');
5714 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5715 pRExC_state->code_blocks[n].end = d;
5716 assert(dst[d] == ')');
5729 RExC_orig_utf8 = RExC_utf8 = 1;
5732 /* return old regex if pattern hasn't changed */
5736 && !!RX_UTF8(old_re) == !!RExC_utf8
5737 && RX_PRECOMP(old_re)
5738 && RX_PRELEN(old_re) == plen
5739 && memEQ(RX_PRECOMP(old_re), exp, plen))
5741 /* with runtime code, always recompile */
5742 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5744 if (!runtime_code) {
5748 Safefree(pRExC_state->code_blocks);
5752 else if ((pm_flags & PMf_USE_RE_EVAL)
5753 /* this second condition covers the non-regex literal case,
5754 * i.e. $foo =~ '(?{})'. */
5755 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5756 && (PL_hints & HINT_RE_EVAL))
5758 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5761 #ifdef TRIE_STUDY_OPT
5765 rx_flags = orig_rx_flags;
5767 if (initial_charset == REGEX_LOCALE_CHARSET) {
5768 RExC_contains_locale = 1;
5770 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5772 /* Set to use unicode semantics if the pattern is in utf8 and has the
5773 * 'depends' charset specified, as it means unicode when utf8 */
5774 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5778 RExC_flags = rx_flags;
5779 RExC_pm_flags = pm_flags;
5782 if (PL_tainting && PL_tainted)
5783 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5785 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5786 /* whoops, we have a non-utf8 pattern, whilst run-time code
5787 * got compiled as utf8. Try again with a utf8 pattern */
5788 JMPENV_JUMP(UTF8_LONGJMP);
5791 assert(!pRExC_state->runtime_code_qr);
5796 RExC_in_lookbehind = 0;
5797 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5799 RExC_override_recoding = 0;
5801 /* First pass: determine size, legality. */
5809 RExC_emit = &PL_regdummy;
5810 RExC_whilem_seen = 0;
5811 RExC_open_parens = NULL;
5812 RExC_close_parens = NULL;
5814 RExC_paren_names = NULL;
5816 RExC_paren_name_list = NULL;
5818 RExC_recurse = NULL;
5819 RExC_recurse_count = 0;
5820 pRExC_state->code_index = 0;
5822 #if 0 /* REGC() is (currently) a NOP at the first pass.
5823 * Clever compilers notice this and complain. --jhi */
5824 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5827 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5829 RExC_lastparse=NULL;
5831 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5832 RExC_precomp = NULL;
5833 Safefree(pRExC_state->code_blocks);
5837 /* Here, finished first pass. Get rid of any added setjmp */
5843 PerlIO_printf(Perl_debug_log,
5844 "Required size %"IVdf" nodes\n"
5845 "Starting second pass (creation)\n",
5848 RExC_lastparse=NULL;
5851 /* The first pass could have found things that force Unicode semantics */
5852 if ((RExC_utf8 || RExC_uni_semantics)
5853 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5855 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5858 /* Small enough for pointer-storage convention?
5859 If extralen==0, this means that we will not need long jumps. */
5860 if (RExC_size >= 0x10000L && RExC_extralen)
5861 RExC_size += RExC_extralen;
5864 if (RExC_whilem_seen > 15)
5865 RExC_whilem_seen = 15;
5867 /* Allocate space and zero-initialize. Note, the two step process
5868 of zeroing when in debug mode, thus anything assigned has to
5869 happen after that */
5870 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5871 r = (struct regexp*)SvANY(rx);
5872 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5873 char, regexp_internal);
5874 if ( r == NULL || ri == NULL )
5875 FAIL("Regexp out of space");
5877 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5878 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5880 /* bulk initialize base fields with 0. */
5881 Zero(ri, sizeof(regexp_internal), char);
5884 /* non-zero initialization begins here */
5887 r->extflags = rx_flags;
5888 if (pm_flags & PMf_IS_QR) {
5889 ri->code_blocks = pRExC_state->code_blocks;
5890 ri->num_code_blocks = pRExC_state->num_code_blocks;
5893 SAVEFREEPV(pRExC_state->code_blocks);
5896 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5897 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5899 /* The caret is output if there are any defaults: if not all the STD
5900 * flags are set, or if no character set specifier is needed */
5902 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5904 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5905 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5906 >> RXf_PMf_STD_PMMOD_SHIFT);
5907 const char *fptr = STD_PAT_MODS; /*"msix"*/
5909 /* Allocate for the worst case, which is all the std flags are turned
5910 * on. If more precision is desired, we could do a population count of
5911 * the flags set. This could be done with a small lookup table, or by
5912 * shifting, masking and adding, or even, when available, assembly
5913 * language for a machine-language population count.
5914 * We never output a minus, as all those are defaults, so are
5915 * covered by the caret */
5916 const STRLEN wraplen = plen + has_p + has_runon
5917 + has_default /* If needs a caret */
5919 /* If needs a character set specifier */
5920 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5921 + (sizeof(STD_PAT_MODS) - 1)
5922 + (sizeof("(?:)") - 1);
5924 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5927 SvFLAGS(rx) |= SVf_UTF8;
5930 /* If a default, cover it using the caret */
5932 *p++= DEFAULT_PAT_MOD;
5936 const char* const name = get_regex_charset_name(r->extflags, &len);
5937 Copy(name, p, len, char);
5941 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5944 while((ch = *fptr++)) {
5952 Copy(RExC_precomp, p, plen, char);
5953 assert ((RX_WRAPPED(rx) - p) < 16);
5954 r->pre_prefix = p - RX_WRAPPED(rx);
5960 SvCUR_set(rx, p - SvPVX_const(rx));
5964 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5966 if (RExC_seen & REG_SEEN_RECURSE) {
5967 Newxz(RExC_open_parens, RExC_npar,regnode *);
5968 SAVEFREEPV(RExC_open_parens);
5969 Newxz(RExC_close_parens,RExC_npar,regnode *);
5970 SAVEFREEPV(RExC_close_parens);
5973 /* Useful during FAIL. */
5974 #ifdef RE_TRACK_PATTERN_OFFSETS
5975 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5976 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5977 "%s %"UVuf" bytes for offset annotations.\n",
5978 ri->u.offsets ? "Got" : "Couldn't get",
5979 (UV)((2*RExC_size+1) * sizeof(U32))));
5981 SetProgLen(ri,RExC_size);
5985 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5987 /* Second pass: emit code. */
5988 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5989 RExC_pm_flags = pm_flags;
5994 RExC_emit_start = ri->program;
5995 RExC_emit = ri->program;
5996 RExC_emit_bound = ri->program + RExC_size + 1;
5997 pRExC_state->code_index = 0;
5999 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6000 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6004 /* XXXX To minimize changes to RE engine we always allocate
6005 3-units-long substrs field. */
6006 Newx(r->substrs, 1, struct reg_substr_data);
6007 if (RExC_recurse_count) {
6008 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6009 SAVEFREEPV(RExC_recurse);
6013 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6014 Zero(r->substrs, 1, struct reg_substr_data);
6016 #ifdef TRIE_STUDY_OPT
6018 StructCopy(&zero_scan_data, &data, scan_data_t);
6019 copyRExC_state = RExC_state;
6022 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6024 RExC_state = copyRExC_state;
6025 if (seen & REG_TOP_LEVEL_BRANCHES)
6026 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6028 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6029 if (data.last_found) {
6030 SvREFCNT_dec(data.longest_fixed);
6031 SvREFCNT_dec(data.longest_float);
6032 SvREFCNT_dec(data.last_found);
6034 StructCopy(&zero_scan_data, &data, scan_data_t);
6037 StructCopy(&zero_scan_data, &data, scan_data_t);
6040 /* Dig out information for optimizations. */
6041 r->extflags = RExC_flags; /* was pm_op */
6042 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6045 SvUTF8_on(rx); /* Unicode in it? */
6046 ri->regstclass = NULL;
6047 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6048 r->intflags |= PREGf_NAUGHTY;
6049 scan = ri->program + 1; /* First BRANCH. */
6051 /* testing for BRANCH here tells us whether there is "must appear"
6052 data in the pattern. If there is then we can use it for optimisations */
6053 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6055 STRLEN longest_float_length, longest_fixed_length;
6056 struct regnode_charclass_class ch_class; /* pointed to by data */
6058 I32 last_close = 0; /* pointed to by data */
6059 regnode *first= scan;
6060 regnode *first_next= regnext(first);
6062 * Skip introductions and multiplicators >= 1
6063 * so that we can extract the 'meat' of the pattern that must
6064 * match in the large if() sequence following.
6065 * NOTE that EXACT is NOT covered here, as it is normally
6066 * picked up by the optimiser separately.
6068 * This is unfortunate as the optimiser isnt handling lookahead
6069 * properly currently.
6072 while ((OP(first) == OPEN && (sawopen = 1)) ||
6073 /* An OR of *one* alternative - should not happen now. */
6074 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6075 /* for now we can't handle lookbehind IFMATCH*/
6076 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6077 (OP(first) == PLUS) ||
6078 (OP(first) == MINMOD) ||
6079 /* An {n,m} with n>0 */
6080 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6081 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6084 * the only op that could be a regnode is PLUS, all the rest
6085 * will be regnode_1 or regnode_2.
6088 if (OP(first) == PLUS)
6091 first += regarglen[OP(first)];
6093 first = NEXTOPER(first);
6094 first_next= regnext(first);
6097 /* Starting-point info. */
6099 DEBUG_PEEP("first:",first,0);
6100 /* Ignore EXACT as we deal with it later. */
6101 if (PL_regkind[OP(first)] == EXACT) {
6102 if (OP(first) == EXACT)
6103 NOOP; /* Empty, get anchored substr later. */
6105 ri->regstclass = first;
6108 else if (PL_regkind[OP(first)] == TRIE &&
6109 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6112 /* this can happen only on restudy */
6113 if ( OP(first) == TRIE ) {
6114 struct regnode_1 *trieop = (struct regnode_1 *)
6115 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6116 StructCopy(first,trieop,struct regnode_1);
6117 trie_op=(regnode *)trieop;
6119 struct regnode_charclass *trieop = (struct regnode_charclass *)
6120 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6121 StructCopy(first,trieop,struct regnode_charclass);
6122 trie_op=(regnode *)trieop;
6125 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6126 ri->regstclass = trie_op;
6129 else if (REGNODE_SIMPLE(OP(first)))
6130 ri->regstclass = first;
6131 else if (PL_regkind[OP(first)] == BOUND ||
6132 PL_regkind[OP(first)] == NBOUND)
6133 ri->regstclass = first;
6134 else if (PL_regkind[OP(first)] == BOL) {
6135 r->extflags |= (OP(first) == MBOL
6137 : (OP(first) == SBOL
6140 first = NEXTOPER(first);
6143 else if (OP(first) == GPOS) {
6144 r->extflags |= RXf_ANCH_GPOS;
6145 first = NEXTOPER(first);
6148 else if ((!sawopen || !RExC_sawback) &&
6149 (OP(first) == STAR &&
6150 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6151 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6153 /* turn .* into ^.* with an implied $*=1 */
6155 (OP(NEXTOPER(first)) == REG_ANY)
6158 r->extflags |= type;
6159 r->intflags |= PREGf_IMPLICIT;
6160 first = NEXTOPER(first);
6163 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6164 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6165 /* x+ must match at the 1st pos of run of x's */
6166 r->intflags |= PREGf_SKIP;
6168 /* Scan is after the zeroth branch, first is atomic matcher. */
6169 #ifdef TRIE_STUDY_OPT
6172 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6173 (IV)(first - scan + 1))
6177 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6178 (IV)(first - scan + 1))
6184 * If there's something expensive in the r.e., find the
6185 * longest literal string that must appear and make it the
6186 * regmust. Resolve ties in favor of later strings, since
6187 * the regstart check works with the beginning of the r.e.
6188 * and avoiding duplication strengthens checking. Not a
6189 * strong reason, but sufficient in the absence of others.
6190 * [Now we resolve ties in favor of the earlier string if
6191 * it happens that c_offset_min has been invalidated, since the
6192 * earlier string may buy us something the later one won't.]
6195 data.longest_fixed = newSVpvs("");
6196 data.longest_float = newSVpvs("");
6197 data.last_found = newSVpvs("");
6198 data.longest = &(data.longest_fixed);
6200 if (!ri->regstclass) {
6201 cl_init(pRExC_state, &ch_class);
6202 data.start_class = &ch_class;
6203 stclass_flag = SCF_DO_STCLASS_AND;
6204 } else /* XXXX Check for BOUND? */
6206 data.last_closep = &last_close;
6208 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6209 &data, -1, NULL, NULL,
6210 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6216 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6217 && data.last_start_min == 0 && data.last_end > 0
6218 && !RExC_seen_zerolen
6219 && !(RExC_seen & REG_SEEN_VERBARG)
6220 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6221 r->extflags |= RXf_CHECK_ALL;
6222 scan_commit(pRExC_state, &data,&minlen,0);
6223 SvREFCNT_dec(data.last_found);
6225 longest_float_length = CHR_SVLEN(data.longest_float);
6227 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6228 && data.offset_fixed == data.offset_float_min
6229 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6230 && S_setup_longest (aTHX_ pRExC_state,
6234 &(r->float_end_shift),
6235 data.lookbehind_float,
6236 data.offset_float_min,
6238 longest_float_length,
6239 data.flags & SF_FL_BEFORE_EOL,
6240 data.flags & SF_FL_BEFORE_MEOL))
6242 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6243 r->float_max_offset = data.offset_float_max;
6244 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6245 r->float_max_offset -= data.lookbehind_float;
6248 r->float_substr = r->float_utf8 = NULL;
6249 SvREFCNT_dec(data.longest_float);
6250 longest_float_length = 0;
6253 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6255 if (S_setup_longest (aTHX_ pRExC_state,
6257 &(r->anchored_utf8),
6258 &(r->anchored_substr),
6259 &(r->anchored_end_shift),
6260 data.lookbehind_fixed,
6263 longest_fixed_length,
6264 data.flags & SF_FIX_BEFORE_EOL,
6265 data.flags & SF_FIX_BEFORE_MEOL))
6267 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6270 r->anchored_substr = r->anchored_utf8 = NULL;
6271 SvREFCNT_dec(data.longest_fixed);
6272 longest_fixed_length = 0;
6276 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6277 ri->regstclass = NULL;
6279 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6281 && !(data.start_class->flags & ANYOF_EOS)
6282 && !cl_is_anything(data.start_class))
6284 const U32 n = add_data(pRExC_state, 1, "f");
6285 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6287 Newx(RExC_rxi->data->data[n], 1,
6288 struct regnode_charclass_class);
6289 StructCopy(data.start_class,
6290 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6291 struct regnode_charclass_class);
6292 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6293 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6294 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6295 regprop(r, sv, (regnode*)data.start_class);
6296 PerlIO_printf(Perl_debug_log,
6297 "synthetic stclass \"%s\".\n",
6298 SvPVX_const(sv));});
6301 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6302 if (longest_fixed_length > longest_float_length) {
6303 r->check_end_shift = r->anchored_end_shift;
6304 r->check_substr = r->anchored_substr;
6305 r->check_utf8 = r->anchored_utf8;
6306 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6307 if (r->extflags & RXf_ANCH_SINGLE)
6308 r->extflags |= RXf_NOSCAN;
6311 r->check_end_shift = r->float_end_shift;
6312 r->check_substr = r->float_substr;
6313 r->check_utf8 = r->float_utf8;
6314 r->check_offset_min = r->float_min_offset;
6315 r->check_offset_max = r->float_max_offset;
6317 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6318 This should be changed ASAP! */
6319 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6320 r->extflags |= RXf_USE_INTUIT;
6321 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6322 r->extflags |= RXf_INTUIT_TAIL;
6324 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6325 if ( (STRLEN)minlen < longest_float_length )
6326 minlen= longest_float_length;
6327 if ( (STRLEN)minlen < longest_fixed_length )
6328 minlen= longest_fixed_length;
6332 /* Several toplevels. Best we can is to set minlen. */
6334 struct regnode_charclass_class ch_class;
6337 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6339 scan = ri->program + 1;
6340 cl_init(pRExC_state, &ch_class);
6341 data.start_class = &ch_class;
6342 data.last_closep = &last_close;
6345 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6346 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6350 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6351 = r->float_substr = r->float_utf8 = NULL;
6353 if (!(data.start_class->flags & ANYOF_EOS)
6354 && !cl_is_anything(data.start_class))
6356 const U32 n = add_data(pRExC_state, 1, "f");
6357 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6359 Newx(RExC_rxi->data->data[n], 1,
6360 struct regnode_charclass_class);
6361 StructCopy(data.start_class,
6362 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6363 struct regnode_charclass_class);
6364 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6365 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6366 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6367 regprop(r, sv, (regnode*)data.start_class);
6368 PerlIO_printf(Perl_debug_log,
6369 "synthetic stclass \"%s\".\n",
6370 SvPVX_const(sv));});
6374 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6375 the "real" pattern. */
6377 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6378 (IV)minlen, (IV)r->minlen);
6380 r->minlenret = minlen;
6381 if (r->minlen < minlen)
6384 if (RExC_seen & REG_SEEN_GPOS)
6385 r->extflags |= RXf_GPOS_SEEN;
6386 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6387 r->extflags |= RXf_LOOKBEHIND_SEEN;
6388 if (pRExC_state->num_code_blocks)
6389 r->extflags |= RXf_EVAL_SEEN;
6390 if (RExC_seen & REG_SEEN_CANY)
6391 r->extflags |= RXf_CANY_SEEN;
6392 if (RExC_seen & REG_SEEN_VERBARG)
6393 r->intflags |= PREGf_VERBARG_SEEN;
6394 if (RExC_seen & REG_SEEN_CUTGROUP)
6395 r->intflags |= PREGf_CUTGROUP_SEEN;
6396 if (pm_flags & PMf_USE_RE_EVAL)
6397 r->intflags |= PREGf_USE_RE_EVAL;
6398 if (RExC_paren_names)
6399 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6401 RXp_PAREN_NAMES(r) = NULL;
6403 #ifdef STUPID_PATTERN_CHECKS
6404 if (RX_PRELEN(rx) == 0)
6405 r->extflags |= RXf_NULL;
6406 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6407 /* XXX: this should happen BEFORE we compile */
6408 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6409 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6410 r->extflags |= RXf_WHITE;
6411 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6412 r->extflags |= RXf_START_ONLY;
6414 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6415 /* XXX: this should happen BEFORE we compile */
6416 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6418 regnode *first = ri->program + 1;
6421 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6422 r->extflags |= RXf_NULL;
6423 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6424 r->extflags |= RXf_START_ONLY;
6425 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6426 && OP(regnext(first)) == END)
6427 r->extflags |= RXf_WHITE;
6431 if (RExC_paren_names) {
6432 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6433 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6436 ri->name_list_idx = 0;
6438 if (RExC_recurse_count) {
6439 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6440 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6441 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6444 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6445 /* assume we don't need to swap parens around before we match */
6448 PerlIO_printf(Perl_debug_log,"Final program:\n");
6451 #ifdef RE_TRACK_PATTERN_OFFSETS
6452 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6453 const U32 len = ri->u.offsets[0];
6455 GET_RE_DEBUG_FLAGS_DECL;
6456 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6457 for (i = 1; i <= len; i++) {
6458 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6459 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6460 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6462 PerlIO_printf(Perl_debug_log, "\n");
6470 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6473 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6475 PERL_UNUSED_ARG(value);
6477 if (flags & RXapif_FETCH) {
6478 return reg_named_buff_fetch(rx, key, flags);
6479 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6480 Perl_croak_no_modify(aTHX);
6482 } else if (flags & RXapif_EXISTS) {
6483 return reg_named_buff_exists(rx, key, flags)
6486 } else if (flags & RXapif_REGNAMES) {
6487 return reg_named_buff_all(rx, flags);
6488 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6489 return reg_named_buff_scalar(rx, flags);
6491 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6497 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6500 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6501 PERL_UNUSED_ARG(lastkey);
6503 if (flags & RXapif_FIRSTKEY)
6504 return reg_named_buff_firstkey(rx, flags);
6505 else if (flags & RXapif_NEXTKEY)
6506 return reg_named_buff_nextkey(rx, flags);
6508 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6514 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6517 AV *retarray = NULL;
6519 struct regexp *const rx = (struct regexp *)SvANY(r);
6521 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6523 if (flags & RXapif_ALL)
6526 if (rx && RXp_PAREN_NAMES(rx)) {
6527 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6530 SV* sv_dat=HeVAL(he_str);
6531 I32 *nums=(I32*)SvPVX(sv_dat);
6532 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6533 if ((I32)(rx->nparens) >= nums[i]
6534 && rx->offs[nums[i]].start != -1
6535 && rx->offs[nums[i]].end != -1)
6538 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6543 ret = newSVsv(&PL_sv_undef);
6546 av_push(retarray, ret);
6549 return newRV_noinc(MUTABLE_SV(retarray));
6556 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6559 struct regexp *const rx = (struct regexp *)SvANY(r);
6561 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6563 if (rx && RXp_PAREN_NAMES(rx)) {
6564 if (flags & RXapif_ALL) {
6565 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6567 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6581 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6583 struct regexp *const rx = (struct regexp *)SvANY(r);
6585 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6587 if ( rx && RXp_PAREN_NAMES(rx) ) {
6588 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6590 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6597 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6599 struct regexp *const rx = (struct regexp *)SvANY(r);
6600 GET_RE_DEBUG_FLAGS_DECL;
6602 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6604 if (rx && RXp_PAREN_NAMES(rx)) {
6605 HV *hv = RXp_PAREN_NAMES(rx);
6607 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6610 SV* sv_dat = HeVAL(temphe);
6611 I32 *nums = (I32*)SvPVX(sv_dat);
6612 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6613 if ((I32)(rx->lastparen) >= nums[i] &&
6614 rx->offs[nums[i]].start != -1 &&
6615 rx->offs[nums[i]].end != -1)
6621 if (parno || flags & RXapif_ALL) {
6622 return newSVhek(HeKEY_hek(temphe));
6630 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6635 struct regexp *const rx = (struct regexp *)SvANY(r);
6637 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6639 if (rx && RXp_PAREN_NAMES(rx)) {
6640 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6641 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6642 } else if (flags & RXapif_ONE) {
6643 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6644 av = MUTABLE_AV(SvRV(ret));
6645 length = av_len(av);
6647 return newSViv(length + 1);
6649 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6653 return &PL_sv_undef;
6657 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6659 struct regexp *const rx = (struct regexp *)SvANY(r);
6662 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6664 if (rx && RXp_PAREN_NAMES(rx)) {
6665 HV *hv= RXp_PAREN_NAMES(rx);
6667 (void)hv_iterinit(hv);
6668 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6671 SV* sv_dat = HeVAL(temphe);
6672 I32 *nums = (I32*)SvPVX(sv_dat);
6673 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6674 if ((I32)(rx->lastparen) >= nums[i] &&
6675 rx->offs[nums[i]].start != -1 &&
6676 rx->offs[nums[i]].end != -1)
6682 if (parno || flags & RXapif_ALL) {
6683 av_push(av, newSVhek(HeKEY_hek(temphe)));
6688 return newRV_noinc(MUTABLE_SV(av));
6692 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6695 struct regexp *const rx = (struct regexp *)SvANY(r);
6701 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6703 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6704 || n == RX_BUFF_IDX_CARET_FULLMATCH
6705 || n == RX_BUFF_IDX_CARET_POSTMATCH
6707 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6714 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6715 /* no need to distinguish between them any more */
6716 n = RX_BUFF_IDX_FULLMATCH;
6718 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6719 && rx->offs[0].start != -1)
6721 /* $`, ${^PREMATCH} */
6722 i = rx->offs[0].start;
6726 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6727 && rx->offs[0].end != -1)
6729 /* $', ${^POSTMATCH} */
6730 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6731 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6734 if ( 0 <= n && n <= (I32)rx->nparens &&
6735 (s1 = rx->offs[n].start) != -1 &&
6736 (t1 = rx->offs[n].end) != -1)
6738 /* $&, ${^MATCH}, $1 ... */
6740 s = rx->subbeg + s1 - rx->suboffset;
6745 assert(s >= rx->subbeg);
6746 assert(rx->sublen >= (s - rx->subbeg) + i );
6748 const int oldtainted = PL_tainted;
6750 sv_setpvn(sv, s, i);
6751 PL_tainted = oldtainted;
6752 if ( (rx->extflags & RXf_CANY_SEEN)
6753 ? (RXp_MATCH_UTF8(rx)
6754 && (!i || is_utf8_string((U8*)s, i)))
6755 : (RXp_MATCH_UTF8(rx)) )
6762 if (RXp_MATCH_TAINTED(rx)) {
6763 if (SvTYPE(sv) >= SVt_PVMG) {
6764 MAGIC* const mg = SvMAGIC(sv);
6767 SvMAGIC_set(sv, mg->mg_moremagic);
6769 if ((mgt = SvMAGIC(sv))) {
6770 mg->mg_moremagic = mgt;
6771 SvMAGIC_set(sv, mg);
6782 sv_setsv(sv,&PL_sv_undef);
6788 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6789 SV const * const value)
6791 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6793 PERL_UNUSED_ARG(rx);
6794 PERL_UNUSED_ARG(paren);
6795 PERL_UNUSED_ARG(value);
6798 Perl_croak_no_modify(aTHX);
6802 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6805 struct regexp *const rx = (struct regexp *)SvANY(r);
6809 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6811 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6813 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6814 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6818 case RX_BUFF_IDX_PREMATCH: /* $` */
6819 if (rx->offs[0].start != -1) {
6820 i = rx->offs[0].start;
6829 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6830 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6832 case RX_BUFF_IDX_POSTMATCH: /* $' */
6833 if (rx->offs[0].end != -1) {
6834 i = rx->sublen - rx->offs[0].end;
6836 s1 = rx->offs[0].end;
6843 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6844 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6848 /* $& / ${^MATCH}, $1, $2, ... */
6850 if (paren <= (I32)rx->nparens &&
6851 (s1 = rx->offs[paren].start) != -1 &&
6852 (t1 = rx->offs[paren].end) != -1)
6858 if (ckWARN(WARN_UNINITIALIZED))
6859 report_uninit((const SV *)sv);
6864 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6865 const char * const s = rx->subbeg - rx->suboffset + s1;
6870 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6877 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6879 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6880 PERL_UNUSED_ARG(rx);
6884 return newSVpvs("Regexp");
6887 /* Scans the name of a named buffer from the pattern.
6888 * If flags is REG_RSN_RETURN_NULL returns null.
6889 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6890 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6891 * to the parsed name as looked up in the RExC_paren_names hash.
6892 * If there is an error throws a vFAIL().. type exception.
6895 #define REG_RSN_RETURN_NULL 0
6896 #define REG_RSN_RETURN_NAME 1
6897 #define REG_RSN_RETURN_DATA 2
6900 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6902 char *name_start = RExC_parse;
6904 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6906 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6907 /* skip IDFIRST by using do...while */
6910 RExC_parse += UTF8SKIP(RExC_parse);
6911 } while (isALNUM_utf8((U8*)RExC_parse));
6915 } while (isALNUM(*RExC_parse));
6917 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6918 vFAIL("Group name must start with a non-digit word character");
6922 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6923 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6924 if ( flags == REG_RSN_RETURN_NAME)
6926 else if (flags==REG_RSN_RETURN_DATA) {
6929 if ( ! sv_name ) /* should not happen*/
6930 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6931 if (RExC_paren_names)
6932 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6934 sv_dat = HeVAL(he_str);
6936 vFAIL("Reference to nonexistent named group");
6940 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6941 (unsigned long) flags);
6943 assert(0); /* NOT REACHED */
6948 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6949 int rem=(int)(RExC_end - RExC_parse); \
6958 if (RExC_lastparse!=RExC_parse) \
6959 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6962 iscut ? "..." : "<" \
6965 PerlIO_printf(Perl_debug_log,"%16s",""); \
6968 num = RExC_size + 1; \
6970 num=REG_NODE_NUM(RExC_emit); \
6971 if (RExC_lastnum!=num) \
6972 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6974 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6975 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6976 (int)((depth*2)), "", \
6980 RExC_lastparse=RExC_parse; \
6985 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6986 DEBUG_PARSE_MSG((funcname)); \
6987 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6989 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6990 DEBUG_PARSE_MSG((funcname)); \
6991 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6994 /* This section of code defines the inversion list object and its methods. The
6995 * interfaces are highly subject to change, so as much as possible is static to
6996 * this file. An inversion list is here implemented as a malloc'd C UV array
6997 * with some added info that is placed as UVs at the beginning in a header
6998 * portion. An inversion list for Unicode is an array of code points, sorted
6999 * by ordinal number. The zeroth element is the first code point in the list.
7000 * The 1th element is the first element beyond that not in the list. In other
7001 * words, the first range is
7002 * invlist[0]..(invlist[1]-1)
7003 * The other ranges follow. Thus every element whose index is divisible by two
7004 * marks the beginning of a range that is in the list, and every element not
7005 * divisible by two marks the beginning of a range not in the list. A single
7006 * element inversion list that contains the single code point N generally
7007 * consists of two elements
7010 * (The exception is when N is the highest representable value on the
7011 * machine, in which case the list containing just it would be a single
7012 * element, itself. By extension, if the last range in the list extends to
7013 * infinity, then the first element of that range will be in the inversion list
7014 * at a position that is divisible by two, and is the final element in the
7016 * Taking the complement (inverting) an inversion list is quite simple, if the
7017 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7018 * This implementation reserves an element at the beginning of each inversion
7019 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
7020 * actual beginning of the list is either that element if 0, or the next one if
7023 * More about inversion lists can be found in "Unicode Demystified"
7024 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7025 * More will be coming when functionality is added later.
7027 * The inversion list data structure is currently implemented as an SV pointing
7028 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7029 * array of UV whose memory management is automatically handled by the existing
7030 * facilities for SV's.
7032 * Some of the methods should always be private to the implementation, and some
7033 * should eventually be made public */
7035 /* The header definitions are in F<inline_invlist.c> */
7037 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7038 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7040 #define INVLIST_INITIAL_LEN 10
7042 PERL_STATIC_INLINE UV*
7043 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7045 /* Returns a pointer to the first element in the inversion list's array.
7046 * This is called upon initialization of an inversion list. Where the
7047 * array begins depends on whether the list has the code point U+0000
7048 * in it or not. The other parameter tells it whether the code that
7049 * follows this call is about to put a 0 in the inversion list or not.
7050 * The first element is either the element with 0, if 0, or the next one,
7053 UV* zero = get_invlist_zero_addr(invlist);
7055 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7058 assert(! *_get_invlist_len_addr(invlist));
7060 /* 1^1 = 0; 1^0 = 1 */
7061 *zero = 1 ^ will_have_0;
7062 return zero + *zero;
7065 PERL_STATIC_INLINE UV*
7066 S_invlist_array(pTHX_ SV* const invlist)
7068 /* Returns the pointer to the inversion list's array. Every time the
7069 * length changes, this needs to be called in case malloc or realloc moved
7072 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7074 /* Must not be empty. If these fail, you probably didn't check for <len>
7075 * being non-zero before trying to get the array */
7076 assert(*_get_invlist_len_addr(invlist));
7077 assert(*get_invlist_zero_addr(invlist) == 0
7078 || *get_invlist_zero_addr(invlist) == 1);
7080 /* The array begins either at the element reserved for zero if the
7081 * list contains 0 (that element will be set to 0), or otherwise the next
7082 * element (in which case the reserved element will be set to 1). */
7083 return (UV *) (get_invlist_zero_addr(invlist)
7084 + *get_invlist_zero_addr(invlist));
7087 PERL_STATIC_INLINE void
7088 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7090 /* Sets the current number of elements stored in the inversion list */
7092 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7094 *_get_invlist_len_addr(invlist) = len;
7096 assert(len <= SvLEN(invlist));
7098 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7099 /* If the list contains U+0000, that element is part of the header,
7100 * and should not be counted as part of the array. It will contain
7101 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7103 * SvCUR_set(invlist,
7104 * TO_INTERNAL_SIZE(len
7105 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7106 * But, this is only valid if len is not 0. The consequences of not doing
7107 * this is that the memory allocation code may think that 1 more UV is
7108 * being used than actually is, and so might do an unnecessary grow. That
7109 * seems worth not bothering to make this the precise amount.
7111 * Note that when inverting, SvCUR shouldn't change */
7114 PERL_STATIC_INLINE IV*
7115 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7117 /* Return the address of the UV that is reserved to hold the cached index
7120 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7122 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7125 PERL_STATIC_INLINE IV
7126 S_invlist_previous_index(pTHX_ SV* const invlist)
7128 /* Returns cached index of previous search */
7130 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7132 return *get_invlist_previous_index_addr(invlist);
7135 PERL_STATIC_INLINE void
7136 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7138 /* Caches <index> for later retrieval */
7140 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7142 assert(index == 0 || index < (int) _invlist_len(invlist));
7144 *get_invlist_previous_index_addr(invlist) = index;
7147 PERL_STATIC_INLINE UV
7148 S_invlist_max(pTHX_ SV* const invlist)
7150 /* Returns the maximum number of elements storable in the inversion list's
7151 * array, without having to realloc() */
7153 PERL_ARGS_ASSERT_INVLIST_MAX;
7155 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7158 PERL_STATIC_INLINE UV*
7159 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7161 /* Return the address of the UV that is reserved to hold 0 if the inversion
7162 * list contains 0. This has to be the last element of the heading, as the
7163 * list proper starts with either it if 0, or the next element if not.
7164 * (But we force it to contain either 0 or 1) */
7166 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7168 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7171 #ifndef PERL_IN_XSUB_RE
7173 Perl__new_invlist(pTHX_ IV initial_size)
7176 /* Return a pointer to a newly constructed inversion list, with enough
7177 * space to store 'initial_size' elements. If that number is negative, a
7178 * system default is used instead */
7182 if (initial_size < 0) {
7183 initial_size = INVLIST_INITIAL_LEN;
7186 /* Allocate the initial space */
7187 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7188 invlist_set_len(new_list, 0);
7190 /* Force iterinit() to be used to get iteration to work */
7191 *get_invlist_iter_addr(new_list) = UV_MAX;
7193 /* This should force a segfault if a method doesn't initialize this
7195 *get_invlist_zero_addr(new_list) = UV_MAX;
7197 *get_invlist_previous_index_addr(new_list) = 0;
7198 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7199 #if HEADER_LENGTH != 5
7200 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7208 S__new_invlist_C_array(pTHX_ UV* list)
7210 /* Return a pointer to a newly constructed inversion list, initialized to
7211 * point to <list>, which has to be in the exact correct inversion list
7212 * form, including internal fields. Thus this is a dangerous routine that
7213 * should not be used in the wrong hands */
7215 SV* invlist = newSV_type(SVt_PV);
7217 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7219 SvPV_set(invlist, (char *) list);
7220 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7221 shouldn't touch it */
7222 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7224 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7225 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7232 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7234 /* Grow the maximum size of an inversion list */
7236 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7238 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7241 PERL_STATIC_INLINE void
7242 S_invlist_trim(pTHX_ SV* const invlist)
7244 PERL_ARGS_ASSERT_INVLIST_TRIM;
7246 /* Change the length of the inversion list to how many entries it currently
7249 SvPV_shrink_to_cur((SV *) invlist);
7252 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7255 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7257 /* Subject to change or removal. Append the range from 'start' to 'end' at
7258 * the end of the inversion list. The range must be above any existing
7262 UV max = invlist_max(invlist);
7263 UV len = _invlist_len(invlist);
7265 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7267 if (len == 0) { /* Empty lists must be initialized */
7268 array = _invlist_array_init(invlist, start == 0);
7271 /* Here, the existing list is non-empty. The current max entry in the
7272 * list is generally the first value not in the set, except when the
7273 * set extends to the end of permissible values, in which case it is
7274 * the first entry in that final set, and so this call is an attempt to
7275 * append out-of-order */
7277 UV final_element = len - 1;
7278 array = invlist_array(invlist);
7279 if (array[final_element] > start
7280 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7282 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7283 array[final_element], start,
7284 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7287 /* Here, it is a legal append. If the new range begins with the first
7288 * value not in the set, it is extending the set, so the new first
7289 * value not in the set is one greater than the newly extended range.
7291 if (array[final_element] == start) {
7292 if (end != UV_MAX) {
7293 array[final_element] = end + 1;
7296 /* But if the end is the maximum representable on the machine,
7297 * just let the range that this would extend to have no end */
7298 invlist_set_len(invlist, len - 1);
7304 /* Here the new range doesn't extend any existing set. Add it */
7306 len += 2; /* Includes an element each for the start and end of range */
7308 /* If overflows the existing space, extend, which may cause the array to be
7311 invlist_extend(invlist, len);
7312 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7313 failure in invlist_array() */
7314 array = invlist_array(invlist);
7317 invlist_set_len(invlist, len);
7320 /* The next item on the list starts the range, the one after that is
7321 * one past the new range. */
7322 array[len - 2] = start;
7323 if (end != UV_MAX) {
7324 array[len - 1] = end + 1;
7327 /* But if the end is the maximum representable on the machine, just let
7328 * the range have no end */
7329 invlist_set_len(invlist, len - 1);
7333 #ifndef PERL_IN_XSUB_RE
7336 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7338 /* Searches the inversion list for the entry that contains the input code
7339 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7340 * return value is the index into the list's array of the range that
7345 IV high = _invlist_len(invlist);
7346 const IV highest_element = high - 1;
7349 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7351 /* If list is empty, return failure. */
7356 /* If the code point is before the first element, return failure. (We
7357 * can't combine this with the test above, because we can't get the array
7358 * unless we know the list is non-empty) */
7359 array = invlist_array(invlist);
7361 mid = invlist_previous_index(invlist);
7362 assert(mid >=0 && mid <= highest_element);
7364 /* <mid> contains the cache of the result of the previous call to this
7365 * function (0 the first time). See if this call is for the same result,
7366 * or if it is for mid-1. This is under the theory that calls to this
7367 * function will often be for related code points that are near each other.
7368 * And benchmarks show that caching gives better results. We also test
7369 * here if the code point is within the bounds of the list. These tests
7370 * replace others that would have had to be made anyway to make sure that
7371 * the array bounds were not exceeded, and give us extra information at the
7373 if (cp >= array[mid]) {
7374 if (cp >= array[highest_element]) {
7375 return highest_element;
7378 /* Here, array[mid] <= cp < array[highest_element]. This means that
7379 * the final element is not the answer, so can exclude it; it also
7380 * means that <mid> is not the final element, so can refer to 'mid + 1'
7382 if (cp < array[mid + 1]) {
7388 else { /* cp < aray[mid] */
7389 if (cp < array[0]) { /* Fail if outside the array */
7393 if (cp >= array[mid - 1]) {
7398 /* Binary search. What we are looking for is <i> such that
7399 * array[i] <= cp < array[i+1]
7400 * The loop below converges on the i+1. Note that there may not be an
7401 * (i+1)th element in the array, and things work nonetheless */
7402 while (low < high) {
7403 mid = (low + high) / 2;
7404 assert(mid <= highest_element);
7405 if (array[mid] <= cp) { /* cp >= array[mid] */
7408 /* We could do this extra test to exit the loop early.
7409 if (cp < array[low]) {
7414 else { /* cp < array[mid] */
7421 invlist_set_previous_index(invlist, high);
7426 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7428 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7429 * but is used when the swash has an inversion list. This makes this much
7430 * faster, as it uses a binary search instead of a linear one. This is
7431 * intimately tied to that function, and perhaps should be in utf8.c,
7432 * except it is intimately tied to inversion lists as well. It assumes
7433 * that <swatch> is all 0's on input */
7436 const IV len = _invlist_len(invlist);
7440 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7442 if (len == 0) { /* Empty inversion list */
7446 array = invlist_array(invlist);
7448 /* Find which element it is */
7449 i = _invlist_search(invlist, start);
7451 /* We populate from <start> to <end> */
7452 while (current < end) {
7455 /* The inversion list gives the results for every possible code point
7456 * after the first one in the list. Only those ranges whose index is
7457 * even are ones that the inversion list matches. For the odd ones,
7458 * and if the initial code point is not in the list, we have to skip
7459 * forward to the next element */
7460 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7462 if (i >= len) { /* Finished if beyond the end of the array */
7466 if (current >= end) { /* Finished if beyond the end of what we
7468 if (LIKELY(end < UV_MAX)) {
7472 /* We get here when the upper bound is the maximum
7473 * representable on the machine, and we are looking for just
7474 * that code point. Have to special case it */
7476 goto join_end_of_list;
7479 assert(current >= start);
7481 /* The current range ends one below the next one, except don't go past
7484 upper = (i < len && array[i] < end) ? array[i] : end;
7486 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7487 * for each code point in it */
7488 for (; current < upper; current++) {
7489 const STRLEN offset = (STRLEN)(current - start);
7490 swatch[offset >> 3] |= 1 << (offset & 7);
7495 /* Quit if at the end of the list */
7498 /* But first, have to deal with the highest possible code point on
7499 * the platform. The previous code assumes that <end> is one
7500 * beyond where we want to populate, but that is impossible at the
7501 * platform's infinity, so have to handle it specially */
7502 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7504 const STRLEN offset = (STRLEN)(end - start);
7505 swatch[offset >> 3] |= 1 << (offset & 7);
7510 /* Advance to the next range, which will be for code points not in the
7519 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7521 /* Take the union of two inversion lists and point <output> to it. *output
7522 * should be defined upon input, and if it points to one of the two lists,
7523 * the reference count to that list will be decremented. The first list,
7524 * <a>, may be NULL, in which case a copy of the second list is returned.
7525 * If <complement_b> is TRUE, the union is taken of the complement
7526 * (inversion) of <b> instead of b itself.
7528 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7529 * Richard Gillam, published by Addison-Wesley, and explained at some
7530 * length there. The preface says to incorporate its examples into your
7531 * code at your own risk.
7533 * The algorithm is like a merge sort.
7535 * XXX A potential performance improvement is to keep track as we go along
7536 * if only one of the inputs contributes to the result, meaning the other
7537 * is a subset of that one. In that case, we can skip the final copy and
7538 * return the larger of the input lists, but then outside code might need
7539 * to keep track of whether to free the input list or not */
7541 UV* array_a; /* a's array */
7543 UV len_a; /* length of a's array */
7546 SV* u; /* the resulting union */
7550 UV i_a = 0; /* current index into a's array */
7554 /* running count, as explained in the algorithm source book; items are
7555 * stopped accumulating and are output when the count changes to/from 0.
7556 * The count is incremented when we start a range that's in the set, and
7557 * decremented when we start a range that's not in the set. So its range
7558 * is 0 to 2. Only when the count is zero is something not in the set.
7562 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7565 /* If either one is empty, the union is the other one */
7566 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7573 *output = invlist_clone(b);
7575 _invlist_invert(*output);
7577 } /* else *output already = b; */
7580 else if ((len_b = _invlist_len(b)) == 0) {
7585 /* The complement of an empty list is a list that has everything in it,
7586 * so the union with <a> includes everything too */
7591 *output = _new_invlist(1);
7592 _append_range_to_invlist(*output, 0, UV_MAX);
7594 else if (*output != a) {
7595 *output = invlist_clone(a);
7597 /* else *output already = a; */
7601 /* Here both lists exist and are non-empty */
7602 array_a = invlist_array(a);
7603 array_b = invlist_array(b);
7605 /* If are to take the union of 'a' with the complement of b, set it
7606 * up so are looking at b's complement. */
7609 /* To complement, we invert: if the first element is 0, remove it. To
7610 * do this, we just pretend the array starts one later, and clear the
7611 * flag as we don't have to do anything else later */
7612 if (array_b[0] == 0) {
7615 complement_b = FALSE;
7619 /* But if the first element is not zero, we unshift a 0 before the
7620 * array. The data structure reserves a space for that 0 (which
7621 * should be a '1' right now), so physical shifting is unneeded,
7622 * but temporarily change that element to 0. Before exiting the
7623 * routine, we must restore the element to '1' */
7630 /* Size the union for the worst case: that the sets are completely
7632 u = _new_invlist(len_a + len_b);
7634 /* Will contain U+0000 if either component does */
7635 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7636 || (len_b > 0 && array_b[0] == 0));
7638 /* Go through each list item by item, stopping when exhausted one of
7640 while (i_a < len_a && i_b < len_b) {
7641 UV cp; /* The element to potentially add to the union's array */
7642 bool cp_in_set; /* is it in the the input list's set or not */
7644 /* We need to take one or the other of the two inputs for the union.
7645 * Since we are merging two sorted lists, we take the smaller of the
7646 * next items. In case of a tie, we take the one that is in its set
7647 * first. If we took one not in the set first, it would decrement the
7648 * count, possibly to 0 which would cause it to be output as ending the
7649 * range, and the next time through we would take the same number, and
7650 * output it again as beginning the next range. By doing it the
7651 * opposite way, there is no possibility that the count will be
7652 * momentarily decremented to 0, and thus the two adjoining ranges will
7653 * be seamlessly merged. (In a tie and both are in the set or both not
7654 * in the set, it doesn't matter which we take first.) */
7655 if (array_a[i_a] < array_b[i_b]
7656 || (array_a[i_a] == array_b[i_b]
7657 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7659 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7663 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7667 /* Here, have chosen which of the two inputs to look at. Only output
7668 * if the running count changes to/from 0, which marks the
7669 * beginning/end of a range in that's in the set */
7672 array_u[i_u++] = cp;
7679 array_u[i_u++] = cp;
7684 /* Here, we are finished going through at least one of the lists, which
7685 * means there is something remaining in at most one. We check if the list
7686 * that hasn't been exhausted is positioned such that we are in the middle
7687 * of a range in its set or not. (i_a and i_b point to the element beyond
7688 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7689 * is potentially more to output.
7690 * There are four cases:
7691 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7692 * in the union is entirely from the non-exhausted set.
7693 * 2) Both were in their sets, count is 2. Nothing further should
7694 * be output, as everything that remains will be in the exhausted
7695 * list's set, hence in the union; decrementing to 1 but not 0 insures
7697 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7698 * Nothing further should be output because the union includes
7699 * everything from the exhausted set. Not decrementing ensures that.
7700 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7701 * decrementing to 0 insures that we look at the remainder of the
7702 * non-exhausted set */
7703 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7704 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7709 /* The final length is what we've output so far, plus what else is about to
7710 * be output. (If 'count' is non-zero, then the input list we exhausted
7711 * has everything remaining up to the machine's limit in its set, and hence
7712 * in the union, so there will be no further output. */
7715 /* At most one of the subexpressions will be non-zero */
7716 len_u += (len_a - i_a) + (len_b - i_b);
7719 /* Set result to final length, which can change the pointer to array_u, so
7721 if (len_u != _invlist_len(u)) {
7722 invlist_set_len(u, len_u);
7724 array_u = invlist_array(u);
7727 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7728 * the other) ended with everything above it not in its set. That means
7729 * that the remaining part of the union is precisely the same as the
7730 * non-exhausted list, so can just copy it unchanged. (If both list were
7731 * exhausted at the same time, then the operations below will be both 0.)
7734 IV copy_count; /* At most one will have a non-zero copy count */
7735 if ((copy_count = len_a - i_a) > 0) {
7736 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7738 else if ((copy_count = len_b - i_b) > 0) {
7739 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7743 /* We may be removing a reference to one of the inputs */
7744 if (a == *output || b == *output) {
7745 SvREFCNT_dec(*output);
7748 /* If we've changed b, restore it */
7758 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7760 /* Take the intersection of two inversion lists and point <i> to it. *i
7761 * should be defined upon input, and if it points to one of the two lists,
7762 * the reference count to that list will be decremented.
7763 * If <complement_b> is TRUE, the result will be the intersection of <a>
7764 * and the complement (or inversion) of <b> instead of <b> directly.
7766 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7767 * Richard Gillam, published by Addison-Wesley, and explained at some
7768 * length there. The preface says to incorporate its examples into your
7769 * code at your own risk. In fact, it had bugs
7771 * The algorithm is like a merge sort, and is essentially the same as the
7775 UV* array_a; /* a's array */
7777 UV len_a; /* length of a's array */
7780 SV* r; /* the resulting intersection */
7784 UV i_a = 0; /* current index into a's array */
7788 /* running count, as explained in the algorithm source book; items are
7789 * stopped accumulating and are output when the count changes to/from 2.
7790 * The count is incremented when we start a range that's in the set, and
7791 * decremented when we start a range that's not in the set. So its range
7792 * is 0 to 2. Only when the count is 2 is something in the intersection.
7796 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7799 /* Special case if either one is empty */
7800 len_a = _invlist_len(a);
7801 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7803 if (len_a != 0 && complement_b) {
7805 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7806 * be empty. Here, also we are using 'b's complement, which hence
7807 * must be every possible code point. Thus the intersection is
7810 *i = invlist_clone(a);
7816 /* else *i is already 'a' */
7820 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7821 * intersection must be empty */
7828 *i = _new_invlist(0);
7832 /* Here both lists exist and are non-empty */
7833 array_a = invlist_array(a);
7834 array_b = invlist_array(b);
7836 /* If are to take the intersection of 'a' with the complement of b, set it
7837 * up so are looking at b's complement. */
7840 /* To complement, we invert: if the first element is 0, remove it. To
7841 * do this, we just pretend the array starts one later, and clear the
7842 * flag as we don't have to do anything else later */
7843 if (array_b[0] == 0) {
7846 complement_b = FALSE;
7850 /* But if the first element is not zero, we unshift a 0 before the
7851 * array. The data structure reserves a space for that 0 (which
7852 * should be a '1' right now), so physical shifting is unneeded,
7853 * but temporarily change that element to 0. Before exiting the
7854 * routine, we must restore the element to '1' */
7861 /* Size the intersection for the worst case: that the intersection ends up
7862 * fragmenting everything to be completely disjoint */
7863 r= _new_invlist(len_a + len_b);
7865 /* Will contain U+0000 iff both components do */
7866 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7867 && len_b > 0 && array_b[0] == 0);
7869 /* Go through each list item by item, stopping when exhausted one of
7871 while (i_a < len_a && i_b < len_b) {
7872 UV cp; /* The element to potentially add to the intersection's
7874 bool cp_in_set; /* Is it in the input list's set or not */
7876 /* We need to take one or the other of the two inputs for the
7877 * intersection. Since we are merging two sorted lists, we take the
7878 * smaller of the next items. In case of a tie, we take the one that
7879 * is not in its set first (a difference from the union algorithm). If
7880 * we took one in the set first, it would increment the count, possibly
7881 * to 2 which would cause it to be output as starting a range in the
7882 * intersection, and the next time through we would take that same
7883 * number, and output it again as ending the set. By doing it the
7884 * opposite of this, there is no possibility that the count will be
7885 * momentarily incremented to 2. (In a tie and both are in the set or
7886 * both not in the set, it doesn't matter which we take first.) */
7887 if (array_a[i_a] < array_b[i_b]
7888 || (array_a[i_a] == array_b[i_b]
7889 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7891 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7895 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7899 /* Here, have chosen which of the two inputs to look at. Only output
7900 * if the running count changes to/from 2, which marks the
7901 * beginning/end of a range that's in the intersection */
7905 array_r[i_r++] = cp;
7910 array_r[i_r++] = cp;
7916 /* Here, we are finished going through at least one of the lists, which
7917 * means there is something remaining in at most one. We check if the list
7918 * that has been exhausted is positioned such that we are in the middle
7919 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7920 * the ones we care about.) There are four cases:
7921 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7922 * nothing left in the intersection.
7923 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7924 * above 2. What should be output is exactly that which is in the
7925 * non-exhausted set, as everything it has is also in the intersection
7926 * set, and everything it doesn't have can't be in the intersection
7927 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7928 * gets incremented to 2. Like the previous case, the intersection is
7929 * everything that remains in the non-exhausted set.
7930 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7931 * remains 1. And the intersection has nothing more. */
7932 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7933 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7938 /* The final length is what we've output so far plus what else is in the
7939 * intersection. At most one of the subexpressions below will be non-zero */
7942 len_r += (len_a - i_a) + (len_b - i_b);
7945 /* Set result to final length, which can change the pointer to array_r, so
7947 if (len_r != _invlist_len(r)) {
7948 invlist_set_len(r, len_r);
7950 array_r = invlist_array(r);
7953 /* Finish outputting any remaining */
7954 if (count >= 2) { /* At most one will have a non-zero copy count */
7956 if ((copy_count = len_a - i_a) > 0) {
7957 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7959 else if ((copy_count = len_b - i_b) > 0) {
7960 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7964 /* We may be removing a reference to one of the inputs */
7965 if (a == *i || b == *i) {
7969 /* If we've changed b, restore it */
7979 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7981 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7982 * set. A pointer to the inversion list is returned. This may actually be
7983 * a new list, in which case the passed in one has been destroyed. The
7984 * passed in inversion list can be NULL, in which case a new one is created
7985 * with just the one range in it */
7990 if (invlist == NULL) {
7991 invlist = _new_invlist(2);
7995 len = _invlist_len(invlist);
7998 /* If comes after the final entry, can just append it to the end */
8000 || start >= invlist_array(invlist)
8001 [_invlist_len(invlist) - 1])
8003 _append_range_to_invlist(invlist, start, end);
8007 /* Here, can't just append things, create and return a new inversion list
8008 * which is the union of this range and the existing inversion list */
8009 range_invlist = _new_invlist(2);
8010 _append_range_to_invlist(range_invlist, start, end);
8012 _invlist_union(invlist, range_invlist, &invlist);
8014 /* The temporary can be freed */
8015 SvREFCNT_dec(range_invlist);
8022 PERL_STATIC_INLINE SV*
8023 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8024 return _add_range_to_invlist(invlist, cp, cp);
8027 #ifndef PERL_IN_XSUB_RE
8029 Perl__invlist_invert(pTHX_ SV* const invlist)
8031 /* Complement the input inversion list. This adds a 0 if the list didn't
8032 * have a zero; removes it otherwise. As described above, the data
8033 * structure is set up so that this is very efficient */
8035 UV* len_pos = _get_invlist_len_addr(invlist);
8037 PERL_ARGS_ASSERT__INVLIST_INVERT;
8039 /* The inverse of matching nothing is matching everything */
8040 if (*len_pos == 0) {
8041 _append_range_to_invlist(invlist, 0, UV_MAX);
8045 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
8046 * zero element was a 0, so it is being removed, so the length decrements
8047 * by 1; and vice-versa. SvCUR is unaffected */
8048 if (*get_invlist_zero_addr(invlist) ^= 1) {
8057 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8059 /* Complement the input inversion list (which must be a Unicode property,
8060 * all of which don't match above the Unicode maximum code point.) And
8061 * Perl has chosen to not have the inversion match above that either. This
8062 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8068 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8070 _invlist_invert(invlist);
8072 len = _invlist_len(invlist);
8074 if (len != 0) { /* If empty do nothing */
8075 array = invlist_array(invlist);
8076 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8077 /* Add 0x110000. First, grow if necessary */
8079 if (invlist_max(invlist) < len) {
8080 invlist_extend(invlist, len);
8081 array = invlist_array(invlist);
8083 invlist_set_len(invlist, len);
8084 array[len - 1] = PERL_UNICODE_MAX + 1;
8086 else { /* Remove the 0x110000 */
8087 invlist_set_len(invlist, len - 1);
8095 PERL_STATIC_INLINE SV*
8096 S_invlist_clone(pTHX_ SV* const invlist)
8099 /* Return a new inversion list that is a copy of the input one, which is
8102 /* Need to allocate extra space to accommodate Perl's addition of a
8103 * trailing NUL to SvPV's, since it thinks they are always strings */
8104 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8105 STRLEN length = SvCUR(invlist);
8107 PERL_ARGS_ASSERT_INVLIST_CLONE;
8109 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8110 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8115 PERL_STATIC_INLINE UV*
8116 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8118 /* Return the address of the UV that contains the current iteration
8121 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8123 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8126 PERL_STATIC_INLINE UV*
8127 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8129 /* Return the address of the UV that contains the version id. */
8131 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8133 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8136 PERL_STATIC_INLINE void
8137 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8139 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8141 *get_invlist_iter_addr(invlist) = 0;
8145 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8147 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8148 * This call sets in <*start> and <*end>, the next range in <invlist>.
8149 * Returns <TRUE> if successful and the next call will return the next
8150 * range; <FALSE> if was already at the end of the list. If the latter,
8151 * <*start> and <*end> are unchanged, and the next call to this function
8152 * will start over at the beginning of the list */
8154 UV* pos = get_invlist_iter_addr(invlist);
8155 UV len = _invlist_len(invlist);
8158 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8161 *pos = UV_MAX; /* Force iternit() to be required next time */
8165 array = invlist_array(invlist);
8167 *start = array[(*pos)++];
8173 *end = array[(*pos)++] - 1;
8179 PERL_STATIC_INLINE UV
8180 S_invlist_highest(pTHX_ SV* const invlist)
8182 /* Returns the highest code point that matches an inversion list. This API
8183 * has an ambiguity, as it returns 0 under either the highest is actually
8184 * 0, or if the list is empty. If this distinction matters to you, check
8185 * for emptiness before calling this function */
8187 UV len = _invlist_len(invlist);
8190 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8196 array = invlist_array(invlist);
8198 /* The last element in the array in the inversion list always starts a
8199 * range that goes to infinity. That range may be for code points that are
8200 * matched in the inversion list, or it may be for ones that aren't
8201 * matched. In the latter case, the highest code point in the set is one
8202 * less than the beginning of this range; otherwise it is the final element
8203 * of this range: infinity */
8204 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8206 : array[len - 1] - 1;
8209 #ifndef PERL_IN_XSUB_RE
8211 Perl__invlist_contents(pTHX_ SV* const invlist)
8213 /* Get the contents of an inversion list into a string SV so that they can
8214 * be printed out. It uses the format traditionally done for debug tracing
8218 SV* output = newSVpvs("\n");
8220 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8222 invlist_iterinit(invlist);
8223 while (invlist_iternext(invlist, &start, &end)) {
8224 if (end == UV_MAX) {
8225 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8227 else if (end != start) {
8228 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8232 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8242 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8244 /* Dumps out the ranges in an inversion list. The string 'header'
8245 * if present is output on a line before the first range */
8249 if (header && strlen(header)) {
8250 PerlIO_printf(Perl_debug_log, "%s\n", header);
8252 invlist_iterinit(invlist);
8253 while (invlist_iternext(invlist, &start, &end)) {
8254 if (end == UV_MAX) {
8255 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8258 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8266 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8268 /* Return a boolean as to if the two passed in inversion lists are
8269 * identical. The final argument, if TRUE, says to take the complement of
8270 * the second inversion list before doing the comparison */
8272 UV* array_a = invlist_array(a);
8273 UV* array_b = invlist_array(b);
8274 UV len_a = _invlist_len(a);
8275 UV len_b = _invlist_len(b);
8277 UV i = 0; /* current index into the arrays */
8278 bool retval = TRUE; /* Assume are identical until proven otherwise */
8280 PERL_ARGS_ASSERT__INVLISTEQ;
8282 /* If are to compare 'a' with the complement of b, set it
8283 * up so are looking at b's complement. */
8286 /* The complement of nothing is everything, so <a> would have to have
8287 * just one element, starting at zero (ending at infinity) */
8289 return (len_a == 1 && array_a[0] == 0);
8291 else if (array_b[0] == 0) {
8293 /* Otherwise, to complement, we invert. Here, the first element is
8294 * 0, just remove it. To do this, we just pretend the array starts
8295 * one later, and clear the flag as we don't have to do anything
8300 complement_b = FALSE;
8304 /* But if the first element is not zero, we unshift a 0 before the
8305 * array. The data structure reserves a space for that 0 (which
8306 * should be a '1' right now), so physical shifting is unneeded,
8307 * but temporarily change that element to 0. Before exiting the
8308 * routine, we must restore the element to '1' */
8315 /* Make sure that the lengths are the same, as well as the final element
8316 * before looping through the remainder. (Thus we test the length, final,
8317 * and first elements right off the bat) */
8318 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8321 else for (i = 0; i < len_a - 1; i++) {
8322 if (array_a[i] != array_b[i]) {
8335 #undef HEADER_LENGTH
8336 #undef INVLIST_INITIAL_LENGTH
8337 #undef TO_INTERNAL_SIZE
8338 #undef FROM_INTERNAL_SIZE
8339 #undef INVLIST_LEN_OFFSET
8340 #undef INVLIST_ZERO_OFFSET
8341 #undef INVLIST_ITER_OFFSET
8342 #undef INVLIST_VERSION_ID
8344 /* End of inversion list object */
8347 - reg - regular expression, i.e. main body or parenthesized thing
8349 * Caller must absorb opening parenthesis.
8351 * Combining parenthesis handling with the base level of regular expression
8352 * is a trifle forced, but the need to tie the tails of the branches to what
8353 * follows makes it hard to avoid.
8355 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8357 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8359 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8363 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8364 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8367 regnode *ret; /* Will be the head of the group. */
8370 regnode *ender = NULL;
8373 U32 oregflags = RExC_flags;
8374 bool have_branch = 0;
8376 I32 freeze_paren = 0;
8377 I32 after_freeze = 0;
8379 /* for (?g), (?gc), and (?o) warnings; warning
8380 about (?c) will warn about (?g) -- japhy */
8382 #define WASTED_O 0x01
8383 #define WASTED_G 0x02
8384 #define WASTED_C 0x04
8385 #define WASTED_GC (0x02|0x04)
8386 I32 wastedflags = 0x00;
8388 char * parse_start = RExC_parse; /* MJD */
8389 char * const oregcomp_parse = RExC_parse;
8391 GET_RE_DEBUG_FLAGS_DECL;
8393 PERL_ARGS_ASSERT_REG;
8394 DEBUG_PARSE("reg ");
8396 *flagp = 0; /* Tentatively. */
8399 /* Make an OPEN node, if parenthesized. */
8401 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8402 char *start_verb = RExC_parse;
8403 STRLEN verb_len = 0;
8404 char *start_arg = NULL;
8405 unsigned char op = 0;
8407 int internal_argval = 0; /* internal_argval is only useful if !argok */
8408 while ( *RExC_parse && *RExC_parse != ')' ) {
8409 if ( *RExC_parse == ':' ) {
8410 start_arg = RExC_parse + 1;
8416 verb_len = RExC_parse - start_verb;
8419 while ( *RExC_parse && *RExC_parse != ')' )
8421 if ( *RExC_parse != ')' )
8422 vFAIL("Unterminated verb pattern argument");
8423 if ( RExC_parse == start_arg )
8426 if ( *RExC_parse != ')' )
8427 vFAIL("Unterminated verb pattern");
8430 switch ( *start_verb ) {
8431 case 'A': /* (*ACCEPT) */
8432 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8434 internal_argval = RExC_nestroot;
8437 case 'C': /* (*COMMIT) */
8438 if ( memEQs(start_verb,verb_len,"COMMIT") )
8441 case 'F': /* (*FAIL) */
8442 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8447 case ':': /* (*:NAME) */
8448 case 'M': /* (*MARK:NAME) */
8449 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8454 case 'P': /* (*PRUNE) */
8455 if ( memEQs(start_verb,verb_len,"PRUNE") )
8458 case 'S': /* (*SKIP) */
8459 if ( memEQs(start_verb,verb_len,"SKIP") )
8462 case 'T': /* (*THEN) */
8463 /* [19:06] <TimToady> :: is then */
8464 if ( memEQs(start_verb,verb_len,"THEN") ) {
8466 RExC_seen |= REG_SEEN_CUTGROUP;
8472 vFAIL3("Unknown verb pattern '%.*s'",
8473 verb_len, start_verb);
8476 if ( start_arg && internal_argval ) {
8477 vFAIL3("Verb pattern '%.*s' may not have an argument",
8478 verb_len, start_verb);
8479 } else if ( argok < 0 && !start_arg ) {
8480 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8481 verb_len, start_verb);
8483 ret = reganode(pRExC_state, op, internal_argval);
8484 if ( ! internal_argval && ! SIZE_ONLY ) {
8486 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8487 ARG(ret) = add_data( pRExC_state, 1, "S" );
8488 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8495 if (!internal_argval)
8496 RExC_seen |= REG_SEEN_VERBARG;
8497 } else if ( start_arg ) {
8498 vFAIL3("Verb pattern '%.*s' may not have an argument",
8499 verb_len, start_verb);
8501 ret = reg_node(pRExC_state, op);
8503 nextchar(pRExC_state);
8506 if (*RExC_parse == '?') { /* (?...) */
8507 bool is_logical = 0;
8508 const char * const seqstart = RExC_parse;
8509 bool has_use_defaults = FALSE;
8512 paren = *RExC_parse++;
8513 ret = NULL; /* For look-ahead/behind. */
8516 case 'P': /* (?P...) variants for those used to PCRE/Python */
8517 paren = *RExC_parse++;
8518 if ( paren == '<') /* (?P<...>) named capture */
8520 else if (paren == '>') { /* (?P>name) named recursion */
8521 goto named_recursion;
8523 else if (paren == '=') { /* (?P=...) named backref */
8524 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8525 you change this make sure you change that */
8526 char* name_start = RExC_parse;
8528 SV *sv_dat = reg_scan_name(pRExC_state,
8529 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8530 if (RExC_parse == name_start || *RExC_parse != ')')
8531 vFAIL2("Sequence %.3s... not terminated",parse_start);
8534 num = add_data( pRExC_state, 1, "S" );
8535 RExC_rxi->data->data[num]=(void*)sv_dat;
8536 SvREFCNT_inc_simple_void(sv_dat);
8539 ret = reganode(pRExC_state,
8542 : (ASCII_FOLD_RESTRICTED)
8544 : (AT_LEAST_UNI_SEMANTICS)
8552 Set_Node_Offset(ret, parse_start+1);
8553 Set_Node_Cur_Length(ret); /* MJD */
8555 nextchar(pRExC_state);
8559 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8561 case '<': /* (?<...) */
8562 if (*RExC_parse == '!')
8564 else if (*RExC_parse != '=')
8570 case '\'': /* (?'...') */
8571 name_start= RExC_parse;
8572 svname = reg_scan_name(pRExC_state,
8573 SIZE_ONLY ? /* reverse test from the others */
8574 REG_RSN_RETURN_NAME :
8575 REG_RSN_RETURN_NULL);
8576 if (RExC_parse == name_start) {
8578 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8581 if (*RExC_parse != paren)
8582 vFAIL2("Sequence (?%c... not terminated",
8583 paren=='>' ? '<' : paren);
8587 if (!svname) /* shouldn't happen */
8589 "panic: reg_scan_name returned NULL");
8590 if (!RExC_paren_names) {
8591 RExC_paren_names= newHV();
8592 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8594 RExC_paren_name_list= newAV();
8595 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8598 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8600 sv_dat = HeVAL(he_str);
8602 /* croak baby croak */
8604 "panic: paren_name hash element allocation failed");
8605 } else if ( SvPOK(sv_dat) ) {
8606 /* (?|...) can mean we have dupes so scan to check
8607 its already been stored. Maybe a flag indicating
8608 we are inside such a construct would be useful,
8609 but the arrays are likely to be quite small, so
8610 for now we punt -- dmq */
8611 IV count = SvIV(sv_dat);
8612 I32 *pv = (I32*)SvPVX(sv_dat);
8614 for ( i = 0 ; i < count ; i++ ) {
8615 if ( pv[i] == RExC_npar ) {
8621 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8622 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8623 pv[count] = RExC_npar;
8624 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8627 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8628 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8630 SvIV_set(sv_dat, 1);
8633 /* Yes this does cause a memory leak in debugging Perls */
8634 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8635 SvREFCNT_dec(svname);
8638 /*sv_dump(sv_dat);*/
8640 nextchar(pRExC_state);
8642 goto capturing_parens;
8644 RExC_seen |= REG_SEEN_LOOKBEHIND;
8645 RExC_in_lookbehind++;
8647 case '=': /* (?=...) */
8648 RExC_seen_zerolen++;
8650 case '!': /* (?!...) */
8651 RExC_seen_zerolen++;
8652 if (*RExC_parse == ')') {
8653 ret=reg_node(pRExC_state, OPFAIL);
8654 nextchar(pRExC_state);
8658 case '|': /* (?|...) */
8659 /* branch reset, behave like a (?:...) except that
8660 buffers in alternations share the same numbers */
8662 after_freeze = freeze_paren = RExC_npar;
8664 case ':': /* (?:...) */
8665 case '>': /* (?>...) */
8667 case '$': /* (?$...) */
8668 case '@': /* (?@...) */
8669 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8671 case '#': /* (?#...) */
8672 while (*RExC_parse && *RExC_parse != ')')
8674 if (*RExC_parse != ')')
8675 FAIL("Sequence (?#... not terminated");
8676 nextchar(pRExC_state);
8679 case '0' : /* (?0) */
8680 case 'R' : /* (?R) */
8681 if (*RExC_parse != ')')
8682 FAIL("Sequence (?R) not terminated");
8683 ret = reg_node(pRExC_state, GOSTART);
8684 *flagp |= POSTPONED;
8685 nextchar(pRExC_state);
8688 { /* named and numeric backreferences */
8690 case '&': /* (?&NAME) */
8691 parse_start = RExC_parse - 1;
8694 SV *sv_dat = reg_scan_name(pRExC_state,
8695 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8696 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8698 goto gen_recurse_regop;
8699 assert(0); /* NOT REACHED */
8701 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8703 vFAIL("Illegal pattern");
8705 goto parse_recursion;
8707 case '-': /* (?-1) */
8708 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8709 RExC_parse--; /* rewind to let it be handled later */
8713 case '1': case '2': case '3': case '4': /* (?1) */
8714 case '5': case '6': case '7': case '8': case '9':
8717 num = atoi(RExC_parse);
8718 parse_start = RExC_parse - 1; /* MJD */
8719 if (*RExC_parse == '-')
8721 while (isDIGIT(*RExC_parse))
8723 if (*RExC_parse!=')')
8724 vFAIL("Expecting close bracket");
8727 if ( paren == '-' ) {
8729 Diagram of capture buffer numbering.
8730 Top line is the normal capture buffer numbers
8731 Bottom line is the negative indexing as from
8735 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8739 num = RExC_npar + num;
8742 vFAIL("Reference to nonexistent group");
8744 } else if ( paren == '+' ) {
8745 num = RExC_npar + num - 1;
8748 ret = reganode(pRExC_state, GOSUB, num);
8750 if (num > (I32)RExC_rx->nparens) {
8752 vFAIL("Reference to nonexistent group");
8754 ARG2L_SET( ret, RExC_recurse_count++);
8756 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8757 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8761 RExC_seen |= REG_SEEN_RECURSE;
8762 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8763 Set_Node_Offset(ret, parse_start); /* MJD */
8765 *flagp |= POSTPONED;
8766 nextchar(pRExC_state);
8768 } /* named and numeric backreferences */
8769 assert(0); /* NOT REACHED */
8771 case '?': /* (??...) */
8773 if (*RExC_parse != '{') {
8775 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8778 *flagp |= POSTPONED;
8779 paren = *RExC_parse++;
8781 case '{': /* (?{...}) */
8784 struct reg_code_block *cb;
8786 RExC_seen_zerolen++;
8788 if ( !pRExC_state->num_code_blocks
8789 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8790 || pRExC_state->code_blocks[pRExC_state->code_index].start
8791 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8794 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8795 FAIL("panic: Sequence (?{...}): no code block found\n");
8796 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8798 /* this is a pre-compiled code block (?{...}) */
8799 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8800 RExC_parse = RExC_start + cb->end;
8803 if (cb->src_regex) {
8804 n = add_data(pRExC_state, 2, "rl");
8805 RExC_rxi->data->data[n] =
8806 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8807 RExC_rxi->data->data[n+1] = (void*)o;
8810 n = add_data(pRExC_state, 1,
8811 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8812 RExC_rxi->data->data[n] = (void*)o;
8815 pRExC_state->code_index++;
8816 nextchar(pRExC_state);
8820 ret = reg_node(pRExC_state, LOGICAL);
8821 eval = reganode(pRExC_state, EVAL, n);
8824 /* for later propagation into (??{}) return value */
8825 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8827 REGTAIL(pRExC_state, ret, eval);
8828 /* deal with the length of this later - MJD */
8831 ret = reganode(pRExC_state, EVAL, n);
8832 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8833 Set_Node_Offset(ret, parse_start);
8836 case '(': /* (?(?{...})...) and (?(?=...)...) */
8839 if (RExC_parse[0] == '?') { /* (?(?...)) */
8840 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8841 || RExC_parse[1] == '<'
8842 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8845 ret = reg_node(pRExC_state, LOGICAL);
8848 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8852 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8853 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8855 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8856 char *name_start= RExC_parse++;
8858 SV *sv_dat=reg_scan_name(pRExC_state,
8859 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8860 if (RExC_parse == name_start || *RExC_parse != ch)
8861 vFAIL2("Sequence (?(%c... not terminated",
8862 (ch == '>' ? '<' : ch));
8865 num = add_data( pRExC_state, 1, "S" );
8866 RExC_rxi->data->data[num]=(void*)sv_dat;
8867 SvREFCNT_inc_simple_void(sv_dat);
8869 ret = reganode(pRExC_state,NGROUPP,num);
8870 goto insert_if_check_paren;
8872 else if (RExC_parse[0] == 'D' &&
8873 RExC_parse[1] == 'E' &&
8874 RExC_parse[2] == 'F' &&
8875 RExC_parse[3] == 'I' &&
8876 RExC_parse[4] == 'N' &&
8877 RExC_parse[5] == 'E')
8879 ret = reganode(pRExC_state,DEFINEP,0);
8882 goto insert_if_check_paren;
8884 else if (RExC_parse[0] == 'R') {
8887 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8888 parno = atoi(RExC_parse++);
8889 while (isDIGIT(*RExC_parse))
8891 } else if (RExC_parse[0] == '&') {
8894 sv_dat = reg_scan_name(pRExC_state,
8895 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8896 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8898 ret = reganode(pRExC_state,INSUBP,parno);
8899 goto insert_if_check_paren;
8901 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8904 parno = atoi(RExC_parse++);
8906 while (isDIGIT(*RExC_parse))
8908 ret = reganode(pRExC_state, GROUPP, parno);
8910 insert_if_check_paren:
8911 if ((c = *nextchar(pRExC_state)) != ')')
8912 vFAIL("Switch condition not recognized");
8914 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8915 br = regbranch(pRExC_state, &flags, 1,depth+1);
8917 br = reganode(pRExC_state, LONGJMP, 0);
8919 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8920 c = *nextchar(pRExC_state);
8925 vFAIL("(?(DEFINE)....) does not allow branches");
8926 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8927 regbranch(pRExC_state, &flags, 1,depth+1);
8928 REGTAIL(pRExC_state, ret, lastbr);
8931 c = *nextchar(pRExC_state);
8936 vFAIL("Switch (?(condition)... contains too many branches");
8937 ender = reg_node(pRExC_state, TAIL);
8938 REGTAIL(pRExC_state, br, ender);
8940 REGTAIL(pRExC_state, lastbr, ender);
8941 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8944 REGTAIL(pRExC_state, ret, ender);
8945 RExC_size++; /* XXX WHY do we need this?!!
8946 For large programs it seems to be required
8947 but I can't figure out why. -- dmq*/
8951 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8955 RExC_parse--; /* for vFAIL to print correctly */
8956 vFAIL("Sequence (? incomplete");
8958 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8960 has_use_defaults = TRUE;
8961 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8962 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8963 ? REGEX_UNICODE_CHARSET
8964 : REGEX_DEPENDS_CHARSET);
8968 parse_flags: /* (?i) */
8970 U32 posflags = 0, negflags = 0;
8971 U32 *flagsp = &posflags;
8972 char has_charset_modifier = '\0';
8973 regex_charset cs = get_regex_charset(RExC_flags);
8974 if (cs == REGEX_DEPENDS_CHARSET
8975 && (RExC_utf8 || RExC_uni_semantics))
8977 cs = REGEX_UNICODE_CHARSET;
8980 while (*RExC_parse) {
8981 /* && strchr("iogcmsx", *RExC_parse) */
8982 /* (?g), (?gc) and (?o) are useless here
8983 and must be globally applied -- japhy */
8984 switch (*RExC_parse) {
8985 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8986 case LOCALE_PAT_MOD:
8987 if (has_charset_modifier) {
8988 goto excess_modifier;
8990 else if (flagsp == &negflags) {
8993 cs = REGEX_LOCALE_CHARSET;
8994 has_charset_modifier = LOCALE_PAT_MOD;
8995 RExC_contains_locale = 1;
8997 case UNICODE_PAT_MOD:
8998 if (has_charset_modifier) {
8999 goto excess_modifier;
9001 else if (flagsp == &negflags) {
9004 cs = REGEX_UNICODE_CHARSET;
9005 has_charset_modifier = UNICODE_PAT_MOD;
9007 case ASCII_RESTRICT_PAT_MOD:
9008 if (flagsp == &negflags) {
9011 if (has_charset_modifier) {
9012 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9013 goto excess_modifier;
9015 /* Doubled modifier implies more restricted */
9016 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9019 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9021 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9023 case DEPENDS_PAT_MOD:
9024 if (has_use_defaults) {
9025 goto fail_modifiers;
9027 else if (flagsp == &negflags) {
9030 else if (has_charset_modifier) {
9031 goto excess_modifier;
9034 /* The dual charset means unicode semantics if the
9035 * pattern (or target, not known until runtime) are
9036 * utf8, or something in the pattern indicates unicode
9038 cs = (RExC_utf8 || RExC_uni_semantics)
9039 ? REGEX_UNICODE_CHARSET
9040 : REGEX_DEPENDS_CHARSET;
9041 has_charset_modifier = DEPENDS_PAT_MOD;
9045 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9046 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9048 else if (has_charset_modifier == *(RExC_parse - 1)) {
9049 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9052 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9057 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9059 case ONCE_PAT_MOD: /* 'o' */
9060 case GLOBAL_PAT_MOD: /* 'g' */
9061 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9062 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9063 if (! (wastedflags & wflagbit) ) {
9064 wastedflags |= wflagbit;
9067 "Useless (%s%c) - %suse /%c modifier",
9068 flagsp == &negflags ? "?-" : "?",
9070 flagsp == &negflags ? "don't " : "",
9077 case CONTINUE_PAT_MOD: /* 'c' */
9078 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9079 if (! (wastedflags & WASTED_C) ) {
9080 wastedflags |= WASTED_GC;
9083 "Useless (%sc) - %suse /gc modifier",
9084 flagsp == &negflags ? "?-" : "?",
9085 flagsp == &negflags ? "don't " : ""
9090 case KEEPCOPY_PAT_MOD: /* 'p' */
9091 if (flagsp == &negflags) {
9093 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9095 *flagsp |= RXf_PMf_KEEPCOPY;
9099 /* A flag is a default iff it is following a minus, so
9100 * if there is a minus, it means will be trying to
9101 * re-specify a default which is an error */
9102 if (has_use_defaults || flagsp == &negflags) {
9105 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9109 wastedflags = 0; /* reset so (?g-c) warns twice */
9115 RExC_flags |= posflags;
9116 RExC_flags &= ~negflags;
9117 set_regex_charset(&RExC_flags, cs);
9119 oregflags |= posflags;
9120 oregflags &= ~negflags;
9121 set_regex_charset(&oregflags, cs);
9123 nextchar(pRExC_state);
9134 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9139 }} /* one for the default block, one for the switch */
9146 ret = reganode(pRExC_state, OPEN, parno);
9149 RExC_nestroot = parno;
9150 if (RExC_seen & REG_SEEN_RECURSE
9151 && !RExC_open_parens[parno-1])
9153 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9154 "Setting open paren #%"IVdf" to %d\n",
9155 (IV)parno, REG_NODE_NUM(ret)));
9156 RExC_open_parens[parno-1]= ret;
9159 Set_Node_Length(ret, 1); /* MJD */
9160 Set_Node_Offset(ret, RExC_parse); /* MJD */
9168 /* Pick up the branches, linking them together. */
9169 parse_start = RExC_parse; /* MJD */
9170 br = regbranch(pRExC_state, &flags, 1,depth+1);
9172 /* branch_len = (paren != 0); */
9176 if (*RExC_parse == '|') {
9177 if (!SIZE_ONLY && RExC_extralen) {
9178 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9181 reginsert(pRExC_state, BRANCH, br, depth+1);
9182 Set_Node_Length(br, paren != 0);
9183 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9187 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9189 else if (paren == ':') {
9190 *flagp |= flags&SIMPLE;
9192 if (is_open) { /* Starts with OPEN. */
9193 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9195 else if (paren != '?') /* Not Conditional */
9197 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9199 while (*RExC_parse == '|') {
9200 if (!SIZE_ONLY && RExC_extralen) {
9201 ender = reganode(pRExC_state, LONGJMP,0);
9202 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9205 RExC_extralen += 2; /* Account for LONGJMP. */
9206 nextchar(pRExC_state);
9208 if (RExC_npar > after_freeze)
9209 after_freeze = RExC_npar;
9210 RExC_npar = freeze_paren;
9212 br = regbranch(pRExC_state, &flags, 0, depth+1);
9216 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9218 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9221 if (have_branch || paren != ':') {
9222 /* Make a closing node, and hook it on the end. */
9225 ender = reg_node(pRExC_state, TAIL);
9228 ender = reganode(pRExC_state, CLOSE, parno);
9229 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9230 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9231 "Setting close paren #%"IVdf" to %d\n",
9232 (IV)parno, REG_NODE_NUM(ender)));
9233 RExC_close_parens[parno-1]= ender;
9234 if (RExC_nestroot == parno)
9237 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9238 Set_Node_Length(ender,1); /* MJD */
9244 *flagp &= ~HASWIDTH;
9247 ender = reg_node(pRExC_state, SUCCEED);
9250 ender = reg_node(pRExC_state, END);
9252 assert(!RExC_opend); /* there can only be one! */
9257 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9258 SV * const mysv_val1=sv_newmortal();
9259 SV * const mysv_val2=sv_newmortal();
9260 DEBUG_PARSE_MSG("lsbr");
9261 regprop(RExC_rx, mysv_val1, lastbr);
9262 regprop(RExC_rx, mysv_val2, ender);
9263 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9264 SvPV_nolen_const(mysv_val1),
9265 (IV)REG_NODE_NUM(lastbr),
9266 SvPV_nolen_const(mysv_val2),
9267 (IV)REG_NODE_NUM(ender),
9268 (IV)(ender - lastbr)
9271 REGTAIL(pRExC_state, lastbr, ender);
9273 if (have_branch && !SIZE_ONLY) {
9276 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9278 /* Hook the tails of the branches to the closing node. */
9279 for (br = ret; br; br = regnext(br)) {
9280 const U8 op = PL_regkind[OP(br)];
9282 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9283 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9286 else if (op == BRANCHJ) {
9287 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9288 /* for now we always disable this optimisation * /
9289 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9295 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9296 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9297 SV * const mysv_val1=sv_newmortal();
9298 SV * const mysv_val2=sv_newmortal();
9299 DEBUG_PARSE_MSG("NADA");
9300 regprop(RExC_rx, mysv_val1, ret);
9301 regprop(RExC_rx, mysv_val2, ender);
9302 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9303 SvPV_nolen_const(mysv_val1),
9304 (IV)REG_NODE_NUM(ret),
9305 SvPV_nolen_const(mysv_val2),
9306 (IV)REG_NODE_NUM(ender),
9311 if (OP(ender) == TAIL) {
9316 for ( opt= br + 1; opt < ender ; opt++ )
9318 NEXT_OFF(br)= ender - br;
9326 static const char parens[] = "=!<,>";
9328 if (paren && (p = strchr(parens, paren))) {
9329 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9330 int flag = (p - parens) > 1;
9333 node = SUSPEND, flag = 0;
9334 reginsert(pRExC_state, node,ret, depth+1);
9335 Set_Node_Cur_Length(ret);
9336 Set_Node_Offset(ret, parse_start + 1);
9338 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9342 /* Check for proper termination. */
9344 RExC_flags = oregflags;
9345 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9346 RExC_parse = oregcomp_parse;
9347 vFAIL("Unmatched (");
9350 else if (!paren && RExC_parse < RExC_end) {
9351 if (*RExC_parse == ')') {
9353 vFAIL("Unmatched )");
9356 FAIL("Junk on end of regexp"); /* "Can't happen". */
9357 assert(0); /* NOTREACHED */
9360 if (RExC_in_lookbehind) {
9361 RExC_in_lookbehind--;
9363 if (after_freeze > RExC_npar)
9364 RExC_npar = after_freeze;
9369 - regbranch - one alternative of an | operator
9371 * Implements the concatenation operator.
9374 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9378 regnode *chain = NULL;
9380 I32 flags = 0, c = 0;
9381 GET_RE_DEBUG_FLAGS_DECL;
9383 PERL_ARGS_ASSERT_REGBRANCH;
9385 DEBUG_PARSE("brnc");
9390 if (!SIZE_ONLY && RExC_extralen)
9391 ret = reganode(pRExC_state, BRANCHJ,0);
9393 ret = reg_node(pRExC_state, BRANCH);
9394 Set_Node_Length(ret, 1);
9398 if (!first && SIZE_ONLY)
9399 RExC_extralen += 1; /* BRANCHJ */
9401 *flagp = WORST; /* Tentatively. */
9404 nextchar(pRExC_state);
9405 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9407 latest = regpiece(pRExC_state, &flags,depth+1);
9408 if (latest == NULL) {
9409 if (flags & TRYAGAIN)
9413 else if (ret == NULL)
9415 *flagp |= flags&(HASWIDTH|POSTPONED);
9416 if (chain == NULL) /* First piece. */
9417 *flagp |= flags&SPSTART;
9420 REGTAIL(pRExC_state, chain, latest);
9425 if (chain == NULL) { /* Loop ran zero times. */
9426 chain = reg_node(pRExC_state, NOTHING);
9431 *flagp |= flags&SIMPLE;
9438 - regpiece - something followed by possible [*+?]
9440 * Note that the branching code sequences used for ? and the general cases
9441 * of * and + are somewhat optimized: they use the same NOTHING node as
9442 * both the endmarker for their branch list and the body of the last branch.
9443 * It might seem that this node could be dispensed with entirely, but the
9444 * endmarker role is not redundant.
9447 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9454 const char * const origparse = RExC_parse;
9456 I32 max = REG_INFTY;
9457 #ifdef RE_TRACK_PATTERN_OFFSETS
9460 const char *maxpos = NULL;
9462 /* Save the original in case we change the emitted regop to a FAIL. */
9463 regnode * const orig_emit = RExC_emit;
9465 GET_RE_DEBUG_FLAGS_DECL;
9467 PERL_ARGS_ASSERT_REGPIECE;
9469 DEBUG_PARSE("piec");
9471 ret = regatom(pRExC_state, &flags,depth+1);
9473 if (flags & TRYAGAIN)
9480 if (op == '{' && regcurly(RExC_parse)) {
9482 #ifdef RE_TRACK_PATTERN_OFFSETS
9483 parse_start = RExC_parse; /* MJD */
9485 next = RExC_parse + 1;
9486 while (isDIGIT(*next) || *next == ',') {
9495 if (*next == '}') { /* got one */
9499 min = atoi(RExC_parse);
9503 maxpos = RExC_parse;
9505 if (!max && *maxpos != '0')
9506 max = REG_INFTY; /* meaning "infinity" */
9507 else if (max >= REG_INFTY)
9508 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9510 nextchar(pRExC_state);
9511 if (max < min) { /* If can't match, warn and optimize to fail
9514 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9516 /* We can't back off the size because we have to reserve
9517 * enough space for all the things we are about to throw
9518 * away, but we can shrink it by the ammount we are about
9520 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9523 RExC_emit = orig_emit;
9525 ret = reg_node(pRExC_state, OPFAIL);
9530 if ((flags&SIMPLE)) {
9531 RExC_naughty += 2 + RExC_naughty / 2;
9532 reginsert(pRExC_state, CURLY, ret, depth+1);
9533 Set_Node_Offset(ret, parse_start+1); /* MJD */
9534 Set_Node_Cur_Length(ret);
9537 regnode * const w = reg_node(pRExC_state, WHILEM);
9540 REGTAIL(pRExC_state, ret, w);
9541 if (!SIZE_ONLY && RExC_extralen) {
9542 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9543 reginsert(pRExC_state, NOTHING,ret, depth+1);
9544 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9546 reginsert(pRExC_state, CURLYX,ret, depth+1);
9548 Set_Node_Offset(ret, parse_start+1);
9549 Set_Node_Length(ret,
9550 op == '{' ? (RExC_parse - parse_start) : 1);
9552 if (!SIZE_ONLY && RExC_extralen)
9553 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9554 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9556 RExC_whilem_seen++, RExC_extralen += 3;
9557 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9566 ARG1_SET(ret, (U16)min);
9567 ARG2_SET(ret, (U16)max);
9579 #if 0 /* Now runtime fix should be reliable. */
9581 /* if this is reinstated, don't forget to put this back into perldiag:
9583 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9585 (F) The part of the regexp subject to either the * or + quantifier
9586 could match an empty string. The {#} shows in the regular
9587 expression about where the problem was discovered.
9591 if (!(flags&HASWIDTH) && op != '?')
9592 vFAIL("Regexp *+ operand could be empty");
9595 #ifdef RE_TRACK_PATTERN_OFFSETS
9596 parse_start = RExC_parse;
9598 nextchar(pRExC_state);
9600 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9602 if (op == '*' && (flags&SIMPLE)) {
9603 reginsert(pRExC_state, STAR, ret, depth+1);
9607 else if (op == '*') {
9611 else if (op == '+' && (flags&SIMPLE)) {
9612 reginsert(pRExC_state, PLUS, ret, depth+1);
9616 else if (op == '+') {
9620 else if (op == '?') {
9625 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9626 ckWARN3reg(RExC_parse,
9627 "%.*s matches null string many times",
9628 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9632 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9633 nextchar(pRExC_state);
9634 reginsert(pRExC_state, MINMOD, ret, depth+1);
9635 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9637 #ifndef REG_ALLOW_MINMOD_SUSPEND
9640 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9642 nextchar(pRExC_state);
9643 ender = reg_node(pRExC_state, SUCCEED);
9644 REGTAIL(pRExC_state, ret, ender);
9645 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9647 ender = reg_node(pRExC_state, TAIL);
9648 REGTAIL(pRExC_state, ret, ender);
9652 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9654 vFAIL("Nested quantifiers");
9661 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9664 /* This is expected to be called by a parser routine that has recognized '\N'
9665 and needs to handle the rest. RExC_parse is expected to point at the first
9666 char following the N at the time of the call. On successful return,
9667 RExC_parse has been updated to point to just after the sequence identified
9668 by this routine, and <*flagp> has been updated.
9670 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9673 \N may begin either a named sequence, or if outside a character class, mean
9674 to match a non-newline. For non single-quoted regexes, the tokenizer has
9675 attempted to decide which, and in the case of a named sequence, converted it
9676 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9677 where c1... are the characters in the sequence. For single-quoted regexes,
9678 the tokenizer passes the \N sequence through unchanged; this code will not
9679 attempt to determine this nor expand those, instead raising a syntax error.
9680 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9681 or there is no '}', it signals that this \N occurrence means to match a
9684 Only the \N{U+...} form should occur in a character class, for the same
9685 reason that '.' inside a character class means to just match a period: it
9686 just doesn't make sense.
9688 The function raises an error (via vFAIL), and doesn't return for various
9689 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9690 success; it returns FALSE otherwise.
9692 If <valuep> is non-null, it means the caller can accept an input sequence
9693 consisting of a just a single code point; <*valuep> is set to that value
9694 if the input is such.
9696 If <node_p> is non-null it signifies that the caller can accept any other
9697 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9699 1) \N means not-a-NL: points to a newly created REG_ANY node;
9700 2) \N{}: points to a new NOTHING node;
9701 3) otherwise: points to a new EXACT node containing the resolved
9703 Note that FALSE is returned for single code point sequences if <valuep> is
9707 char * endbrace; /* '}' following the name */
9709 char *endchar; /* Points to '.' or '}' ending cur char in the input
9711 bool has_multiple_chars; /* true if the input stream contains a sequence of
9712 more than one character */
9714 GET_RE_DEBUG_FLAGS_DECL;
9716 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9720 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9722 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9723 * modifier. The other meaning does not */
9724 p = (RExC_flags & RXf_PMf_EXTENDED)
9725 ? regwhite( pRExC_state, RExC_parse )
9728 /* Disambiguate between \N meaning a named character versus \N meaning
9729 * [^\n]. The former is assumed when it can't be the latter. */
9730 if (*p != '{' || regcurly(p)) {
9733 /* no bare \N in a charclass */
9734 if (in_char_class) {
9735 vFAIL("\\N in a character class must be a named character: \\N{...}");
9739 nextchar(pRExC_state);
9740 *node_p = reg_node(pRExC_state, REG_ANY);
9741 *flagp |= HASWIDTH|SIMPLE;
9744 Set_Node_Length(*node_p, 1); /* MJD */
9748 /* Here, we have decided it should be a named character or sequence */
9750 /* The test above made sure that the next real character is a '{', but
9751 * under the /x modifier, it could be separated by space (or a comment and
9752 * \n) and this is not allowed (for consistency with \x{...} and the
9753 * tokenizer handling of \N{NAME}). */
9754 if (*RExC_parse != '{') {
9755 vFAIL("Missing braces on \\N{}");
9758 RExC_parse++; /* Skip past the '{' */
9760 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9761 || ! (endbrace == RExC_parse /* nothing between the {} */
9762 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9763 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9765 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9766 vFAIL("\\N{NAME} must be resolved by the lexer");
9769 if (endbrace == RExC_parse) { /* empty: \N{} */
9772 *node_p = reg_node(pRExC_state,NOTHING);
9774 else if (in_char_class) {
9775 if (SIZE_ONLY && in_char_class) {
9776 ckWARNreg(RExC_parse,
9777 "Ignoring zero length \\N{} in character class"
9785 nextchar(pRExC_state);
9789 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9790 RExC_parse += 2; /* Skip past the 'U+' */
9792 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9794 /* Code points are separated by dots. If none, there is only one code
9795 * point, and is terminated by the brace */
9796 has_multiple_chars = (endchar < endbrace);
9798 if (valuep && (! has_multiple_chars || in_char_class)) {
9799 /* We only pay attention to the first char of
9800 multichar strings being returned in char classes. I kinda wonder
9801 if this makes sense as it does change the behaviour
9802 from earlier versions, OTOH that behaviour was broken
9803 as well. XXX Solution is to recharacterize as
9804 [rest-of-class]|multi1|multi2... */
9806 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9807 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9808 | PERL_SCAN_DISALLOW_PREFIX
9809 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9811 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9813 /* The tokenizer should have guaranteed validity, but it's possible to
9814 * bypass it by using single quoting, so check */
9815 if (length_of_hex == 0
9816 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9818 RExC_parse += length_of_hex; /* Includes all the valid */
9819 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9820 ? UTF8SKIP(RExC_parse)
9822 /* Guard against malformed utf8 */
9823 if (RExC_parse >= endchar) {
9824 RExC_parse = endchar;
9826 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9829 if (in_char_class && has_multiple_chars) {
9830 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9832 RExC_parse = endbrace + 1;
9834 else if (! node_p || ! has_multiple_chars) {
9836 /* Here, the input is legal, but not according to the caller's
9837 * options. We fail without advancing the parse, so that the
9838 * caller can try again */
9844 /* What is done here is to convert this to a sub-pattern of the form
9845 * (?:\x{char1}\x{char2}...)
9846 * and then call reg recursively. That way, it retains its atomicness,
9847 * while not having to worry about special handling that some code
9848 * points may have. toke.c has converted the original Unicode values
9849 * to native, so that we can just pass on the hex values unchanged. We
9850 * do have to set a flag to keep recoding from happening in the
9853 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9855 char *orig_end = RExC_end;
9858 while (RExC_parse < endbrace) {
9860 /* Convert to notation the rest of the code understands */
9861 sv_catpv(substitute_parse, "\\x{");
9862 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9863 sv_catpv(substitute_parse, "}");
9865 /* Point to the beginning of the next character in the sequence. */
9866 RExC_parse = endchar + 1;
9867 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9869 sv_catpv(substitute_parse, ")");
9871 RExC_parse = SvPV(substitute_parse, len);
9873 /* Don't allow empty number */
9875 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9877 RExC_end = RExC_parse + len;
9879 /* The values are Unicode, and therefore not subject to recoding */
9880 RExC_override_recoding = 1;
9882 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9883 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9885 RExC_parse = endbrace;
9886 RExC_end = orig_end;
9887 RExC_override_recoding = 0;
9889 nextchar(pRExC_state);
9899 * It returns the code point in utf8 for the value in *encp.
9900 * value: a code value in the source encoding
9901 * encp: a pointer to an Encode object
9903 * If the result from Encode is not a single character,
9904 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9907 S_reg_recode(pTHX_ const char value, SV **encp)
9910 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9911 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9912 const STRLEN newlen = SvCUR(sv);
9913 UV uv = UNICODE_REPLACEMENT;
9915 PERL_ARGS_ASSERT_REG_RECODE;
9919 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9922 if (!newlen || numlen != newlen) {
9923 uv = UNICODE_REPLACEMENT;
9929 PERL_STATIC_INLINE U8
9930 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9934 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9940 op = get_regex_charset(RExC_flags);
9941 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9942 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9943 been, so there is no hole */
9949 PERL_STATIC_INLINE void
9950 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9952 /* This knows the details about sizing an EXACTish node, setting flags for
9953 * it (by setting <*flagp>, and potentially populating it with a single
9956 * If <len> is non-zero, this function assumes that the node has already
9957 * been populated, and just does the sizing. In this case <code_point>
9958 * should be the final code point that has already been placed into the
9959 * node. This value will be ignored except that under some circumstances
9960 * <*flagp> is set based on it.
9962 * If <len is zero, the function assumes that the node is to contain only
9963 * the single character given by <code_point> and calculates what <len>
9964 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9965 * additionally will populate the node's STRING with <code_point>, if <len>
9966 * is 0. In both cases <*flagp> is appropriately set
9968 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9969 * folded (the latter only when the rules indicate it can match 'ss') */
9971 bool len_passed_in = cBOOL(len != 0);
9972 U8 character[UTF8_MAXBYTES_CASE+1];
9974 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9976 if (! len_passed_in) {
9979 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9982 uvchr_to_utf8( character, code_point);
9983 len = UTF8SKIP(character);
9987 || code_point != LATIN_SMALL_LETTER_SHARP_S
9988 || ASCII_FOLD_RESTRICTED
9989 || ! AT_LEAST_UNI_SEMANTICS)
9991 *character = (U8) code_point;
9996 *(character + 1) = 's';
10002 RExC_size += STR_SZ(len);
10005 RExC_emit += STR_SZ(len);
10006 STR_LEN(node) = len;
10007 if (! len_passed_in) {
10008 Copy((char *) character, STRING(node), len, char);
10012 *flagp |= HASWIDTH;
10013 if (len == 1 && UNI_IS_INVARIANT(code_point))
10018 - regatom - the lowest level
10020 Try to identify anything special at the start of the pattern. If there
10021 is, then handle it as required. This may involve generating a single regop,
10022 such as for an assertion; or it may involve recursing, such as to
10023 handle a () structure.
10025 If the string doesn't start with something special then we gobble up
10026 as much literal text as we can.
10028 Once we have been able to handle whatever type of thing started the
10029 sequence, we return.
10031 Note: we have to be careful with escapes, as they can be both literal
10032 and special, and in the case of \10 and friends, context determines which.
10034 A summary of the code structure is:
10036 switch (first_byte) {
10037 cases for each special:
10038 handle this special;
10041 switch (2nd byte) {
10042 cases for each unambiguous special:
10043 handle this special;
10045 cases for each ambigous special/literal:
10047 if (special) handle here
10049 default: // unambiguously literal:
10052 default: // is a literal char
10055 create EXACTish node for literal;
10056 while (more input and node isn't full) {
10057 switch (input_byte) {
10058 cases for each special;
10059 make sure parse pointer is set so that the next call to
10060 regatom will see this special first
10061 goto loopdone; // EXACTish node terminated by prev. char
10063 append char to EXACTISH node;
10065 get next input byte;
10069 return the generated node;
10071 Specifically there are two separate switches for handling
10072 escape sequences, with the one for handling literal escapes requiring
10073 a dummy entry for all of the special escapes that are actually handled
10078 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10081 regnode *ret = NULL;
10083 char *parse_start = RExC_parse;
10085 GET_RE_DEBUG_FLAGS_DECL;
10086 DEBUG_PARSE("atom");
10087 *flagp = WORST; /* Tentatively. */
10089 PERL_ARGS_ASSERT_REGATOM;
10092 switch ((U8)*RExC_parse) {
10094 RExC_seen_zerolen++;
10095 nextchar(pRExC_state);
10096 if (RExC_flags & RXf_PMf_MULTILINE)
10097 ret = reg_node(pRExC_state, MBOL);
10098 else if (RExC_flags & RXf_PMf_SINGLELINE)
10099 ret = reg_node(pRExC_state, SBOL);
10101 ret = reg_node(pRExC_state, BOL);
10102 Set_Node_Length(ret, 1); /* MJD */
10105 nextchar(pRExC_state);
10107 RExC_seen_zerolen++;
10108 if (RExC_flags & RXf_PMf_MULTILINE)
10109 ret = reg_node(pRExC_state, MEOL);
10110 else if (RExC_flags & RXf_PMf_SINGLELINE)
10111 ret = reg_node(pRExC_state, SEOL);
10113 ret = reg_node(pRExC_state, EOL);
10114 Set_Node_Length(ret, 1); /* MJD */
10117 nextchar(pRExC_state);
10118 if (RExC_flags & RXf_PMf_SINGLELINE)
10119 ret = reg_node(pRExC_state, SANY);
10121 ret = reg_node(pRExC_state, REG_ANY);
10122 *flagp |= HASWIDTH|SIMPLE;
10124 Set_Node_Length(ret, 1); /* MJD */
10128 char * const oregcomp_parse = ++RExC_parse;
10129 ret = regclass(pRExC_state, flagp,depth+1);
10130 if (*RExC_parse != ']') {
10131 RExC_parse = oregcomp_parse;
10132 vFAIL("Unmatched [");
10134 nextchar(pRExC_state);
10135 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10139 nextchar(pRExC_state);
10140 ret = reg(pRExC_state, 1, &flags,depth+1);
10142 if (flags & TRYAGAIN) {
10143 if (RExC_parse == RExC_end) {
10144 /* Make parent create an empty node if needed. */
10145 *flagp |= TRYAGAIN;
10152 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10156 if (flags & TRYAGAIN) {
10157 *flagp |= TRYAGAIN;
10160 vFAIL("Internal urp");
10161 /* Supposed to be caught earlier. */
10167 vFAIL("Quantifier follows nothing");
10172 This switch handles escape sequences that resolve to some kind
10173 of special regop and not to literal text. Escape sequnces that
10174 resolve to literal text are handled below in the switch marked
10177 Every entry in this switch *must* have a corresponding entry
10178 in the literal escape switch. However, the opposite is not
10179 required, as the default for this switch is to jump to the
10180 literal text handling code.
10182 switch ((U8)*++RExC_parse) {
10183 /* Special Escapes */
10185 RExC_seen_zerolen++;
10186 ret = reg_node(pRExC_state, SBOL);
10188 goto finish_meta_pat;
10190 ret = reg_node(pRExC_state, GPOS);
10191 RExC_seen |= REG_SEEN_GPOS;
10193 goto finish_meta_pat;
10195 RExC_seen_zerolen++;
10196 ret = reg_node(pRExC_state, KEEPS);
10198 /* XXX:dmq : disabling in-place substitution seems to
10199 * be necessary here to avoid cases of memory corruption, as
10200 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10202 RExC_seen |= REG_SEEN_LOOKBEHIND;
10203 goto finish_meta_pat;
10205 ret = reg_node(pRExC_state, SEOL);
10207 RExC_seen_zerolen++; /* Do not optimize RE away */
10208 goto finish_meta_pat;
10210 ret = reg_node(pRExC_state, EOS);
10212 RExC_seen_zerolen++; /* Do not optimize RE away */
10213 goto finish_meta_pat;
10215 ret = reg_node(pRExC_state, CANY);
10216 RExC_seen |= REG_SEEN_CANY;
10217 *flagp |= HASWIDTH|SIMPLE;
10218 goto finish_meta_pat;
10220 ret = reg_node(pRExC_state, CLUMP);
10221 *flagp |= HASWIDTH;
10222 goto finish_meta_pat;
10224 op = ALNUM + get_regex_charset(RExC_flags);
10225 if (op > ALNUMA) { /* /aa is same as /a */
10228 ret = reg_node(pRExC_state, op);
10229 *flagp |= HASWIDTH|SIMPLE;
10230 goto finish_meta_pat;
10232 op = NALNUM + get_regex_charset(RExC_flags);
10233 if (op > NALNUMA) { /* /aa is same as /a */
10236 ret = reg_node(pRExC_state, op);
10237 *flagp |= HASWIDTH|SIMPLE;
10238 goto finish_meta_pat;
10240 RExC_seen_zerolen++;
10241 RExC_seen |= REG_SEEN_LOOKBEHIND;
10242 op = BOUND + get_regex_charset(RExC_flags);
10243 if (op > BOUNDA) { /* /aa is same as /a */
10246 ret = reg_node(pRExC_state, op);
10247 FLAGS(ret) = get_regex_charset(RExC_flags);
10249 goto finish_meta_pat;
10251 RExC_seen_zerolen++;
10252 RExC_seen |= REG_SEEN_LOOKBEHIND;
10253 op = NBOUND + get_regex_charset(RExC_flags);
10254 if (op > NBOUNDA) { /* /aa is same as /a */
10257 ret = reg_node(pRExC_state, op);
10258 FLAGS(ret) = get_regex_charset(RExC_flags);
10260 goto finish_meta_pat;
10262 op = SPACE + get_regex_charset(RExC_flags);
10263 if (op > SPACEA) { /* /aa is same as /a */
10266 ret = reg_node(pRExC_state, op);
10267 *flagp |= HASWIDTH|SIMPLE;
10268 goto finish_meta_pat;
10270 op = NSPACE + get_regex_charset(RExC_flags);
10271 if (op > NSPACEA) { /* /aa is same as /a */
10274 ret = reg_node(pRExC_state, op);
10275 *flagp |= HASWIDTH|SIMPLE;
10276 goto finish_meta_pat;
10284 U8 offset = get_regex_charset(RExC_flags);
10285 if (offset == REGEX_UNICODE_CHARSET) {
10286 offset = REGEX_DEPENDS_CHARSET;
10288 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10289 offset = REGEX_ASCII_RESTRICTED_CHARSET;
10293 ret = reg_node(pRExC_state, op);
10294 *flagp |= HASWIDTH|SIMPLE;
10295 goto finish_meta_pat;
10297 ret = reg_node(pRExC_state, LNBREAK);
10298 *flagp |= HASWIDTH|SIMPLE;
10299 goto finish_meta_pat;
10301 ret = reg_node(pRExC_state, HORIZWS);
10302 *flagp |= HASWIDTH|SIMPLE;
10303 goto finish_meta_pat;
10305 ret = reg_node(pRExC_state, NHORIZWS);
10306 *flagp |= HASWIDTH|SIMPLE;
10307 goto finish_meta_pat;
10309 ret = reg_node(pRExC_state, VERTWS);
10310 *flagp |= HASWIDTH|SIMPLE;
10311 goto finish_meta_pat;
10313 ret = reg_node(pRExC_state, NVERTWS);
10314 *flagp |= HASWIDTH|SIMPLE;
10316 nextchar(pRExC_state);
10317 Set_Node_Length(ret, 2); /* MJD */
10322 char* const oldregxend = RExC_end;
10324 char* parse_start = RExC_parse - 2;
10327 if (RExC_parse[1] == '{') {
10328 /* a lovely hack--pretend we saw [\pX] instead */
10329 RExC_end = strchr(RExC_parse, '}');
10331 const U8 c = (U8)*RExC_parse;
10333 RExC_end = oldregxend;
10334 vFAIL2("Missing right brace on \\%c{}", c);
10339 RExC_end = RExC_parse + 2;
10340 if (RExC_end > oldregxend)
10341 RExC_end = oldregxend;
10345 ret = regclass(pRExC_state, flagp,depth+1);
10347 RExC_end = oldregxend;
10350 Set_Node_Offset(ret, parse_start + 2);
10351 Set_Node_Cur_Length(ret);
10352 nextchar(pRExC_state);
10356 /* Handle \N and \N{NAME} with multiple code points here and not
10357 * below because it can be multicharacter. join_exact() will join
10358 * them up later on. Also this makes sure that things like
10359 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10360 * The options to the grok function call causes it to fail if the
10361 * sequence is just a single code point. We then go treat it as
10362 * just another character in the current EXACT node, and hence it
10363 * gets uniform treatment with all the other characters. The
10364 * special treatment for quantifiers is not needed for such single
10365 * character sequences */
10367 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10372 case 'k': /* Handle \k<NAME> and \k'NAME' */
10375 char ch= RExC_parse[1];
10376 if (ch != '<' && ch != '\'' && ch != '{') {
10378 vFAIL2("Sequence %.2s... not terminated",parse_start);
10380 /* this pretty much dupes the code for (?P=...) in reg(), if
10381 you change this make sure you change that */
10382 char* name_start = (RExC_parse += 2);
10384 SV *sv_dat = reg_scan_name(pRExC_state,
10385 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10386 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10387 if (RExC_parse == name_start || *RExC_parse != ch)
10388 vFAIL2("Sequence %.3s... not terminated",parse_start);
10391 num = add_data( pRExC_state, 1, "S" );
10392 RExC_rxi->data->data[num]=(void*)sv_dat;
10393 SvREFCNT_inc_simple_void(sv_dat);
10397 ret = reganode(pRExC_state,
10400 : (ASCII_FOLD_RESTRICTED)
10402 : (AT_LEAST_UNI_SEMANTICS)
10408 *flagp |= HASWIDTH;
10410 /* override incorrect value set in reganode MJD */
10411 Set_Node_Offset(ret, parse_start+1);
10412 Set_Node_Cur_Length(ret); /* MJD */
10413 nextchar(pRExC_state);
10419 case '1': case '2': case '3': case '4':
10420 case '5': case '6': case '7': case '8': case '9':
10423 bool isg = *RExC_parse == 'g';
10428 if (*RExC_parse == '{') {
10432 if (*RExC_parse == '-') {
10436 if (hasbrace && !isDIGIT(*RExC_parse)) {
10437 if (isrel) RExC_parse--;
10439 goto parse_named_seq;
10441 num = atoi(RExC_parse);
10442 if (isg && num == 0)
10443 vFAIL("Reference to invalid group 0");
10445 num = RExC_npar - num;
10447 vFAIL("Reference to nonexistent or unclosed group");
10449 if (!isg && num > 9 && num >= RExC_npar)
10450 /* Probably a character specified in octal, e.g. \35 */
10453 char * const parse_start = RExC_parse - 1; /* MJD */
10454 while (isDIGIT(*RExC_parse))
10456 if (parse_start == RExC_parse - 1)
10457 vFAIL("Unterminated \\g... pattern");
10459 if (*RExC_parse != '}')
10460 vFAIL("Unterminated \\g{...} pattern");
10464 if (num > (I32)RExC_rx->nparens)
10465 vFAIL("Reference to nonexistent group");
10468 ret = reganode(pRExC_state,
10471 : (ASCII_FOLD_RESTRICTED)
10473 : (AT_LEAST_UNI_SEMANTICS)
10479 *flagp |= HASWIDTH;
10481 /* override incorrect value set in reganode MJD */
10482 Set_Node_Offset(ret, parse_start+1);
10483 Set_Node_Cur_Length(ret); /* MJD */
10485 nextchar(pRExC_state);
10490 if (RExC_parse >= RExC_end)
10491 FAIL("Trailing \\");
10494 /* Do not generate "unrecognized" warnings here, we fall
10495 back into the quick-grab loop below */
10502 if (RExC_flags & RXf_PMf_EXTENDED) {
10503 if ( reg_skipcomment( pRExC_state ) )
10510 parse_start = RExC_parse - 1;
10519 #define MAX_NODE_STRING_SIZE 127
10520 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10522 U8 upper_parse = MAX_NODE_STRING_SIZE;
10525 bool next_is_quantifier;
10526 char * oldp = NULL;
10529 node_type = compute_EXACTish(pRExC_state);
10530 ret = reg_node(pRExC_state, node_type);
10532 /* In pass1, folded, we use a temporary buffer instead of the
10533 * actual node, as the node doesn't exist yet */
10534 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10540 /* XXX The node can hold up to 255 bytes, yet this only goes to
10541 * 127. I (khw) do not know why. Keeping it somewhat less than
10542 * 255 allows us to not have to worry about overflow due to
10543 * converting to utf8 and fold expansion, but that value is
10544 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10545 * split up by this limit into a single one using the real max of
10546 * 255. Even at 127, this breaks under rare circumstances. If
10547 * folding, we do not want to split a node at a character that is a
10548 * non-final in a multi-char fold, as an input string could just
10549 * happen to want to match across the node boundary. The join
10550 * would solve that problem if the join actually happens. But a
10551 * series of more than two nodes in a row each of 127 would cause
10552 * the first join to succeed to get to 254, but then there wouldn't
10553 * be room for the next one, which could at be one of those split
10554 * multi-char folds. I don't know of any fool-proof solution. One
10555 * could back off to end with only a code point that isn't such a
10556 * non-final, but it is possible for there not to be any in the
10558 for (p = RExC_parse - 1;
10559 len < upper_parse && p < RExC_end;
10564 if (RExC_flags & RXf_PMf_EXTENDED)
10565 p = regwhite( pRExC_state, p );
10576 /* Literal Escapes Switch
10578 This switch is meant to handle escape sequences that
10579 resolve to a literal character.
10581 Every escape sequence that represents something
10582 else, like an assertion or a char class, is handled
10583 in the switch marked 'Special Escapes' above in this
10584 routine, but also has an entry here as anything that
10585 isn't explicitly mentioned here will be treated as
10586 an unescaped equivalent literal.
10589 switch ((U8)*++p) {
10590 /* These are all the special escapes. */
10591 case 'A': /* Start assertion */
10592 case 'b': case 'B': /* Word-boundary assertion*/
10593 case 'C': /* Single char !DANGEROUS! */
10594 case 'd': case 'D': /* digit class */
10595 case 'g': case 'G': /* generic-backref, pos assertion */
10596 case 'h': case 'H': /* HORIZWS */
10597 case 'k': case 'K': /* named backref, keep marker */
10598 case 'p': case 'P': /* Unicode property */
10599 case 'R': /* LNBREAK */
10600 case 's': case 'S': /* space class */
10601 case 'v': case 'V': /* VERTWS */
10602 case 'w': case 'W': /* word class */
10603 case 'X': /* eXtended Unicode "combining character sequence" */
10604 case 'z': case 'Z': /* End of line/string assertion */
10608 /* Anything after here is an escape that resolves to a
10609 literal. (Except digits, which may or may not)
10615 case 'N': /* Handle a single-code point named character. */
10616 /* The options cause it to fail if a multiple code
10617 * point sequence. Handle those in the switch() above
10619 RExC_parse = p + 1;
10620 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10621 flagp, depth, FALSE))
10623 RExC_parse = p = oldp;
10627 if (ender > 0xff) {
10644 ender = ASCII_TO_NATIVE('\033');
10648 ender = ASCII_TO_NATIVE('\007');
10653 STRLEN brace_len = len;
10655 const char* error_msg;
10657 bool valid = grok_bslash_o(p,
10664 RExC_parse = p; /* going to die anyway; point
10665 to exact spot of failure */
10672 if (PL_encoding && ender < 0x100) {
10673 goto recode_encoding;
10675 if (ender > 0xff) {
10682 STRLEN brace_len = len;
10684 const char* error_msg;
10686 bool valid = grok_bslash_x(p,
10693 RExC_parse = p; /* going to die anyway; point
10694 to exact spot of failure */
10700 if (PL_encoding && ender < 0x100) {
10701 goto recode_encoding;
10703 if (ender > 0xff) {
10710 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10712 case '0': case '1': case '2': case '3':case '4':
10713 case '5': case '6': case '7':
10715 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10717 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10719 ender = grok_oct(p, &numlen, &flags, NULL);
10720 if (ender > 0xff) {
10729 if (PL_encoding && ender < 0x100)
10730 goto recode_encoding;
10733 if (! RExC_override_recoding) {
10734 SV* enc = PL_encoding;
10735 ender = reg_recode((const char)(U8)ender, &enc);
10736 if (!enc && SIZE_ONLY)
10737 ckWARNreg(p, "Invalid escape in the specified encoding");
10743 FAIL("Trailing \\");
10746 if (!SIZE_ONLY&& isALNUMC(*p)) {
10747 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10749 goto normal_default;
10753 /* Currently we don't warn when the lbrace is at the start
10754 * of a construct. This catches it in the middle of a
10755 * literal string, or when its the first thing after
10756 * something like "\b" */
10758 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10760 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10765 if (UTF8_IS_START(*p) && UTF) {
10767 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10768 &numlen, UTF8_ALLOW_DEFAULT);
10774 } /* End of switch on the literal */
10776 /* Here, have looked at the literal character and <ender>
10777 * contains its ordinal, <p> points to the character after it
10780 if ( RExC_flags & RXf_PMf_EXTENDED)
10781 p = regwhite( pRExC_state, p );
10783 /* If the next thing is a quantifier, it applies to this
10784 * character only, which means that this character has to be in
10785 * its own node and can't just be appended to the string in an
10786 * existing node, so if there are already other characters in
10787 * the node, close the node with just them, and set up to do
10788 * this character again next time through, when it will be the
10789 * only thing in its new node */
10790 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10798 /* See comments for join_exact() as to why we fold
10799 * this non-UTF at compile time */
10800 || (node_type == EXACTFU
10801 && ender == LATIN_SMALL_LETTER_SHARP_S))
10805 /* Prime the casefolded buffer. Locale rules, which
10806 * apply only to code points < 256, aren't known until
10807 * execution, so for them, just output the original
10808 * character using utf8. If we start to fold non-UTF
10809 * patterns, be sure to update join_exact() */
10810 if (LOC && ender < 256) {
10811 if (UNI_IS_INVARIANT(ender)) {
10815 *s = UTF8_TWO_BYTE_HI(ender);
10816 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10821 ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
10823 | ((LOC) ? FOLD_FLAGS_LOCALE
10824 : (ASCII_FOLD_RESTRICTED)
10825 ? FOLD_FLAGS_NOMIX_ASCII
10831 /* The loop increments <len> each time, as all but this
10832 * path (and the one just below for UTF) through it add
10833 * a single byte to the EXACTish node. But this one
10834 * has changed len to be the correct final value, so
10835 * subtract one to cancel out the increment that
10837 len += foldlen - 1;
10844 const STRLEN unilen = reguni(pRExC_state, ender, s);
10850 /* See comment just above for - 1 */
10854 REGC((char)ender, s++);
10857 if (next_is_quantifier) {
10859 /* Here, the next input is a quantifier, and to get here,
10860 * the current character is the only one in the node.
10861 * Also, here <len> doesn't include the final byte for this
10867 } /* End of loop through literal characters */
10869 /* Here we have either exhausted the input or ran out of room in
10870 * the node. (If we encountered a character that can't be in the
10871 * node, transfer is made directly to <loopdone>, and so we
10872 * wouldn't have fallen off the end of the loop.) In the latter
10873 * case, we artificially have to split the node into two, because
10874 * we just don't have enough space to hold everything. This
10875 * creates a problem if the final character participates in a
10876 * multi-character fold in the non-final position, as a match that
10877 * should have occurred won't, due to the way nodes are matched,
10878 * and our artificial boundary. So back off until we find a non-
10879 * problematic character -- one that isn't at the beginning or
10880 * middle of such a fold. (Either it doesn't participate in any
10881 * folds, or appears only in the final position of all the folds it
10882 * does participate in.) A better solution with far fewer false
10883 * positives, and that would fill the nodes more completely, would
10884 * be to actually have available all the multi-character folds to
10885 * test against, and to back-off only far enough to be sure that
10886 * this node isn't ending with a partial one. <upper_parse> is set
10887 * further below (if we need to reparse the node) to include just
10888 * up through that final non-problematic character that this code
10889 * identifies, so when it is set to less than the full node, we can
10890 * skip the rest of this */
10891 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10893 const STRLEN full_len = len;
10895 assert(len >= MAX_NODE_STRING_SIZE);
10897 /* Here, <s> points to the final byte of the final character.
10898 * Look backwards through the string until find a non-
10899 * problematic character */
10903 /* These two have no multi-char folds to non-UTF characters
10905 if (ASCII_FOLD_RESTRICTED || LOC) {
10909 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10913 if (! PL_NonL1NonFinalFold) {
10914 PL_NonL1NonFinalFold = _new_invlist_C_array(
10915 NonL1_Perl_Non_Final_Folds_invlist);
10918 /* Point to the first byte of the final character */
10919 s = (char *) utf8_hop((U8 *) s, -1);
10921 while (s >= s0) { /* Search backwards until find
10922 non-problematic char */
10923 if (UTF8_IS_INVARIANT(*s)) {
10925 /* There are no ascii characters that participate
10926 * in multi-char folds under /aa. In EBCDIC, the
10927 * non-ascii invariants are all control characters,
10928 * so don't ever participate in any folds. */
10929 if (ASCII_FOLD_RESTRICTED
10930 || ! IS_NON_FINAL_FOLD(*s))
10935 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10937 /* No Latin1 characters participate in multi-char
10938 * folds under /l */
10940 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10946 else if (! _invlist_contains_cp(
10947 PL_NonL1NonFinalFold,
10948 valid_utf8_to_uvchr((U8 *) s, NULL)))
10953 /* Here, the current character is problematic in that
10954 * it does occur in the non-final position of some
10955 * fold, so try the character before it, but have to
10956 * special case the very first byte in the string, so
10957 * we don't read outside the string */
10958 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10959 } /* End of loop backwards through the string */
10961 /* If there were only problematic characters in the string,
10962 * <s> will point to before s0, in which case the length
10963 * should be 0, otherwise include the length of the
10964 * non-problematic character just found */
10965 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10968 /* Here, have found the final character, if any, that is
10969 * non-problematic as far as ending the node without splitting
10970 * it across a potential multi-char fold. <len> contains the
10971 * number of bytes in the node up-to and including that
10972 * character, or is 0 if there is no such character, meaning
10973 * the whole node contains only problematic characters. In
10974 * this case, give up and just take the node as-is. We can't
10980 /* Here, the node does contain some characters that aren't
10981 * problematic. If one such is the final character in the
10982 * node, we are done */
10983 if (len == full_len) {
10986 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
10988 /* If the final character is problematic, but the
10989 * penultimate is not, back-off that last character to
10990 * later start a new node with it */
10995 /* Here, the final non-problematic character is earlier
10996 * in the input than the penultimate character. What we do
10997 * is reparse from the beginning, going up only as far as
10998 * this final ok one, thus guaranteeing that the node ends
10999 * in an acceptable character. The reason we reparse is
11000 * that we know how far in the character is, but we don't
11001 * know how to correlate its position with the input parse.
11002 * An alternate implementation would be to build that
11003 * correlation as we go along during the original parse,
11004 * but that would entail extra work for every node, whereas
11005 * this code gets executed only when the string is too
11006 * large for the node, and the final two characters are
11007 * problematic, an infrequent occurrence. Yet another
11008 * possible strategy would be to save the tail of the
11009 * string, and the next time regatom is called, initialize
11010 * with that. The problem with this is that unless you
11011 * back off one more character, you won't be guaranteed
11012 * regatom will get called again, unless regbranch,
11013 * regpiece ... are also changed. If you do back off that
11014 * extra character, so that there is input guaranteed to
11015 * force calling regatom, you can't handle the case where
11016 * just the first character in the node is acceptable. I
11017 * (khw) decided to try this method which doesn't have that
11018 * pitfall; if performance issues are found, we can do a
11019 * combination of the current approach plus that one */
11025 } /* End of verifying node ends with an appropriate char */
11027 loopdone: /* Jumped to when encounters something that shouldn't be in
11030 /* I (khw) don't know if you can get here with zero length, but the
11031 * old code handled this situation by creating a zero-length EXACT
11032 * node. Might as well be NOTHING instead */
11037 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11040 RExC_parse = p - 1;
11041 Set_Node_Cur_Length(ret); /* MJD */
11042 nextchar(pRExC_state);
11044 /* len is STRLEN which is unsigned, need to copy to signed */
11047 vFAIL("Internal disaster");
11050 } /* End of label 'defchar:' */
11052 } /* End of giant switch on input character */
11058 S_regwhite( RExC_state_t *pRExC_state, char *p )
11060 const char *e = RExC_end;
11062 PERL_ARGS_ASSERT_REGWHITE;
11067 else if (*p == '#') {
11070 if (*p++ == '\n') {
11076 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11084 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11085 Character classes ([:foo:]) can also be negated ([:^foo:]).
11086 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11087 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11088 but trigger failures because they are currently unimplemented. */
11090 #define POSIXCC_DONE(c) ((c) == ':')
11091 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11092 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11095 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
11098 I32 namedclass = OOB_NAMEDCLASS;
11100 PERL_ARGS_ASSERT_REGPPOSIXCC;
11102 if (value == '[' && RExC_parse + 1 < RExC_end &&
11103 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11104 POSIXCC(UCHARAT(RExC_parse))) {
11105 const char c = UCHARAT(RExC_parse);
11106 char* const s = RExC_parse++;
11108 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11110 if (RExC_parse == RExC_end)
11111 /* Grandfather lone [:, [=, [. */
11114 const char* const t = RExC_parse++; /* skip over the c */
11117 if (UCHARAT(RExC_parse) == ']') {
11118 const char *posixcc = s + 1;
11119 RExC_parse++; /* skip over the ending ] */
11122 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11123 const I32 skip = t - posixcc;
11125 /* Initially switch on the length of the name. */
11128 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11129 namedclass = ANYOF_ALNUM;
11132 /* Names all of length 5. */
11133 /* alnum alpha ascii blank cntrl digit graph lower
11134 print punct space upper */
11135 /* Offset 4 gives the best switch position. */
11136 switch (posixcc[4]) {
11138 if (memEQ(posixcc, "alph", 4)) /* alpha */
11139 namedclass = ANYOF_ALPHA;
11142 if (memEQ(posixcc, "spac", 4)) /* space */
11143 namedclass = ANYOF_PSXSPC;
11146 if (memEQ(posixcc, "grap", 4)) /* graph */
11147 namedclass = ANYOF_GRAPH;
11150 if (memEQ(posixcc, "asci", 4)) /* ascii */
11151 namedclass = ANYOF_ASCII;
11154 if (memEQ(posixcc, "blan", 4)) /* blank */
11155 namedclass = ANYOF_BLANK;
11158 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11159 namedclass = ANYOF_CNTRL;
11162 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11163 namedclass = ANYOF_ALNUMC;
11166 if (memEQ(posixcc, "lowe", 4)) /* lower */
11167 namedclass = ANYOF_LOWER;
11168 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11169 namedclass = ANYOF_UPPER;
11172 if (memEQ(posixcc, "digi", 4)) /* digit */
11173 namedclass = ANYOF_DIGIT;
11174 else if (memEQ(posixcc, "prin", 4)) /* print */
11175 namedclass = ANYOF_PRINT;
11176 else if (memEQ(posixcc, "punc", 4)) /* punct */
11177 namedclass = ANYOF_PUNCT;
11182 if (memEQ(posixcc, "xdigit", 6))
11183 namedclass = ANYOF_XDIGIT;
11187 if (namedclass == OOB_NAMEDCLASS)
11188 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11191 /* The #defines are structured so each complement is +1 to
11192 * the normal one */
11196 assert (posixcc[skip] == ':');
11197 assert (posixcc[skip+1] == ']');
11198 } else if (!SIZE_ONLY) {
11199 /* [[=foo=]] and [[.foo.]] are still future. */
11201 /* adjust RExC_parse so the warning shows after
11202 the class closes */
11203 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11205 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11208 /* Maternal grandfather:
11209 * "[:" ending in ":" but not in ":]" */
11219 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
11223 PERL_ARGS_ASSERT_CHECKPOSIXCC;
11225 if (POSIXCC(UCHARAT(RExC_parse))) {
11226 const char *s = RExC_parse;
11227 const char c = *s++;
11229 while (isALNUM(*s))
11231 if (*s && c == *s && s[1] == ']') {
11233 "POSIX syntax [%c %c] belongs inside character classes",
11236 /* [[=foo=]] and [[.foo.]] are still future. */
11237 if (POSIXCC_NOTYET(c)) {
11238 /* adjust RExC_parse so the error shows after
11239 the class closes */
11240 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11242 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11248 /* Generate the code to add a full posix character <class> to the bracketed
11249 * character class given by <node>. (<node> is needed only under locale rules)
11250 * destlist is the inversion list for non-locale rules that this class is
11252 * sourcelist is the ASCII-range inversion list to add under /a rules
11253 * Xsourcelist is the full Unicode range list to use otherwise. */
11254 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11256 SV* scratch_list = NULL; \
11258 /* Set this class in the node for runtime matching */ \
11259 ANYOF_CLASS_SET(node, class); \
11261 /* For above Latin1 code points, we use the full Unicode range */ \
11262 _invlist_intersection(PL_AboveLatin1, \
11265 /* And set the output to it, adding instead if there already is an \
11266 * output. Checking if <destlist> is NULL first saves an extra \
11267 * clone. Its reference count will be decremented at the next \
11268 * union, etc, or if this is the only instance, at the end of the \
11270 if (! destlist) { \
11271 destlist = scratch_list; \
11274 _invlist_union(destlist, scratch_list, &destlist); \
11275 SvREFCNT_dec(scratch_list); \
11279 /* For non-locale, just add it to any existing list */ \
11280 _invlist_union(destlist, \
11281 (AT_LEAST_ASCII_RESTRICTED) \
11287 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11289 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11291 SV* scratch_list = NULL; \
11292 ANYOF_CLASS_SET(node, class); \
11293 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
11294 if (! destlist) { \
11295 destlist = scratch_list; \
11298 _invlist_union(destlist, scratch_list, &destlist); \
11299 SvREFCNT_dec(scratch_list); \
11303 _invlist_union_complement_2nd(destlist, \
11304 (AT_LEAST_ASCII_RESTRICTED) \
11308 /* Under /d, everything in the upper half of the Latin1 range \
11309 * matches this complement */ \
11310 if (DEPENDS_SEMANTICS) { \
11311 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11315 /* Generate the code to add a posix character <class> to the bracketed
11316 * character class given by <node>. (<node> is needed only under locale rules)
11317 * destlist is the inversion list for non-locale rules that this class is
11319 * sourcelist is the ASCII-range inversion list to add under /a rules
11320 * l1_sourcelist is the Latin1 range list to use otherwise.
11321 * Xpropertyname is the name to add to <run_time_list> of the property to
11322 * specify the code points above Latin1 that will have to be
11323 * determined at run-time
11324 * run_time_list is a SV* that contains text names of properties that are to
11325 * be computed at run time. This concatenates <Xpropertyname>
11326 * to it, appropriately
11327 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11329 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11330 l1_sourcelist, Xpropertyname, run_time_list) \
11331 /* First, resolve whether to use the ASCII-only list or the L1 \
11333 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
11334 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11335 Xpropertyname, run_time_list)
11337 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11338 Xpropertyname, run_time_list) \
11339 /* If not /a matching, there are going to be code points we will have \
11340 * to defer to runtime to look-up */ \
11341 if (! AT_LEAST_ASCII_RESTRICTED) { \
11342 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11345 ANYOF_CLASS_SET(node, class); \
11348 _invlist_union(destlist, sourcelist, &destlist); \
11351 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
11352 * this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
11354 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11355 l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11356 if (AT_LEAST_ASCII_RESTRICTED) { \
11357 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
11360 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11361 matches_above_unicode = TRUE; \
11363 ANYOF_CLASS_SET(node, namedclass); \
11366 SV* scratch_list = NULL; \
11367 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
11368 if (! destlist) { \
11369 destlist = scratch_list; \
11372 _invlist_union(destlist, scratch_list, &destlist); \
11373 SvREFCNT_dec(scratch_list); \
11375 if (DEPENDS_SEMANTICS) { \
11376 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11382 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
11384 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
11385 * alternate list, pointed to by 'alternate_ptr'. This is an array of
11386 * the multi-character folds of characters in the node */
11389 PERL_ARGS_ASSERT_ADD_ALTERNATE;
11391 if (! *alternate_ptr) {
11392 *alternate_ptr = newAV();
11394 sv = newSVpvn_utf8((char*)string, len, TRUE);
11395 av_push(*alternate_ptr, sv);
11399 /* The names of properties whose definitions are not known at compile time are
11400 * stored in this SV, after a constant heading. So if the length has been
11401 * changed since initialization, then there is a run-time definition. */
11402 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11404 /* This converts the named class defined in regcomp.h to its equivalent class
11405 * number defined in handy.h. */
11406 #define namedclass_to_classnum(class) ((class) / 2)
11409 parse a class specification and produce either an ANYOF node that
11410 matches the pattern or perhaps will be optimized into an EXACTish node
11411 instead. The node contains a bit map for the first 256 characters, with the
11412 corresponding bit set if that character is in the list. For characters
11413 above 255, a range list is used */
11416 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11420 UV prevvalue = OOB_UNICODE;
11425 IV namedclass = OOB_NAMEDCLASS;
11426 char *rangebegin = NULL;
11427 bool need_class = 0;
11428 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
11430 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11431 than just initialized. */
11432 SV* properties = NULL; /* Code points that match \p{} \P{} */
11433 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11434 extended beyond the Latin1 range */
11435 UV element_count = 0; /* Number of distinct elements in the class.
11436 Optimizations may be possible if this is tiny */
11439 /* Unicode properties are stored in a swash; this holds the current one
11440 * being parsed. If this swash is the only above-latin1 component of the
11441 * character class, an optimization is to pass it directly on to the
11442 * execution engine. Otherwise, it is set to NULL to indicate that there
11443 * are other things in the class that have to be dealt with at execution
11445 SV* swash = NULL; /* Code points that match \p{} \P{} */
11447 /* Set if a component of this character class is user-defined; just passed
11448 * on to the engine */
11449 bool has_user_defined_property = FALSE;
11451 /* inversion list of code points this node matches only when the target
11452 * string is in UTF-8. (Because is under /d) */
11453 SV* depends_list = NULL;
11455 /* inversion list of code points this node matches. For much of the
11456 * function, it includes only those that match regardless of the utf8ness
11457 * of the target string */
11458 SV* cp_list = NULL;
11460 /* List of multi-character folds that are matched by this node */
11461 AV* unicode_alternate = NULL;
11463 /* In a range, counts how many 0-2 of the ends of it came from literals,
11464 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11465 UV literal_endpoint = 0;
11467 bool invert = FALSE; /* Is this class to be complemented */
11469 /* Is there any thing like \W or [:^digit:] that matches above the legal
11470 * Unicode range? */
11471 bool runtime_posix_matches_above_Unicode = FALSE;
11473 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11474 case we need to change the emitted regop to an EXACT. */
11475 const char * orig_parse = RExC_parse;
11476 const I32 orig_size = RExC_size;
11477 GET_RE_DEBUG_FLAGS_DECL;
11479 PERL_ARGS_ASSERT_REGCLASS;
11481 PERL_UNUSED_ARG(depth);
11484 DEBUG_PARSE("clas");
11486 /* Assume we are going to generate an ANYOF node. */
11487 ret = reganode(pRExC_state, ANYOF, 0);
11491 ANYOF_FLAGS(ret) = 0;
11494 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11499 /* We have decided to not allow multi-char folds in inverted character
11500 * classes, due to the confusion that can happen, especially with
11501 * classes that are designed for a non-Unicode world: You have the
11502 * peculiar case that:
11503 "s s" =~ /^[^\xDF]+$/i => Y
11504 "ss" =~ /^[^\xDF]+$/i => N
11506 * See [perl #89750] */
11507 allow_full_fold = FALSE;
11511 RExC_size += ANYOF_SKIP;
11512 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11515 RExC_emit += ANYOF_SKIP;
11517 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11519 listsv = newSVpvs("# comment\n");
11520 initial_listsv_len = SvCUR(listsv);
11523 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11525 if (!SIZE_ONLY && POSIXCC(nextvalue))
11526 checkposixcc(pRExC_state);
11528 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11529 if (UCHARAT(RExC_parse) == ']')
11530 goto charclassloop;
11533 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11537 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11540 rangebegin = RExC_parse;
11544 value = utf8n_to_uvchr((U8*)RExC_parse,
11545 RExC_end - RExC_parse,
11546 &numlen, UTF8_ALLOW_DEFAULT);
11547 RExC_parse += numlen;
11550 value = UCHARAT(RExC_parse++);
11552 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11553 if (value == '[' && POSIXCC(nextvalue))
11554 namedclass = regpposixcc(pRExC_state, value);
11555 else if (value == '\\') {
11557 value = utf8n_to_uvchr((U8*)RExC_parse,
11558 RExC_end - RExC_parse,
11559 &numlen, UTF8_ALLOW_DEFAULT);
11560 RExC_parse += numlen;
11563 value = UCHARAT(RExC_parse++);
11564 /* Some compilers cannot handle switching on 64-bit integer
11565 * values, therefore value cannot be an UV. Yes, this will
11566 * be a problem later if we want switch on Unicode.
11567 * A similar issue a little bit later when switching on
11568 * namedclass. --jhi */
11569 switch ((I32)value) {
11570 case 'w': namedclass = ANYOF_ALNUM; break;
11571 case 'W': namedclass = ANYOF_NALNUM; break;
11572 case 's': namedclass = ANYOF_SPACE; break;
11573 case 'S': namedclass = ANYOF_NSPACE; break;
11574 case 'd': namedclass = ANYOF_DIGIT; break;
11575 case 'D': namedclass = ANYOF_NDIGIT; break;
11576 case 'v': namedclass = ANYOF_VERTWS; break;
11577 case 'V': namedclass = ANYOF_NVERTWS; break;
11578 case 'h': namedclass = ANYOF_HORIZWS; break;
11579 case 'H': namedclass = ANYOF_NHORIZWS; break;
11580 case 'N': /* Handle \N{NAME} in class */
11582 /* We only pay attention to the first char of
11583 multichar strings being returned. I kinda wonder
11584 if this makes sense as it does change the behaviour
11585 from earlier versions, OTOH that behaviour was broken
11587 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11588 TRUE /* => charclass */))
11599 /* This routine will handle any undefined properties */
11600 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11602 if (RExC_parse >= RExC_end)
11603 vFAIL2("Empty \\%c{}", (U8)value);
11604 if (*RExC_parse == '{') {
11605 const U8 c = (U8)value;
11606 e = strchr(RExC_parse++, '}');
11608 vFAIL2("Missing right brace on \\%c{}", c);
11609 while (isSPACE(UCHARAT(RExC_parse)))
11611 if (e == RExC_parse)
11612 vFAIL2("Empty \\%c{}", c);
11613 n = e - RExC_parse;
11614 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11625 if (UCHARAT(RExC_parse) == '^') {
11628 value = value == 'p' ? 'P' : 'p'; /* toggle */
11629 while (isSPACE(UCHARAT(RExC_parse))) {
11634 /* Try to get the definition of the property into
11635 * <invlist>. If /i is in effect, the effective property
11636 * will have its name be <__NAME_i>. The design is
11637 * discussed in commit
11638 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11639 Newx(name, n + sizeof("_i__\n"), char);
11641 sprintf(name, "%s%.*s%s\n",
11642 (FOLD) ? "__" : "",
11648 /* Look up the property name, and get its swash and
11649 * inversion list, if the property is found */
11651 SvREFCNT_dec(swash);
11653 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11656 NULL, /* No inversion list */
11659 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11661 SvREFCNT_dec(swash);
11665 /* Here didn't find it. It could be a user-defined
11666 * property that will be available at run-time. Add it
11667 * to the list to look up then */
11668 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11669 (value == 'p' ? '+' : '!'),
11671 has_user_defined_property = TRUE;
11673 /* We don't know yet, so have to assume that the
11674 * property could match something in the Latin1 range,
11675 * hence something that isn't utf8. Note that this
11676 * would cause things in <depends_list> to match
11677 * inappropriately, except that any \p{}, including
11678 * this one forces Unicode semantics, which means there
11679 * is <no depends_list> */
11680 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11684 /* Here, did get the swash and its inversion list. If
11685 * the swash is from a user-defined property, then this
11686 * whole character class should be regarded as such */
11687 has_user_defined_property =
11689 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11691 /* Invert if asking for the complement */
11692 if (value == 'P') {
11693 _invlist_union_complement_2nd(properties,
11697 /* The swash can't be used as-is, because we've
11698 * inverted things; delay removing it to here after
11699 * have copied its invlist above */
11700 SvREFCNT_dec(swash);
11704 _invlist_union(properties, invlist, &properties);
11709 RExC_parse = e + 1;
11710 namedclass = ANYOF_MAX; /* no official name, but it's named */
11712 /* \p means they want Unicode semantics */
11713 RExC_uni_semantics = 1;
11716 case 'n': value = '\n'; break;
11717 case 'r': value = '\r'; break;
11718 case 't': value = '\t'; break;
11719 case 'f': value = '\f'; break;
11720 case 'b': value = '\b'; break;
11721 case 'e': value = ASCII_TO_NATIVE('\033');break;
11722 case 'a': value = ASCII_TO_NATIVE('\007');break;
11724 RExC_parse--; /* function expects to be pointed at the 'o' */
11726 const char* error_msg;
11727 bool valid = grok_bslash_o(RExC_parse,
11732 RExC_parse += numlen;
11737 if (PL_encoding && value < 0x100) {
11738 goto recode_encoding;
11742 RExC_parse--; /* function expects to be pointed at the 'x' */
11744 const char* error_msg;
11745 bool valid = grok_bslash_x(RExC_parse,
11750 RExC_parse += numlen;
11755 if (PL_encoding && value < 0x100)
11756 goto recode_encoding;
11759 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11761 case '0': case '1': case '2': case '3': case '4':
11762 case '5': case '6': case '7':
11764 /* Take 1-3 octal digits */
11765 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11767 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11768 RExC_parse += numlen;
11769 if (PL_encoding && value < 0x100)
11770 goto recode_encoding;
11774 if (! RExC_override_recoding) {
11775 SV* enc = PL_encoding;
11776 value = reg_recode((const char)(U8)value, &enc);
11777 if (!enc && SIZE_ONLY)
11778 ckWARNreg(RExC_parse,
11779 "Invalid escape in the specified encoding");
11783 /* Allow \_ to not give an error */
11784 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11785 ckWARN2reg(RExC_parse,
11786 "Unrecognized escape \\%c in character class passed through",
11791 } /* end of \blah */
11794 literal_endpoint++;
11797 /* What matches in a locale is not known until runtime. This
11798 * includes what the Posix classes (like \w, [:space:]) match.
11799 * Room must be reserved (one time per class) to store such
11800 * classes, either if Perl is compiled so that locale nodes always
11801 * should have this space, or if there is such class info to be
11802 * stored. The space will contain a bit for each named class that
11803 * is to be matched against. This isn't needed for \p{} and
11804 * pseudo-classes, as they are not affected by locale, and hence
11805 * are dealt with separately */
11808 && (ANYOF_LOCALE == ANYOF_CLASS
11809 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11813 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11816 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11817 ANYOF_CLASS_ZERO(ret);
11819 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11822 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11824 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11825 * literal, as is the character that began the false range, i.e.
11826 * the 'a' in the examples */
11830 RExC_parse >= rangebegin ?
11831 RExC_parse - rangebegin : 0;
11832 ckWARN4reg(RExC_parse,
11833 "False [] range \"%*.*s\"",
11835 cp_list = add_cp_to_invlist(cp_list, '-');
11836 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11839 range = 0; /* this was not a true range */
11840 element_count += 2; /* So counts for three values */
11844 switch ((I32)namedclass) {
11846 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11847 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11848 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11850 case ANYOF_NALNUMC:
11851 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11852 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11853 runtime_posix_matches_above_Unicode);
11856 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11857 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11860 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11861 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11862 runtime_posix_matches_above_Unicode);
11866 ANYOF_CLASS_SET(ret, namedclass);
11869 _invlist_union(posixes, PL_ASCII, &posixes);
11874 ANYOF_CLASS_SET(ret, namedclass);
11877 _invlist_union_complement_2nd(posixes,
11878 PL_ASCII, &posixes);
11879 if (DEPENDS_SEMANTICS) {
11880 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11885 DO_POSIX(ret, namedclass, posixes,
11886 PL_PosixBlank, PL_XPosixBlank);
11889 DO_N_POSIX(ret, namedclass, posixes,
11890 PL_PosixBlank, PL_XPosixBlank);
11893 DO_POSIX(ret, namedclass, posixes,
11894 PL_PosixCntrl, PL_XPosixCntrl);
11897 DO_N_POSIX(ret, namedclass, posixes,
11898 PL_PosixCntrl, PL_XPosixCntrl);
11901 /* There are no digits in the Latin1 range outside of
11902 * ASCII, so call the macro that doesn't have to resolve
11904 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
11905 PL_PosixDigit, "XPosixDigit", listsv);
11908 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11909 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
11910 runtime_posix_matches_above_Unicode);
11913 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11914 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11917 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11918 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
11919 runtime_posix_matches_above_Unicode);
11921 case ANYOF_HORIZWS:
11922 /* For these, we use the cp_list, as /d doesn't make a
11923 * difference in what these match. There would be problems
11924 * if these characters had folds other than themselves, as
11925 * cp_list is subject to folding. It turns out that \h
11926 * is just a synonym for XPosixBlank */
11927 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
11929 case ANYOF_NHORIZWS:
11930 _invlist_union_complement_2nd(cp_list,
11931 PL_XPosixBlank, &cp_list);
11935 { /* These require special handling, as they differ under
11936 folding, matching Cased there (which in the ASCII range
11937 is the same as Alpha */
11943 if (FOLD && ! LOC) {
11944 ascii_source = PL_PosixAlpha;
11945 l1_source = PL_L1Cased;
11949 ascii_source = PL_PosixLower;
11950 l1_source = PL_L1PosixLower;
11951 Xname = "XPosixLower";
11953 if (namedclass == ANYOF_LOWER) {
11954 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11955 ascii_source, l1_source, Xname, listsv);
11958 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11959 posixes, ascii_source, l1_source, Xname, listsv,
11960 runtime_posix_matches_above_Unicode);
11965 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11966 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11969 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11970 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
11971 runtime_posix_matches_above_Unicode);
11974 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11975 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11978 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11979 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
11980 runtime_posix_matches_above_Unicode);
11983 DO_POSIX(ret, namedclass, posixes,
11984 PL_PosixSpace, PL_XPosixSpace);
11986 case ANYOF_NPSXSPC:
11987 DO_N_POSIX(ret, namedclass, posixes,
11988 PL_PosixSpace, PL_XPosixSpace);
11991 DO_POSIX(ret, namedclass, posixes,
11992 PL_PerlSpace, PL_XPerlSpace);
11995 DO_N_POSIX(ret, namedclass, posixes,
11996 PL_PerlSpace, PL_XPerlSpace);
11998 case ANYOF_UPPER: /* Same as LOWER, above */
12005 if (FOLD && ! LOC) {
12006 ascii_source = PL_PosixAlpha;
12007 l1_source = PL_L1Cased;
12011 ascii_source = PL_PosixUpper;
12012 l1_source = PL_L1PosixUpper;
12013 Xname = "XPosixUpper";
12015 if (namedclass == ANYOF_UPPER) {
12016 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12017 ascii_source, l1_source, Xname, listsv);
12020 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12021 posixes, ascii_source, l1_source, Xname, listsv,
12022 runtime_posix_matches_above_Unicode);
12026 case ANYOF_ALNUM: /* Really is 'Word' */
12027 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12028 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
12031 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12032 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
12033 runtime_posix_matches_above_Unicode);
12036 /* For these, we use the cp_list, as /d doesn't make a
12037 * difference in what these match. There would be problems
12038 * if these characters had folds other than themselves, as
12039 * cp_list is subject to folding */
12040 _invlist_union(cp_list, PL_VertSpace, &cp_list);
12042 case ANYOF_NVERTWS:
12043 _invlist_union_complement_2nd(cp_list,
12044 PL_VertSpace, &cp_list);
12047 DO_POSIX(ret, namedclass, posixes,
12048 PL_PosixXDigit, PL_XPosixXDigit);
12050 case ANYOF_NXDIGIT:
12051 DO_N_POSIX(ret, namedclass, posixes,
12052 PL_PosixXDigit, PL_XPosixXDigit);
12055 /* this is to handle \p and \P */
12058 vFAIL("Invalid [::] class");
12062 continue; /* Go get next character */
12064 } /* end of namedclass \blah */
12067 if (prevvalue > value) /* b-a */ {
12068 const int w = RExC_parse - rangebegin;
12069 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12070 range = 0; /* not a valid range */
12074 prevvalue = value; /* save the beginning of the potential range */
12075 if (RExC_parse+1 < RExC_end
12076 && *RExC_parse == '-'
12077 && RExC_parse[1] != ']')
12081 /* a bad range like \w-, [:word:]- ? */
12082 if (namedclass > OOB_NAMEDCLASS) {
12083 if (ckWARN(WARN_REGEXP)) {
12085 RExC_parse >= rangebegin ?
12086 RExC_parse - rangebegin : 0;
12088 "False [] range \"%*.*s\"",
12092 cp_list = add_cp_to_invlist(cp_list, '-');
12096 range = 1; /* yeah, it's a range! */
12097 continue; /* but do it the next time */
12101 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12104 /* non-Latin1 code point implies unicode semantics. Must be set in
12105 * pass1 so is there for the whole of pass 2 */
12107 RExC_uni_semantics = 1;
12110 /* Ready to process either the single value, or the completed range */
12113 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12115 UV* this_range = _new_invlist(1);
12116 _append_range_to_invlist(this_range, prevvalue, value);
12118 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12119 * If this range was specified using something like 'i-j', we want
12120 * to include only the 'i' and the 'j', and not anything in
12121 * between, so exclude non-ASCII, non-alphabetics from it.
12122 * However, if the range was specified with something like
12123 * [\x89-\x91] or [\x89-j], all code points within it should be
12124 * included. literal_endpoint==2 means both ends of the range used
12125 * a literal character, not \x{foo} */
12126 if (literal_endpoint == 2
12127 && (prevvalue >= 'a' && value <= 'z')
12128 || (prevvalue >= 'A' && value <= 'Z'))
12130 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12131 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12133 _invlist_union(cp_list, this_range, &cp_list);
12134 literal_endpoint = 0;
12138 range = 0; /* this range (if it was one) is done now */
12139 } /* End of loop through all the text within the brackets */
12141 /* If the character class contains only a single element, it may be
12142 * optimizable into another node type which is smaller and runs faster.
12143 * Check if this is the case for this class */
12144 if (element_count == 1) {
12148 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12149 [:digit:] or \p{foo} */
12151 /* Certain named classes have equivalents that can appear outside a
12152 * character class, e.g. \w, \H. We use these instead of a
12153 * character class. */
12154 switch ((I32)namedclass) {
12157 /* The first group is for node types that depend on the charset
12158 * modifier to the regex. We first calculate the base node
12159 * type, and if it should be inverted */
12166 goto join_charset_classes;
12173 goto join_charset_classes;
12181 join_charset_classes:
12183 /* Now that we have the base node type, we take advantage
12184 * of the enum ordering of the charset modifiers to get the
12185 * exact node type, For example the base SPACE also has
12186 * SPACEL, SPACEU, and SPACEA */
12188 offset = get_regex_charset(RExC_flags);
12190 /* /aa is the same as /a for these */
12191 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12192 offset = REGEX_ASCII_RESTRICTED_CHARSET;
12194 else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12195 offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12200 /* The number of varieties of each of these is the same,
12201 * hence, so is the delta between the normal and
12202 * complemented nodes */
12204 op += NALNUM - ALNUM;
12206 *flagp |= HASWIDTH|SIMPLE;
12209 /* The second group doesn't depend of the charset modifiers.
12210 * We just have normal and complemented */
12211 case ANYOF_NHORIZWS:
12214 case ANYOF_HORIZWS:
12216 op = (invert) ? NHORIZWS : HORIZWS;
12217 *flagp |= HASWIDTH|SIMPLE;
12220 case ANYOF_NVERTWS:
12224 op = (invert) ? NVERTWS : VERTWS;
12225 *flagp |= HASWIDTH|SIMPLE;
12235 if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12240 /* A generic posix class. All the /a ones can be handled
12241 * by the POSIXA opcode. And all are closed under folding
12242 * in the ASCII range, so FOLD doesn't matter */
12243 if (AT_LEAST_ASCII_RESTRICTED
12244 || (! LOC && namedclass == ANYOF_ASCII))
12246 /* The odd numbered ones are the complements of the
12247 * next-lower even number one */
12248 if (namedclass % 2 == 1) {
12252 arg = namedclass_to_classnum(namedclass);
12253 op = (invert) ? NPOSIXA : POSIXA;
12258 else if (value == prevvalue) {
12260 /* Here, the class consists of just a single code point */
12263 if (! LOC && value == '\n') {
12264 op = REG_ANY; /* Optimize [^\n] */
12265 *flagp |= HASWIDTH|SIMPLE;
12269 else if (value < 256 || UTF) {
12271 /* Optimize a single value into an EXACTish node, but not if it
12272 * would require converting the pattern to UTF-8. */
12273 op = compute_EXACTish(pRExC_state);
12275 } /* Otherwise is a range */
12276 else if (! LOC) { /* locale could vary these */
12277 if (prevvalue == '0') {
12278 if (value == '9') {
12279 op = (invert) ? NDIGITA : DIGITA;
12280 *flagp |= HASWIDTH|SIMPLE;
12285 /* Here, we have changed <op> away from its initial value iff we found
12286 * an optimization */
12289 /* Throw away this ANYOF regnode, and emit the calculated one,
12290 * which should correspond to the beginning, not current, state of
12292 const char * cur_parse = RExC_parse;
12293 RExC_parse = (char *)orig_parse;
12297 /* To get locale nodes to not use the full ANYOF size would
12298 * require moving the code above that writes the portions
12299 * of it that aren't in other nodes to after this point.
12300 * e.g. ANYOF_CLASS_SET */
12301 RExC_size = orig_size;
12305 RExC_emit = (regnode *)orig_emit;
12308 ret = reg_node(pRExC_state, op);
12310 if (PL_regkind[op] == POSIXD) {
12314 *flagp |= HASWIDTH|SIMPLE;
12316 else if (PL_regkind[op] == EXACT) {
12317 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12320 RExC_parse = (char *) cur_parse;
12322 SvREFCNT_dec(listsv);
12329 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12331 /* If folding, we calculate all characters that could fold to or from the
12332 * ones already on the list */
12333 if (FOLD && cp_list) {
12334 UV start, end; /* End points of code point ranges */
12336 SV* fold_intersection = NULL;
12338 /* In the Latin1 range, the characters that can be folded-to or -from
12339 * are precisely the alphabetic characters. If the highest code point
12340 * is within Latin1, we can use the compiled-in list, and not have to
12341 * go out to disk. */
12342 if (invlist_highest(cp_list) < 256) {
12343 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12347 /* Here, there are non-Latin1 code points, so we will have to go
12348 * fetch the list of all the characters that participate in folds
12350 if (! PL_utf8_foldable) {
12351 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12352 &PL_sv_undef, 1, 0);
12353 PL_utf8_foldable = _get_swash_invlist(swash);
12354 SvREFCNT_dec(swash);
12357 /* This is a hash that for a particular fold gives all characters
12358 * that are involved in it */
12359 if (! PL_utf8_foldclosures) {
12361 /* If we were unable to find any folds, then we likely won't be
12362 * able to find the closures. So just create an empty list.
12363 * Folding will effectively be restricted to the non-Unicode
12364 * rules hard-coded into Perl. (This case happens legitimately
12365 * during compilation of Perl itself before the Unicode tables
12366 * are generated) */
12367 if (_invlist_len(PL_utf8_foldable) == 0) {
12368 PL_utf8_foldclosures = newHV();
12371 /* If the folds haven't been read in, call a fold function
12373 if (! PL_utf8_tofold) {
12374 U8 dummy[UTF8_MAXBYTES+1];
12377 /* This string is just a short named one above \xff */
12378 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
12379 assert(PL_utf8_tofold); /* Verify that worked */
12381 PL_utf8_foldclosures =
12382 _swash_inversion_hash(PL_utf8_tofold);
12386 /* Only the characters in this class that participate in folds need
12387 * be checked. Get the intersection of this class and all the
12388 * possible characters that are foldable. This can quickly narrow
12389 * down a large class */
12390 _invlist_intersection(PL_utf8_foldable, cp_list,
12391 &fold_intersection);
12394 /* Now look at the foldable characters in this class individually */
12395 invlist_iterinit(fold_intersection);
12396 while (invlist_iternext(fold_intersection, &start, &end)) {
12399 /* Locale folding for Latin1 characters is deferred until runtime */
12400 if (LOC && start < 256) {
12404 /* Look at every character in the range */
12405 for (j = start; j <= end; j++) {
12407 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12413 /* We have the latin1 folding rules hard-coded here so that
12414 * an innocent-looking character class, like /[ks]/i won't
12415 * have to go out to disk to find the possible matches.
12416 * XXX It would be better to generate these via regen, in
12417 * case a new version of the Unicode standard adds new
12418 * mappings, though that is not really likely, and may be
12419 * caught by the default: case of the switch below. */
12421 if (PL_fold_latin1[j] != j) {
12423 /* ASCII is always matched; non-ASCII is matched only
12424 * under Unicode rules */
12425 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12427 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12431 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12435 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12436 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12438 /* Certain Latin1 characters have matches outside
12439 * Latin1, or are multi-character. To get here, 'j' is
12440 * one of those characters. None of these matches is
12441 * valid for ASCII characters under /aa, which is why
12442 * the 'if' just above excludes those. The matches
12443 * fall into three categories:
12444 * 1) They are singly folded-to or -from an above 255
12445 * character, e.g., LATIN SMALL LETTER Y WITH
12446 * DIAERESIS and LATIN CAPITAL LETTER Y WITH
12448 * 2) They are part of a multi-char fold with another
12449 * latin1 character; only LATIN SMALL LETTER
12450 * SHARP S => "ss" fits this;
12451 * 3) They are part of a multi-char fold with a
12452 * character outside of Latin1, such as various
12454 * We aren't dealing fully with multi-char folds, except
12455 * we do deal with the pattern containing a character
12456 * that has a multi-char fold (not so much the inverse).
12457 * For types 1) and 3), the matches only happen when the
12458 * target string is utf8; that's not true for 2), and we
12459 * set a flag for it.
12461 * The code below adds the single fold closures for 'j'
12462 * to the inversion list. */
12467 add_cp_to_invlist(cp_list, KELVIN_SIGN);
12471 cp_list = add_cp_to_invlist(cp_list,
12472 LATIN_SMALL_LETTER_LONG_S);
12475 cp_list = add_cp_to_invlist(cp_list,
12476 GREEK_CAPITAL_LETTER_MU);
12477 cp_list = add_cp_to_invlist(cp_list,
12478 GREEK_SMALL_LETTER_MU);
12480 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12481 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12483 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12485 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12486 cp_list = add_cp_to_invlist(cp_list,
12487 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12489 case LATIN_SMALL_LETTER_SHARP_S:
12490 cp_list = add_cp_to_invlist(cp_list,
12491 LATIN_CAPITAL_LETTER_SHARP_S);
12493 /* Under /a, /d, and /u, this can match the two
12495 if (! ASCII_FOLD_RESTRICTED) {
12496 add_alternate(&unicode_alternate,
12499 /* And under /u or /a, it can match even if
12500 * the target is not utf8 */
12501 if (AT_LEAST_UNI_SEMANTICS) {
12502 ANYOF_FLAGS(ret) |=
12503 ANYOF_NONBITMAP_NON_UTF8;
12507 case 'F': case 'f':
12508 case 'I': case 'i':
12509 case 'L': case 'l':
12510 case 'T': case 't':
12511 case 'A': case 'a':
12512 case 'H': case 'h':
12513 case 'J': case 'j':
12514 case 'N': case 'n':
12515 case 'W': case 'w':
12516 case 'Y': case 'y':
12517 /* These all are targets of multi-character
12518 * folds from code points that require UTF8 to
12519 * express, so they can't match unless the
12520 * target string is in UTF-8, so no action here
12521 * is necessary, as regexec.c properly handles
12522 * the general case for UTF-8 matching */
12525 /* Use deprecated warning to increase the
12526 * chances of this being output */
12527 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12534 /* Here is an above Latin1 character. We don't have the rules
12535 * hard-coded for it. First, get its fold */
12536 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
12537 ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
12539 ? FOLD_FLAGS_LOCALE
12540 : (ASCII_FOLD_RESTRICTED)
12541 ? FOLD_FLAGS_NOMIX_ASCII
12544 if (foldlen > (STRLEN)UNISKIP(f)) {
12546 /* Any multicharacter foldings (disallowed in lookbehind
12547 * patterns) require the following transform: [ABCDEF] ->
12548 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
12549 * folds into "rst", all other characters fold to single
12550 * characters. We save away these multicharacter foldings,
12551 * to be later saved as part of the additional "s" data. */
12552 if (! RExC_in_lookbehind) {
12554 U8* e = foldbuf + foldlen;
12556 /* If any of the folded characters of this are in the
12557 * Latin1 range, tell the regex engine that this can
12558 * match a non-utf8 target string. */
12560 if (UTF8_IS_INVARIANT(*loc)
12561 || UTF8_IS_DOWNGRADEABLE_START(*loc))
12564 |= ANYOF_NONBITMAP_NON_UTF8;
12567 loc += UTF8SKIP(loc);
12570 add_alternate(&unicode_alternate, foldbuf, foldlen);
12574 /* Single character fold of above Latin1. Add everything
12575 * in its fold closure to the list that this node should
12579 /* The fold closures data structure is a hash with the keys
12580 * being every character that is folded to, like 'k', and
12581 * the values each an array of everything that folds to its
12582 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
12583 if ((listp = hv_fetch(PL_utf8_foldclosures,
12584 (char *) foldbuf, foldlen, FALSE)))
12586 AV* list = (AV*) *listp;
12588 for (k = 0; k <= av_len(list); k++) {
12589 SV** c_p = av_fetch(list, k, FALSE);
12592 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12596 /* /aa doesn't allow folds between ASCII and non-;
12597 * /l doesn't allow them between above and below
12599 if ((ASCII_FOLD_RESTRICTED
12600 && (isASCII(c) != isASCII(j)))
12601 || (LOC && ((c < 256) != (j < 256))))
12606 /* Folds involving non-ascii Latin1 characters
12607 * under /d are added to a separate list */
12608 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12610 cp_list = add_cp_to_invlist(cp_list, c);
12613 depends_list = add_cp_to_invlist(depends_list, c);
12620 SvREFCNT_dec(fold_intersection);
12623 /* And combine the result (if any) with any inversion list from posix
12624 * classes. The lists are kept separate up to now because we don't want to
12625 * fold the classes (folding of those is automatically handled by the swash
12626 * fetching code) */
12628 if (! DEPENDS_SEMANTICS) {
12630 _invlist_union(cp_list, posixes, &cp_list);
12631 SvREFCNT_dec(posixes);
12638 /* Under /d, we put into a separate list the Latin1 things that
12639 * match only when the target string is utf8 */
12640 SV* nonascii_but_latin1_properties = NULL;
12641 _invlist_intersection(posixes, PL_Latin1,
12642 &nonascii_but_latin1_properties);
12643 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12644 &nonascii_but_latin1_properties);
12645 _invlist_subtract(posixes, nonascii_but_latin1_properties,
12648 _invlist_union(cp_list, posixes, &cp_list);
12649 SvREFCNT_dec(posixes);
12655 if (depends_list) {
12656 _invlist_union(depends_list, nonascii_but_latin1_properties,
12658 SvREFCNT_dec(nonascii_but_latin1_properties);
12661 depends_list = nonascii_but_latin1_properties;
12666 /* And combine the result (if any) with any inversion list from properties.
12667 * The lists are kept separate up to now so that we can distinguish the two
12668 * in regards to matching above-Unicode. A run-time warning is generated
12669 * if a Unicode property is matched against a non-Unicode code point. But,
12670 * we allow user-defined properties to match anything, without any warning,
12671 * and we also suppress the warning if there is a portion of the character
12672 * class that isn't a Unicode property, and which matches above Unicode, \W
12673 * or [\x{110000}] for example.
12674 * (Note that in this case, unlike the Posix one above, there is no
12675 * <depends_list>, because having a Unicode property forces Unicode
12678 bool warn_super = ! has_user_defined_property;
12681 /* If it matters to the final outcome, see if a non-property
12682 * component of the class matches above Unicode. If so, the
12683 * warning gets suppressed. This is true even if just a single
12684 * such code point is specified, as though not strictly correct if
12685 * another such code point is matched against, the fact that they
12686 * are using above-Unicode code points indicates they should know
12687 * the issues involved */
12689 bool non_prop_matches_above_Unicode =
12690 runtime_posix_matches_above_Unicode
12691 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12693 non_prop_matches_above_Unicode =
12694 ! non_prop_matches_above_Unicode;
12696 warn_super = ! non_prop_matches_above_Unicode;
12699 _invlist_union(properties, cp_list, &cp_list);
12700 SvREFCNT_dec(properties);
12703 cp_list = properties;
12707 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12711 /* Here, we have calculated what code points should be in the character
12714 * Now we can see about various optimizations. Fold calculation (which we
12715 * did above) needs to take place before inversion. Otherwise /[^k]/i
12716 * would invert to include K, which under /i would match k, which it
12717 * shouldn't. Therefore we can't invert folded locale now, as it won't be
12718 * folded until runtime */
12720 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12721 * at compile time. Besides not inverting folded locale now, we can't invert
12722 * if there are things such as \w, which aren't known until runtime */
12724 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12726 && ! unicode_alternate
12727 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12729 _invlist_invert(cp_list);
12731 /* Any swash can't be used as-is, because we've inverted things */
12733 SvREFCNT_dec(swash);
12737 /* Clear the invert flag since have just done it here */
12741 /* If we didn't do folding, it's because some information isn't available
12742 * until runtime; set the run-time fold flag for these. (We don't have to
12743 * worry about properties folding, as that is taken care of by the swash
12745 if (FOLD && (LOC || unicode_alternate))
12747 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12750 /* Some character classes are equivalent to other nodes. Such nodes take
12751 * up less room and generally fewer operations to execute than ANYOF nodes.
12752 * Above, we checked for and optimized into some such equivalents for
12753 * certain common classes that are easy to test. Getting to this point in
12754 * the code means that the class didn't get optimized there. Since this
12755 * code is only executed in Pass 2, it is too late to save space--it has
12756 * been allocated in Pass 1, and currently isn't given back. But turning
12757 * things into an EXACTish node can allow the optimizer to join it to any
12758 * adjacent such nodes. And if the class is equivalent to things like /./,
12759 * expensive run-time swashes can be avoided. Now that we have more
12760 * complete information, we can find things necessarily missed by the
12761 * earlier code. I (khw) am not sure how much to look for here. It would
12762 * be easy, but perhaps too slow, to check any candidates against all the
12763 * node types they could possibly match using _invlistEQ(). */
12766 && ! unicode_alternate
12769 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12770 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12773 U8 op = END; /* The optimzation node-type */
12774 const char * cur_parse= RExC_parse;
12776 invlist_iterinit(cp_list);
12777 if (! invlist_iternext(cp_list, &start, &end)) {
12779 /* Here, the list is empty. This happens, for example, when a
12780 * Unicode property is the only thing in the character class, and
12781 * it doesn't match anything. (perluniprops.pod notes such
12784 *flagp |= HASWIDTH|SIMPLE;
12786 else if (start == end) { /* The range is a single code point */
12787 if (! invlist_iternext(cp_list, &start, &end)
12789 /* Don't do this optimization if it would require changing
12790 * the pattern to UTF-8 */
12791 && (start < 256 || UTF))
12793 /* Here, the list contains a single code point. Can optimize
12794 * into an EXACT node */
12803 /* A locale node under folding with one code point can be
12804 * an EXACTFL, as its fold won't be calculated until
12810 /* Here, we are generally folding, but there is only one
12811 * code point to match. If we have to, we use an EXACT
12812 * node, but it would be better for joining with adjacent
12813 * nodes in the optimization pass if we used the same
12814 * EXACTFish node that any such are likely to be. We can
12815 * do this iff the code point doesn't participate in any
12816 * folds. For example, an EXACTF of a colon is the same as
12817 * an EXACT one, since nothing folds to or from a colon.
12818 * In the Latin1 range, being an alpha means that the
12819 * character participates in a fold (except for the
12820 * feminine and masculine ordinals, which I (khw) don't
12821 * think are worrying about optimizing for). */
12823 if (isALPHA_L1(value)) {
12828 if (! PL_utf8_foldable) {
12829 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12830 &PL_sv_undef, 1, 0);
12831 PL_utf8_foldable = _get_swash_invlist(swash);
12832 SvREFCNT_dec(swash);
12834 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
12839 /* If we haven't found the node type, above, it means we
12840 * can use the prevailing one */
12842 op = compute_EXACTish(pRExC_state);
12847 else if (start == 0) {
12848 if (end == UV_MAX) {
12850 *flagp |= HASWIDTH|SIMPLE;
12853 else if (end == '\n' - 1
12854 && invlist_iternext(cp_list, &start, &end)
12855 && start == '\n' + 1 && end == UV_MAX)
12858 *flagp |= HASWIDTH|SIMPLE;
12864 RExC_parse = (char *)orig_parse;
12865 RExC_emit = (regnode *)orig_emit;
12867 ret = reg_node(pRExC_state, op);
12869 RExC_parse = (char *)cur_parse;
12871 if (PL_regkind[op] == EXACT) {
12872 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12875 SvREFCNT_dec(listsv);
12880 /* Here, <cp_list> contains all the code points we can determine at
12881 * compile time that match under all conditions. Go through it, and
12882 * for things that belong in the bitmap, put them there, and delete from
12883 * <cp_list>. While we are at it, see if everything above 255 is in the
12884 * list, and if so, set a flag to speed up execution */
12885 ANYOF_BITMAP_ZERO(ret);
12888 /* This gets set if we actually need to modify things */
12889 bool change_invlist = FALSE;
12893 /* Start looking through <cp_list> */
12894 invlist_iterinit(cp_list);
12895 while (invlist_iternext(cp_list, &start, &end)) {
12899 if (end == UV_MAX && start <= 256) {
12900 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12903 /* Quit if are above what we should change */
12908 change_invlist = TRUE;
12910 /* Set all the bits in the range, up to the max that we are doing */
12911 high = (end < 255) ? end : 255;
12912 for (i = start; i <= (int) high; i++) {
12913 if (! ANYOF_BITMAP_TEST(ret, i)) {
12914 ANYOF_BITMAP_SET(ret, i);
12921 /* Done with loop; remove any code points that are in the bitmap from
12923 if (change_invlist) {
12924 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
12927 /* If have completely emptied it, remove it completely */
12928 if (_invlist_len(cp_list) == 0) {
12929 SvREFCNT_dec(cp_list);
12935 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
12938 /* Here, the bitmap has been populated with all the Latin1 code points that
12939 * always match. Can now add to the overall list those that match only
12940 * when the target string is UTF-8 (<depends_list>). */
12941 if (depends_list) {
12943 _invlist_union(cp_list, depends_list, &cp_list);
12944 SvREFCNT_dec(depends_list);
12947 cp_list = depends_list;
12951 /* If there is a swash and more than one element, we can't use the swash in
12952 * the optimization below. */
12953 if (swash && element_count > 1) {
12954 SvREFCNT_dec(swash);
12959 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12960 && ! unicode_alternate)
12962 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12963 SvREFCNT_dec(listsv);
12964 SvREFCNT_dec(unicode_alternate);
12967 /* av[0] stores the character class description in its textual form:
12968 * used later (regexec.c:Perl_regclass_swash()) to initialize the
12969 * appropriate swash, and is also useful for dumping the regnode.
12970 * av[1] if NULL, is a placeholder to later contain the swash computed
12971 * from av[0]. But if no further computation need be done, the
12972 * swash is stored there now.
12973 * av[2] stores the multicharacter foldings, used later in
12974 * regexec.c:S_reginclass().
12975 * av[3] stores the cp_list inversion list for use in addition or
12976 * instead of av[0]; used only if av[1] is NULL
12977 * av[4] is set if any component of the class is from a user-defined
12978 * property; used only if av[1] is NULL */
12979 AV * const av = newAV();
12982 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12986 av_store(av, 1, swash);
12987 SvREFCNT_dec(cp_list);
12990 av_store(av, 1, NULL);
12992 av_store(av, 3, cp_list);
12993 av_store(av, 4, newSVuv(has_user_defined_property));
12997 /* Store any computed multi-char folds only if we are allowing
12999 if (allow_full_fold) {
13000 av_store(av, 2, MUTABLE_SV(unicode_alternate));
13001 if (unicode_alternate) { /* This node is variable length */
13006 av_store(av, 2, NULL);
13008 rv = newRV_noinc(MUTABLE_SV(av));
13009 n = add_data(pRExC_state, 1, "s");
13010 RExC_rxi->data->data[n] = (void*)rv;
13014 *flagp |= HASWIDTH|SIMPLE;
13017 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13020 /* reg_skipcomment()
13022 Absorbs an /x style # comments from the input stream.
13023 Returns true if there is more text remaining in the stream.
13024 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13025 terminates the pattern without including a newline.
13027 Note its the callers responsibility to ensure that we are
13028 actually in /x mode
13033 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13037 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13039 while (RExC_parse < RExC_end)
13040 if (*RExC_parse++ == '\n') {
13045 /* we ran off the end of the pattern without ending
13046 the comment, so we have to add an \n when wrapping */
13047 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13055 Advances the parse position, and optionally absorbs
13056 "whitespace" from the inputstream.
13058 Without /x "whitespace" means (?#...) style comments only,
13059 with /x this means (?#...) and # comments and whitespace proper.
13061 Returns the RExC_parse point from BEFORE the scan occurs.
13063 This is the /x friendly way of saying RExC_parse++.
13067 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13069 char* const retval = RExC_parse++;
13071 PERL_ARGS_ASSERT_NEXTCHAR;
13074 if (RExC_end - RExC_parse >= 3
13075 && *RExC_parse == '('
13076 && RExC_parse[1] == '?'
13077 && RExC_parse[2] == '#')
13079 while (*RExC_parse != ')') {
13080 if (RExC_parse == RExC_end)
13081 FAIL("Sequence (?#... not terminated");
13087 if (RExC_flags & RXf_PMf_EXTENDED) {
13088 if (isSPACE(*RExC_parse)) {
13092 else if (*RExC_parse == '#') {
13093 if ( reg_skipcomment( pRExC_state ) )
13102 - reg_node - emit a node
13104 STATIC regnode * /* Location. */
13105 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13109 regnode * const ret = RExC_emit;
13110 GET_RE_DEBUG_FLAGS_DECL;
13112 PERL_ARGS_ASSERT_REG_NODE;
13115 SIZE_ALIGN(RExC_size);
13119 if (RExC_emit >= RExC_emit_bound)
13120 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13121 op, RExC_emit, RExC_emit_bound);
13123 NODE_ALIGN_FILL(ret);
13125 FILL_ADVANCE_NODE(ptr, op);
13126 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
13127 #ifdef RE_TRACK_PATTERN_OFFSETS
13128 if (RExC_offsets) { /* MJD */
13129 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13130 "reg_node", __LINE__,
13132 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13133 ? "Overwriting end of array!\n" : "OK",
13134 (UV)(RExC_emit - RExC_emit_start),
13135 (UV)(RExC_parse - RExC_start),
13136 (UV)RExC_offsets[0]));
13137 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13145 - reganode - emit a node with an argument
13147 STATIC regnode * /* Location. */
13148 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13152 regnode * const ret = RExC_emit;
13153 GET_RE_DEBUG_FLAGS_DECL;
13155 PERL_ARGS_ASSERT_REGANODE;
13158 SIZE_ALIGN(RExC_size);
13163 assert(2==regarglen[op]+1);
13165 Anything larger than this has to allocate the extra amount.
13166 If we changed this to be:
13168 RExC_size += (1 + regarglen[op]);
13170 then it wouldn't matter. Its not clear what side effect
13171 might come from that so its not done so far.
13176 if (RExC_emit >= RExC_emit_bound)
13177 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13178 op, RExC_emit, RExC_emit_bound);
13180 NODE_ALIGN_FILL(ret);
13182 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13183 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
13184 #ifdef RE_TRACK_PATTERN_OFFSETS
13185 if (RExC_offsets) { /* MJD */
13186 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13190 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13191 "Overwriting end of array!\n" : "OK",
13192 (UV)(RExC_emit - RExC_emit_start),
13193 (UV)(RExC_parse - RExC_start),
13194 (UV)RExC_offsets[0]));
13195 Set_Cur_Node_Offset;
13203 - reguni - emit (if appropriate) a Unicode character
13206 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13210 PERL_ARGS_ASSERT_REGUNI;
13212 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13216 - reginsert - insert an operator in front of already-emitted operand
13218 * Means relocating the operand.
13221 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13227 const int offset = regarglen[(U8)op];
13228 const int size = NODE_STEP_REGNODE + offset;
13229 GET_RE_DEBUG_FLAGS_DECL;
13231 PERL_ARGS_ASSERT_REGINSERT;
13232 PERL_UNUSED_ARG(depth);
13233 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13234 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13243 if (RExC_open_parens) {
13245 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13246 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13247 if ( RExC_open_parens[paren] >= opnd ) {
13248 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13249 RExC_open_parens[paren] += size;
13251 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13253 if ( RExC_close_parens[paren] >= opnd ) {
13254 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13255 RExC_close_parens[paren] += size;
13257 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13262 while (src > opnd) {
13263 StructCopy(--src, --dst, regnode);
13264 #ifdef RE_TRACK_PATTERN_OFFSETS
13265 if (RExC_offsets) { /* MJD 20010112 */
13266 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13270 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13271 ? "Overwriting end of array!\n" : "OK",
13272 (UV)(src - RExC_emit_start),
13273 (UV)(dst - RExC_emit_start),
13274 (UV)RExC_offsets[0]));
13275 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13276 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13282 place = opnd; /* Op node, where operand used to be. */
13283 #ifdef RE_TRACK_PATTERN_OFFSETS
13284 if (RExC_offsets) { /* MJD */
13285 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13289 (UV)(place - RExC_emit_start) > RExC_offsets[0]
13290 ? "Overwriting end of array!\n" : "OK",
13291 (UV)(place - RExC_emit_start),
13292 (UV)(RExC_parse - RExC_start),
13293 (UV)RExC_offsets[0]));
13294 Set_Node_Offset(place, RExC_parse);
13295 Set_Node_Length(place, 1);
13298 src = NEXTOPER(place);
13299 FILL_ADVANCE_NODE(place, op);
13300 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
13301 Zero(src, offset, regnode);
13305 - regtail - set the next-pointer at the end of a node chain of p to val.
13306 - SEE ALSO: regtail_study
13308 /* TODO: All three parms should be const */
13310 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13314 GET_RE_DEBUG_FLAGS_DECL;
13316 PERL_ARGS_ASSERT_REGTAIL;
13318 PERL_UNUSED_ARG(depth);
13324 /* Find last node. */
13327 regnode * const temp = regnext(scan);
13329 SV * const mysv=sv_newmortal();
13330 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13331 regprop(RExC_rx, mysv, scan);
13332 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13333 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13334 (temp == NULL ? "->" : ""),
13335 (temp == NULL ? PL_reg_name[OP(val)] : "")
13343 if (reg_off_by_arg[OP(scan)]) {
13344 ARG_SET(scan, val - scan);
13347 NEXT_OFF(scan) = val - scan;
13353 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13354 - Look for optimizable sequences at the same time.
13355 - currently only looks for EXACT chains.
13357 This is experimental code. The idea is to use this routine to perform
13358 in place optimizations on branches and groups as they are constructed,
13359 with the long term intention of removing optimization from study_chunk so
13360 that it is purely analytical.
13362 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13363 to control which is which.
13366 /* TODO: All four parms should be const */
13369 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13374 #ifdef EXPERIMENTAL_INPLACESCAN
13377 GET_RE_DEBUG_FLAGS_DECL;
13379 PERL_ARGS_ASSERT_REGTAIL_STUDY;
13385 /* Find last node. */
13389 regnode * const temp = regnext(scan);
13390 #ifdef EXPERIMENTAL_INPLACESCAN
13391 if (PL_regkind[OP(scan)] == EXACT) {
13392 bool has_exactf_sharp_s; /* Unexamined in this routine */
13393 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13398 switch (OP(scan)) {
13404 case EXACTFU_TRICKYFOLD:
13406 if( exact == PSEUDO )
13408 else if ( exact != OP(scan) )
13417 SV * const mysv=sv_newmortal();
13418 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13419 regprop(RExC_rx, mysv, scan);
13420 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13421 SvPV_nolen_const(mysv),
13422 REG_NODE_NUM(scan),
13423 PL_reg_name[exact]);
13430 SV * const mysv_val=sv_newmortal();
13431 DEBUG_PARSE_MSG("");
13432 regprop(RExC_rx, mysv_val, val);
13433 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13434 SvPV_nolen_const(mysv_val),
13435 (IV)REG_NODE_NUM(val),
13439 if (reg_off_by_arg[OP(scan)]) {
13440 ARG_SET(scan, val - scan);
13443 NEXT_OFF(scan) = val - scan;
13451 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13455 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13461 for (bit=0; bit<32; bit++) {
13462 if (flags & (1<<bit)) {
13463 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
13466 if (!set++ && lead)
13467 PerlIO_printf(Perl_debug_log, "%s",lead);
13468 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13471 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13472 if (!set++ && lead) {
13473 PerlIO_printf(Perl_debug_log, "%s",lead);
13476 case REGEX_UNICODE_CHARSET:
13477 PerlIO_printf(Perl_debug_log, "UNICODE");
13479 case REGEX_LOCALE_CHARSET:
13480 PerlIO_printf(Perl_debug_log, "LOCALE");
13482 case REGEX_ASCII_RESTRICTED_CHARSET:
13483 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13485 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13486 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13489 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13495 PerlIO_printf(Perl_debug_log, "\n");
13497 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13503 Perl_regdump(pTHX_ const regexp *r)
13507 SV * const sv = sv_newmortal();
13508 SV *dsv= sv_newmortal();
13509 RXi_GET_DECL(r,ri);
13510 GET_RE_DEBUG_FLAGS_DECL;
13512 PERL_ARGS_ASSERT_REGDUMP;
13514 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13516 /* Header fields of interest. */
13517 if (r->anchored_substr) {
13518 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13519 RE_SV_DUMPLEN(r->anchored_substr), 30);
13520 PerlIO_printf(Perl_debug_log,
13521 "anchored %s%s at %"IVdf" ",
13522 s, RE_SV_TAIL(r->anchored_substr),
13523 (IV)r->anchored_offset);
13524 } else if (r->anchored_utf8) {
13525 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13526 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13527 PerlIO_printf(Perl_debug_log,
13528 "anchored utf8 %s%s at %"IVdf" ",
13529 s, RE_SV_TAIL(r->anchored_utf8),
13530 (IV)r->anchored_offset);
13532 if (r->float_substr) {
13533 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13534 RE_SV_DUMPLEN(r->float_substr), 30);
13535 PerlIO_printf(Perl_debug_log,
13536 "floating %s%s at %"IVdf"..%"UVuf" ",
13537 s, RE_SV_TAIL(r->float_substr),
13538 (IV)r->float_min_offset, (UV)r->float_max_offset);
13539 } else if (r->float_utf8) {
13540 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13541 RE_SV_DUMPLEN(r->float_utf8), 30);
13542 PerlIO_printf(Perl_debug_log,
13543 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13544 s, RE_SV_TAIL(r->float_utf8),
13545 (IV)r->float_min_offset, (UV)r->float_max_offset);
13547 if (r->check_substr || r->check_utf8)
13548 PerlIO_printf(Perl_debug_log,
13550 (r->check_substr == r->float_substr
13551 && r->check_utf8 == r->float_utf8
13552 ? "(checking floating" : "(checking anchored"));
13553 if (r->extflags & RXf_NOSCAN)
13554 PerlIO_printf(Perl_debug_log, " noscan");
13555 if (r->extflags & RXf_CHECK_ALL)
13556 PerlIO_printf(Perl_debug_log, " isall");
13557 if (r->check_substr || r->check_utf8)
13558 PerlIO_printf(Perl_debug_log, ") ");
13560 if (ri->regstclass) {
13561 regprop(r, sv, ri->regstclass);
13562 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13564 if (r->extflags & RXf_ANCH) {
13565 PerlIO_printf(Perl_debug_log, "anchored");
13566 if (r->extflags & RXf_ANCH_BOL)
13567 PerlIO_printf(Perl_debug_log, "(BOL)");
13568 if (r->extflags & RXf_ANCH_MBOL)
13569 PerlIO_printf(Perl_debug_log, "(MBOL)");
13570 if (r->extflags & RXf_ANCH_SBOL)
13571 PerlIO_printf(Perl_debug_log, "(SBOL)");
13572 if (r->extflags & RXf_ANCH_GPOS)
13573 PerlIO_printf(Perl_debug_log, "(GPOS)");
13574 PerlIO_putc(Perl_debug_log, ' ');
13576 if (r->extflags & RXf_GPOS_SEEN)
13577 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13578 if (r->intflags & PREGf_SKIP)
13579 PerlIO_printf(Perl_debug_log, "plus ");
13580 if (r->intflags & PREGf_IMPLICIT)
13581 PerlIO_printf(Perl_debug_log, "implicit ");
13582 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13583 if (r->extflags & RXf_EVAL_SEEN)
13584 PerlIO_printf(Perl_debug_log, "with eval ");
13585 PerlIO_printf(Perl_debug_log, "\n");
13586 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13588 PERL_ARGS_ASSERT_REGDUMP;
13589 PERL_UNUSED_CONTEXT;
13590 PERL_UNUSED_ARG(r);
13591 #endif /* DEBUGGING */
13595 - regprop - printable representation of opcode
13597 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13600 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13601 if (flags & ANYOF_INVERT) \
13602 /*make sure the invert info is in each */ \
13603 sv_catpvs(sv, "^"); \
13609 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13615 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13616 static const char * const anyofs[] = {
13648 RXi_GET_DECL(prog,progi);
13649 GET_RE_DEBUG_FLAGS_DECL;
13651 PERL_ARGS_ASSERT_REGPROP;
13655 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13656 /* It would be nice to FAIL() here, but this may be called from
13657 regexec.c, and it would be hard to supply pRExC_state. */
13658 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13659 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13661 k = PL_regkind[OP(o)];
13664 sv_catpvs(sv, " ");
13665 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13666 * is a crude hack but it may be the best for now since
13667 * we have no flag "this EXACTish node was UTF-8"
13669 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13670 PERL_PV_ESCAPE_UNI_DETECT |
13671 PERL_PV_ESCAPE_NONASCII |
13672 PERL_PV_PRETTY_ELLIPSES |
13673 PERL_PV_PRETTY_LTGT |
13674 PERL_PV_PRETTY_NOCLEAR
13676 } else if (k == TRIE) {
13677 /* print the details of the trie in dumpuntil instead, as
13678 * progi->data isn't available here */
13679 const char op = OP(o);
13680 const U32 n = ARG(o);
13681 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13682 (reg_ac_data *)progi->data->data[n] :
13684 const reg_trie_data * const trie
13685 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13687 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13688 DEBUG_TRIE_COMPILE_r(
13689 Perl_sv_catpvf(aTHX_ sv,
13690 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13691 (UV)trie->startstate,
13692 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13693 (UV)trie->wordcount,
13696 (UV)TRIE_CHARCOUNT(trie),
13697 (UV)trie->uniquecharcount
13700 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13702 int rangestart = -1;
13703 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13704 sv_catpvs(sv, "[");
13705 for (i = 0; i <= 256; i++) {
13706 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13707 if (rangestart == -1)
13709 } else if (rangestart != -1) {
13710 if (i <= rangestart + 3)
13711 for (; rangestart < i; rangestart++)
13712 put_byte(sv, rangestart);
13714 put_byte(sv, rangestart);
13715 sv_catpvs(sv, "-");
13716 put_byte(sv, i - 1);
13721 sv_catpvs(sv, "]");
13724 } else if (k == CURLY) {
13725 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13726 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13727 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13729 else if (k == WHILEM && o->flags) /* Ordinal/of */
13730 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13731 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13732 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13733 if ( RXp_PAREN_NAMES(prog) ) {
13734 if ( k != REF || (OP(o) < NREF)) {
13735 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13736 SV **name= av_fetch(list, ARG(o), 0 );
13738 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13741 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13742 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13743 I32 *nums=(I32*)SvPVX(sv_dat);
13744 SV **name= av_fetch(list, nums[0], 0 );
13747 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13748 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13749 (n ? "," : ""), (IV)nums[n]);
13751 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13755 } else if (k == GOSUB)
13756 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13757 else if (k == VERB) {
13759 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13760 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13761 } else if (k == LOGICAL)
13762 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13763 else if (k == ANYOF) {
13764 int i, rangestart = -1;
13765 const U8 flags = ANYOF_FLAGS(o);
13769 if (flags & ANYOF_LOCALE)
13770 sv_catpvs(sv, "{loc}");
13771 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13772 sv_catpvs(sv, "{i}");
13773 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13774 if (flags & ANYOF_INVERT)
13775 sv_catpvs(sv, "^");
13777 /* output what the standard cp 0-255 bitmap matches */
13778 for (i = 0; i <= 256; i++) {
13779 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13780 if (rangestart == -1)
13782 } else if (rangestart != -1) {
13783 if (i <= rangestart + 3)
13784 for (; rangestart < i; rangestart++)
13785 put_byte(sv, rangestart);
13787 put_byte(sv, rangestart);
13788 sv_catpvs(sv, "-");
13789 put_byte(sv, i - 1);
13796 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13797 /* output any special charclass tests (used entirely under use locale) */
13798 if (ANYOF_CLASS_TEST_ANY_SET(o))
13799 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13800 if (ANYOF_CLASS_TEST(o,i)) {
13801 sv_catpv(sv, anyofs[i]);
13805 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13807 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13808 sv_catpvs(sv, "{non-utf8-latin1-all}");
13811 /* output information about the unicode matching */
13812 if (flags & ANYOF_UNICODE_ALL)
13813 sv_catpvs(sv, "{unicode_all}");
13814 else if (ANYOF_NONBITMAP(o))
13815 sv_catpvs(sv, "{unicode}");
13816 if (flags & ANYOF_NONBITMAP_NON_UTF8)
13817 sv_catpvs(sv, "{outside bitmap}");
13819 if (ANYOF_NONBITMAP(o)) {
13820 SV *lv; /* Set if there is something outside the bit map */
13821 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13822 bool byte_output = FALSE; /* If something in the bitmap has been
13825 if (lv && lv != &PL_sv_undef) {
13827 U8 s[UTF8_MAXBYTES_CASE+1];
13829 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13830 uvchr_to_utf8(s, i);
13833 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
13837 && swash_fetch(sw, s, TRUE))
13839 if (rangestart == -1)
13841 } else if (rangestart != -1) {
13842 byte_output = TRUE;
13843 if (i <= rangestart + 3)
13844 for (; rangestart < i; rangestart++) {
13845 put_byte(sv, rangestart);
13848 put_byte(sv, rangestart);
13849 sv_catpvs(sv, "-");
13858 char *s = savesvpv(lv);
13859 char * const origs = s;
13861 while (*s && *s != '\n')
13865 const char * const t = ++s;
13868 sv_catpvs(sv, " ");
13874 /* Truncate very long output */
13875 if (s - origs > 256) {
13876 Perl_sv_catpvf(aTHX_ sv,
13878 (int) (s - origs - 1),
13884 else if (*s == '\t') {
13903 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13905 else if (k == POSIXD) {
13906 U8 index = FLAGS(o) * 2;
13907 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
13908 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
13911 sv_catpv(sv, anyofs[index]);
13914 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13915 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13917 PERL_UNUSED_CONTEXT;
13918 PERL_UNUSED_ARG(sv);
13919 PERL_UNUSED_ARG(o);
13920 PERL_UNUSED_ARG(prog);
13921 #endif /* DEBUGGING */
13925 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13926 { /* Assume that RE_INTUIT is set */
13928 struct regexp *const prog = (struct regexp *)SvANY(r);
13929 GET_RE_DEBUG_FLAGS_DECL;
13931 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13932 PERL_UNUSED_CONTEXT;
13936 const char * const s = SvPV_nolen_const(prog->check_substr
13937 ? prog->check_substr : prog->check_utf8);
13939 if (!PL_colorset) reginitcolors();
13940 PerlIO_printf(Perl_debug_log,
13941 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13943 prog->check_substr ? "" : "utf8 ",
13944 PL_colors[5],PL_colors[0],
13947 (strlen(s) > 60 ? "..." : ""));
13950 return prog->check_substr ? prog->check_substr : prog->check_utf8;
13956 handles refcounting and freeing the perl core regexp structure. When
13957 it is necessary to actually free the structure the first thing it
13958 does is call the 'free' method of the regexp_engine associated to
13959 the regexp, allowing the handling of the void *pprivate; member
13960 first. (This routine is not overridable by extensions, which is why
13961 the extensions free is called first.)
13963 See regdupe and regdupe_internal if you change anything here.
13965 #ifndef PERL_IN_XSUB_RE
13967 Perl_pregfree(pTHX_ REGEXP *r)
13973 Perl_pregfree2(pTHX_ REGEXP *rx)
13976 struct regexp *const r = (struct regexp *)SvANY(rx);
13977 GET_RE_DEBUG_FLAGS_DECL;
13979 PERL_ARGS_ASSERT_PREGFREE2;
13981 if (r->mother_re) {
13982 ReREFCNT_dec(r->mother_re);
13984 CALLREGFREE_PVT(rx); /* free the private data */
13985 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13988 SvREFCNT_dec(r->anchored_substr);
13989 SvREFCNT_dec(r->anchored_utf8);
13990 SvREFCNT_dec(r->float_substr);
13991 SvREFCNT_dec(r->float_utf8);
13992 Safefree(r->substrs);
13994 RX_MATCH_COPY_FREE(rx);
13995 #ifdef PERL_OLD_COPY_ON_WRITE
13996 SvREFCNT_dec(r->saved_copy);
13999 SvREFCNT_dec(r->qr_anoncv);
14004 This is a hacky workaround to the structural issue of match results
14005 being stored in the regexp structure which is in turn stored in
14006 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14007 could be PL_curpm in multiple contexts, and could require multiple
14008 result sets being associated with the pattern simultaneously, such
14009 as when doing a recursive match with (??{$qr})
14011 The solution is to make a lightweight copy of the regexp structure
14012 when a qr// is returned from the code executed by (??{$qr}) this
14013 lightweight copy doesn't actually own any of its data except for
14014 the starp/end and the actual regexp structure itself.
14020 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14022 struct regexp *ret;
14023 struct regexp *const r = (struct regexp *)SvANY(rx);
14025 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14028 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14029 ret = (struct regexp *)SvANY(ret_x);
14031 (void)ReREFCNT_inc(rx);
14032 /* We can take advantage of the existing "copied buffer" mechanism in SVs
14033 by pointing directly at the buffer, but flagging that the allocated
14034 space in the copy is zero. As we've just done a struct copy, it's now
14035 a case of zero-ing that, rather than copying the current length. */
14036 SvPV_set(ret_x, RX_WRAPPED(rx));
14037 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
14038 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14039 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14040 SvLEN_set(ret_x, 0);
14041 SvSTASH_set(ret_x, NULL);
14042 SvMAGIC_set(ret_x, NULL);
14044 const I32 npar = r->nparens+1;
14045 Newx(ret->offs, npar, regexp_paren_pair);
14046 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14049 Newx(ret->substrs, 1, struct reg_substr_data);
14050 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14052 SvREFCNT_inc_void(ret->anchored_substr);
14053 SvREFCNT_inc_void(ret->anchored_utf8);
14054 SvREFCNT_inc_void(ret->float_substr);
14055 SvREFCNT_inc_void(ret->float_utf8);
14057 /* check_substr and check_utf8, if non-NULL, point to either their
14058 anchored or float namesakes, and don't hold a second reference. */
14060 RX_MATCH_COPIED_off(ret_x);
14061 #ifdef PERL_OLD_COPY_ON_WRITE
14062 ret->saved_copy = NULL;
14064 ret->mother_re = rx;
14065 SvREFCNT_inc_void(ret->qr_anoncv);
14071 /* regfree_internal()
14073 Free the private data in a regexp. This is overloadable by
14074 extensions. Perl takes care of the regexp structure in pregfree(),
14075 this covers the *pprivate pointer which technically perl doesn't
14076 know about, however of course we have to handle the
14077 regexp_internal structure when no extension is in use.
14079 Note this is called before freeing anything in the regexp
14084 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14087 struct regexp *const r = (struct regexp *)SvANY(rx);
14088 RXi_GET_DECL(r,ri);
14089 GET_RE_DEBUG_FLAGS_DECL;
14091 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14097 SV *dsv= sv_newmortal();
14098 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14099 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14100 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14101 PL_colors[4],PL_colors[5],s);
14104 #ifdef RE_TRACK_PATTERN_OFFSETS
14106 Safefree(ri->u.offsets); /* 20010421 MJD */
14108 if (ri->code_blocks) {
14110 for (n = 0; n < ri->num_code_blocks; n++)
14111 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14112 Safefree(ri->code_blocks);
14116 int n = ri->data->count;
14119 /* If you add a ->what type here, update the comment in regcomp.h */
14120 switch (ri->data->what[n]) {
14126 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14129 Safefree(ri->data->data[n]);
14135 { /* Aho Corasick add-on structure for a trie node.
14136 Used in stclass optimization only */
14138 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14140 refcount = --aho->refcount;
14143 PerlMemShared_free(aho->states);
14144 PerlMemShared_free(aho->fail);
14145 /* do this last!!!! */
14146 PerlMemShared_free(ri->data->data[n]);
14147 PerlMemShared_free(ri->regstclass);
14153 /* trie structure. */
14155 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14157 refcount = --trie->refcount;
14160 PerlMemShared_free(trie->charmap);
14161 PerlMemShared_free(trie->states);
14162 PerlMemShared_free(trie->trans);
14164 PerlMemShared_free(trie->bitmap);
14166 PerlMemShared_free(trie->jump);
14167 PerlMemShared_free(trie->wordinfo);
14168 /* do this last!!!! */
14169 PerlMemShared_free(ri->data->data[n]);
14174 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14177 Safefree(ri->data->what);
14178 Safefree(ri->data);
14184 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14185 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14186 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14189 re_dup - duplicate a regexp.
14191 This routine is expected to clone a given regexp structure. It is only
14192 compiled under USE_ITHREADS.
14194 After all of the core data stored in struct regexp is duplicated
14195 the regexp_engine.dupe method is used to copy any private data
14196 stored in the *pprivate pointer. This allows extensions to handle
14197 any duplication it needs to do.
14199 See pregfree() and regfree_internal() if you change anything here.
14201 #if defined(USE_ITHREADS)
14202 #ifndef PERL_IN_XSUB_RE
14204 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14208 const struct regexp *r = (const struct regexp *)SvANY(sstr);
14209 struct regexp *ret = (struct regexp *)SvANY(dstr);
14211 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14213 npar = r->nparens+1;
14214 Newx(ret->offs, npar, regexp_paren_pair);
14215 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14217 /* no need to copy these */
14218 Newx(ret->swap, npar, regexp_paren_pair);
14221 if (ret->substrs) {
14222 /* Do it this way to avoid reading from *r after the StructCopy().
14223 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14224 cache, it doesn't matter. */
14225 const bool anchored = r->check_substr
14226 ? r->check_substr == r->anchored_substr
14227 : r->check_utf8 == r->anchored_utf8;
14228 Newx(ret->substrs, 1, struct reg_substr_data);
14229 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14231 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14232 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14233 ret->float_substr = sv_dup_inc(ret->float_substr, param);
14234 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14236 /* check_substr and check_utf8, if non-NULL, point to either their
14237 anchored or float namesakes, and don't hold a second reference. */
14239 if (ret->check_substr) {
14241 assert(r->check_utf8 == r->anchored_utf8);
14242 ret->check_substr = ret->anchored_substr;
14243 ret->check_utf8 = ret->anchored_utf8;
14245 assert(r->check_substr == r->float_substr);
14246 assert(r->check_utf8 == r->float_utf8);
14247 ret->check_substr = ret->float_substr;
14248 ret->check_utf8 = ret->float_utf8;
14250 } else if (ret->check_utf8) {
14252 ret->check_utf8 = ret->anchored_utf8;
14254 ret->check_utf8 = ret->float_utf8;
14259 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14260 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14263 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14265 if (RX_MATCH_COPIED(dstr))
14266 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
14268 ret->subbeg = NULL;
14269 #ifdef PERL_OLD_COPY_ON_WRITE
14270 ret->saved_copy = NULL;
14273 if (ret->mother_re) {
14274 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
14275 /* Our storage points directly to our mother regexp, but that's
14276 1: a buffer in a different thread
14277 2: something we no longer hold a reference on
14278 so we need to copy it locally. */
14279 /* Note we need to use SvCUR(), rather than
14280 SvLEN(), on our mother_re, because it, in
14281 turn, may well be pointing to its own mother_re. */
14282 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
14283 SvCUR(ret->mother_re)+1));
14284 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
14286 ret->mother_re = NULL;
14290 #endif /* PERL_IN_XSUB_RE */
14295 This is the internal complement to regdupe() which is used to copy
14296 the structure pointed to by the *pprivate pointer in the regexp.
14297 This is the core version of the extension overridable cloning hook.
14298 The regexp structure being duplicated will be copied by perl prior
14299 to this and will be provided as the regexp *r argument, however
14300 with the /old/ structures pprivate pointer value. Thus this routine
14301 may override any copying normally done by perl.
14303 It returns a pointer to the new regexp_internal structure.
14307 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14310 struct regexp *const r = (struct regexp *)SvANY(rx);
14311 regexp_internal *reti;
14313 RXi_GET_DECL(r,ri);
14315 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14319 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14320 Copy(ri->program, reti->program, len+1, regnode);
14322 reti->num_code_blocks = ri->num_code_blocks;
14323 if (ri->code_blocks) {
14325 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14326 struct reg_code_block);
14327 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14328 struct reg_code_block);
14329 for (n = 0; n < ri->num_code_blocks; n++)
14330 reti->code_blocks[n].src_regex = (REGEXP*)
14331 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14334 reti->code_blocks = NULL;
14336 reti->regstclass = NULL;
14339 struct reg_data *d;
14340 const int count = ri->data->count;
14343 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14344 char, struct reg_data);
14345 Newx(d->what, count, U8);
14348 for (i = 0; i < count; i++) {
14349 d->what[i] = ri->data->what[i];
14350 switch (d->what[i]) {
14351 /* see also regcomp.h and regfree_internal() */
14352 case 'a': /* actually an AV, but the dup function is identical. */
14356 case 'u': /* actually an HV, but the dup function is identical. */
14357 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14360 /* This is cheating. */
14361 Newx(d->data[i], 1, struct regnode_charclass_class);
14362 StructCopy(ri->data->data[i], d->data[i],
14363 struct regnode_charclass_class);
14364 reti->regstclass = (regnode*)d->data[i];
14367 /* Trie stclasses are readonly and can thus be shared
14368 * without duplication. We free the stclass in pregfree
14369 * when the corresponding reg_ac_data struct is freed.
14371 reti->regstclass= ri->regstclass;
14375 ((reg_trie_data*)ri->data->data[i])->refcount++;
14380 d->data[i] = ri->data->data[i];
14383 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14392 reti->name_list_idx = ri->name_list_idx;
14394 #ifdef RE_TRACK_PATTERN_OFFSETS
14395 if (ri->u.offsets) {
14396 Newx(reti->u.offsets, 2*len+1, U32);
14397 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14400 SetProgLen(reti,len);
14403 return (void*)reti;
14406 #endif /* USE_ITHREADS */
14408 #ifndef PERL_IN_XSUB_RE
14411 - regnext - dig the "next" pointer out of a node
14414 Perl_regnext(pTHX_ register regnode *p)
14422 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
14423 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14426 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14435 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14438 STRLEN l1 = strlen(pat1);
14439 STRLEN l2 = strlen(pat2);
14442 const char *message;
14444 PERL_ARGS_ASSERT_RE_CROAK2;
14450 Copy(pat1, buf, l1 , char);
14451 Copy(pat2, buf + l1, l2 , char);
14452 buf[l1 + l2] = '\n';
14453 buf[l1 + l2 + 1] = '\0';
14455 /* ANSI variant takes additional second argument */
14456 va_start(args, pat2);
14460 msv = vmess(buf, &args);
14462 message = SvPV_const(msv,l1);
14465 Copy(message, buf, l1 , char);
14466 buf[l1-1] = '\0'; /* Overwrite \n */
14467 Perl_croak(aTHX_ "%s", buf);
14470 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
14472 #ifndef PERL_IN_XSUB_RE
14474 Perl_save_re_context(pTHX)
14478 struct re_save_state *state;
14480 SAVEVPTR(PL_curcop);
14481 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14483 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14484 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14485 SSPUSHUV(SAVEt_RE_STATE);
14487 Copy(&PL_reg_state, state, 1, struct re_save_state);
14489 PL_reg_oldsaved = NULL;
14490 PL_reg_oldsavedlen = 0;
14491 PL_reg_oldsavedoffset = 0;
14492 PL_reg_oldsavedcoffset = 0;
14493 PL_reg_maxiter = 0;
14494 PL_reg_leftiter = 0;
14495 PL_reg_poscache = NULL;
14496 PL_reg_poscache_size = 0;
14497 #ifdef PERL_OLD_COPY_ON_WRITE
14501 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14503 const REGEXP * const rx = PM_GETRE(PL_curpm);
14506 for (i = 1; i <= RX_NPARENS(rx); i++) {
14507 char digits[TYPE_CHARS(long)];
14508 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14509 GV *const *const gvp
14510 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14513 GV * const gv = *gvp;
14514 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14524 clear_re(pTHX_ void *r)
14527 ReREFCNT_dec((REGEXP *)r);
14533 S_put_byte(pTHX_ SV *sv, int c)
14535 PERL_ARGS_ASSERT_PUT_BYTE;
14537 /* Our definition of isPRINT() ignores locales, so only bytes that are
14538 not part of UTF-8 are considered printable. I assume that the same
14539 holds for UTF-EBCDIC.
14540 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14541 which Wikipedia says:
14543 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14544 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14545 identical, to the ASCII delete (DEL) or rubout control character.
14546 ) So the old condition can be simplified to !isPRINT(c) */
14549 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14552 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14556 const char string = c;
14557 if (c == '-' || c == ']' || c == '\\' || c == '^')
14558 sv_catpvs(sv, "\\");
14559 sv_catpvn(sv, &string, 1);
14564 #define CLEAR_OPTSTART \
14565 if (optstart) STMT_START { \
14566 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14570 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14572 STATIC const regnode *
14573 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14574 const regnode *last, const regnode *plast,
14575 SV* sv, I32 indent, U32 depth)
14578 U8 op = PSEUDO; /* Arbitrary non-END op. */
14579 const regnode *next;
14580 const regnode *optstart= NULL;
14582 RXi_GET_DECL(r,ri);
14583 GET_RE_DEBUG_FLAGS_DECL;
14585 PERL_ARGS_ASSERT_DUMPUNTIL;
14587 #ifdef DEBUG_DUMPUNTIL
14588 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14589 last ? last-start : 0,plast ? plast-start : 0);
14592 if (plast && plast < last)
14595 while (PL_regkind[op] != END && (!last || node < last)) {
14596 /* While that wasn't END last time... */
14599 if (op == CLOSE || op == WHILEM)
14601 next = regnext((regnode *)node);
14604 if (OP(node) == OPTIMIZED) {
14605 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14612 regprop(r, sv, node);
14613 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14614 (int)(2*indent + 1), "", SvPVX_const(sv));
14616 if (OP(node) != OPTIMIZED) {
14617 if (next == NULL) /* Next ptr. */
14618 PerlIO_printf(Perl_debug_log, " (0)");
14619 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14620 PerlIO_printf(Perl_debug_log, " (FAIL)");
14622 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14623 (void)PerlIO_putc(Perl_debug_log, '\n');
14627 if (PL_regkind[(U8)op] == BRANCHJ) {
14630 const regnode *nnode = (OP(next) == LONGJMP
14631 ? regnext((regnode *)next)
14633 if (last && nnode > last)
14635 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14638 else if (PL_regkind[(U8)op] == BRANCH) {
14640 DUMPUNTIL(NEXTOPER(node), next);
14642 else if ( PL_regkind[(U8)op] == TRIE ) {
14643 const regnode *this_trie = node;
14644 const char op = OP(node);
14645 const U32 n = ARG(node);
14646 const reg_ac_data * const ac = op>=AHOCORASICK ?
14647 (reg_ac_data *)ri->data->data[n] :
14649 const reg_trie_data * const trie =
14650 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14652 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14654 const regnode *nextbranch= NULL;
14657 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14658 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14660 PerlIO_printf(Perl_debug_log, "%*s%s ",
14661 (int)(2*(indent+3)), "",
14662 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14663 PL_colors[0], PL_colors[1],
14664 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14665 PERL_PV_PRETTY_ELLIPSES |
14666 PERL_PV_PRETTY_LTGT
14671 U16 dist= trie->jump[word_idx+1];
14672 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14673 (UV)((dist ? this_trie + dist : next) - start));
14676 nextbranch= this_trie + trie->jump[0];
14677 DUMPUNTIL(this_trie + dist, nextbranch);
14679 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14680 nextbranch= regnext((regnode *)nextbranch);
14682 PerlIO_printf(Perl_debug_log, "\n");
14685 if (last && next > last)
14690 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14691 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14692 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14694 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14696 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14698 else if ( op == PLUS || op == STAR) {
14699 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14701 else if (PL_regkind[(U8)op] == ANYOF) {
14702 /* arglen 1 + class block */
14703 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14704 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14705 node = NEXTOPER(node);
14707 else if (PL_regkind[(U8)op] == EXACT) {
14708 /* Literal string, where present. */
14709 node += NODE_SZ_STR(node) - 1;
14710 node = NEXTOPER(node);
14713 node = NEXTOPER(node);
14714 node += regarglen[(U8)op];
14716 if (op == CURLYX || op == OPEN)
14720 #ifdef DEBUG_DUMPUNTIL
14721 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14726 #endif /* DEBUGGING */
14730 * c-indentation-style: bsd
14731 * c-basic-offset: 4
14732 * indent-tabs-mode: nil
14735 * ex: set ts=8 sts=4 sw=4 et: