5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
89 #ifndef PERL_IN_XSUB_RE
90 # include "charclass_invlists.h"
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U32 flags; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 REGEXP *rx_sv; /* The SV that is the regexp. */
114 regexp *rx; /* perl core regexp structure */
115 regexp_internal *rxi; /* internal data for regexp object pprivate field */
116 char *start; /* Start of input for compile */
117 char *end; /* End of input for compile */
118 char *parse; /* Input-scan pointer. */
119 I32 whilem_seen; /* number of WHILEM in this expr */
120 regnode *emit_start; /* Start of emitted-code area */
121 regnode *emit_bound; /* First regnode outside of the allocated space */
122 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
123 I32 naughty; /* How bad is this pattern? */
124 I32 sawback; /* Did we see \1, ...? */
126 I32 size; /* Code size. */
127 I32 npar; /* Capture buffer count, (OPEN). */
128 I32 cpar; /* Capture buffer count, (CLOSE). */
129 I32 nestroot; /* root parens we are in - used by accept */
133 regnode **open_parens; /* pointers to open parens */
134 regnode **close_parens; /* pointers to close parens */
135 regnode *opend; /* END node in program */
136 I32 utf8; /* whether the pattern is utf8 or not */
137 I32 orig_utf8; /* whether the pattern was originally in utf8 */
138 /* XXX use this for future optimisation of case
139 * where pattern must be upgraded to utf8. */
140 I32 uni_semantics; /* If a d charset modifier should use unicode
141 rules, even if the pattern is not in
143 HV *paren_names; /* Paren names */
145 regnode **recurse; /* Recurse regops */
146 I32 recurse_count; /* Number of recurse regops */
149 I32 override_recoding;
151 char *starttry; /* -Dr: where regtry was called. */
152 #define RExC_starttry (pRExC_state->starttry)
155 const char *lastparse;
157 AV *paren_name_list; /* idx -> name */
158 #define RExC_lastparse (pRExC_state->lastparse)
159 #define RExC_lastnum (pRExC_state->lastnum)
160 #define RExC_paren_name_list (pRExC_state->paren_name_list)
164 #define RExC_flags (pRExC_state->flags)
165 #define RExC_precomp (pRExC_state->precomp)
166 #define RExC_rx_sv (pRExC_state->rx_sv)
167 #define RExC_rx (pRExC_state->rx)
168 #define RExC_rxi (pRExC_state->rxi)
169 #define RExC_start (pRExC_state->start)
170 #define RExC_end (pRExC_state->end)
171 #define RExC_parse (pRExC_state->parse)
172 #define RExC_whilem_seen (pRExC_state->whilem_seen)
173 #ifdef RE_TRACK_PATTERN_OFFSETS
174 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
176 #define RExC_emit (pRExC_state->emit)
177 #define RExC_emit_start (pRExC_state->emit_start)
178 #define RExC_emit_bound (pRExC_state->emit_bound)
179 #define RExC_naughty (pRExC_state->naughty)
180 #define RExC_sawback (pRExC_state->sawback)
181 #define RExC_seen (pRExC_state->seen)
182 #define RExC_size (pRExC_state->size)
183 #define RExC_npar (pRExC_state->npar)
184 #define RExC_nestroot (pRExC_state->nestroot)
185 #define RExC_extralen (pRExC_state->extralen)
186 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
187 #define RExC_seen_evals (pRExC_state->seen_evals)
188 #define RExC_utf8 (pRExC_state->utf8)
189 #define RExC_uni_semantics (pRExC_state->uni_semantics)
190 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
191 #define RExC_open_parens (pRExC_state->open_parens)
192 #define RExC_close_parens (pRExC_state->close_parens)
193 #define RExC_opend (pRExC_state->opend)
194 #define RExC_paren_names (pRExC_state->paren_names)
195 #define RExC_recurse (pRExC_state->recurse)
196 #define RExC_recurse_count (pRExC_state->recurse_count)
197 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
198 #define RExC_contains_locale (pRExC_state->contains_locale)
199 #define RExC_override_recoding (pRExC_state->override_recoding)
202 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
203 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
204 ((*s) == '{' && regcurly(s)))
207 #undef SPSTART /* dratted cpp namespace... */
210 * Flags to be passed up and down.
212 #define WORST 0 /* Worst case. */
213 #define HASWIDTH 0x01 /* Known to match non-null strings. */
215 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
216 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
218 #define SPSTART 0x04 /* Starts with * or +. */
219 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
220 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
222 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
224 /* whether trie related optimizations are enabled */
225 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
226 #define TRIE_STUDY_OPT
227 #define FULL_TRIE_STUDY
233 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
234 #define PBITVAL(paren) (1 << ((paren) & 7))
235 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
236 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
237 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
239 /* If not already in utf8, do a longjmp back to the beginning */
240 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
241 #define REQUIRE_UTF8 STMT_START { \
242 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
245 /* About scan_data_t.
247 During optimisation we recurse through the regexp program performing
248 various inplace (keyhole style) optimisations. In addition study_chunk
249 and scan_commit populate this data structure with information about
250 what strings MUST appear in the pattern. We look for the longest
251 string that must appear at a fixed location, and we look for the
252 longest string that may appear at a floating location. So for instance
257 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
258 strings (because they follow a .* construct). study_chunk will identify
259 both FOO and BAR as being the longest fixed and floating strings respectively.
261 The strings can be composites, for instance
265 will result in a composite fixed substring 'foo'.
267 For each string some basic information is maintained:
269 - offset or min_offset
270 This is the position the string must appear at, or not before.
271 It also implicitly (when combined with minlenp) tells us how many
272 characters must match before the string we are searching for.
273 Likewise when combined with minlenp and the length of the string it
274 tells us how many characters must appear after the string we have
278 Only used for floating strings. This is the rightmost point that
279 the string can appear at. If set to I32 max it indicates that the
280 string can occur infinitely far to the right.
283 A pointer to the minimum length of the pattern that the string
284 was found inside. This is important as in the case of positive
285 lookahead or positive lookbehind we can have multiple patterns
290 The minimum length of the pattern overall is 3, the minimum length
291 of the lookahead part is 3, but the minimum length of the part that
292 will actually match is 1. So 'FOO's minimum length is 3, but the
293 minimum length for the F is 1. This is important as the minimum length
294 is used to determine offsets in front of and behind the string being
295 looked for. Since strings can be composites this is the length of the
296 pattern at the time it was committed with a scan_commit. Note that
297 the length is calculated by study_chunk, so that the minimum lengths
298 are not known until the full pattern has been compiled, thus the
299 pointer to the value.
303 In the case of lookbehind the string being searched for can be
304 offset past the start point of the final matching string.
305 If this value was just blithely removed from the min_offset it would
306 invalidate some of the calculations for how many chars must match
307 before or after (as they are derived from min_offset and minlen and
308 the length of the string being searched for).
309 When the final pattern is compiled and the data is moved from the
310 scan_data_t structure into the regexp structure the information
311 about lookbehind is factored in, with the information that would
312 have been lost precalculated in the end_shift field for the
315 The fields pos_min and pos_delta are used to store the minimum offset
316 and the delta to the maximum offset at the current point in the pattern.
320 typedef struct scan_data_t {
321 /*I32 len_min; unused */
322 /*I32 len_delta; unused */
326 I32 last_end; /* min value, <0 unless valid. */
329 SV **longest; /* Either &l_fixed, or &l_float. */
330 SV *longest_fixed; /* longest fixed string found in pattern */
331 I32 offset_fixed; /* offset where it starts */
332 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
333 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
334 SV *longest_float; /* longest floating string found in pattern */
335 I32 offset_float_min; /* earliest point in string it can appear */
336 I32 offset_float_max; /* latest point in string it can appear */
337 I32 *minlen_float; /* pointer to the minlen relevant to the string */
338 I32 lookbehind_float; /* is the position of the string modified by LB */
342 struct regnode_charclass_class *start_class;
346 * Forward declarations for pregcomp()'s friends.
349 static const scan_data_t zero_scan_data =
350 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
352 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
353 #define SF_BEFORE_SEOL 0x0001
354 #define SF_BEFORE_MEOL 0x0002
355 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
356 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
359 # define SF_FIX_SHIFT_EOL (0+2)
360 # define SF_FL_SHIFT_EOL (0+4)
362 # define SF_FIX_SHIFT_EOL (+2)
363 # define SF_FL_SHIFT_EOL (+4)
366 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
367 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
369 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
370 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
371 #define SF_IS_INF 0x0040
372 #define SF_HAS_PAR 0x0080
373 #define SF_IN_PAR 0x0100
374 #define SF_HAS_EVAL 0x0200
375 #define SCF_DO_SUBSTR 0x0400
376 #define SCF_DO_STCLASS_AND 0x0800
377 #define SCF_DO_STCLASS_OR 0x1000
378 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
379 #define SCF_WHILEM_VISITED_POS 0x2000
381 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
382 #define SCF_SEEN_ACCEPT 0x8000
384 #define UTF cBOOL(RExC_utf8)
385 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
386 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
387 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
388 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
389 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
390 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
391 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
393 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
395 #define OOB_UNICODE 12345678
396 #define OOB_NAMEDCLASS -1
398 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
399 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
402 /* length of regex to show in messages that don't mark a position within */
403 #define RegexLengthToShowInErrorMessages 127
406 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
407 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
408 * op/pragma/warn/regcomp.
410 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
411 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
413 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
416 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
417 * arg. Show regex, up to a maximum length. If it's too long, chop and add
420 #define _FAIL(code) STMT_START { \
421 const char *ellipses = ""; \
422 IV len = RExC_end - RExC_precomp; \
425 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
426 if (len > RegexLengthToShowInErrorMessages) { \
427 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
428 len = RegexLengthToShowInErrorMessages - 10; \
434 #define FAIL(msg) _FAIL( \
435 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
436 msg, (int)len, RExC_precomp, ellipses))
438 #define FAIL2(msg,arg) _FAIL( \
439 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
440 arg, (int)len, RExC_precomp, ellipses))
443 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
445 #define Simple_vFAIL(m) STMT_START { \
446 const IV offset = RExC_parse - RExC_precomp; \
447 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
448 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
452 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
454 #define vFAIL(m) STMT_START { \
456 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
461 * Like Simple_vFAIL(), but accepts two arguments.
463 #define Simple_vFAIL2(m,a1) STMT_START { \
464 const IV offset = RExC_parse - RExC_precomp; \
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
472 #define vFAIL2(m,a1) STMT_START { \
474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
475 Simple_vFAIL2(m, a1); \
480 * Like Simple_vFAIL(), but accepts three arguments.
482 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
483 const IV offset = RExC_parse - RExC_precomp; \
484 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
485 (int)offset, RExC_precomp, RExC_precomp + offset); \
489 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
491 #define vFAIL3(m,a1,a2) STMT_START { \
493 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
494 Simple_vFAIL3(m, a1, a2); \
498 * Like Simple_vFAIL(), but accepts four arguments.
500 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
501 const IV offset = RExC_parse - RExC_precomp; \
502 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
503 (int)offset, RExC_precomp, RExC_precomp + offset); \
506 #define ckWARNreg(loc,m) STMT_START { \
507 const IV offset = loc - RExC_precomp; \
508 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 (int)offset, RExC_precomp, RExC_precomp + offset); \
512 #define ckWARNregdep(loc,m) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
516 (int)offset, RExC_precomp, RExC_precomp + offset); \
519 #define ckWARN2regdep(loc,m, a1) STMT_START { \
520 const IV offset = loc - RExC_precomp; \
521 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
523 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
526 #define ckWARN2reg(loc, m, a1) STMT_START { \
527 const IV offset = loc - RExC_precomp; \
528 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
529 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
532 #define vWARN3(loc, m, a1, a2) STMT_START { \
533 const IV offset = loc - RExC_precomp; \
534 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
535 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
538 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
539 const IV offset = loc - RExC_precomp; \
540 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
541 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
544 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
545 const IV offset = loc - RExC_precomp; \
546 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
547 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
550 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
551 const IV offset = loc - RExC_precomp; \
552 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
553 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
556 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
557 const IV offset = loc - RExC_precomp; \
558 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
559 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
563 /* Allow for side effects in s */
564 #define REGC(c,s) STMT_START { \
565 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
568 /* Macros for recording node offsets. 20001227 mjd@plover.com
569 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
570 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
571 * Element 0 holds the number n.
572 * Position is 1 indexed.
574 #ifndef RE_TRACK_PATTERN_OFFSETS
575 #define Set_Node_Offset_To_R(node,byte)
576 #define Set_Node_Offset(node,byte)
577 #define Set_Cur_Node_Offset
578 #define Set_Node_Length_To_R(node,len)
579 #define Set_Node_Length(node,len)
580 #define Set_Node_Cur_Length(node)
581 #define Node_Offset(n)
582 #define Node_Length(n)
583 #define Set_Node_Offset_Length(node,offset,len)
584 #define ProgLen(ri) ri->u.proglen
585 #define SetProgLen(ri,x) ri->u.proglen = x
587 #define ProgLen(ri) ri->u.offsets[0]
588 #define SetProgLen(ri,x) ri->u.offsets[0] = x
589 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
591 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
592 __LINE__, (int)(node), (int)(byte))); \
594 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
596 RExC_offsets[2*(node)-1] = (byte); \
601 #define Set_Node_Offset(node,byte) \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
603 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
605 #define Set_Node_Length_To_R(node,len) STMT_START { \
607 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
608 __LINE__, (int)(node), (int)(len))); \
610 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
612 RExC_offsets[2*(node)] = (len); \
617 #define Set_Node_Length(node,len) \
618 Set_Node_Length_To_R((node)-RExC_emit_start, len)
619 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
620 #define Set_Node_Cur_Length(node) \
621 Set_Node_Length(node, RExC_parse - parse_start)
623 /* Get offsets and lengths */
624 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
625 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
627 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
628 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
629 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
633 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
634 #define EXPERIMENTAL_INPLACESCAN
635 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
637 #define DEBUG_STUDYDATA(str,data,depth) \
638 DEBUG_OPTIMISE_MORE_r(if(data){ \
639 PerlIO_printf(Perl_debug_log, \
640 "%*s" str "Pos:%"IVdf"/%"IVdf \
641 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
642 (int)(depth)*2, "", \
643 (IV)((data)->pos_min), \
644 (IV)((data)->pos_delta), \
645 (UV)((data)->flags), \
646 (IV)((data)->whilem_c), \
647 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
648 is_inf ? "INF " : "" \
650 if ((data)->last_found) \
651 PerlIO_printf(Perl_debug_log, \
652 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
653 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
654 SvPVX_const((data)->last_found), \
655 (IV)((data)->last_end), \
656 (IV)((data)->last_start_min), \
657 (IV)((data)->last_start_max), \
658 ((data)->longest && \
659 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
660 SvPVX_const((data)->longest_fixed), \
661 (IV)((data)->offset_fixed), \
662 ((data)->longest && \
663 (data)->longest==&((data)->longest_float)) ? "*" : "", \
664 SvPVX_const((data)->longest_float), \
665 (IV)((data)->offset_float_min), \
666 (IV)((data)->offset_float_max) \
668 PerlIO_printf(Perl_debug_log,"\n"); \
671 static void clear_re(pTHX_ void *r);
673 /* Mark that we cannot extend a found fixed substring at this point.
674 Update the longest found anchored substring and the longest found
675 floating substrings if needed. */
678 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
680 const STRLEN l = CHR_SVLEN(data->last_found);
681 const STRLEN old_l = CHR_SVLEN(*data->longest);
682 GET_RE_DEBUG_FLAGS_DECL;
684 PERL_ARGS_ASSERT_SCAN_COMMIT;
686 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
687 SvSetMagicSV(*data->longest, data->last_found);
688 if (*data->longest == data->longest_fixed) {
689 data->offset_fixed = l ? data->last_start_min : data->pos_min;
690 if (data->flags & SF_BEFORE_EOL)
692 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
694 data->flags &= ~SF_FIX_BEFORE_EOL;
695 data->minlen_fixed=minlenp;
696 data->lookbehind_fixed=0;
698 else { /* *data->longest == data->longest_float */
699 data->offset_float_min = l ? data->last_start_min : data->pos_min;
700 data->offset_float_max = (l
701 ? data->last_start_max
702 : data->pos_min + data->pos_delta);
703 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
704 data->offset_float_max = I32_MAX;
705 if (data->flags & SF_BEFORE_EOL)
707 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
709 data->flags &= ~SF_FL_BEFORE_EOL;
710 data->minlen_float=minlenp;
711 data->lookbehind_float=0;
714 SvCUR_set(data->last_found, 0);
716 SV * const sv = data->last_found;
717 if (SvUTF8(sv) && SvMAGICAL(sv)) {
718 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
724 data->flags &= ~SF_BEFORE_EOL;
725 DEBUG_STUDYDATA("commit: ",data,0);
728 /* Can match anything (initialization) */
730 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
732 PERL_ARGS_ASSERT_CL_ANYTHING;
734 ANYOF_BITMAP_SETALL(cl);
735 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
736 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
738 /* If any portion of the regex is to operate under locale rules,
739 * initialization includes it. The reason this isn't done for all regexes
740 * is that the optimizer was written under the assumption that locale was
741 * all-or-nothing. Given the complexity and lack of documentation in the
742 * optimizer, and that there are inadequate test cases for locale, so many
743 * parts of it may not work properly, it is safest to avoid locale unless
745 if (RExC_contains_locale) {
746 ANYOF_CLASS_SETALL(cl); /* /l uses class */
747 cl->flags |= ANYOF_LOCALE;
750 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
754 /* Can match anything (initialization) */
756 S_cl_is_anything(const struct regnode_charclass_class *cl)
760 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
762 for (value = 0; value <= ANYOF_MAX; value += 2)
763 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
765 if (!(cl->flags & ANYOF_UNICODE_ALL))
767 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
772 /* Can match anything (initialization) */
774 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
776 PERL_ARGS_ASSERT_CL_INIT;
778 Zero(cl, 1, struct regnode_charclass_class);
780 cl_anything(pRExC_state, cl);
781 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
784 /* These two functions currently do the exact same thing */
785 #define cl_init_zero S_cl_init
787 /* 'AND' a given class with another one. Can create false positives. 'cl'
788 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
789 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
791 S_cl_and(struct regnode_charclass_class *cl,
792 const struct regnode_charclass_class *and_with)
794 PERL_ARGS_ASSERT_CL_AND;
796 assert(and_with->type == ANYOF);
798 /* I (khw) am not sure all these restrictions are necessary XXX */
799 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
800 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
801 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
802 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
803 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
806 if (and_with->flags & ANYOF_INVERT)
807 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
808 cl->bitmap[i] &= ~and_with->bitmap[i];
810 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
811 cl->bitmap[i] &= and_with->bitmap[i];
812 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
814 if (and_with->flags & ANYOF_INVERT) {
816 /* Here, the and'ed node is inverted. Get the AND of the flags that
817 * aren't affected by the inversion. Those that are affected are
818 * handled individually below */
819 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
820 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
821 cl->flags |= affected_flags;
823 /* We currently don't know how to deal with things that aren't in the
824 * bitmap, but we know that the intersection is no greater than what
825 * is already in cl, so let there be false positives that get sorted
826 * out after the synthetic start class succeeds, and the node is
827 * matched for real. */
829 /* The inversion of these two flags indicate that the resulting
830 * intersection doesn't have them */
831 if (and_with->flags & ANYOF_UNICODE_ALL) {
832 cl->flags &= ~ANYOF_UNICODE_ALL;
834 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
835 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
838 else { /* and'd node is not inverted */
839 U8 outside_bitmap_but_not_utf8; /* Temp variable */
841 if (! ANYOF_NONBITMAP(and_with)) {
843 /* Here 'and_with' doesn't match anything outside the bitmap
844 * (except possibly ANYOF_UNICODE_ALL), which means the
845 * intersection can't either, except for ANYOF_UNICODE_ALL, in
846 * which case we don't know what the intersection is, but it's no
847 * greater than what cl already has, so can just leave it alone,
848 * with possible false positives */
849 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
850 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
851 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
854 else if (! ANYOF_NONBITMAP(cl)) {
856 /* Here, 'and_with' does match something outside the bitmap, and cl
857 * doesn't have a list of things to match outside the bitmap. If
858 * cl can match all code points above 255, the intersection will
859 * be those above-255 code points that 'and_with' matches. If cl
860 * can't match all Unicode code points, it means that it can't
861 * match anything outside the bitmap (since the 'if' that got us
862 * into this block tested for that), so we leave the bitmap empty.
864 if (cl->flags & ANYOF_UNICODE_ALL) {
865 ARG_SET(cl, ARG(and_with));
867 /* and_with's ARG may match things that don't require UTF8.
868 * And now cl's will too, in spite of this being an 'and'. See
869 * the comments below about the kludge */
870 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
874 /* Here, both 'and_with' and cl match something outside the
875 * bitmap. Currently we do not do the intersection, so just match
876 * whatever cl had at the beginning. */
880 /* Take the intersection of the two sets of flags. However, the
881 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
882 * kludge around the fact that this flag is not treated like the others
883 * which are initialized in cl_anything(). The way the optimizer works
884 * is that the synthetic start class (SSC) is initialized to match
885 * anything, and then the first time a real node is encountered, its
886 * values are AND'd with the SSC's with the result being the values of
887 * the real node. However, there are paths through the optimizer where
888 * the AND never gets called, so those initialized bits are set
889 * inappropriately, which is not usually a big deal, as they just cause
890 * false positives in the SSC, which will just mean a probably
891 * imperceptible slow down in execution. However this bit has a
892 * higher false positive consequence in that it can cause utf8.pm,
893 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
894 * bigger slowdown and also causes significant extra memory to be used.
895 * In order to prevent this, the code now takes a different tack. The
896 * bit isn't set unless some part of the regular expression needs it,
897 * but once set it won't get cleared. This means that these extra
898 * modules won't get loaded unless there was some path through the
899 * pattern that would have required them anyway, and so any false
900 * positives that occur by not ANDing them out when they could be
901 * aren't as severe as they would be if we treated this bit like all
903 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
904 & ANYOF_NONBITMAP_NON_UTF8;
905 cl->flags &= and_with->flags;
906 cl->flags |= outside_bitmap_but_not_utf8;
910 /* 'OR' a given class with another one. Can create false positives. 'cl'
911 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
912 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
914 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
916 PERL_ARGS_ASSERT_CL_OR;
918 if (or_with->flags & ANYOF_INVERT) {
920 /* Here, the or'd node is to be inverted. This means we take the
921 * complement of everything not in the bitmap, but currently we don't
922 * know what that is, so give up and match anything */
923 if (ANYOF_NONBITMAP(or_with)) {
924 cl_anything(pRExC_state, cl);
927 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
928 * <= (B1 | !B2) | (CL1 | !CL2)
929 * which is wasteful if CL2 is small, but we ignore CL2:
930 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
931 * XXXX Can we handle case-fold? Unclear:
932 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
933 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
935 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
936 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
937 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
940 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
941 cl->bitmap[i] |= ~or_with->bitmap[i];
942 } /* XXXX: logic is complicated otherwise */
944 cl_anything(pRExC_state, cl);
947 /* And, we can just take the union of the flags that aren't affected
948 * by the inversion */
949 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
951 /* For the remaining flags:
952 ANYOF_UNICODE_ALL and inverted means to not match anything above
953 255, which means that the union with cl should just be
954 what cl has in it, so can ignore this flag
955 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
956 is 127-255 to match them, but then invert that, so the
957 union with cl should just be what cl has in it, so can
960 } else { /* 'or_with' is not inverted */
961 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
962 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
963 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
964 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
967 /* OR char bitmap and class bitmap separately */
968 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
969 cl->bitmap[i] |= or_with->bitmap[i];
970 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
971 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
972 cl->classflags[i] |= or_with->classflags[i];
973 cl->flags |= ANYOF_CLASS;
976 else { /* XXXX: logic is complicated, leave it along for a moment. */
977 cl_anything(pRExC_state, cl);
980 if (ANYOF_NONBITMAP(or_with)) {
982 /* Use the added node's outside-the-bit-map match if there isn't a
983 * conflict. If there is a conflict (both nodes match something
984 * outside the bitmap, but what they match outside is not the same
985 * pointer, and hence not easily compared until XXX we extend
986 * inversion lists this far), give up and allow the start class to
987 * match everything outside the bitmap. If that stuff is all above
988 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
989 if (! ANYOF_NONBITMAP(cl)) {
990 ARG_SET(cl, ARG(or_with));
992 else if (ARG(cl) != ARG(or_with)) {
994 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
995 cl_anything(pRExC_state, cl);
998 cl->flags |= ANYOF_UNICODE_ALL;
1003 /* Take the union */
1004 cl->flags |= or_with->flags;
1008 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1009 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1010 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1011 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1016 dump_trie(trie,widecharmap,revcharmap)
1017 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1018 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1020 These routines dump out a trie in a somewhat readable format.
1021 The _interim_ variants are used for debugging the interim
1022 tables that are used to generate the final compressed
1023 representation which is what dump_trie expects.
1025 Part of the reason for their existence is to provide a form
1026 of documentation as to how the different representations function.
1031 Dumps the final compressed table form of the trie to Perl_debug_log.
1032 Used for debugging make_trie().
1036 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1037 AV *revcharmap, U32 depth)
1040 SV *sv=sv_newmortal();
1041 int colwidth= widecharmap ? 6 : 4;
1043 GET_RE_DEBUG_FLAGS_DECL;
1045 PERL_ARGS_ASSERT_DUMP_TRIE;
1047 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1048 (int)depth * 2 + 2,"",
1049 "Match","Base","Ofs" );
1051 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1052 SV ** const tmp = av_fetch( revcharmap, state, 0);
1054 PerlIO_printf( Perl_debug_log, "%*s",
1056 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1057 PL_colors[0], PL_colors[1],
1058 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1059 PERL_PV_ESCAPE_FIRSTCHAR
1064 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1065 (int)depth * 2 + 2,"");
1067 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1068 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1069 PerlIO_printf( Perl_debug_log, "\n");
1071 for( state = 1 ; state < trie->statecount ; state++ ) {
1072 const U32 base = trie->states[ state ].trans.base;
1074 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1076 if ( trie->states[ state ].wordnum ) {
1077 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1079 PerlIO_printf( Perl_debug_log, "%6s", "" );
1082 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1087 while( ( base + ofs < trie->uniquecharcount ) ||
1088 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1089 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1092 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1094 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1095 if ( ( base + ofs >= trie->uniquecharcount ) &&
1096 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1097 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1099 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1101 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1103 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1107 PerlIO_printf( Perl_debug_log, "]");
1110 PerlIO_printf( Perl_debug_log, "\n" );
1112 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1113 for (word=1; word <= trie->wordcount; word++) {
1114 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1115 (int)word, (int)(trie->wordinfo[word].prev),
1116 (int)(trie->wordinfo[word].len));
1118 PerlIO_printf(Perl_debug_log, "\n" );
1121 Dumps a fully constructed but uncompressed trie in list form.
1122 List tries normally only are used for construction when the number of
1123 possible chars (trie->uniquecharcount) is very high.
1124 Used for debugging make_trie().
1127 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1128 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1132 SV *sv=sv_newmortal();
1133 int colwidth= widecharmap ? 6 : 4;
1134 GET_RE_DEBUG_FLAGS_DECL;
1136 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1138 /* print out the table precompression. */
1139 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1140 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1141 "------:-----+-----------------\n" );
1143 for( state=1 ; state < next_alloc ; state ++ ) {
1146 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1147 (int)depth * 2 + 2,"", (UV)state );
1148 if ( ! trie->states[ state ].wordnum ) {
1149 PerlIO_printf( Perl_debug_log, "%5s| ","");
1151 PerlIO_printf( Perl_debug_log, "W%4x| ",
1152 trie->states[ state ].wordnum
1155 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1156 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1158 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1160 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1161 PL_colors[0], PL_colors[1],
1162 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1163 PERL_PV_ESCAPE_FIRSTCHAR
1165 TRIE_LIST_ITEM(state,charid).forid,
1166 (UV)TRIE_LIST_ITEM(state,charid).newstate
1169 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1170 (int)((depth * 2) + 14), "");
1173 PerlIO_printf( Perl_debug_log, "\n");
1178 Dumps a fully constructed but uncompressed trie in table form.
1179 This is the normal DFA style state transition table, with a few
1180 twists to facilitate compression later.
1181 Used for debugging make_trie().
1184 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1185 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1190 SV *sv=sv_newmortal();
1191 int colwidth= widecharmap ? 6 : 4;
1192 GET_RE_DEBUG_FLAGS_DECL;
1194 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1197 print out the table precompression so that we can do a visual check
1198 that they are identical.
1201 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1203 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1206 PerlIO_printf( Perl_debug_log, "%*s",
1208 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1209 PL_colors[0], PL_colors[1],
1210 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1211 PERL_PV_ESCAPE_FIRSTCHAR
1217 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1219 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1220 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1223 PerlIO_printf( Perl_debug_log, "\n" );
1225 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1227 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1228 (int)depth * 2 + 2,"",
1229 (UV)TRIE_NODENUM( state ) );
1231 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1232 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1234 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1236 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1238 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1239 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1241 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1242 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1250 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1251 startbranch: the first branch in the whole branch sequence
1252 first : start branch of sequence of branch-exact nodes.
1253 May be the same as startbranch
1254 last : Thing following the last branch.
1255 May be the same as tail.
1256 tail : item following the branch sequence
1257 count : words in the sequence
1258 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1259 depth : indent depth
1261 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1263 A trie is an N'ary tree where the branches are determined by digital
1264 decomposition of the key. IE, at the root node you look up the 1st character and
1265 follow that branch repeat until you find the end of the branches. Nodes can be
1266 marked as "accepting" meaning they represent a complete word. Eg:
1270 would convert into the following structure. Numbers represent states, letters
1271 following numbers represent valid transitions on the letter from that state, if
1272 the number is in square brackets it represents an accepting state, otherwise it
1273 will be in parenthesis.
1275 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1279 (1) +-i->(6)-+-s->[7]
1281 +-s->(3)-+-h->(4)-+-e->[5]
1283 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1285 This shows that when matching against the string 'hers' we will begin at state 1
1286 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1287 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1288 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1289 single traverse. We store a mapping from accepting to state to which word was
1290 matched, and then when we have multiple possibilities we try to complete the
1291 rest of the regex in the order in which they occured in the alternation.
1293 The only prior NFA like behaviour that would be changed by the TRIE support is
1294 the silent ignoring of duplicate alternations which are of the form:
1296 / (DUPE|DUPE) X? (?{ ... }) Y /x
1298 Thus EVAL blocks following a trie may be called a different number of times with
1299 and without the optimisation. With the optimisations dupes will be silently
1300 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1301 the following demonstrates:
1303 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1305 which prints out 'word' three times, but
1307 'words'=~/(word|word|word)(?{ print $1 })S/
1309 which doesnt print it out at all. This is due to other optimisations kicking in.
1311 Example of what happens on a structural level:
1313 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1315 1: CURLYM[1] {1,32767}(18)
1326 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1327 and should turn into:
1329 1: CURLYM[1] {1,32767}(18)
1331 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1339 Cases where tail != last would be like /(?foo|bar)baz/:
1349 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1350 and would end up looking like:
1353 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1360 d = uvuni_to_utf8_flags(d, uv, 0);
1362 is the recommended Unicode-aware way of saying
1367 #define TRIE_STORE_REVCHAR \
1370 SV *zlopp = newSV(2); \
1371 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1372 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1373 SvCUR_set(zlopp, kapow - flrbbbbb); \
1376 av_push(revcharmap, zlopp); \
1378 char ooooff = (char)uvc; \
1379 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1383 #define TRIE_READ_CHAR STMT_START { \
1387 if ( foldlen > 0 ) { \
1388 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1393 len = UTF8SKIP(uc);\
1394 uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
1395 foldlen -= UNISKIP( uvc ); \
1396 scan = foldbuf + UNISKIP( uvc ); \
1399 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1409 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1410 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1411 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1412 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1414 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1415 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1416 TRIE_LIST_CUR( state )++; \
1419 #define TRIE_LIST_NEW(state) STMT_START { \
1420 Newxz( trie->states[ state ].trans.list, \
1421 4, reg_trie_trans_le ); \
1422 TRIE_LIST_CUR( state ) = 1; \
1423 TRIE_LIST_LEN( state ) = 4; \
1426 #define TRIE_HANDLE_WORD(state) STMT_START { \
1427 U16 dupe= trie->states[ state ].wordnum; \
1428 regnode * const noper_next = regnext( noper ); \
1431 /* store the word for dumping */ \
1433 if (OP(noper) != NOTHING) \
1434 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1436 tmp = newSVpvn_utf8( "", 0, UTF ); \
1437 av_push( trie_words, tmp ); \
1441 trie->wordinfo[curword].prev = 0; \
1442 trie->wordinfo[curword].len = wordlen; \
1443 trie->wordinfo[curword].accept = state; \
1445 if ( noper_next < tail ) { \
1447 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1448 trie->jump[curword] = (U16)(noper_next - convert); \
1450 jumper = noper_next; \
1452 nextbranch= regnext(cur); \
1456 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1457 /* chain, so that when the bits of chain are later */\
1458 /* linked together, the dups appear in the chain */\
1459 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1460 trie->wordinfo[dupe].prev = curword; \
1462 /* we haven't inserted this word yet. */ \
1463 trie->states[ state ].wordnum = curword; \
1468 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1469 ( ( base + charid >= ucharcount \
1470 && base + charid < ubound \
1471 && state == trie->trans[ base - ucharcount + charid ].check \
1472 && trie->trans[ base - ucharcount + charid ].next ) \
1473 ? trie->trans[ base - ucharcount + charid ].next \
1474 : ( state==1 ? special : 0 ) \
1478 #define MADE_JUMP_TRIE 2
1479 #define MADE_EXACT_TRIE 4
1482 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1485 /* first pass, loop through and scan words */
1486 reg_trie_data *trie;
1487 HV *widecharmap = NULL;
1488 AV *revcharmap = newAV();
1490 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1495 regnode *jumper = NULL;
1496 regnode *nextbranch = NULL;
1497 regnode *convert = NULL;
1498 U32 *prev_states; /* temp array mapping each state to previous one */
1499 /* we just use folder as a flag in utf8 */
1500 const U8 * folder = NULL;
1503 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1504 AV *trie_words = NULL;
1505 /* along with revcharmap, this only used during construction but both are
1506 * useful during debugging so we store them in the struct when debugging.
1509 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1510 STRLEN trie_charcount=0;
1512 SV *re_trie_maxbuff;
1513 GET_RE_DEBUG_FLAGS_DECL;
1515 PERL_ARGS_ASSERT_MAKE_TRIE;
1517 PERL_UNUSED_ARG(depth);
1523 case EXACTFU: folder = PL_fold_latin1; break;
1524 case EXACTF: folder = PL_fold; break;
1525 case EXACTFL: folder = PL_fold_locale; break;
1526 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
1529 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1531 trie->startstate = 1;
1532 trie->wordcount = word_count;
1533 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1534 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1535 if (!(UTF && folder))
1536 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1537 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1538 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1541 trie_words = newAV();
1544 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1545 if (!SvIOK(re_trie_maxbuff)) {
1546 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1549 PerlIO_printf( Perl_debug_log,
1550 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1551 (int)depth * 2 + 2, "",
1552 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1553 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1557 /* Find the node we are going to overwrite */
1558 if ( first == startbranch && OP( last ) != BRANCH ) {
1559 /* whole branch chain */
1562 /* branch sub-chain */
1563 convert = NEXTOPER( first );
1566 /* -- First loop and Setup --
1568 We first traverse the branches and scan each word to determine if it
1569 contains widechars, and how many unique chars there are, this is
1570 important as we have to build a table with at least as many columns as we
1573 We use an array of integers to represent the character codes 0..255
1574 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1575 native representation of the character value as the key and IV's for the
1578 *TODO* If we keep track of how many times each character is used we can
1579 remap the columns so that the table compression later on is more
1580 efficient in terms of memory by ensuring the most common value is in the
1581 middle and the least common are on the outside. IMO this would be better
1582 than a most to least common mapping as theres a decent chance the most
1583 common letter will share a node with the least common, meaning the node
1584 will not be compressible. With a middle is most common approach the worst
1585 case is when we have the least common nodes twice.
1589 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1590 regnode * const noper = NEXTOPER( cur );
1591 const U8 *uc = (U8*)STRING( noper );
1592 const U8 * const e = uc + STR_LEN( noper );
1594 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1595 const U8 *scan = (U8*)NULL;
1596 U32 wordlen = 0; /* required init */
1598 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1600 if (OP(noper) == NOTHING) {
1604 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1605 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1606 regardless of encoding */
1608 for ( ; uc < e ; uc += len ) {
1609 TRIE_CHARCOUNT(trie)++;
1613 if ( !trie->charmap[ uvc ] ) {
1614 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1616 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1620 /* store the codepoint in the bitmap, and its folded
1622 TRIE_BITMAP_SET(trie,uvc);
1624 /* store the folded codepoint */
1625 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1628 /* store first byte of utf8 representation of
1629 variant codepoints */
1630 if (! UNI_IS_INVARIANT(uvc)) {
1631 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1634 set_bit = 0; /* We've done our bit :-) */
1639 widecharmap = newHV();
1641 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1644 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1646 if ( !SvTRUE( *svpp ) ) {
1647 sv_setiv( *svpp, ++trie->uniquecharcount );
1652 if( cur == first ) {
1655 } else if (chars < trie->minlen) {
1657 } else if (chars > trie->maxlen) {
1661 } /* end first pass */
1662 DEBUG_TRIE_COMPILE_r(
1663 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1664 (int)depth * 2 + 2,"",
1665 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1666 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1667 (int)trie->minlen, (int)trie->maxlen )
1671 We now know what we are dealing with in terms of unique chars and
1672 string sizes so we can calculate how much memory a naive
1673 representation using a flat table will take. If it's over a reasonable
1674 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1675 conservative but potentially much slower representation using an array
1678 At the end we convert both representations into the same compressed
1679 form that will be used in regexec.c for matching with. The latter
1680 is a form that cannot be used to construct with but has memory
1681 properties similar to the list form and access properties similar
1682 to the table form making it both suitable for fast searches and
1683 small enough that its feasable to store for the duration of a program.
1685 See the comment in the code where the compressed table is produced
1686 inplace from the flat tabe representation for an explanation of how
1687 the compression works.
1692 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1695 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1697 Second Pass -- Array Of Lists Representation
1699 Each state will be represented by a list of charid:state records
1700 (reg_trie_trans_le) the first such element holds the CUR and LEN
1701 points of the allocated array. (See defines above).
1703 We build the initial structure using the lists, and then convert
1704 it into the compressed table form which allows faster lookups
1705 (but cant be modified once converted).
1708 STRLEN transcount = 1;
1710 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1711 "%*sCompiling trie using list compiler\n",
1712 (int)depth * 2 + 2, ""));
1714 trie->states = (reg_trie_state *)
1715 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1716 sizeof(reg_trie_state) );
1720 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1722 regnode * const noper = NEXTOPER( cur );
1723 U8 *uc = (U8*)STRING( noper );
1724 const U8 * const e = uc + STR_LEN( noper );
1725 U32 state = 1; /* required init */
1726 U16 charid = 0; /* sanity init */
1727 U8 *scan = (U8*)NULL; /* sanity init */
1728 STRLEN foldlen = 0; /* required init */
1729 U32 wordlen = 0; /* required init */
1730 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1732 if (OP(noper) != NOTHING) {
1733 for ( ; uc < e ; uc += len ) {
1738 charid = trie->charmap[ uvc ];
1740 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1744 charid=(U16)SvIV( *svpp );
1747 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1754 if ( !trie->states[ state ].trans.list ) {
1755 TRIE_LIST_NEW( state );
1757 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1758 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1759 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1764 newstate = next_alloc++;
1765 prev_states[newstate] = state;
1766 TRIE_LIST_PUSH( state, charid, newstate );
1771 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1775 TRIE_HANDLE_WORD(state);
1777 } /* end second pass */
1779 /* next alloc is the NEXT state to be allocated */
1780 trie->statecount = next_alloc;
1781 trie->states = (reg_trie_state *)
1782 PerlMemShared_realloc( trie->states,
1784 * sizeof(reg_trie_state) );
1786 /* and now dump it out before we compress it */
1787 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1788 revcharmap, next_alloc,
1792 trie->trans = (reg_trie_trans *)
1793 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1800 for( state=1 ; state < next_alloc ; state ++ ) {
1804 DEBUG_TRIE_COMPILE_MORE_r(
1805 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1809 if (trie->states[state].trans.list) {
1810 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1814 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1815 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1816 if ( forid < minid ) {
1818 } else if ( forid > maxid ) {
1822 if ( transcount < tp + maxid - minid + 1) {
1824 trie->trans = (reg_trie_trans *)
1825 PerlMemShared_realloc( trie->trans,
1827 * sizeof(reg_trie_trans) );
1828 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1830 base = trie->uniquecharcount + tp - minid;
1831 if ( maxid == minid ) {
1833 for ( ; zp < tp ; zp++ ) {
1834 if ( ! trie->trans[ zp ].next ) {
1835 base = trie->uniquecharcount + zp - minid;
1836 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1837 trie->trans[ zp ].check = state;
1843 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1844 trie->trans[ tp ].check = state;
1849 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1850 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1851 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1852 trie->trans[ tid ].check = state;
1854 tp += ( maxid - minid + 1 );
1856 Safefree(trie->states[ state ].trans.list);
1859 DEBUG_TRIE_COMPILE_MORE_r(
1860 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1863 trie->states[ state ].trans.base=base;
1865 trie->lasttrans = tp + 1;
1869 Second Pass -- Flat Table Representation.
1871 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1872 We know that we will need Charcount+1 trans at most to store the data
1873 (one row per char at worst case) So we preallocate both structures
1874 assuming worst case.
1876 We then construct the trie using only the .next slots of the entry
1879 We use the .check field of the first entry of the node temporarily to
1880 make compression both faster and easier by keeping track of how many non
1881 zero fields are in the node.
1883 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1886 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1887 number representing the first entry of the node, and state as a
1888 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1889 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1890 are 2 entrys per node. eg:
1898 The table is internally in the right hand, idx form. However as we also
1899 have to deal with the states array which is indexed by nodenum we have to
1900 use TRIE_NODENUM() to convert.
1903 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1904 "%*sCompiling trie using table compiler\n",
1905 (int)depth * 2 + 2, ""));
1907 trie->trans = (reg_trie_trans *)
1908 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1909 * trie->uniquecharcount + 1,
1910 sizeof(reg_trie_trans) );
1911 trie->states = (reg_trie_state *)
1912 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1913 sizeof(reg_trie_state) );
1914 next_alloc = trie->uniquecharcount + 1;
1917 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1919 regnode * const noper = NEXTOPER( cur );
1920 const U8 *uc = (U8*)STRING( noper );
1921 const U8 * const e = uc + STR_LEN( noper );
1923 U32 state = 1; /* required init */
1925 U16 charid = 0; /* sanity init */
1926 U32 accept_state = 0; /* sanity init */
1927 U8 *scan = (U8*)NULL; /* sanity init */
1929 STRLEN foldlen = 0; /* required init */
1930 U32 wordlen = 0; /* required init */
1931 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1933 if ( OP(noper) != NOTHING ) {
1934 for ( ; uc < e ; uc += len ) {
1939 charid = trie->charmap[ uvc ];
1941 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1942 charid = svpp ? (U16)SvIV(*svpp) : 0;
1946 if ( !trie->trans[ state + charid ].next ) {
1947 trie->trans[ state + charid ].next = next_alloc;
1948 trie->trans[ state ].check++;
1949 prev_states[TRIE_NODENUM(next_alloc)]
1950 = TRIE_NODENUM(state);
1951 next_alloc += trie->uniquecharcount;
1953 state = trie->trans[ state + charid ].next;
1955 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1957 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1960 accept_state = TRIE_NODENUM( state );
1961 TRIE_HANDLE_WORD(accept_state);
1963 } /* end second pass */
1965 /* and now dump it out before we compress it */
1966 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1968 next_alloc, depth+1));
1972 * Inplace compress the table.*
1974 For sparse data sets the table constructed by the trie algorithm will
1975 be mostly 0/FAIL transitions or to put it another way mostly empty.
1976 (Note that leaf nodes will not contain any transitions.)
1978 This algorithm compresses the tables by eliminating most such
1979 transitions, at the cost of a modest bit of extra work during lookup:
1981 - Each states[] entry contains a .base field which indicates the
1982 index in the state[] array wheres its transition data is stored.
1984 - If .base is 0 there are no valid transitions from that node.
1986 - If .base is nonzero then charid is added to it to find an entry in
1989 -If trans[states[state].base+charid].check!=state then the
1990 transition is taken to be a 0/Fail transition. Thus if there are fail
1991 transitions at the front of the node then the .base offset will point
1992 somewhere inside the previous nodes data (or maybe even into a node
1993 even earlier), but the .check field determines if the transition is
1997 The following process inplace converts the table to the compressed
1998 table: We first do not compress the root node 1,and mark all its
1999 .check pointers as 1 and set its .base pointer as 1 as well. This
2000 allows us to do a DFA construction from the compressed table later,
2001 and ensures that any .base pointers we calculate later are greater
2004 - We set 'pos' to indicate the first entry of the second node.
2006 - We then iterate over the columns of the node, finding the first and
2007 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2008 and set the .check pointers accordingly, and advance pos
2009 appropriately and repreat for the next node. Note that when we copy
2010 the next pointers we have to convert them from the original
2011 NODEIDX form to NODENUM form as the former is not valid post
2014 - If a node has no transitions used we mark its base as 0 and do not
2015 advance the pos pointer.
2017 - If a node only has one transition we use a second pointer into the
2018 structure to fill in allocated fail transitions from other states.
2019 This pointer is independent of the main pointer and scans forward
2020 looking for null transitions that are allocated to a state. When it
2021 finds one it writes the single transition into the "hole". If the
2022 pointer doesnt find one the single transition is appended as normal.
2024 - Once compressed we can Renew/realloc the structures to release the
2027 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2028 specifically Fig 3.47 and the associated pseudocode.
2032 const U32 laststate = TRIE_NODENUM( next_alloc );
2035 trie->statecount = laststate;
2037 for ( state = 1 ; state < laststate ; state++ ) {
2039 const U32 stateidx = TRIE_NODEIDX( state );
2040 const U32 o_used = trie->trans[ stateidx ].check;
2041 U32 used = trie->trans[ stateidx ].check;
2042 trie->trans[ stateidx ].check = 0;
2044 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2045 if ( flag || trie->trans[ stateidx + charid ].next ) {
2046 if ( trie->trans[ stateidx + charid ].next ) {
2048 for ( ; zp < pos ; zp++ ) {
2049 if ( ! trie->trans[ zp ].next ) {
2053 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2054 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2055 trie->trans[ zp ].check = state;
2056 if ( ++zp > pos ) pos = zp;
2063 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2065 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2066 trie->trans[ pos ].check = state;
2071 trie->lasttrans = pos + 1;
2072 trie->states = (reg_trie_state *)
2073 PerlMemShared_realloc( trie->states, laststate
2074 * sizeof(reg_trie_state) );
2075 DEBUG_TRIE_COMPILE_MORE_r(
2076 PerlIO_printf( Perl_debug_log,
2077 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2078 (int)depth * 2 + 2,"",
2079 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2082 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2085 } /* end table compress */
2087 DEBUG_TRIE_COMPILE_MORE_r(
2088 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2089 (int)depth * 2 + 2, "",
2090 (UV)trie->statecount,
2091 (UV)trie->lasttrans)
2093 /* resize the trans array to remove unused space */
2094 trie->trans = (reg_trie_trans *)
2095 PerlMemShared_realloc( trie->trans, trie->lasttrans
2096 * sizeof(reg_trie_trans) );
2098 { /* Modify the program and insert the new TRIE node */
2099 U8 nodetype =(U8)(flags & 0xFF);
2103 regnode *optimize = NULL;
2104 #ifdef RE_TRACK_PATTERN_OFFSETS
2107 U32 mjd_nodelen = 0;
2108 #endif /* RE_TRACK_PATTERN_OFFSETS */
2109 #endif /* DEBUGGING */
2111 This means we convert either the first branch or the first Exact,
2112 depending on whether the thing following (in 'last') is a branch
2113 or not and whther first is the startbranch (ie is it a sub part of
2114 the alternation or is it the whole thing.)
2115 Assuming its a sub part we convert the EXACT otherwise we convert
2116 the whole branch sequence, including the first.
2118 /* Find the node we are going to overwrite */
2119 if ( first != startbranch || OP( last ) == BRANCH ) {
2120 /* branch sub-chain */
2121 NEXT_OFF( first ) = (U16)(last - first);
2122 #ifdef RE_TRACK_PATTERN_OFFSETS
2124 mjd_offset= Node_Offset((convert));
2125 mjd_nodelen= Node_Length((convert));
2128 /* whole branch chain */
2130 #ifdef RE_TRACK_PATTERN_OFFSETS
2133 const regnode *nop = NEXTOPER( convert );
2134 mjd_offset= Node_Offset((nop));
2135 mjd_nodelen= Node_Length((nop));
2139 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2140 (int)depth * 2 + 2, "",
2141 (UV)mjd_offset, (UV)mjd_nodelen)
2144 /* But first we check to see if there is a common prefix we can
2145 split out as an EXACT and put in front of the TRIE node. */
2146 trie->startstate= 1;
2147 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2149 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2153 const U32 base = trie->states[ state ].trans.base;
2155 if ( trie->states[state].wordnum )
2158 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2159 if ( ( base + ofs >= trie->uniquecharcount ) &&
2160 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2161 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2163 if ( ++count > 1 ) {
2164 SV **tmp = av_fetch( revcharmap, ofs, 0);
2165 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2166 if ( state == 1 ) break;
2168 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2170 PerlIO_printf(Perl_debug_log,
2171 "%*sNew Start State=%"UVuf" Class: [",
2172 (int)depth * 2 + 2, "",
2175 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2176 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2178 TRIE_BITMAP_SET(trie,*ch);
2180 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2182 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2186 TRIE_BITMAP_SET(trie,*ch);
2188 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2189 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2195 SV **tmp = av_fetch( revcharmap, idx, 0);
2197 char *ch = SvPV( *tmp, len );
2199 SV *sv=sv_newmortal();
2200 PerlIO_printf( Perl_debug_log,
2201 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2202 (int)depth * 2 + 2, "",
2204 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2205 PL_colors[0], PL_colors[1],
2206 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2207 PERL_PV_ESCAPE_FIRSTCHAR
2212 OP( convert ) = nodetype;
2213 str=STRING(convert);
2216 STR_LEN(convert) += len;
2222 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2227 trie->prefixlen = (state-1);
2229 regnode *n = convert+NODE_SZ_STR(convert);
2230 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2231 trie->startstate = state;
2232 trie->minlen -= (state - 1);
2233 trie->maxlen -= (state - 1);
2235 /* At least the UNICOS C compiler choked on this
2236 * being argument to DEBUG_r(), so let's just have
2239 #ifdef PERL_EXT_RE_BUILD
2245 regnode *fix = convert;
2246 U32 word = trie->wordcount;
2248 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2249 while( ++fix < n ) {
2250 Set_Node_Offset_Length(fix, 0, 0);
2253 SV ** const tmp = av_fetch( trie_words, word, 0 );
2255 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2256 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2258 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2266 NEXT_OFF(convert) = (U16)(tail - convert);
2267 DEBUG_r(optimize= n);
2273 if ( trie->maxlen ) {
2274 NEXT_OFF( convert ) = (U16)(tail - convert);
2275 ARG_SET( convert, data_slot );
2276 /* Store the offset to the first unabsorbed branch in
2277 jump[0], which is otherwise unused by the jump logic.
2278 We use this when dumping a trie and during optimisation. */
2280 trie->jump[0] = (U16)(nextbranch - convert);
2282 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2283 * and there is a bitmap
2284 * and the first "jump target" node we found leaves enough room
2285 * then convert the TRIE node into a TRIEC node, with the bitmap
2286 * embedded inline in the opcode - this is hypothetically faster.
2288 if ( !trie->states[trie->startstate].wordnum
2290 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2292 OP( convert ) = TRIEC;
2293 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2294 PerlMemShared_free(trie->bitmap);
2297 OP( convert ) = TRIE;
2299 /* store the type in the flags */
2300 convert->flags = nodetype;
2304 + regarglen[ OP( convert ) ];
2306 /* XXX We really should free up the resource in trie now,
2307 as we won't use them - (which resources?) dmq */
2309 /* needed for dumping*/
2310 DEBUG_r(if (optimize) {
2311 regnode *opt = convert;
2313 while ( ++opt < optimize) {
2314 Set_Node_Offset_Length(opt,0,0);
2317 Try to clean up some of the debris left after the
2320 while( optimize < jumper ) {
2321 mjd_nodelen += Node_Length((optimize));
2322 OP( optimize ) = OPTIMIZED;
2323 Set_Node_Offset_Length(optimize,0,0);
2326 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2328 } /* end node insert */
2329 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2331 /* Finish populating the prev field of the wordinfo array. Walk back
2332 * from each accept state until we find another accept state, and if
2333 * so, point the first word's .prev field at the second word. If the
2334 * second already has a .prev field set, stop now. This will be the
2335 * case either if we've already processed that word's accept state,
2336 * or that state had multiple words, and the overspill words were
2337 * already linked up earlier.
2344 for (word=1; word <= trie->wordcount; word++) {
2346 if (trie->wordinfo[word].prev)
2348 state = trie->wordinfo[word].accept;
2350 state = prev_states[state];
2353 prev = trie->states[state].wordnum;
2357 trie->wordinfo[word].prev = prev;
2359 Safefree(prev_states);
2363 /* and now dump out the compressed format */
2364 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2366 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2368 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2369 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2371 SvREFCNT_dec(revcharmap);
2375 : trie->startstate>1
2381 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2383 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2385 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2386 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2389 We find the fail state for each state in the trie, this state is the longest proper
2390 suffix of the current state's 'word' that is also a proper prefix of another word in our
2391 trie. State 1 represents the word '' and is thus the default fail state. This allows
2392 the DFA not to have to restart after its tried and failed a word at a given point, it
2393 simply continues as though it had been matching the other word in the first place.
2395 'abcdgu'=~/abcdefg|cdgu/
2396 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2397 fail, which would bring us to the state representing 'd' in the second word where we would
2398 try 'g' and succeed, proceeding to match 'cdgu'.
2400 /* add a fail transition */
2401 const U32 trie_offset = ARG(source);
2402 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2404 const U32 ucharcount = trie->uniquecharcount;
2405 const U32 numstates = trie->statecount;
2406 const U32 ubound = trie->lasttrans + ucharcount;
2410 U32 base = trie->states[ 1 ].trans.base;
2413 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2414 GET_RE_DEBUG_FLAGS_DECL;
2416 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2418 PERL_UNUSED_ARG(depth);
2422 ARG_SET( stclass, data_slot );
2423 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2424 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2425 aho->trie=trie_offset;
2426 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2427 Copy( trie->states, aho->states, numstates, reg_trie_state );
2428 Newxz( q, numstates, U32);
2429 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2432 /* initialize fail[0..1] to be 1 so that we always have
2433 a valid final fail state */
2434 fail[ 0 ] = fail[ 1 ] = 1;
2436 for ( charid = 0; charid < ucharcount ; charid++ ) {
2437 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2439 q[ q_write ] = newstate;
2440 /* set to point at the root */
2441 fail[ q[ q_write++ ] ]=1;
2444 while ( q_read < q_write) {
2445 const U32 cur = q[ q_read++ % numstates ];
2446 base = trie->states[ cur ].trans.base;
2448 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2449 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2451 U32 fail_state = cur;
2454 fail_state = fail[ fail_state ];
2455 fail_base = aho->states[ fail_state ].trans.base;
2456 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2458 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2459 fail[ ch_state ] = fail_state;
2460 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2462 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2464 q[ q_write++ % numstates] = ch_state;
2468 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2469 when we fail in state 1, this allows us to use the
2470 charclass scan to find a valid start char. This is based on the principle
2471 that theres a good chance the string being searched contains lots of stuff
2472 that cant be a start char.
2474 fail[ 0 ] = fail[ 1 ] = 0;
2475 DEBUG_TRIE_COMPILE_r({
2476 PerlIO_printf(Perl_debug_log,
2477 "%*sStclass Failtable (%"UVuf" states): 0",
2478 (int)(depth * 2), "", (UV)numstates
2480 for( q_read=1; q_read<numstates; q_read++ ) {
2481 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2483 PerlIO_printf(Perl_debug_log, "\n");
2486 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2491 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2492 * These need to be revisited when a newer toolchain becomes available.
2494 #if defined(__sparc64__) && defined(__GNUC__)
2495 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2496 # undef SPARC64_GCC_WORKAROUND
2497 # define SPARC64_GCC_WORKAROUND 1
2501 #define DEBUG_PEEP(str,scan,depth) \
2502 DEBUG_OPTIMISE_r({if (scan){ \
2503 SV * const mysv=sv_newmortal(); \
2504 regnode *Next = regnext(scan); \
2505 regprop(RExC_rx, mysv, scan); \
2506 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2507 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2508 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2512 /* The below joins as many adjacent EXACTish nodes as possible into a single
2513 * one, and looks for problematic sequences of characters whose folds vs.
2514 * non-folds have sufficiently different lengths, that the optimizer would be
2515 * fooled into rejecting legitimate matches of them, and the trie construction
2516 * code can't cope with them. The joining is only done if:
2517 * 1) there is room in the current conglomerated node to entirely contain the
2519 * 2) they are the exact same node type
2521 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2522 * these get optimized out
2524 * If there are problematic code sequences, *min_subtract is set to the delta
2525 * that the minimum size of the node can be less than its actual size. And,
2526 * the node type of the result is changed to reflect that it contains these
2529 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2530 * and contains LATIN SMALL LETTER SHARP S
2532 * This is as good a place as any to discuss the design of handling these
2533 * problematic sequences. It's been wrong in Perl for a very long time. There
2534 * are three code points in Unicode whose folded lengths differ so much from
2535 * the un-folded lengths that it causes problems for the optimizer and trie
2536 * construction. Why only these are problematic, and not others where lengths
2537 * also differ is something I (khw) do not understand. New versions of Unicode
2538 * might add more such code points. Hopefully the logic in fold_grind.t that
2539 * figures out what to test (in part by verifying that each size-combination
2540 * gets tested) will catch any that do come along, so they can be added to the
2541 * special handling below. The chances of new ones are actually rather small,
2542 * as most, if not all, of the world's scripts that have casefolding have
2543 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2544 * made to allow compatibility with pre-existing standards, and almost all of
2545 * those have already been dealt with. These would otherwise be the most
2546 * likely candidates for generating further tricky sequences. In other words,
2547 * Unicode by itself is unlikely to add new ones unless it is for compatibility
2548 * with pre-existing standards, and there aren't many of those left.
2550 * The previous designs for dealing with these involved assigning a special
2551 * node for them. This approach doesn't work, as evidenced by this example:
2552 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2553 * Both these fold to "sss", but if the pattern is parsed to create a node of
2554 * that would match just the \xDF, it won't be able to handle the case where a
2555 * successful match would have to cross the node's boundary. The new approach
2556 * that hopefully generally solves the problem generates an EXACTFU_SS node
2559 * There are a number of components to the approach (a lot of work for just
2560 * three code points!):
2561 * 1) This routine examines each EXACTFish node that could contain the
2562 * problematic sequences. It returns in *min_subtract how much to
2563 * subtract from the the actual length of the string to get a real minimum
2564 * for one that could match it. This number is usually 0 except for the
2565 * problematic sequences. This delta is used by the caller to adjust the
2566 * min length of the match, and the delta between min and max, so that the
2567 * optimizer doesn't reject these possibilities based on size constraints.
2568 * 2) These sequences are not currently correctly handled by the trie code
2569 * either, so it changes the joined node type to ops that are not handled
2570 * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
2571 * 3) This is sufficient for the two Greek sequences (described below), but
2572 * the one involving the Sharp s (\xDF) needs more. The node type
2573 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2574 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2575 * case where there is a possible fold length change. That means that a
2576 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2577 * itself with length changes, and so can be processed faster. regexec.c
2578 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2579 * is pre-folded by regcomp.c. This saves effort in regex matching.
2580 * However, probably mostly for historical reasons, the pre-folding isn't
2581 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2582 * nodes, as what they fold to isn't known until runtime.) The fold
2583 * possibilities for the non-UTF8 patterns are quite simple, except for
2584 * the sharp s. All the ones that don't involve a UTF-8 target string
2585 * are members of a fold-pair, and arrays are set up for all of them
2586 * that quickly find the other member of the pair. It might actually
2587 * be faster to pre-fold these, but it isn't currently done, except for
2588 * the sharp s. Code elsewhere in this file makes sure that it gets
2589 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2590 * issues described in the next item.
2591 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2592 * 'ss' or not is not knowable at compile time. It will match iff the
2593 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2594 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2595 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2596 * described in item 3). An assumption that the optimizer part of
2597 * regexec.c (probably unwittingly) makes is that a character in the
2598 * pattern corresponds to at most a single character in the target string.
2599 * (And I do mean character, and not byte here, unlike other parts of the
2600 * documentation that have never been updated to account for multibyte
2601 * Unicode.) This assumption is wrong only in this case, as all other
2602 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2603 * virtue of having this file pre-fold UTF-8 patterns. I'm
2604 * reluctant to try to change this assumption, so instead the code punts.
2605 * This routine examines EXACTF nodes for the sharp s, and returns a
2606 * boolean indicating whether or not the node is an EXACTF node that
2607 * contains a sharp s. When it is true, the caller sets a flag that later
2608 * causes the optimizer in this file to not set values for the floating
2609 * and fixed string lengths, and thus avoids the optimizer code in
2610 * regexec.c that makes the invalid assumption. Thus, there is no
2611 * optimization based on string lengths for EXACTF nodes that contain the
2612 * sharp s. This only happens for /id rules (which means the pattern
2616 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2617 if (PL_regkind[OP(scan)] == EXACT) \
2618 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2621 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2622 /* Merge several consecutive EXACTish nodes into one. */
2623 regnode *n = regnext(scan);
2625 regnode *next = scan + NODE_SZ_STR(scan);
2629 regnode *stop = scan;
2630 GET_RE_DEBUG_FLAGS_DECL;
2632 PERL_UNUSED_ARG(depth);
2635 PERL_ARGS_ASSERT_JOIN_EXACT;
2636 #ifndef EXPERIMENTAL_INPLACESCAN
2637 PERL_UNUSED_ARG(flags);
2638 PERL_UNUSED_ARG(val);
2640 DEBUG_PEEP("join",scan,depth);
2642 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2643 * EXACT ones that are mergeable to the current one. */
2645 && (PL_regkind[OP(n)] == NOTHING
2646 || (stringok && OP(n) == OP(scan)))
2648 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2651 if (OP(n) == TAIL || n > next)
2653 if (PL_regkind[OP(n)] == NOTHING) {
2654 DEBUG_PEEP("skip:",n,depth);
2655 NEXT_OFF(scan) += NEXT_OFF(n);
2656 next = n + NODE_STEP_REGNODE;
2663 else if (stringok) {
2664 const unsigned int oldl = STR_LEN(scan);
2665 regnode * const nnext = regnext(n);
2667 if (oldl + STR_LEN(n) > U8_MAX)
2670 DEBUG_PEEP("merg",n,depth);
2673 NEXT_OFF(scan) += NEXT_OFF(n);
2674 STR_LEN(scan) += STR_LEN(n);
2675 next = n + NODE_SZ_STR(n);
2676 /* Now we can overwrite *n : */
2677 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2685 #ifdef EXPERIMENTAL_INPLACESCAN
2686 if (flags && !NEXT_OFF(n)) {
2687 DEBUG_PEEP("atch", val, depth);
2688 if (reg_off_by_arg[OP(n)]) {
2689 ARG_SET(n, val - n);
2692 NEXT_OFF(n) = val - n;
2700 *has_exactf_sharp_s = FALSE;
2702 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2703 * can now analyze for sequences of problematic code points. (Prior to
2704 * this final joining, sequences could have been split over boundaries, and
2705 * hence missed). The sequences only happen in folding, hence for any
2706 * non-EXACT EXACTish node */
2707 if (OP(scan) != EXACT) {
2709 U8 * s0 = (U8*) STRING(scan);
2710 U8 * const s_end = s0 + STR_LEN(scan);
2712 /* The below is perhaps overboard, but this allows us to save a test
2713 * each time through the loop at the expense of a mask. This is
2714 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2715 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2716 * This uses an exclusive 'or' to find that bit and then inverts it to
2717 * form a mask, with just a single 0, in the bit position where 'S' and
2719 const U8 S_or_s_mask = ~ ('S' ^ 's');
2720 const U8 s_masked = 's' & S_or_s_mask;
2722 /* One pass is made over the node's string looking for all the
2723 * possibilities. to avoid some tests in the loop, there are two main
2724 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2728 /* There are two problematic Greek code points in Unicode
2731 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2732 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2738 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2739 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2741 * This means that in case-insensitive matching (or "loose
2742 * matching", as Unicode calls it), an EXACTF of length six (the
2743 * UTF-8 encoded byte length of the above casefolded versions) can
2744 * match a target string of length two (the byte length of UTF-8
2745 * encoded U+0390 or U+03B0). This would rather mess up the
2746 * minimum length computation. (there are other code points that
2747 * also fold to these two sequences, but the delta is smaller)
2749 * If these sequences are found, the minimum length is decreased by
2750 * four (six minus two).
2752 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2753 * LETTER SHARP S. We decrease the min length by 1 for each
2754 * occurrence of 'ss' found */
2756 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2757 # define U390_first_byte 0xb4
2758 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2759 # define U3B0_first_byte 0xb5
2760 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2762 # define U390_first_byte 0xce
2763 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2764 # define U3B0_first_byte 0xcf
2765 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2767 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2768 yields a net of 0 */
2769 /* Examine the string for one of the problematic sequences */
2771 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2772 * sequence we are looking for is 2 */
2776 /* Look for the first byte in each problematic sequence */
2778 /* We don't have to worry about other things that fold to
2779 * 's' (such as the long s, U+017F), as all above-latin1
2780 * code points have been pre-folded */
2784 /* Current character is an 's' or 'S'. If next one is
2785 * as well, we have the dreaded sequence */
2786 if (((*(s+1) & S_or_s_mask) == s_masked)
2787 /* These two node types don't have special handling
2789 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2792 OP(scan) = EXACTFU_SS;
2793 s++; /* No need to look at this character again */
2797 case U390_first_byte:
2798 if (s_end - s >= len
2800 /* The 1's are because are skipping comparing the
2802 && memEQ(s + 1, U390_tail, len - 1))
2804 goto greek_sequence;
2808 case U3B0_first_byte:
2809 if (! (s_end - s >= len
2810 && memEQ(s + 1, U3B0_tail, len - 1)))
2817 /* This can't currently be handled by trie's, so change
2818 * the node type to indicate this. If EXACTFA and
2819 * EXACTFL were ever to be handled by trie's, this
2820 * would have to be changed. If this node has already
2821 * been changed to EXACTFU_SS in this loop, leave it as
2822 * is. (I (khw) think it doesn't matter in regexec.c
2823 * for UTF patterns, but no need to change it */
2824 if (OP(scan) == EXACTFU) {
2825 OP(scan) = EXACTFU_NO_TRIE;
2827 s += 6; /* We already know what this sequence is. Skip
2833 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2835 /* Here, the pattern is not UTF-8. We need to look only for the
2836 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2837 * in the final position. Otherwise we can stop looking 1 byte
2838 * earlier because have to find both the first and second 's' */
2839 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2841 for (s = s0; s < upper; s++) {
2846 && ((*(s+1) & S_or_s_mask) == s_masked))
2850 /* EXACTF nodes need to know that the minimum
2851 * length changed so that a sharp s in the string
2852 * can match this ss in the pattern, but they
2853 * remain EXACTF nodes, as they are not trie'able,
2854 * so don't have to invent a new node type to
2855 * exclude them from the trie code */
2856 if (OP(scan) != EXACTF) {
2857 OP(scan) = EXACTFU_SS;
2862 case LATIN_SMALL_LETTER_SHARP_S:
2863 if (OP(scan) == EXACTF) {
2864 *has_exactf_sharp_s = TRUE;
2873 /* Allow dumping but overwriting the collection of skipped
2874 * ops and/or strings with fake optimized ops */
2875 n = scan + NODE_SZ_STR(scan);
2883 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2887 /* REx optimizer. Converts nodes into quicker variants "in place".
2888 Finds fixed substrings. */
2890 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2891 to the position after last scanned or to NULL. */
2893 #define INIT_AND_WITHP \
2894 assert(!and_withp); \
2895 Newx(and_withp,1,struct regnode_charclass_class); \
2896 SAVEFREEPV(and_withp)
2898 /* this is a chain of data about sub patterns we are processing that
2899 need to be handled separately/specially in study_chunk. Its so
2900 we can simulate recursion without losing state. */
2902 typedef struct scan_frame {
2903 regnode *last; /* last node to process in this frame */
2904 regnode *next; /* next node to process when last is reached */
2905 struct scan_frame *prev; /*previous frame*/
2906 I32 stop; /* what stopparen do we use */
2910 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2912 #define CASE_SYNST_FNC(nAmE) \
2914 if (flags & SCF_DO_STCLASS_AND) { \
2915 for (value = 0; value < 256; value++) \
2916 if (!is_ ## nAmE ## _cp(value)) \
2917 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2920 for (value = 0; value < 256; value++) \
2921 if (is_ ## nAmE ## _cp(value)) \
2922 ANYOF_BITMAP_SET(data->start_class, value); \
2926 if (flags & SCF_DO_STCLASS_AND) { \
2927 for (value = 0; value < 256; value++) \
2928 if (is_ ## nAmE ## _cp(value)) \
2929 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2932 for (value = 0; value < 256; value++) \
2933 if (!is_ ## nAmE ## _cp(value)) \
2934 ANYOF_BITMAP_SET(data->start_class, value); \
2941 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2942 I32 *minlenp, I32 *deltap,
2947 struct regnode_charclass_class *and_withp,
2948 U32 flags, U32 depth)
2949 /* scanp: Start here (read-write). */
2950 /* deltap: Write maxlen-minlen here. */
2951 /* last: Stop before this one. */
2952 /* data: string data about the pattern */
2953 /* stopparen: treat close N as END */
2954 /* recursed: which subroutines have we recursed into */
2955 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2958 I32 min = 0, pars = 0, code;
2959 regnode *scan = *scanp, *next;
2961 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2962 int is_inf_internal = 0; /* The studied chunk is infinite */
2963 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2964 scan_data_t data_fake;
2965 SV *re_trie_maxbuff = NULL;
2966 regnode *first_non_open = scan;
2967 I32 stopmin = I32_MAX;
2968 scan_frame *frame = NULL;
2969 GET_RE_DEBUG_FLAGS_DECL;
2971 PERL_ARGS_ASSERT_STUDY_CHUNK;
2974 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2978 while (first_non_open && OP(first_non_open) == OPEN)
2979 first_non_open=regnext(first_non_open);
2984 while ( scan && OP(scan) != END && scan < last ){
2985 UV min_subtract = 0; /* How much to subtract from the minimum node
2986 length to get a real minimum (because the
2987 folded version may be shorter) */
2988 bool has_exactf_sharp_s = FALSE;
2989 /* Peephole optimizer: */
2990 DEBUG_STUDYDATA("Peep:", data,depth);
2991 DEBUG_PEEP("Peep",scan,depth);
2993 /* Its not clear to khw or hv why this is done here, and not in the
2994 * clauses that deal with EXACT nodes. khw's guess is that it's
2995 * because of a previous design */
2996 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
2998 /* Follow the next-chain of the current node and optimize
2999 away all the NOTHINGs from it. */
3000 if (OP(scan) != CURLYX) {
3001 const int max = (reg_off_by_arg[OP(scan)]
3003 /* I32 may be smaller than U16 on CRAYs! */
3004 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3005 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3009 /* Skip NOTHING and LONGJMP. */
3010 while ((n = regnext(n))
3011 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3012 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3013 && off + noff < max)
3015 if (reg_off_by_arg[OP(scan)])
3018 NEXT_OFF(scan) = off;
3023 /* The principal pseudo-switch. Cannot be a switch, since we
3024 look into several different things. */
3025 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3026 || OP(scan) == IFTHEN) {
3027 next = regnext(scan);
3029 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3031 if (OP(next) == code || code == IFTHEN) {
3032 /* NOTE - There is similar code to this block below for handling
3033 TRIE nodes on a re-study. If you change stuff here check there
3035 I32 max1 = 0, min1 = I32_MAX, num = 0;
3036 struct regnode_charclass_class accum;
3037 regnode * const startbranch=scan;
3039 if (flags & SCF_DO_SUBSTR)
3040 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3041 if (flags & SCF_DO_STCLASS)
3042 cl_init_zero(pRExC_state, &accum);
3044 while (OP(scan) == code) {
3045 I32 deltanext, minnext, f = 0, fake;
3046 struct regnode_charclass_class this_class;
3049 data_fake.flags = 0;
3051 data_fake.whilem_c = data->whilem_c;
3052 data_fake.last_closep = data->last_closep;
3055 data_fake.last_closep = &fake;
3057 data_fake.pos_delta = delta;
3058 next = regnext(scan);
3059 scan = NEXTOPER(scan);
3061 scan = NEXTOPER(scan);
3062 if (flags & SCF_DO_STCLASS) {
3063 cl_init(pRExC_state, &this_class);
3064 data_fake.start_class = &this_class;
3065 f = SCF_DO_STCLASS_AND;
3067 if (flags & SCF_WHILEM_VISITED_POS)
3068 f |= SCF_WHILEM_VISITED_POS;
3070 /* we suppose the run is continuous, last=next...*/
3071 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3073 stopparen, recursed, NULL, f,depth+1);
3076 if (max1 < minnext + deltanext)
3077 max1 = minnext + deltanext;
3078 if (deltanext == I32_MAX)
3079 is_inf = is_inf_internal = 1;
3081 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3083 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3084 if ( stopmin > minnext)
3085 stopmin = min + min1;
3086 flags &= ~SCF_DO_SUBSTR;
3088 data->flags |= SCF_SEEN_ACCEPT;
3091 if (data_fake.flags & SF_HAS_EVAL)
3092 data->flags |= SF_HAS_EVAL;
3093 data->whilem_c = data_fake.whilem_c;
3095 if (flags & SCF_DO_STCLASS)
3096 cl_or(pRExC_state, &accum, &this_class);
3098 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3100 if (flags & SCF_DO_SUBSTR) {
3101 data->pos_min += min1;
3102 data->pos_delta += max1 - min1;
3103 if (max1 != min1 || is_inf)
3104 data->longest = &(data->longest_float);
3107 delta += max1 - min1;
3108 if (flags & SCF_DO_STCLASS_OR) {
3109 cl_or(pRExC_state, data->start_class, &accum);
3111 cl_and(data->start_class, and_withp);
3112 flags &= ~SCF_DO_STCLASS;
3115 else if (flags & SCF_DO_STCLASS_AND) {
3117 cl_and(data->start_class, &accum);
3118 flags &= ~SCF_DO_STCLASS;
3121 /* Switch to OR mode: cache the old value of
3122 * data->start_class */
3124 StructCopy(data->start_class, and_withp,
3125 struct regnode_charclass_class);
3126 flags &= ~SCF_DO_STCLASS_AND;
3127 StructCopy(&accum, data->start_class,
3128 struct regnode_charclass_class);
3129 flags |= SCF_DO_STCLASS_OR;
3130 data->start_class->flags |= ANYOF_EOS;
3134 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3137 Assuming this was/is a branch we are dealing with: 'scan' now
3138 points at the item that follows the branch sequence, whatever
3139 it is. We now start at the beginning of the sequence and look
3146 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3148 If we can find such a subsequence we need to turn the first
3149 element into a trie and then add the subsequent branch exact
3150 strings to the trie.
3154 1. patterns where the whole set of branches can be converted.
3156 2. patterns where only a subset can be converted.
3158 In case 1 we can replace the whole set with a single regop
3159 for the trie. In case 2 we need to keep the start and end
3162 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3163 becomes BRANCH TRIE; BRANCH X;
3165 There is an additional case, that being where there is a
3166 common prefix, which gets split out into an EXACT like node
3167 preceding the TRIE node.
3169 If x(1..n)==tail then we can do a simple trie, if not we make
3170 a "jump" trie, such that when we match the appropriate word
3171 we "jump" to the appropriate tail node. Essentially we turn
3172 a nested if into a case structure of sorts.
3177 if (!re_trie_maxbuff) {
3178 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3179 if (!SvIOK(re_trie_maxbuff))
3180 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3182 if ( SvIV(re_trie_maxbuff)>=0 ) {
3184 regnode *first = (regnode *)NULL;
3185 regnode *last = (regnode *)NULL;
3186 regnode *tail = scan;
3191 SV * const mysv = sv_newmortal(); /* for dumping */
3193 /* var tail is used because there may be a TAIL
3194 regop in the way. Ie, the exacts will point to the
3195 thing following the TAIL, but the last branch will
3196 point at the TAIL. So we advance tail. If we
3197 have nested (?:) we may have to move through several
3201 while ( OP( tail ) == TAIL ) {
3202 /* this is the TAIL generated by (?:) */
3203 tail = regnext( tail );
3208 regprop(RExC_rx, mysv, tail );
3209 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3210 (int)depth * 2 + 2, "",
3211 "Looking for TRIE'able sequences. Tail node is: ",
3212 SvPV_nolen_const( mysv )
3218 step through the branches, cur represents each
3219 branch, noper is the first thing to be matched
3220 as part of that branch and noper_next is the
3221 regnext() of that node. if noper is an EXACT
3222 and noper_next is the same as scan (our current
3223 position in the regex) then the EXACT branch is
3224 a possible optimization target. Once we have
3225 two or more consecutive such branches we can
3226 create a trie of the EXACT's contents and stich
3227 it in place. If the sequence represents all of
3228 the branches we eliminate the whole thing and
3229 replace it with a single TRIE. If it is a
3230 subsequence then we need to stitch it in. This
3231 means the first branch has to remain, and needs
3232 to be repointed at the item on the branch chain
3233 following the last branch optimized. This could
3234 be either a BRANCH, in which case the
3235 subsequence is internal, or it could be the
3236 item following the branch sequence in which
3237 case the subsequence is at the end.
3241 /* dont use tail as the end marker for this traverse */
3242 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3243 regnode * const noper = NEXTOPER( cur );
3244 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3245 regnode * const noper_next = regnext( noper );
3249 regprop(RExC_rx, mysv, cur);
3250 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3251 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3253 regprop(RExC_rx, mysv, noper);
3254 PerlIO_printf( Perl_debug_log, " -> %s",
3255 SvPV_nolen_const(mysv));
3258 regprop(RExC_rx, mysv, noper_next );
3259 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3260 SvPV_nolen_const(mysv));
3262 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3263 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3265 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3266 : PL_regkind[ OP( noper ) ] == EXACT )
3267 || OP(noper) == NOTHING )
3269 && noper_next == tail
3274 if ( !first || optype == NOTHING ) {
3275 if (!first) first = cur;
3276 optype = OP( noper );
3282 Currently the trie logic handles case insensitive matching properly only
3283 when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
3286 If/when this is fixed the following define can be swapped
3287 in below to fully enable trie logic.
3289 #define TRIE_TYPE_IS_SAFE 1
3291 Note that join_exact() assumes that the other types of EXACTFish nodes are not
3292 used in tries, so that would have to be updated if this changed
3295 #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
3297 if ( last && TRIE_TYPE_IS_SAFE ) {
3298 make_trie( pRExC_state,
3299 startbranch, first, cur, tail, count,
3302 if ( PL_regkind[ OP( noper ) ] == EXACT
3304 && noper_next == tail
3309 optype = OP( noper );
3319 regprop(RExC_rx, mysv, cur);
3320 PerlIO_printf( Perl_debug_log,
3321 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3322 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3326 if ( last && TRIE_TYPE_IS_SAFE ) {
3327 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3328 #ifdef TRIE_STUDY_OPT
3329 if ( ((made == MADE_EXACT_TRIE &&
3330 startbranch == first)
3331 || ( first_non_open == first )) &&
3333 flags |= SCF_TRIE_RESTUDY;
3334 if ( startbranch == first
3337 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3347 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3348 scan = NEXTOPER(NEXTOPER(scan));
3349 } else /* single branch is optimized. */
3350 scan = NEXTOPER(scan);
3352 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3353 scan_frame *newframe = NULL;
3358 if (OP(scan) != SUSPEND) {
3359 /* set the pointer */
3360 if (OP(scan) == GOSUB) {
3362 RExC_recurse[ARG2L(scan)] = scan;
3363 start = RExC_open_parens[paren-1];
3364 end = RExC_close_parens[paren-1];
3367 start = RExC_rxi->program + 1;
3371 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3372 SAVEFREEPV(recursed);
3374 if (!PAREN_TEST(recursed,paren+1)) {
3375 PAREN_SET(recursed,paren+1);
3376 Newx(newframe,1,scan_frame);
3378 if (flags & SCF_DO_SUBSTR) {
3379 SCAN_COMMIT(pRExC_state,data,minlenp);
3380 data->longest = &(data->longest_float);
3382 is_inf = is_inf_internal = 1;
3383 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3384 cl_anything(pRExC_state, data->start_class);
3385 flags &= ~SCF_DO_STCLASS;
3388 Newx(newframe,1,scan_frame);
3391 end = regnext(scan);
3396 SAVEFREEPV(newframe);
3397 newframe->next = regnext(scan);
3398 newframe->last = last;
3399 newframe->stop = stopparen;
3400 newframe->prev = frame;
3410 else if (OP(scan) == EXACT) {
3411 I32 l = STR_LEN(scan);
3414 const U8 * const s = (U8*)STRING(scan);
3415 l = utf8_length(s, s + l);
3416 uc = utf8_to_uvchr(s, NULL);
3418 uc = *((U8*)STRING(scan));
3421 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3422 /* The code below prefers earlier match for fixed
3423 offset, later match for variable offset. */
3424 if (data->last_end == -1) { /* Update the start info. */
3425 data->last_start_min = data->pos_min;
3426 data->last_start_max = is_inf
3427 ? I32_MAX : data->pos_min + data->pos_delta;
3429 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3431 SvUTF8_on(data->last_found);
3433 SV * const sv = data->last_found;
3434 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3435 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3436 if (mg && mg->mg_len >= 0)
3437 mg->mg_len += utf8_length((U8*)STRING(scan),
3438 (U8*)STRING(scan)+STR_LEN(scan));
3440 data->last_end = data->pos_min + l;
3441 data->pos_min += l; /* As in the first entry. */
3442 data->flags &= ~SF_BEFORE_EOL;
3444 if (flags & SCF_DO_STCLASS_AND) {
3445 /* Check whether it is compatible with what we know already! */
3449 /* If compatible, we or it in below. It is compatible if is
3450 * in the bitmp and either 1) its bit or its fold is set, or 2)
3451 * it's for a locale. Even if there isn't unicode semantics
3452 * here, at runtime there may be because of matching against a
3453 * utf8 string, so accept a possible false positive for
3454 * latin1-range folds */
3456 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3457 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3458 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3459 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3464 ANYOF_CLASS_ZERO(data->start_class);
3465 ANYOF_BITMAP_ZERO(data->start_class);
3467 ANYOF_BITMAP_SET(data->start_class, uc);
3468 else if (uc >= 0x100) {
3471 /* Some Unicode code points fold to the Latin1 range; as
3472 * XXX temporary code, instead of figuring out if this is
3473 * one, just assume it is and set all the start class bits
3474 * that could be some such above 255 code point's fold
3475 * which will generate fals positives. As the code
3476 * elsewhere that does compute the fold settles down, it
3477 * can be extracted out and re-used here */
3478 for (i = 0; i < 256; i++){
3479 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3480 ANYOF_BITMAP_SET(data->start_class, i);
3484 data->start_class->flags &= ~ANYOF_EOS;
3486 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3488 else if (flags & SCF_DO_STCLASS_OR) {
3489 /* false positive possible if the class is case-folded */
3491 ANYOF_BITMAP_SET(data->start_class, uc);
3493 data->start_class->flags |= ANYOF_UNICODE_ALL;
3494 data->start_class->flags &= ~ANYOF_EOS;
3495 cl_and(data->start_class, and_withp);
3497 flags &= ~SCF_DO_STCLASS;
3499 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3500 I32 l = STR_LEN(scan);
3501 UV uc = *((U8*)STRING(scan));
3503 /* Search for fixed substrings supports EXACT only. */
3504 if (flags & SCF_DO_SUBSTR) {
3506 SCAN_COMMIT(pRExC_state, data, minlenp);
3509 const U8 * const s = (U8 *)STRING(scan);
3510 l = utf8_length(s, s + l);
3511 uc = utf8_to_uvchr(s, NULL);
3513 else if (has_exactf_sharp_s) {
3514 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3516 min += l - min_subtract;
3520 delta += min_subtract;
3521 if (flags & SCF_DO_SUBSTR) {
3522 data->pos_min += l - min_subtract;
3523 if (data->pos_min < 0) {
3526 data->pos_delta += min_subtract;
3528 data->longest = &(data->longest_float);
3531 if (flags & SCF_DO_STCLASS_AND) {
3532 /* Check whether it is compatible with what we know already! */
3535 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3536 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3537 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3541 ANYOF_CLASS_ZERO(data->start_class);
3542 ANYOF_BITMAP_ZERO(data->start_class);
3544 ANYOF_BITMAP_SET(data->start_class, uc);
3545 data->start_class->flags &= ~ANYOF_EOS;
3546 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3547 if (OP(scan) == EXACTFL) {
3548 /* XXX This set is probably no longer necessary, and
3549 * probably wrong as LOCALE now is on in the initial
3551 data->start_class->flags |= ANYOF_LOCALE;
3555 /* Also set the other member of the fold pair. In case
3556 * that unicode semantics is called for at runtime, use
3557 * the full latin1 fold. (Can't do this for locale,
3558 * because not known until runtime) */
3559 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3561 /* All other (EXACTFL handled above) folds except under
3562 * /iaa that include s, S, and sharp_s also may include
3564 if (OP(scan) != EXACTFA) {
3565 if (uc == 's' || uc == 'S') {
3566 ANYOF_BITMAP_SET(data->start_class,
3567 LATIN_SMALL_LETTER_SHARP_S);
3569 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3570 ANYOF_BITMAP_SET(data->start_class, 's');
3571 ANYOF_BITMAP_SET(data->start_class, 'S');
3576 else if (uc >= 0x100) {
3578 for (i = 0; i < 256; i++){
3579 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3580 ANYOF_BITMAP_SET(data->start_class, i);
3585 else if (flags & SCF_DO_STCLASS_OR) {
3586 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3587 /* false positive possible if the class is case-folded.
3588 Assume that the locale settings are the same... */
3590 ANYOF_BITMAP_SET(data->start_class, uc);
3591 if (OP(scan) != EXACTFL) {
3593 /* And set the other member of the fold pair, but
3594 * can't do that in locale because not known until
3596 ANYOF_BITMAP_SET(data->start_class,
3597 PL_fold_latin1[uc]);
3599 /* All folds except under /iaa that include s, S,
3600 * and sharp_s also may include the others */
3601 if (OP(scan) != EXACTFA) {
3602 if (uc == 's' || uc == 'S') {
3603 ANYOF_BITMAP_SET(data->start_class,
3604 LATIN_SMALL_LETTER_SHARP_S);
3606 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3607 ANYOF_BITMAP_SET(data->start_class, 's');
3608 ANYOF_BITMAP_SET(data->start_class, 'S');
3613 data->start_class->flags &= ~ANYOF_EOS;
3615 cl_and(data->start_class, and_withp);
3617 flags &= ~SCF_DO_STCLASS;
3619 else if (REGNODE_VARIES(OP(scan))) {
3620 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3621 I32 f = flags, pos_before = 0;
3622 regnode * const oscan = scan;
3623 struct regnode_charclass_class this_class;
3624 struct regnode_charclass_class *oclass = NULL;
3625 I32 next_is_eval = 0;
3627 switch (PL_regkind[OP(scan)]) {
3628 case WHILEM: /* End of (?:...)* . */
3629 scan = NEXTOPER(scan);
3632 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3633 next = NEXTOPER(scan);
3634 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3636 maxcount = REG_INFTY;
3637 next = regnext(scan);
3638 scan = NEXTOPER(scan);
3642 if (flags & SCF_DO_SUBSTR)
3647 if (flags & SCF_DO_STCLASS) {
3649 maxcount = REG_INFTY;
3650 next = regnext(scan);
3651 scan = NEXTOPER(scan);
3654 is_inf = is_inf_internal = 1;
3655 scan = regnext(scan);
3656 if (flags & SCF_DO_SUBSTR) {
3657 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3658 data->longest = &(data->longest_float);
3660 goto optimize_curly_tail;
3662 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3663 && (scan->flags == stopparen))
3668 mincount = ARG1(scan);
3669 maxcount = ARG2(scan);
3671 next = regnext(scan);
3672 if (OP(scan) == CURLYX) {
3673 I32 lp = (data ? *(data->last_closep) : 0);
3674 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3676 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3677 next_is_eval = (OP(scan) == EVAL);
3679 if (flags & SCF_DO_SUBSTR) {
3680 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3681 pos_before = data->pos_min;
3685 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3687 data->flags |= SF_IS_INF;
3689 if (flags & SCF_DO_STCLASS) {
3690 cl_init(pRExC_state, &this_class);
3691 oclass = data->start_class;
3692 data->start_class = &this_class;
3693 f |= SCF_DO_STCLASS_AND;
3694 f &= ~SCF_DO_STCLASS_OR;
3696 /* Exclude from super-linear cache processing any {n,m}
3697 regops for which the combination of input pos and regex
3698 pos is not enough information to determine if a match
3701 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3702 regex pos at the \s*, the prospects for a match depend not
3703 only on the input position but also on how many (bar\s*)
3704 repeats into the {4,8} we are. */
3705 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3706 f &= ~SCF_WHILEM_VISITED_POS;
3708 /* This will finish on WHILEM, setting scan, or on NULL: */
3709 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3710 last, data, stopparen, recursed, NULL,
3712 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3714 if (flags & SCF_DO_STCLASS)
3715 data->start_class = oclass;
3716 if (mincount == 0 || minnext == 0) {
3717 if (flags & SCF_DO_STCLASS_OR) {
3718 cl_or(pRExC_state, data->start_class, &this_class);
3720 else if (flags & SCF_DO_STCLASS_AND) {
3721 /* Switch to OR mode: cache the old value of
3722 * data->start_class */
3724 StructCopy(data->start_class, and_withp,
3725 struct regnode_charclass_class);
3726 flags &= ~SCF_DO_STCLASS_AND;
3727 StructCopy(&this_class, data->start_class,
3728 struct regnode_charclass_class);
3729 flags |= SCF_DO_STCLASS_OR;
3730 data->start_class->flags |= ANYOF_EOS;
3732 } else { /* Non-zero len */
3733 if (flags & SCF_DO_STCLASS_OR) {
3734 cl_or(pRExC_state, data->start_class, &this_class);
3735 cl_and(data->start_class, and_withp);
3737 else if (flags & SCF_DO_STCLASS_AND)
3738 cl_and(data->start_class, &this_class);
3739 flags &= ~SCF_DO_STCLASS;
3741 if (!scan) /* It was not CURLYX, but CURLY. */
3743 if ( /* ? quantifier ok, except for (?{ ... }) */
3744 (next_is_eval || !(mincount == 0 && maxcount == 1))
3745 && (minnext == 0) && (deltanext == 0)
3746 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3747 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3749 ckWARNreg(RExC_parse,
3750 "Quantifier unexpected on zero-length expression");
3753 min += minnext * mincount;
3754 is_inf_internal |= ((maxcount == REG_INFTY
3755 && (minnext + deltanext) > 0)
3756 || deltanext == I32_MAX);
3757 is_inf |= is_inf_internal;
3758 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3760 /* Try powerful optimization CURLYX => CURLYN. */
3761 if ( OP(oscan) == CURLYX && data
3762 && data->flags & SF_IN_PAR
3763 && !(data->flags & SF_HAS_EVAL)
3764 && !deltanext && minnext == 1 ) {
3765 /* Try to optimize to CURLYN. */
3766 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3767 regnode * const nxt1 = nxt;
3774 if (!REGNODE_SIMPLE(OP(nxt))
3775 && !(PL_regkind[OP(nxt)] == EXACT
3776 && STR_LEN(nxt) == 1))
3782 if (OP(nxt) != CLOSE)
3784 if (RExC_open_parens) {
3785 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3786 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3788 /* Now we know that nxt2 is the only contents: */
3789 oscan->flags = (U8)ARG(nxt);
3791 OP(nxt1) = NOTHING; /* was OPEN. */
3794 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3795 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3796 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3797 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3798 OP(nxt + 1) = OPTIMIZED; /* was count. */
3799 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3804 /* Try optimization CURLYX => CURLYM. */
3805 if ( OP(oscan) == CURLYX && data
3806 && !(data->flags & SF_HAS_PAR)
3807 && !(data->flags & SF_HAS_EVAL)
3808 && !deltanext /* atom is fixed width */
3809 && minnext != 0 /* CURLYM can't handle zero width */
3811 /* XXXX How to optimize if data == 0? */
3812 /* Optimize to a simpler form. */
3813 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3817 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3818 && (OP(nxt2) != WHILEM))
3820 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3821 /* Need to optimize away parenths. */
3822 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3823 /* Set the parenth number. */
3824 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3826 oscan->flags = (U8)ARG(nxt);
3827 if (RExC_open_parens) {
3828 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3829 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3831 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3832 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3835 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3836 OP(nxt + 1) = OPTIMIZED; /* was count. */
3837 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3838 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3841 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3842 regnode *nnxt = regnext(nxt1);
3844 if (reg_off_by_arg[OP(nxt1)])
3845 ARG_SET(nxt1, nxt2 - nxt1);
3846 else if (nxt2 - nxt1 < U16_MAX)
3847 NEXT_OFF(nxt1) = nxt2 - nxt1;
3849 OP(nxt) = NOTHING; /* Cannot beautify */
3854 /* Optimize again: */
3855 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3856 NULL, stopparen, recursed, NULL, 0,depth+1);
3861 else if ((OP(oscan) == CURLYX)
3862 && (flags & SCF_WHILEM_VISITED_POS)
3863 /* See the comment on a similar expression above.
3864 However, this time it's not a subexpression
3865 we care about, but the expression itself. */
3866 && (maxcount == REG_INFTY)
3867 && data && ++data->whilem_c < 16) {
3868 /* This stays as CURLYX, we can put the count/of pair. */
3869 /* Find WHILEM (as in regexec.c) */
3870 regnode *nxt = oscan + NEXT_OFF(oscan);
3872 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3874 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3875 | (RExC_whilem_seen << 4)); /* On WHILEM */
3877 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3879 if (flags & SCF_DO_SUBSTR) {
3880 SV *last_str = NULL;
3881 int counted = mincount != 0;
3883 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3884 #if defined(SPARC64_GCC_WORKAROUND)
3887 const char *s = NULL;
3890 if (pos_before >= data->last_start_min)
3893 b = data->last_start_min;
3896 s = SvPV_const(data->last_found, l);
3897 old = b - data->last_start_min;
3900 I32 b = pos_before >= data->last_start_min
3901 ? pos_before : data->last_start_min;
3903 const char * const s = SvPV_const(data->last_found, l);
3904 I32 old = b - data->last_start_min;
3908 old = utf8_hop((U8*)s, old) - (U8*)s;
3910 /* Get the added string: */
3911 last_str = newSVpvn_utf8(s + old, l, UTF);
3912 if (deltanext == 0 && pos_before == b) {
3913 /* What was added is a constant string */
3915 SvGROW(last_str, (mincount * l) + 1);
3916 repeatcpy(SvPVX(last_str) + l,
3917 SvPVX_const(last_str), l, mincount - 1);
3918 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3919 /* Add additional parts. */
3920 SvCUR_set(data->last_found,
3921 SvCUR(data->last_found) - l);
3922 sv_catsv(data->last_found, last_str);
3924 SV * sv = data->last_found;
3926 SvUTF8(sv) && SvMAGICAL(sv) ?
3927 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3928 if (mg && mg->mg_len >= 0)
3929 mg->mg_len += CHR_SVLEN(last_str) - l;
3931 data->last_end += l * (mincount - 1);
3934 /* start offset must point into the last copy */
3935 data->last_start_min += minnext * (mincount - 1);
3936 data->last_start_max += is_inf ? I32_MAX
3937 : (maxcount - 1) * (minnext + data->pos_delta);
3940 /* It is counted once already... */
3941 data->pos_min += minnext * (mincount - counted);
3942 data->pos_delta += - counted * deltanext +
3943 (minnext + deltanext) * maxcount - minnext * mincount;
3944 if (mincount != maxcount) {
3945 /* Cannot extend fixed substrings found inside
3947 SCAN_COMMIT(pRExC_state,data,minlenp);
3948 if (mincount && last_str) {
3949 SV * const sv = data->last_found;
3950 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3951 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3955 sv_setsv(sv, last_str);
3956 data->last_end = data->pos_min;
3957 data->last_start_min =
3958 data->pos_min - CHR_SVLEN(last_str);
3959 data->last_start_max = is_inf
3961 : data->pos_min + data->pos_delta
3962 - CHR_SVLEN(last_str);
3964 data->longest = &(data->longest_float);
3966 SvREFCNT_dec(last_str);
3968 if (data && (fl & SF_HAS_EVAL))
3969 data->flags |= SF_HAS_EVAL;
3970 optimize_curly_tail:
3971 if (OP(oscan) != CURLYX) {
3972 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3974 NEXT_OFF(oscan) += NEXT_OFF(next);
3977 default: /* REF, ANYOFV, and CLUMP only? */
3978 if (flags & SCF_DO_SUBSTR) {
3979 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3980 data->longest = &(data->longest_float);
3982 is_inf = is_inf_internal = 1;
3983 if (flags & SCF_DO_STCLASS_OR)
3984 cl_anything(pRExC_state, data->start_class);
3985 flags &= ~SCF_DO_STCLASS;
3989 else if (OP(scan) == LNBREAK) {
3990 if (flags & SCF_DO_STCLASS) {
3992 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3993 if (flags & SCF_DO_STCLASS_AND) {
3994 for (value = 0; value < 256; value++)
3995 if (!is_VERTWS_cp(value))
3996 ANYOF_BITMAP_CLEAR(data->start_class, value);
3999 for (value = 0; value < 256; value++)
4000 if (is_VERTWS_cp(value))
4001 ANYOF_BITMAP_SET(data->start_class, value);
4003 if (flags & SCF_DO_STCLASS_OR)
4004 cl_and(data->start_class, and_withp);
4005 flags &= ~SCF_DO_STCLASS;
4009 if (flags & SCF_DO_SUBSTR) {
4010 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4012 data->pos_delta += 1;
4013 data->longest = &(data->longest_float);
4016 else if (REGNODE_SIMPLE(OP(scan))) {
4019 if (flags & SCF_DO_SUBSTR) {
4020 SCAN_COMMIT(pRExC_state,data,minlenp);
4024 if (flags & SCF_DO_STCLASS) {
4025 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4027 /* Some of the logic below assumes that switching
4028 locale on will only add false positives. */
4029 switch (PL_regkind[OP(scan)]) {
4033 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4034 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4035 cl_anything(pRExC_state, data->start_class);
4038 if (OP(scan) == SANY)
4040 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4041 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4042 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4043 cl_anything(pRExC_state, data->start_class);
4045 if (flags & SCF_DO_STCLASS_AND || !value)
4046 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4049 if (flags & SCF_DO_STCLASS_AND)
4050 cl_and(data->start_class,
4051 (struct regnode_charclass_class*)scan);
4053 cl_or(pRExC_state, data->start_class,
4054 (struct regnode_charclass_class*)scan);
4057 if (flags & SCF_DO_STCLASS_AND) {
4058 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4059 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4060 if (OP(scan) == ALNUMU) {
4061 for (value = 0; value < 256; value++) {
4062 if (!isWORDCHAR_L1(value)) {
4063 ANYOF_BITMAP_CLEAR(data->start_class, value);
4067 for (value = 0; value < 256; value++) {
4068 if (!isALNUM(value)) {
4069 ANYOF_BITMAP_CLEAR(data->start_class, value);
4076 if (data->start_class->flags & ANYOF_LOCALE)
4077 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4079 /* Even if under locale, set the bits for non-locale
4080 * in case it isn't a true locale-node. This will
4081 * create false positives if it truly is locale */
4082 if (OP(scan) == ALNUMU) {
4083 for (value = 0; value < 256; value++) {
4084 if (isWORDCHAR_L1(value)) {
4085 ANYOF_BITMAP_SET(data->start_class, value);
4089 for (value = 0; value < 256; value++) {
4090 if (isALNUM(value)) {
4091 ANYOF_BITMAP_SET(data->start_class, value);
4098 if (flags & SCF_DO_STCLASS_AND) {
4099 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4100 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4101 if (OP(scan) == NALNUMU) {
4102 for (value = 0; value < 256; value++) {
4103 if (isWORDCHAR_L1(value)) {
4104 ANYOF_BITMAP_CLEAR(data->start_class, value);
4108 for (value = 0; value < 256; value++) {
4109 if (isALNUM(value)) {
4110 ANYOF_BITMAP_CLEAR(data->start_class, value);
4117 if (data->start_class->flags & ANYOF_LOCALE)
4118 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4120 /* Even if under locale, set the bits for non-locale in
4121 * case it isn't a true locale-node. This will create
4122 * false positives if it truly is locale */
4123 if (OP(scan) == NALNUMU) {
4124 for (value = 0; value < 256; value++) {
4125 if (! isWORDCHAR_L1(value)) {
4126 ANYOF_BITMAP_SET(data->start_class, value);
4130 for (value = 0; value < 256; value++) {
4131 if (! isALNUM(value)) {
4132 ANYOF_BITMAP_SET(data->start_class, value);
4139 if (flags & SCF_DO_STCLASS_AND) {
4140 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4141 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4142 if (OP(scan) == SPACEU) {
4143 for (value = 0; value < 256; value++) {
4144 if (!isSPACE_L1(value)) {
4145 ANYOF_BITMAP_CLEAR(data->start_class, value);
4149 for (value = 0; value < 256; value++) {
4150 if (!isSPACE(value)) {
4151 ANYOF_BITMAP_CLEAR(data->start_class, value);
4158 if (data->start_class->flags & ANYOF_LOCALE) {
4159 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4161 if (OP(scan) == SPACEU) {
4162 for (value = 0; value < 256; value++) {
4163 if (isSPACE_L1(value)) {
4164 ANYOF_BITMAP_SET(data->start_class, value);
4168 for (value = 0; value < 256; value++) {
4169 if (isSPACE(value)) {
4170 ANYOF_BITMAP_SET(data->start_class, value);
4177 if (flags & SCF_DO_STCLASS_AND) {
4178 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4179 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4180 if (OP(scan) == NSPACEU) {
4181 for (value = 0; value < 256; value++) {
4182 if (isSPACE_L1(value)) {
4183 ANYOF_BITMAP_CLEAR(data->start_class, value);
4187 for (value = 0; value < 256; value++) {
4188 if (isSPACE(value)) {
4189 ANYOF_BITMAP_CLEAR(data->start_class, value);
4196 if (data->start_class->flags & ANYOF_LOCALE)
4197 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4198 if (OP(scan) == NSPACEU) {
4199 for (value = 0; value < 256; value++) {
4200 if (!isSPACE_L1(value)) {
4201 ANYOF_BITMAP_SET(data->start_class, value);
4206 for (value = 0; value < 256; value++) {
4207 if (!isSPACE(value)) {
4208 ANYOF_BITMAP_SET(data->start_class, value);
4215 if (flags & SCF_DO_STCLASS_AND) {
4216 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4217 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4218 for (value = 0; value < 256; value++)
4219 if (!isDIGIT(value))
4220 ANYOF_BITMAP_CLEAR(data->start_class, value);
4224 if (data->start_class->flags & ANYOF_LOCALE)
4225 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4226 for (value = 0; value < 256; value++)
4228 ANYOF_BITMAP_SET(data->start_class, value);
4232 if (flags & SCF_DO_STCLASS_AND) {
4233 if (!(data->start_class->flags & ANYOF_LOCALE))
4234 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4235 for (value = 0; value < 256; value++)
4237 ANYOF_BITMAP_CLEAR(data->start_class, value);
4240 if (data->start_class->flags & ANYOF_LOCALE)
4241 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4242 for (value = 0; value < 256; value++)
4243 if (!isDIGIT(value))
4244 ANYOF_BITMAP_SET(data->start_class, value);
4247 CASE_SYNST_FNC(VERTWS);
4248 CASE_SYNST_FNC(HORIZWS);
4251 if (flags & SCF_DO_STCLASS_OR)
4252 cl_and(data->start_class, and_withp);
4253 flags &= ~SCF_DO_STCLASS;
4256 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4257 data->flags |= (OP(scan) == MEOL
4261 else if ( PL_regkind[OP(scan)] == BRANCHJ
4262 /* Lookbehind, or need to calculate parens/evals/stclass: */
4263 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4264 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4265 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4266 || OP(scan) == UNLESSM )
4268 /* Negative Lookahead/lookbehind
4269 In this case we can't do fixed string optimisation.
4272 I32 deltanext, minnext, fake = 0;
4274 struct regnode_charclass_class intrnl;
4277 data_fake.flags = 0;
4279 data_fake.whilem_c = data->whilem_c;
4280 data_fake.last_closep = data->last_closep;
4283 data_fake.last_closep = &fake;
4284 data_fake.pos_delta = delta;
4285 if ( flags & SCF_DO_STCLASS && !scan->flags
4286 && OP(scan) == IFMATCH ) { /* Lookahead */
4287 cl_init(pRExC_state, &intrnl);
4288 data_fake.start_class = &intrnl;
4289 f |= SCF_DO_STCLASS_AND;
4291 if (flags & SCF_WHILEM_VISITED_POS)
4292 f |= SCF_WHILEM_VISITED_POS;
4293 next = regnext(scan);
4294 nscan = NEXTOPER(NEXTOPER(scan));
4295 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4296 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4299 FAIL("Variable length lookbehind not implemented");
4301 else if (minnext > (I32)U8_MAX) {
4302 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4304 scan->flags = (U8)minnext;
4307 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4309 if (data_fake.flags & SF_HAS_EVAL)
4310 data->flags |= SF_HAS_EVAL;
4311 data->whilem_c = data_fake.whilem_c;
4313 if (f & SCF_DO_STCLASS_AND) {
4314 if (flags & SCF_DO_STCLASS_OR) {
4315 /* OR before, AND after: ideally we would recurse with
4316 * data_fake to get the AND applied by study of the
4317 * remainder of the pattern, and then derecurse;
4318 * *** HACK *** for now just treat as "no information".
4319 * See [perl #56690].
4321 cl_init(pRExC_state, data->start_class);
4323 /* AND before and after: combine and continue */
4324 const int was = (data->start_class->flags & ANYOF_EOS);
4326 cl_and(data->start_class, &intrnl);
4328 data->start_class->flags |= ANYOF_EOS;
4332 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4334 /* Positive Lookahead/lookbehind
4335 In this case we can do fixed string optimisation,
4336 but we must be careful about it. Note in the case of
4337 lookbehind the positions will be offset by the minimum
4338 length of the pattern, something we won't know about
4339 until after the recurse.
4341 I32 deltanext, fake = 0;
4343 struct regnode_charclass_class intrnl;
4345 /* We use SAVEFREEPV so that when the full compile
4346 is finished perl will clean up the allocated
4347 minlens when it's all done. This way we don't
4348 have to worry about freeing them when we know
4349 they wont be used, which would be a pain.
4352 Newx( minnextp, 1, I32 );
4353 SAVEFREEPV(minnextp);
4356 StructCopy(data, &data_fake, scan_data_t);
4357 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4360 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4361 data_fake.last_found=newSVsv(data->last_found);
4365 data_fake.last_closep = &fake;
4366 data_fake.flags = 0;
4367 data_fake.pos_delta = delta;
4369 data_fake.flags |= SF_IS_INF;
4370 if ( flags & SCF_DO_STCLASS && !scan->flags
4371 && OP(scan) == IFMATCH ) { /* Lookahead */
4372 cl_init(pRExC_state, &intrnl);
4373 data_fake.start_class = &intrnl;
4374 f |= SCF_DO_STCLASS_AND;
4376 if (flags & SCF_WHILEM_VISITED_POS)
4377 f |= SCF_WHILEM_VISITED_POS;
4378 next = regnext(scan);
4379 nscan = NEXTOPER(NEXTOPER(scan));
4381 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4382 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4385 FAIL("Variable length lookbehind not implemented");
4387 else if (*minnextp > (I32)U8_MAX) {
4388 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4390 scan->flags = (U8)*minnextp;
4395 if (f & SCF_DO_STCLASS_AND) {
4396 const int was = (data->start_class->flags & ANYOF_EOS);
4398 cl_and(data->start_class, &intrnl);
4400 data->start_class->flags |= ANYOF_EOS;
4403 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4405 if (data_fake.flags & SF_HAS_EVAL)
4406 data->flags |= SF_HAS_EVAL;
4407 data->whilem_c = data_fake.whilem_c;
4408 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4409 if (RExC_rx->minlen<*minnextp)
4410 RExC_rx->minlen=*minnextp;
4411 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4412 SvREFCNT_dec(data_fake.last_found);
4414 if ( data_fake.minlen_fixed != minlenp )
4416 data->offset_fixed= data_fake.offset_fixed;
4417 data->minlen_fixed= data_fake.minlen_fixed;
4418 data->lookbehind_fixed+= scan->flags;
4420 if ( data_fake.minlen_float != minlenp )
4422 data->minlen_float= data_fake.minlen_float;
4423 data->offset_float_min=data_fake.offset_float_min;
4424 data->offset_float_max=data_fake.offset_float_max;
4425 data->lookbehind_float+= scan->flags;
4434 else if (OP(scan) == OPEN) {
4435 if (stopparen != (I32)ARG(scan))
4438 else if (OP(scan) == CLOSE) {
4439 if (stopparen == (I32)ARG(scan)) {
4442 if ((I32)ARG(scan) == is_par) {
4443 next = regnext(scan);
4445 if ( next && (OP(next) != WHILEM) && next < last)
4446 is_par = 0; /* Disable optimization */
4449 *(data->last_closep) = ARG(scan);
4451 else if (OP(scan) == EVAL) {
4453 data->flags |= SF_HAS_EVAL;
4455 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4456 if (flags & SCF_DO_SUBSTR) {
4457 SCAN_COMMIT(pRExC_state,data,minlenp);
4458 flags &= ~SCF_DO_SUBSTR;
4460 if (data && OP(scan)==ACCEPT) {
4461 data->flags |= SCF_SEEN_ACCEPT;
4466 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4468 if (flags & SCF_DO_SUBSTR) {
4469 SCAN_COMMIT(pRExC_state,data,minlenp);
4470 data->longest = &(data->longest_float);
4472 is_inf = is_inf_internal = 1;
4473 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4474 cl_anything(pRExC_state, data->start_class);
4475 flags &= ~SCF_DO_STCLASS;
4477 else if (OP(scan) == GPOS) {
4478 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4479 !(delta || is_inf || (data && data->pos_delta)))
4481 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4482 RExC_rx->extflags |= RXf_ANCH_GPOS;
4483 if (RExC_rx->gofs < (U32)min)
4484 RExC_rx->gofs = min;
4486 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4490 #ifdef TRIE_STUDY_OPT
4491 #ifdef FULL_TRIE_STUDY
4492 else if (PL_regkind[OP(scan)] == TRIE) {
4493 /* NOTE - There is similar code to this block above for handling
4494 BRANCH nodes on the initial study. If you change stuff here
4496 regnode *trie_node= scan;
4497 regnode *tail= regnext(scan);
4498 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4499 I32 max1 = 0, min1 = I32_MAX;
4500 struct regnode_charclass_class accum;
4502 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4503 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4504 if (flags & SCF_DO_STCLASS)
4505 cl_init_zero(pRExC_state, &accum);
4511 const regnode *nextbranch= NULL;
4514 for ( word=1 ; word <= trie->wordcount ; word++)
4516 I32 deltanext=0, minnext=0, f = 0, fake;
4517 struct regnode_charclass_class this_class;
4519 data_fake.flags = 0;
4521 data_fake.whilem_c = data->whilem_c;
4522 data_fake.last_closep = data->last_closep;
4525 data_fake.last_closep = &fake;
4526 data_fake.pos_delta = delta;
4527 if (flags & SCF_DO_STCLASS) {
4528 cl_init(pRExC_state, &this_class);
4529 data_fake.start_class = &this_class;
4530 f = SCF_DO_STCLASS_AND;
4532 if (flags & SCF_WHILEM_VISITED_POS)
4533 f |= SCF_WHILEM_VISITED_POS;
4535 if (trie->jump[word]) {
4537 nextbranch = trie_node + trie->jump[0];
4538 scan= trie_node + trie->jump[word];
4539 /* We go from the jump point to the branch that follows
4540 it. Note this means we need the vestigal unused branches
4541 even though they arent otherwise used.
4543 minnext = study_chunk(pRExC_state, &scan, minlenp,
4544 &deltanext, (regnode *)nextbranch, &data_fake,
4545 stopparen, recursed, NULL, f,depth+1);
4547 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4548 nextbranch= regnext((regnode*)nextbranch);
4550 if (min1 > (I32)(minnext + trie->minlen))
4551 min1 = minnext + trie->minlen;
4552 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4553 max1 = minnext + deltanext + trie->maxlen;
4554 if (deltanext == I32_MAX)
4555 is_inf = is_inf_internal = 1;
4557 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4559 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4560 if ( stopmin > min + min1)
4561 stopmin = min + min1;
4562 flags &= ~SCF_DO_SUBSTR;
4564 data->flags |= SCF_SEEN_ACCEPT;
4567 if (data_fake.flags & SF_HAS_EVAL)
4568 data->flags |= SF_HAS_EVAL;
4569 data->whilem_c = data_fake.whilem_c;
4571 if (flags & SCF_DO_STCLASS)
4572 cl_or(pRExC_state, &accum, &this_class);
4575 if (flags & SCF_DO_SUBSTR) {
4576 data->pos_min += min1;
4577 data->pos_delta += max1 - min1;
4578 if (max1 != min1 || is_inf)
4579 data->longest = &(data->longest_float);
4582 delta += max1 - min1;
4583 if (flags & SCF_DO_STCLASS_OR) {
4584 cl_or(pRExC_state, data->start_class, &accum);
4586 cl_and(data->start_class, and_withp);
4587 flags &= ~SCF_DO_STCLASS;
4590 else if (flags & SCF_DO_STCLASS_AND) {
4592 cl_and(data->start_class, &accum);
4593 flags &= ~SCF_DO_STCLASS;
4596 /* Switch to OR mode: cache the old value of
4597 * data->start_class */
4599 StructCopy(data->start_class, and_withp,
4600 struct regnode_charclass_class);
4601 flags &= ~SCF_DO_STCLASS_AND;
4602 StructCopy(&accum, data->start_class,
4603 struct regnode_charclass_class);
4604 flags |= SCF_DO_STCLASS_OR;
4605 data->start_class->flags |= ANYOF_EOS;
4612 else if (PL_regkind[OP(scan)] == TRIE) {
4613 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4616 min += trie->minlen;
4617 delta += (trie->maxlen - trie->minlen);
4618 flags &= ~SCF_DO_STCLASS; /* xxx */
4619 if (flags & SCF_DO_SUBSTR) {
4620 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4621 data->pos_min += trie->minlen;
4622 data->pos_delta += (trie->maxlen - trie->minlen);
4623 if (trie->maxlen != trie->minlen)
4624 data->longest = &(data->longest_float);
4626 if (trie->jump) /* no more substrings -- for now /grr*/
4627 flags &= ~SCF_DO_SUBSTR;
4629 #endif /* old or new */
4630 #endif /* TRIE_STUDY_OPT */
4632 /* Else: zero-length, ignore. */
4633 scan = regnext(scan);
4638 stopparen = frame->stop;
4639 frame = frame->prev;
4640 goto fake_study_recurse;
4645 DEBUG_STUDYDATA("pre-fin:",data,depth);
4648 *deltap = is_inf_internal ? I32_MAX : delta;
4649 if (flags & SCF_DO_SUBSTR && is_inf)
4650 data->pos_delta = I32_MAX - data->pos_min;
4651 if (is_par > (I32)U8_MAX)
4653 if (is_par && pars==1 && data) {
4654 data->flags |= SF_IN_PAR;
4655 data->flags &= ~SF_HAS_PAR;
4657 else if (pars && data) {
4658 data->flags |= SF_HAS_PAR;
4659 data->flags &= ~SF_IN_PAR;
4661 if (flags & SCF_DO_STCLASS_OR)
4662 cl_and(data->start_class, and_withp);
4663 if (flags & SCF_TRIE_RESTUDY)
4664 data->flags |= SCF_TRIE_RESTUDY;
4666 DEBUG_STUDYDATA("post-fin:",data,depth);
4668 return min < stopmin ? min : stopmin;
4672 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4674 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4676 PERL_ARGS_ASSERT_ADD_DATA;
4678 Renewc(RExC_rxi->data,
4679 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4680 char, struct reg_data);
4682 Renew(RExC_rxi->data->what, count + n, U8);
4684 Newx(RExC_rxi->data->what, n, U8);
4685 RExC_rxi->data->count = count + n;
4686 Copy(s, RExC_rxi->data->what + count, n, U8);
4690 /*XXX: todo make this not included in a non debugging perl */
4691 #ifndef PERL_IN_XSUB_RE
4693 Perl_reginitcolors(pTHX)
4696 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4698 char *t = savepv(s);
4702 t = strchr(t, '\t');
4708 PL_colors[i] = t = (char *)"";
4713 PL_colors[i++] = (char *)"";
4720 #ifdef TRIE_STUDY_OPT
4721 #define CHECK_RESTUDY_GOTO \
4723 (data.flags & SCF_TRIE_RESTUDY) \
4727 #define CHECK_RESTUDY_GOTO
4731 - pregcomp - compile a regular expression into internal code
4733 * We can't allocate space until we know how big the compiled form will be,
4734 * but we can't compile it (and thus know how big it is) until we've got a
4735 * place to put the code. So we cheat: we compile it twice, once with code
4736 * generation turned off and size counting turned on, and once "for real".
4737 * This also means that we don't allocate space until we are sure that the
4738 * thing really will compile successfully, and we never have to move the
4739 * code and thus invalidate pointers into it. (Note that it has to be in
4740 * one piece because free() must be able to free it all.) [NB: not true in perl]
4742 * Beware that the optimization-preparation code in here knows about some
4743 * of the structure of the compiled regexp. [I'll say.]
4748 #ifndef PERL_IN_XSUB_RE
4749 #define RE_ENGINE_PTR &reh_regexp_engine
4751 extern const struct regexp_engine my_reg_engine;
4752 #define RE_ENGINE_PTR &my_reg_engine
4755 #ifndef PERL_IN_XSUB_RE
4757 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4760 HV * const table = GvHV(PL_hintgv);
4762 PERL_ARGS_ASSERT_PREGCOMP;
4764 /* Dispatch a request to compile a regexp to correct
4767 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4768 GET_RE_DEBUG_FLAGS_DECL;
4769 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4770 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4772 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4775 return CALLREGCOMP_ENG(eng, pattern, flags);
4778 return Perl_re_compile(aTHX_ pattern, flags);
4783 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4788 register regexp_internal *ri;
4797 /* these are all flags - maybe they should be turned
4798 * into a single int with different bit masks */
4799 I32 sawlookahead = 0;
4802 bool used_setjump = FALSE;
4803 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4808 RExC_state_t RExC_state;
4809 RExC_state_t * const pRExC_state = &RExC_state;
4810 #ifdef TRIE_STUDY_OPT
4812 RExC_state_t copyRExC_state;
4814 GET_RE_DEBUG_FLAGS_DECL;
4816 PERL_ARGS_ASSERT_RE_COMPILE;
4818 DEBUG_r(if (!PL_colorset) reginitcolors());
4820 #ifndef PERL_IN_XSUB_RE
4821 /* Initialize these here instead of as-needed, as is quick and avoids
4822 * having to test them each time otherwise */
4823 if (! PL_AboveLatin1) {
4824 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
4825 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
4826 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
4828 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4829 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4831 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
4832 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
4834 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
4835 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
4837 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
4839 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
4840 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
4842 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
4844 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
4845 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
4847 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4848 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4850 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
4851 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
4853 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
4854 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
4856 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
4857 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
4859 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
4860 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
4862 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
4863 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
4865 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
4866 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
4868 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
4870 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
4871 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
4873 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
4874 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
4878 exp = SvPV(pattern, plen);
4880 if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4881 RExC_utf8 = RExC_orig_utf8 = 0;
4884 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4886 RExC_uni_semantics = 0;
4887 RExC_contains_locale = 0;
4889 /****************** LONG JUMP TARGET HERE***********************/
4890 /* Longjmp back to here if have to switch in midstream to utf8 */
4891 if (! RExC_orig_utf8) {
4892 JMPENV_PUSH(jump_ret);
4893 used_setjump = TRUE;
4896 if (jump_ret == 0) { /* First time through */
4900 SV *dsv= sv_newmortal();
4901 RE_PV_QUOTED_DECL(s, RExC_utf8,
4902 dsv, exp, plen, 60);
4903 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4904 PL_colors[4],PL_colors[5],s);
4907 else { /* longjumped back */
4910 /* If the cause for the longjmp was other than changing to utf8, pop
4911 * our own setjmp, and longjmp to the correct handler */
4912 if (jump_ret != UTF8_LONGJMP) {
4914 JMPENV_JUMP(jump_ret);
4919 /* It's possible to write a regexp in ascii that represents Unicode
4920 codepoints outside of the byte range, such as via \x{100}. If we
4921 detect such a sequence we have to convert the entire pattern to utf8
4922 and then recompile, as our sizing calculation will have been based
4923 on 1 byte == 1 character, but we will need to use utf8 to encode
4924 at least some part of the pattern, and therefore must convert the whole
4927 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4928 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4929 exp = (char*)Perl_bytes_to_utf8(aTHX_
4930 (U8*)SvPV_nomg(pattern, plen),
4933 RExC_orig_utf8 = RExC_utf8 = 1;
4937 #ifdef TRIE_STUDY_OPT
4941 pm_flags = orig_pm_flags;
4943 if (initial_charset == REGEX_LOCALE_CHARSET) {
4944 RExC_contains_locale = 1;
4946 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4948 /* Set to use unicode semantics if the pattern is in utf8 and has the
4949 * 'depends' charset specified, as it means unicode when utf8 */
4950 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4954 RExC_flags = pm_flags;
4958 RExC_in_lookbehind = 0;
4959 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4960 RExC_seen_evals = 0;
4962 RExC_override_recoding = 0;
4964 /* First pass: determine size, legality. */
4972 RExC_emit = &PL_regdummy;
4973 RExC_whilem_seen = 0;
4974 RExC_open_parens = NULL;
4975 RExC_close_parens = NULL;
4977 RExC_paren_names = NULL;
4979 RExC_paren_name_list = NULL;
4981 RExC_recurse = NULL;
4982 RExC_recurse_count = 0;
4984 #if 0 /* REGC() is (currently) a NOP at the first pass.
4985 * Clever compilers notice this and complain. --jhi */
4986 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4989 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
4991 RExC_lastparse=NULL;
4993 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4994 RExC_precomp = NULL;
4998 /* Here, finished first pass. Get rid of any added setjmp */
5004 PerlIO_printf(Perl_debug_log,
5005 "Required size %"IVdf" nodes\n"
5006 "Starting second pass (creation)\n",
5009 RExC_lastparse=NULL;
5012 /* The first pass could have found things that force Unicode semantics */
5013 if ((RExC_utf8 || RExC_uni_semantics)
5014 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5016 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5019 /* Small enough for pointer-storage convention?
5020 If extralen==0, this means that we will not need long jumps. */
5021 if (RExC_size >= 0x10000L && RExC_extralen)
5022 RExC_size += RExC_extralen;
5025 if (RExC_whilem_seen > 15)
5026 RExC_whilem_seen = 15;
5028 /* Allocate space and zero-initialize. Note, the two step process
5029 of zeroing when in debug mode, thus anything assigned has to
5030 happen after that */
5031 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5032 r = (struct regexp*)SvANY(rx);
5033 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5034 char, regexp_internal);
5035 if ( r == NULL || ri == NULL )
5036 FAIL("Regexp out of space");
5038 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5039 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5041 /* bulk initialize base fields with 0. */
5042 Zero(ri, sizeof(regexp_internal), char);
5045 /* non-zero initialization begins here */
5047 r->engine= RE_ENGINE_PTR;
5048 r->extflags = pm_flags;
5050 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5051 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5053 /* The caret is output if there are any defaults: if not all the STD
5054 * flags are set, or if no character set specifier is needed */
5056 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5058 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5059 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5060 >> RXf_PMf_STD_PMMOD_SHIFT);
5061 const char *fptr = STD_PAT_MODS; /*"msix"*/
5063 /* Allocate for the worst case, which is all the std flags are turned
5064 * on. If more precision is desired, we could do a population count of
5065 * the flags set. This could be done with a small lookup table, or by
5066 * shifting, masking and adding, or even, when available, assembly
5067 * language for a machine-language population count.
5068 * We never output a minus, as all those are defaults, so are
5069 * covered by the caret */
5070 const STRLEN wraplen = plen + has_p + has_runon
5071 + has_default /* If needs a caret */
5073 /* If needs a character set specifier */
5074 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5075 + (sizeof(STD_PAT_MODS) - 1)
5076 + (sizeof("(?:)") - 1);
5078 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5080 SvFLAGS(rx) |= SvUTF8(pattern);
5083 /* If a default, cover it using the caret */
5085 *p++= DEFAULT_PAT_MOD;
5089 const char* const name = get_regex_charset_name(r->extflags, &len);
5090 Copy(name, p, len, char);
5094 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5097 while((ch = *fptr++)) {
5105 Copy(RExC_precomp, p, plen, char);
5106 assert ((RX_WRAPPED(rx) - p) < 16);
5107 r->pre_prefix = p - RX_WRAPPED(rx);
5113 SvCUR_set(rx, p - SvPVX_const(rx));
5117 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5119 if (RExC_seen & REG_SEEN_RECURSE) {
5120 Newxz(RExC_open_parens, RExC_npar,regnode *);
5121 SAVEFREEPV(RExC_open_parens);
5122 Newxz(RExC_close_parens,RExC_npar,regnode *);
5123 SAVEFREEPV(RExC_close_parens);
5126 /* Useful during FAIL. */
5127 #ifdef RE_TRACK_PATTERN_OFFSETS
5128 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5129 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5130 "%s %"UVuf" bytes for offset annotations.\n",
5131 ri->u.offsets ? "Got" : "Couldn't get",
5132 (UV)((2*RExC_size+1) * sizeof(U32))));
5134 SetProgLen(ri,RExC_size);
5138 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5140 /* Second pass: emit code. */
5141 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
5146 RExC_emit_start = ri->program;
5147 RExC_emit = ri->program;
5148 RExC_emit_bound = ri->program + RExC_size + 1;
5150 /* Store the count of eval-groups for security checks: */
5151 RExC_rx->seen_evals = RExC_seen_evals;
5152 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5153 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5157 /* XXXX To minimize changes to RE engine we always allocate
5158 3-units-long substrs field. */
5159 Newx(r->substrs, 1, struct reg_substr_data);
5160 if (RExC_recurse_count) {
5161 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5162 SAVEFREEPV(RExC_recurse);
5166 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5167 Zero(r->substrs, 1, struct reg_substr_data);
5169 #ifdef TRIE_STUDY_OPT
5171 StructCopy(&zero_scan_data, &data, scan_data_t);
5172 copyRExC_state = RExC_state;
5175 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5177 RExC_state = copyRExC_state;
5178 if (seen & REG_TOP_LEVEL_BRANCHES)
5179 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5181 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5182 if (data.last_found) {
5183 SvREFCNT_dec(data.longest_fixed);
5184 SvREFCNT_dec(data.longest_float);
5185 SvREFCNT_dec(data.last_found);
5187 StructCopy(&zero_scan_data, &data, scan_data_t);
5190 StructCopy(&zero_scan_data, &data, scan_data_t);
5193 /* Dig out information for optimizations. */
5194 r->extflags = RExC_flags; /* was pm_op */
5195 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5198 SvUTF8_on(rx); /* Unicode in it? */
5199 ri->regstclass = NULL;
5200 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5201 r->intflags |= PREGf_NAUGHTY;
5202 scan = ri->program + 1; /* First BRANCH. */
5204 /* testing for BRANCH here tells us whether there is "must appear"
5205 data in the pattern. If there is then we can use it for optimisations */
5206 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5208 STRLEN longest_float_length, longest_fixed_length;
5209 struct regnode_charclass_class ch_class; /* pointed to by data */
5211 I32 last_close = 0; /* pointed to by data */
5212 regnode *first= scan;
5213 regnode *first_next= regnext(first);
5215 * Skip introductions and multiplicators >= 1
5216 * so that we can extract the 'meat' of the pattern that must
5217 * match in the large if() sequence following.
5218 * NOTE that EXACT is NOT covered here, as it is normally
5219 * picked up by the optimiser separately.
5221 * This is unfortunate as the optimiser isnt handling lookahead
5222 * properly currently.
5225 while ((OP(first) == OPEN && (sawopen = 1)) ||
5226 /* An OR of *one* alternative - should not happen now. */
5227 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5228 /* for now we can't handle lookbehind IFMATCH*/
5229 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5230 (OP(first) == PLUS) ||
5231 (OP(first) == MINMOD) ||
5232 /* An {n,m} with n>0 */
5233 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5234 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5237 * the only op that could be a regnode is PLUS, all the rest
5238 * will be regnode_1 or regnode_2.
5241 if (OP(first) == PLUS)
5244 first += regarglen[OP(first)];
5246 first = NEXTOPER(first);
5247 first_next= regnext(first);
5250 /* Starting-point info. */
5252 DEBUG_PEEP("first:",first,0);
5253 /* Ignore EXACT as we deal with it later. */
5254 if (PL_regkind[OP(first)] == EXACT) {
5255 if (OP(first) == EXACT)
5256 NOOP; /* Empty, get anchored substr later. */
5258 ri->regstclass = first;
5261 else if (PL_regkind[OP(first)] == TRIE &&
5262 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5265 /* this can happen only on restudy */
5266 if ( OP(first) == TRIE ) {
5267 struct regnode_1 *trieop = (struct regnode_1 *)
5268 PerlMemShared_calloc(1, sizeof(struct regnode_1));
5269 StructCopy(first,trieop,struct regnode_1);
5270 trie_op=(regnode *)trieop;
5272 struct regnode_charclass *trieop = (struct regnode_charclass *)
5273 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5274 StructCopy(first,trieop,struct regnode_charclass);
5275 trie_op=(regnode *)trieop;
5278 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5279 ri->regstclass = trie_op;
5282 else if (REGNODE_SIMPLE(OP(first)))
5283 ri->regstclass = first;
5284 else if (PL_regkind[OP(first)] == BOUND ||
5285 PL_regkind[OP(first)] == NBOUND)
5286 ri->regstclass = first;
5287 else if (PL_regkind[OP(first)] == BOL) {
5288 r->extflags |= (OP(first) == MBOL
5290 : (OP(first) == SBOL
5293 first = NEXTOPER(first);
5296 else if (OP(first) == GPOS) {
5297 r->extflags |= RXf_ANCH_GPOS;
5298 first = NEXTOPER(first);
5301 else if ((!sawopen || !RExC_sawback) &&
5302 (OP(first) == STAR &&
5303 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5304 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5306 /* turn .* into ^.* with an implied $*=1 */
5308 (OP(NEXTOPER(first)) == REG_ANY)
5311 r->extflags |= type;
5312 r->intflags |= PREGf_IMPLICIT;
5313 first = NEXTOPER(first);
5316 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5317 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5318 /* x+ must match at the 1st pos of run of x's */
5319 r->intflags |= PREGf_SKIP;
5321 /* Scan is after the zeroth branch, first is atomic matcher. */
5322 #ifdef TRIE_STUDY_OPT
5325 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5326 (IV)(first - scan + 1))
5330 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5331 (IV)(first - scan + 1))
5337 * If there's something expensive in the r.e., find the
5338 * longest literal string that must appear and make it the
5339 * regmust. Resolve ties in favor of later strings, since
5340 * the regstart check works with the beginning of the r.e.
5341 * and avoiding duplication strengthens checking. Not a
5342 * strong reason, but sufficient in the absence of others.
5343 * [Now we resolve ties in favor of the earlier string if
5344 * it happens that c_offset_min has been invalidated, since the
5345 * earlier string may buy us something the later one won't.]
5348 data.longest_fixed = newSVpvs("");
5349 data.longest_float = newSVpvs("");
5350 data.last_found = newSVpvs("");
5351 data.longest = &(data.longest_fixed);
5353 if (!ri->regstclass) {
5354 cl_init(pRExC_state, &ch_class);
5355 data.start_class = &ch_class;
5356 stclass_flag = SCF_DO_STCLASS_AND;
5357 } else /* XXXX Check for BOUND? */
5359 data.last_closep = &last_close;
5361 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5362 &data, -1, NULL, NULL,
5363 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5369 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5370 && data.last_start_min == 0 && data.last_end > 0
5371 && !RExC_seen_zerolen
5372 && !(RExC_seen & REG_SEEN_VERBARG)
5373 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5374 r->extflags |= RXf_CHECK_ALL;
5375 scan_commit(pRExC_state, &data,&minlen,0);
5376 SvREFCNT_dec(data.last_found);
5378 /* Note that code very similar to this but for anchored string
5379 follows immediately below, changes may need to be made to both.
5382 longest_float_length = CHR_SVLEN(data.longest_float);
5383 if (longest_float_length
5384 || (data.flags & SF_FL_BEFORE_EOL
5385 && (!(data.flags & SF_FL_BEFORE_MEOL)
5386 || (RExC_flags & RXf_PMf_MULTILINE))))
5390 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5391 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5392 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5393 && data.offset_fixed == data.offset_float_min
5394 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5395 goto remove_float; /* As in (a)+. */
5397 /* copy the information about the longest float from the reg_scan_data
5398 over to the program. */
5399 if (SvUTF8(data.longest_float)) {
5400 r->float_utf8 = data.longest_float;
5401 r->float_substr = NULL;
5403 r->float_substr = data.longest_float;
5404 r->float_utf8 = NULL;
5406 /* float_end_shift is how many chars that must be matched that
5407 follow this item. We calculate it ahead of time as once the
5408 lookbehind offset is added in we lose the ability to correctly
5410 ml = data.minlen_float ? *(data.minlen_float)
5411 : (I32)longest_float_length;
5412 r->float_end_shift = ml - data.offset_float_min
5413 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5414 + data.lookbehind_float;
5415 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5416 r->float_max_offset = data.offset_float_max;
5417 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5418 r->float_max_offset -= data.lookbehind_float;
5420 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5421 && (!(data.flags & SF_FL_BEFORE_MEOL)
5422 || (RExC_flags & RXf_PMf_MULTILINE)));
5423 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5427 r->float_substr = r->float_utf8 = NULL;
5428 SvREFCNT_dec(data.longest_float);
5429 longest_float_length = 0;
5432 /* Note that code very similar to this but for floating string
5433 is immediately above, changes may need to be made to both.
5436 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5438 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5439 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5440 && (longest_fixed_length
5441 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5442 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5443 || (RExC_flags & RXf_PMf_MULTILINE)))) )
5447 /* copy the information about the longest fixed
5448 from the reg_scan_data over to the program. */
5449 if (SvUTF8(data.longest_fixed)) {
5450 r->anchored_utf8 = data.longest_fixed;
5451 r->anchored_substr = NULL;
5453 r->anchored_substr = data.longest_fixed;
5454 r->anchored_utf8 = NULL;
5456 /* fixed_end_shift is how many chars that must be matched that
5457 follow this item. We calculate it ahead of time as once the
5458 lookbehind offset is added in we lose the ability to correctly
5460 ml = data.minlen_fixed ? *(data.minlen_fixed)
5461 : (I32)longest_fixed_length;
5462 r->anchored_end_shift = ml - data.offset_fixed
5463 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5464 + data.lookbehind_fixed;
5465 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5467 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5468 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5469 || (RExC_flags & RXf_PMf_MULTILINE)));
5470 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5473 r->anchored_substr = r->anchored_utf8 = NULL;
5474 SvREFCNT_dec(data.longest_fixed);
5475 longest_fixed_length = 0;
5478 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5479 ri->regstclass = NULL;
5481 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5483 && !(data.start_class->flags & ANYOF_EOS)
5484 && !cl_is_anything(data.start_class))
5486 const U32 n = add_data(pRExC_state, 1, "f");
5487 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5489 Newx(RExC_rxi->data->data[n], 1,
5490 struct regnode_charclass_class);
5491 StructCopy(data.start_class,
5492 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5493 struct regnode_charclass_class);
5494 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5495 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5496 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5497 regprop(r, sv, (regnode*)data.start_class);
5498 PerlIO_printf(Perl_debug_log,
5499 "synthetic stclass \"%s\".\n",
5500 SvPVX_const(sv));});
5503 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5504 if (longest_fixed_length > longest_float_length) {
5505 r->check_end_shift = r->anchored_end_shift;
5506 r->check_substr = r->anchored_substr;
5507 r->check_utf8 = r->anchored_utf8;
5508 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5509 if (r->extflags & RXf_ANCH_SINGLE)
5510 r->extflags |= RXf_NOSCAN;
5513 r->check_end_shift = r->float_end_shift;
5514 r->check_substr = r->float_substr;
5515 r->check_utf8 = r->float_utf8;
5516 r->check_offset_min = r->float_min_offset;
5517 r->check_offset_max = r->float_max_offset;
5519 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5520 This should be changed ASAP! */
5521 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5522 r->extflags |= RXf_USE_INTUIT;
5523 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5524 r->extflags |= RXf_INTUIT_TAIL;
5526 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5527 if ( (STRLEN)minlen < longest_float_length )
5528 minlen= longest_float_length;
5529 if ( (STRLEN)minlen < longest_fixed_length )
5530 minlen= longest_fixed_length;
5534 /* Several toplevels. Best we can is to set minlen. */
5536 struct regnode_charclass_class ch_class;
5539 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5541 scan = ri->program + 1;
5542 cl_init(pRExC_state, &ch_class);
5543 data.start_class = &ch_class;
5544 data.last_closep = &last_close;
5547 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5548 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5552 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5553 = r->float_substr = r->float_utf8 = NULL;
5555 if (!(data.start_class->flags & ANYOF_EOS)
5556 && !cl_is_anything(data.start_class))
5558 const U32 n = add_data(pRExC_state, 1, "f");
5559 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5561 Newx(RExC_rxi->data->data[n], 1,
5562 struct regnode_charclass_class);
5563 StructCopy(data.start_class,
5564 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5565 struct regnode_charclass_class);
5566 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5567 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5568 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5569 regprop(r, sv, (regnode*)data.start_class);
5570 PerlIO_printf(Perl_debug_log,
5571 "synthetic stclass \"%s\".\n",
5572 SvPVX_const(sv));});
5576 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5577 the "real" pattern. */
5579 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5580 (IV)minlen, (IV)r->minlen);
5582 r->minlenret = minlen;
5583 if (r->minlen < minlen)
5586 if (RExC_seen & REG_SEEN_GPOS)
5587 r->extflags |= RXf_GPOS_SEEN;
5588 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5589 r->extflags |= RXf_LOOKBEHIND_SEEN;
5590 if (RExC_seen & REG_SEEN_EVAL)
5591 r->extflags |= RXf_EVAL_SEEN;
5592 if (RExC_seen & REG_SEEN_CANY)
5593 r->extflags |= RXf_CANY_SEEN;
5594 if (RExC_seen & REG_SEEN_VERBARG)
5595 r->intflags |= PREGf_VERBARG_SEEN;
5596 if (RExC_seen & REG_SEEN_CUTGROUP)
5597 r->intflags |= PREGf_CUTGROUP_SEEN;
5598 if (RExC_paren_names)
5599 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5601 RXp_PAREN_NAMES(r) = NULL;
5603 #ifdef STUPID_PATTERN_CHECKS
5604 if (RX_PRELEN(rx) == 0)
5605 r->extflags |= RXf_NULL;
5606 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5607 /* XXX: this should happen BEFORE we compile */
5608 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5609 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5610 r->extflags |= RXf_WHITE;
5611 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5612 r->extflags |= RXf_START_ONLY;
5614 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5615 /* XXX: this should happen BEFORE we compile */
5616 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5618 regnode *first = ri->program + 1;
5621 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5622 r->extflags |= RXf_NULL;
5623 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5624 r->extflags |= RXf_START_ONLY;
5625 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5626 && OP(regnext(first)) == END)
5627 r->extflags |= RXf_WHITE;
5631 if (RExC_paren_names) {
5632 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5633 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5636 ri->name_list_idx = 0;
5638 if (RExC_recurse_count) {
5639 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5640 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5641 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5644 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5645 /* assume we don't need to swap parens around before we match */
5648 PerlIO_printf(Perl_debug_log,"Final program:\n");
5651 #ifdef RE_TRACK_PATTERN_OFFSETS
5652 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5653 const U32 len = ri->u.offsets[0];
5655 GET_RE_DEBUG_FLAGS_DECL;
5656 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5657 for (i = 1; i <= len; i++) {
5658 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5659 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5660 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5662 PerlIO_printf(Perl_debug_log, "\n");
5668 #undef RE_ENGINE_PTR
5672 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5675 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5677 PERL_UNUSED_ARG(value);
5679 if (flags & RXapif_FETCH) {
5680 return reg_named_buff_fetch(rx, key, flags);
5681 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5682 Perl_croak_no_modify(aTHX);
5684 } else if (flags & RXapif_EXISTS) {
5685 return reg_named_buff_exists(rx, key, flags)
5688 } else if (flags & RXapif_REGNAMES) {
5689 return reg_named_buff_all(rx, flags);
5690 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5691 return reg_named_buff_scalar(rx, flags);
5693 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5699 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5702 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5703 PERL_UNUSED_ARG(lastkey);
5705 if (flags & RXapif_FIRSTKEY)
5706 return reg_named_buff_firstkey(rx, flags);
5707 else if (flags & RXapif_NEXTKEY)
5708 return reg_named_buff_nextkey(rx, flags);
5710 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5716 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5719 AV *retarray = NULL;
5721 struct regexp *const rx = (struct regexp *)SvANY(r);
5723 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5725 if (flags & RXapif_ALL)
5728 if (rx && RXp_PAREN_NAMES(rx)) {
5729 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5732 SV* sv_dat=HeVAL(he_str);
5733 I32 *nums=(I32*)SvPVX(sv_dat);
5734 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5735 if ((I32)(rx->nparens) >= nums[i]
5736 && rx->offs[nums[i]].start != -1
5737 && rx->offs[nums[i]].end != -1)
5740 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5745 ret = newSVsv(&PL_sv_undef);
5748 av_push(retarray, ret);
5751 return newRV_noinc(MUTABLE_SV(retarray));
5758 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5761 struct regexp *const rx = (struct regexp *)SvANY(r);
5763 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5765 if (rx && RXp_PAREN_NAMES(rx)) {
5766 if (flags & RXapif_ALL) {
5767 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5769 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5783 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5785 struct regexp *const rx = (struct regexp *)SvANY(r);
5787 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5789 if ( rx && RXp_PAREN_NAMES(rx) ) {
5790 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5792 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5799 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5801 struct regexp *const rx = (struct regexp *)SvANY(r);
5802 GET_RE_DEBUG_FLAGS_DECL;
5804 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5806 if (rx && RXp_PAREN_NAMES(rx)) {
5807 HV *hv = RXp_PAREN_NAMES(rx);
5809 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5812 SV* sv_dat = HeVAL(temphe);
5813 I32 *nums = (I32*)SvPVX(sv_dat);
5814 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5815 if ((I32)(rx->lastparen) >= nums[i] &&
5816 rx->offs[nums[i]].start != -1 &&
5817 rx->offs[nums[i]].end != -1)
5823 if (parno || flags & RXapif_ALL) {
5824 return newSVhek(HeKEY_hek(temphe));
5832 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5837 struct regexp *const rx = (struct regexp *)SvANY(r);
5839 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5841 if (rx && RXp_PAREN_NAMES(rx)) {
5842 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5843 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5844 } else if (flags & RXapif_ONE) {
5845 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5846 av = MUTABLE_AV(SvRV(ret));
5847 length = av_len(av);
5849 return newSViv(length + 1);
5851 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5855 return &PL_sv_undef;
5859 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5861 struct regexp *const rx = (struct regexp *)SvANY(r);
5864 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5866 if (rx && RXp_PAREN_NAMES(rx)) {
5867 HV *hv= RXp_PAREN_NAMES(rx);
5869 (void)hv_iterinit(hv);
5870 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5873 SV* sv_dat = HeVAL(temphe);
5874 I32 *nums = (I32*)SvPVX(sv_dat);
5875 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5876 if ((I32)(rx->lastparen) >= nums[i] &&
5877 rx->offs[nums[i]].start != -1 &&
5878 rx->offs[nums[i]].end != -1)
5884 if (parno || flags & RXapif_ALL) {
5885 av_push(av, newSVhek(HeKEY_hek(temphe)));
5890 return newRV_noinc(MUTABLE_SV(av));
5894 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5897 struct regexp *const rx = (struct regexp *)SvANY(r);
5902 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5905 sv_setsv(sv,&PL_sv_undef);
5909 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5911 i = rx->offs[0].start;
5915 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5917 s = rx->subbeg + rx->offs[0].end;
5918 i = rx->sublen - rx->offs[0].end;
5921 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5922 (s1 = rx->offs[paren].start) != -1 &&
5923 (t1 = rx->offs[paren].end) != -1)
5927 s = rx->subbeg + s1;
5929 sv_setsv(sv,&PL_sv_undef);
5932 assert(rx->sublen >= (s - rx->subbeg) + i );
5934 const int oldtainted = PL_tainted;
5936 sv_setpvn(sv, s, i);
5937 PL_tainted = oldtainted;
5938 if ( (rx->extflags & RXf_CANY_SEEN)
5939 ? (RXp_MATCH_UTF8(rx)
5940 && (!i || is_utf8_string((U8*)s, i)))
5941 : (RXp_MATCH_UTF8(rx)) )
5948 if (RXp_MATCH_TAINTED(rx)) {
5949 if (SvTYPE(sv) >= SVt_PVMG) {
5950 MAGIC* const mg = SvMAGIC(sv);
5953 SvMAGIC_set(sv, mg->mg_moremagic);
5955 if ((mgt = SvMAGIC(sv))) {
5956 mg->mg_moremagic = mgt;
5957 SvMAGIC_set(sv, mg);
5967 sv_setsv(sv,&PL_sv_undef);
5973 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5974 SV const * const value)
5976 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5978 PERL_UNUSED_ARG(rx);
5979 PERL_UNUSED_ARG(paren);
5980 PERL_UNUSED_ARG(value);
5983 Perl_croak_no_modify(aTHX);
5987 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5990 struct regexp *const rx = (struct regexp *)SvANY(r);
5994 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5996 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5998 /* $` / ${^PREMATCH} */
5999 case RX_BUFF_IDX_PREMATCH:
6000 if (rx->offs[0].start != -1) {
6001 i = rx->offs[0].start;
6009 /* $' / ${^POSTMATCH} */
6010 case RX_BUFF_IDX_POSTMATCH:
6011 if (rx->offs[0].end != -1) {
6012 i = rx->sublen - rx->offs[0].end;
6014 s1 = rx->offs[0].end;
6020 /* $& / ${^MATCH}, $1, $2, ... */
6022 if (paren <= (I32)rx->nparens &&
6023 (s1 = rx->offs[paren].start) != -1 &&
6024 (t1 = rx->offs[paren].end) != -1)
6029 if (ckWARN(WARN_UNINITIALIZED))
6030 report_uninit((const SV *)sv);
6035 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6036 const char * const s = rx->subbeg + s1;
6041 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6048 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6050 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6051 PERL_UNUSED_ARG(rx);
6055 return newSVpvs("Regexp");
6058 /* Scans the name of a named buffer from the pattern.
6059 * If flags is REG_RSN_RETURN_NULL returns null.
6060 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6061 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6062 * to the parsed name as looked up in the RExC_paren_names hash.
6063 * If there is an error throws a vFAIL().. type exception.
6066 #define REG_RSN_RETURN_NULL 0
6067 #define REG_RSN_RETURN_NAME 1
6068 #define REG_RSN_RETURN_DATA 2
6071 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6073 char *name_start = RExC_parse;
6075 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6077 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6078 /* skip IDFIRST by using do...while */
6081 RExC_parse += UTF8SKIP(RExC_parse);
6082 } while (isALNUM_utf8((U8*)RExC_parse));
6086 } while (isALNUM(*RExC_parse));
6091 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6092 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6093 if ( flags == REG_RSN_RETURN_NAME)
6095 else if (flags==REG_RSN_RETURN_DATA) {
6098 if ( ! sv_name ) /* should not happen*/
6099 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6100 if (RExC_paren_names)
6101 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6103 sv_dat = HeVAL(he_str);
6105 vFAIL("Reference to nonexistent named group");
6109 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6110 (unsigned long) flags);
6117 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6118 int rem=(int)(RExC_end - RExC_parse); \
6127 if (RExC_lastparse!=RExC_parse) \
6128 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6131 iscut ? "..." : "<" \
6134 PerlIO_printf(Perl_debug_log,"%16s",""); \
6137 num = RExC_size + 1; \
6139 num=REG_NODE_NUM(RExC_emit); \
6140 if (RExC_lastnum!=num) \
6141 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6143 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6144 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6145 (int)((depth*2)), "", \
6149 RExC_lastparse=RExC_parse; \
6154 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6155 DEBUG_PARSE_MSG((funcname)); \
6156 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6158 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6159 DEBUG_PARSE_MSG((funcname)); \
6160 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6163 /* This section of code defines the inversion list object and its methods. The
6164 * interfaces are highly subject to change, so as much as possible is static to
6165 * this file. An inversion list is here implemented as a malloc'd C UV array
6166 * with some added info that is placed as UVs at the beginning in a header
6167 * portion. An inversion list for Unicode is an array of code points, sorted
6168 * by ordinal number. The zeroth element is the first code point in the list.
6169 * The 1th element is the first element beyond that not in the list. In other
6170 * words, the first range is
6171 * invlist[0]..(invlist[1]-1)
6172 * The other ranges follow. Thus every element whose index is divisible by two
6173 * marks the beginning of a range that is in the list, and every element not
6174 * divisible by two marks the beginning of a range not in the list. A single
6175 * element inversion list that contains the single code point N generally
6176 * consists of two elements
6179 * (The exception is when N is the highest representable value on the
6180 * machine, in which case the list containing just it would be a single
6181 * element, itself. By extension, if the last range in the list extends to
6182 * infinity, then the first element of that range will be in the inversion list
6183 * at a position that is divisible by two, and is the final element in the
6185 * Taking the complement (inverting) an inversion list is quite simple, if the
6186 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6187 * This implementation reserves an element at the beginning of each inversion list
6188 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6189 * beginning of the list is either that element if 0, or the next one if 1.
6191 * More about inversion lists can be found in "Unicode Demystified"
6192 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6193 * More will be coming when functionality is added later.
6195 * The inversion list data structure is currently implemented as an SV pointing
6196 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6197 * array of UV whose memory management is automatically handled by the existing
6198 * facilities for SV's.
6200 * Some of the methods should always be private to the implementation, and some
6201 * should eventually be made public */
6203 #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6204 #define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6206 /* This is a combination of a version and data structure type, so that one
6207 * being passed in can be validated to be an inversion list of the correct
6208 * vintage. When the structure of the header is changed, a new random number
6209 * in the range 2**31-1 should be generated and the new() method changed to
6210 * insert that at this location. Then, if an auxiliary program doesn't change
6211 * correspondingly, it will be discovered immediately */
6212 #define INVLIST_VERSION_ID_OFFSET 2
6213 #define INVLIST_VERSION_ID 1064334010
6215 /* For safety, when adding new elements, remember to #undef them at the end of
6216 * the inversion list code section */
6218 #define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
6219 /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
6220 * contains the code point U+00000, and begins here. If 1, the inversion list
6221 * doesn't contain U+0000, and it begins at the next UV in the array.
6222 * Inverting an inversion list consists of adding or removing the 0 at the
6223 * beginning of it. By reserving a space for that 0, inversion can be made
6226 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6228 /* Internally things are UVs */
6229 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6230 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6232 #define INVLIST_INITIAL_LEN 10
6234 PERL_STATIC_INLINE UV*
6235 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6237 /* Returns a pointer to the first element in the inversion list's array.
6238 * This is called upon initialization of an inversion list. Where the
6239 * array begins depends on whether the list has the code point U+0000
6240 * in it or not. The other parameter tells it whether the code that
6241 * follows this call is about to put a 0 in the inversion list or not.
6242 * The first element is either the element with 0, if 0, or the next one,
6245 UV* zero = get_invlist_zero_addr(invlist);
6247 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6250 assert(! *get_invlist_len_addr(invlist));
6252 /* 1^1 = 0; 1^0 = 1 */
6253 *zero = 1 ^ will_have_0;
6254 return zero + *zero;
6257 PERL_STATIC_INLINE UV*
6258 S_invlist_array(pTHX_ SV* const invlist)
6260 /* Returns the pointer to the inversion list's array. Every time the
6261 * length changes, this needs to be called in case malloc or realloc moved
6264 PERL_ARGS_ASSERT_INVLIST_ARRAY;
6266 /* Must not be empty. If these fail, you probably didn't check for <len>
6267 * being non-zero before trying to get the array */
6268 assert(*get_invlist_len_addr(invlist));
6269 assert(*get_invlist_zero_addr(invlist) == 0
6270 || *get_invlist_zero_addr(invlist) == 1);
6272 /* The array begins either at the element reserved for zero if the
6273 * list contains 0 (that element will be set to 0), or otherwise the next
6274 * element (in which case the reserved element will be set to 1). */
6275 return (UV *) (get_invlist_zero_addr(invlist)
6276 + *get_invlist_zero_addr(invlist));
6279 PERL_STATIC_INLINE UV*
6280 S_get_invlist_len_addr(pTHX_ SV* invlist)
6282 /* Return the address of the UV that contains the current number
6283 * of used elements in the inversion list */
6285 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6287 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6290 PERL_STATIC_INLINE UV
6291 S_invlist_len(pTHX_ SV* const invlist)
6293 /* Returns the current number of elements stored in the inversion list's
6296 PERL_ARGS_ASSERT_INVLIST_LEN;
6298 return *get_invlist_len_addr(invlist);
6301 PERL_STATIC_INLINE void
6302 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6304 /* Sets the current number of elements stored in the inversion list */
6306 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6308 *get_invlist_len_addr(invlist) = len;
6310 assert(len <= SvLEN(invlist));
6312 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6313 /* If the list contains U+0000, that element is part of the header,
6314 * and should not be counted as part of the array. It will contain
6315 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
6317 * SvCUR_set(invlist,
6318 * TO_INTERNAL_SIZE(len
6319 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
6320 * But, this is only valid if len is not 0. The consequences of not doing
6321 * this is that the memory allocation code may think that 1 more UV is
6322 * being used than actually is, and so might do an unnecessary grow. That
6323 * seems worth not bothering to make this the precise amount.
6325 * Note that when inverting, SvCUR shouldn't change */
6328 PERL_STATIC_INLINE UV
6329 S_invlist_max(pTHX_ SV* const invlist)
6331 /* Returns the maximum number of elements storable in the inversion list's
6332 * array, without having to realloc() */
6334 PERL_ARGS_ASSERT_INVLIST_MAX;
6336 return FROM_INTERNAL_SIZE(SvLEN(invlist));
6339 PERL_STATIC_INLINE UV*
6340 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6342 /* Return the address of the UV that is reserved to hold 0 if the inversion
6343 * list contains 0. This has to be the last element of the heading, as the
6344 * list proper starts with either it if 0, or the next element if not.
6345 * (But we force it to contain either 0 or 1) */
6347 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6349 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6352 #ifndef PERL_IN_XSUB_RE
6354 Perl__new_invlist(pTHX_ IV initial_size)
6357 /* Return a pointer to a newly constructed inversion list, with enough
6358 * space to store 'initial_size' elements. If that number is negative, a
6359 * system default is used instead */
6363 if (initial_size < 0) {
6364 initial_size = INVLIST_INITIAL_LEN;
6367 /* Allocate the initial space */
6368 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6369 invlist_set_len(new_list, 0);
6371 /* Force iterinit() to be used to get iteration to work */
6372 *get_invlist_iter_addr(new_list) = UV_MAX;
6374 /* This should force a segfault if a method doesn't initialize this
6376 *get_invlist_zero_addr(new_list) = UV_MAX;
6378 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6379 #if HEADER_LENGTH != 4
6380 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6388 S__new_invlist_C_array(pTHX_ UV* list)
6390 /* Return a pointer to a newly constructed inversion list, initialized to
6391 * point to <list>, which has to be in the exact correct inversion list
6392 * form, including internal fields. Thus this is a dangerous routine that
6393 * should not be used in the wrong hands */
6395 SV* invlist = newSV_type(SVt_PV);
6397 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6399 SvPV_set(invlist, (char *) list);
6400 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
6401 shouldn't touch it */
6402 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6404 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6405 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6412 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6414 /* Grow the maximum size of an inversion list */
6416 PERL_ARGS_ASSERT_INVLIST_EXTEND;
6418 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6421 PERL_STATIC_INLINE void
6422 S_invlist_trim(pTHX_ SV* const invlist)
6424 PERL_ARGS_ASSERT_INVLIST_TRIM;
6426 /* Change the length of the inversion list to how many entries it currently
6429 SvPV_shrink_to_cur((SV *) invlist);
6432 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6434 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6435 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6437 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6439 #ifndef PERL_IN_XSUB_RE
6441 Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6443 /* Subject to change or removal. Append the range from 'start' to 'end' at
6444 * the end of the inversion list. The range must be above any existing
6448 UV max = invlist_max(invlist);
6449 UV len = invlist_len(invlist);
6451 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6453 if (len == 0) { /* Empty lists must be initialized */
6454 array = _invlist_array_init(invlist, start == 0);
6457 /* Here, the existing list is non-empty. The current max entry in the
6458 * list is generally the first value not in the set, except when the
6459 * set extends to the end of permissible values, in which case it is
6460 * the first entry in that final set, and so this call is an attempt to
6461 * append out-of-order */
6463 UV final_element = len - 1;
6464 array = invlist_array(invlist);
6465 if (array[final_element] > start
6466 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6468 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6469 array[final_element], start,
6470 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6473 /* Here, it is a legal append. If the new range begins with the first
6474 * value not in the set, it is extending the set, so the new first
6475 * value not in the set is one greater than the newly extended range.
6477 if (array[final_element] == start) {
6478 if (end != UV_MAX) {
6479 array[final_element] = end + 1;
6482 /* But if the end is the maximum representable on the machine,
6483 * just let the range that this would extend to have no end */
6484 invlist_set_len(invlist, len - 1);
6490 /* Here the new range doesn't extend any existing set. Add it */
6492 len += 2; /* Includes an element each for the start and end of range */
6494 /* If overflows the existing space, extend, which may cause the array to be
6497 invlist_extend(invlist, len);
6498 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
6499 failure in invlist_array() */
6500 array = invlist_array(invlist);
6503 invlist_set_len(invlist, len);
6506 /* The next item on the list starts the range, the one after that is
6507 * one past the new range. */
6508 array[len - 2] = start;
6509 if (end != UV_MAX) {
6510 array[len - 1] = end + 1;
6513 /* But if the end is the maximum representable on the machine, just let
6514 * the range have no end */
6515 invlist_set_len(invlist, len - 1);
6520 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6522 /* Searches the inversion list for the entry that contains the input code
6523 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
6524 * return value is the index into the list's array of the range that
6528 IV high = invlist_len(invlist);
6529 const UV * const array = invlist_array(invlist);
6531 PERL_ARGS_ASSERT_INVLIST_SEARCH;
6533 /* If list is empty or the code point is before the first element, return
6535 if (high == 0 || cp < array[0]) {
6539 /* Binary search. What we are looking for is <i> such that
6540 * array[i] <= cp < array[i+1]
6541 * The loop below converges on the i+1. */
6542 while (low < high) {
6543 IV mid = (low + high) / 2;
6544 if (array[mid] <= cp) {
6547 /* We could do this extra test to exit the loop early.
6548 if (cp < array[low]) {
6553 else { /* cp < array[mid] */
6562 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6564 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6565 * but is used when the swash has an inversion list. This makes this much
6566 * faster, as it uses a binary search instead of a linear one. This is
6567 * intimately tied to that function, and perhaps should be in utf8.c,
6568 * except it is intimately tied to inversion lists as well. It assumes
6569 * that <swatch> is all 0's on input */
6572 const IV len = invlist_len(invlist);
6576 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6578 if (len == 0) { /* Empty inversion list */
6582 array = invlist_array(invlist);
6584 /* Find which element it is */
6585 i = invlist_search(invlist, start);
6587 /* We populate from <start> to <end> */
6588 while (current < end) {
6591 /* The inversion list gives the results for every possible code point
6592 * after the first one in the list. Only those ranges whose index is
6593 * even are ones that the inversion list matches. For the odd ones,
6594 * and if the initial code point is not in the list, we have to skip
6595 * forward to the next element */
6596 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6598 if (i >= len) { /* Finished if beyond the end of the array */
6602 if (current >= end) { /* Finished if beyond the end of what we
6607 assert(current >= start);
6609 /* The current range ends one below the next one, except don't go past
6612 upper = (i < len && array[i] < end) ? array[i] : end;
6614 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
6615 * for each code point in it */
6616 for (; current < upper; current++) {
6617 const STRLEN offset = (STRLEN)(current - start);
6618 swatch[offset >> 3] |= 1 << (offset & 7);
6621 /* Quit if at the end of the list */
6624 /* But first, have to deal with the highest possible code point on
6625 * the platform. The previous code assumes that <end> is one
6626 * beyond where we want to populate, but that is impossible at the
6627 * platform's infinity, so have to handle it specially */
6628 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6630 const STRLEN offset = (STRLEN)(end - start);
6631 swatch[offset >> 3] |= 1 << (offset & 7);
6636 /* Advance to the next range, which will be for code points not in the
6646 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
6648 /* Take the union of two inversion lists and point <output> to it. *output
6649 * should be defined upon input, and if it points to one of the two lists,
6650 * the reference count to that list will be decremented. The first list,
6651 * <a>, may be NULL, in which case a copy of the second list is returned.
6652 * If <complement_b> is TRUE, the union is taken of the complement
6653 * (inversion) of <b> instead of b itself.
6655 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6656 * Richard Gillam, published by Addison-Wesley, and explained at some
6657 * length there. The preface says to incorporate its examples into your
6658 * code at your own risk.
6660 * The algorithm is like a merge sort.
6662 * XXX A potential performance improvement is to keep track as we go along
6663 * if only one of the inputs contributes to the result, meaning the other
6664 * is a subset of that one. In that case, we can skip the final copy and
6665 * return the larger of the input lists, but then outside code might need
6666 * to keep track of whether to free the input list or not */
6668 UV* array_a; /* a's array */
6670 UV len_a; /* length of a's array */
6673 SV* u; /* the resulting union */
6677 UV i_a = 0; /* current index into a's array */
6681 /* running count, as explained in the algorithm source book; items are
6682 * stopped accumulating and are output when the count changes to/from 0.
6683 * The count is incremented when we start a range that's in the set, and
6684 * decremented when we start a range that's not in the set. So its range
6685 * is 0 to 2. Only when the count is zero is something not in the set.
6689 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
6692 /* If either one is empty, the union is the other one */
6693 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
6700 *output = invlist_clone(b);
6702 _invlist_invert(*output);
6704 } /* else *output already = b; */
6707 else if ((len_b = invlist_len(b)) == 0) {
6712 /* The complement of an empty list is a list that has everything in it,
6713 * so the union with <a> includes everything too */
6718 *output = _new_invlist(1);
6719 _append_range_to_invlist(*output, 0, UV_MAX);
6721 else if (*output != a) {
6722 *output = invlist_clone(a);
6724 /* else *output already = a; */
6728 /* Here both lists exist and are non-empty */
6729 array_a = invlist_array(a);
6730 array_b = invlist_array(b);
6732 /* If are to take the union of 'a' with the complement of b, set it
6733 * up so are looking at b's complement. */
6736 /* To complement, we invert: if the first element is 0, remove it. To
6737 * do this, we just pretend the array starts one later, and clear the
6738 * flag as we don't have to do anything else later */
6739 if (array_b[0] == 0) {
6742 complement_b = FALSE;
6746 /* But if the first element is not zero, we unshift a 0 before the
6747 * array. The data structure reserves a space for that 0 (which
6748 * should be a '1' right now), so physical shifting is unneeded,
6749 * but temporarily change that element to 0. Before exiting the
6750 * routine, we must restore the element to '1' */
6757 /* Size the union for the worst case: that the sets are completely
6759 u = _new_invlist(len_a + len_b);
6761 /* Will contain U+0000 if either component does */
6762 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6763 || (len_b > 0 && array_b[0] == 0));
6765 /* Go through each list item by item, stopping when exhausted one of
6767 while (i_a < len_a && i_b < len_b) {
6768 UV cp; /* The element to potentially add to the union's array */
6769 bool cp_in_set; /* is it in the the input list's set or not */
6771 /* We need to take one or the other of the two inputs for the union.
6772 * Since we are merging two sorted lists, we take the smaller of the
6773 * next items. In case of a tie, we take the one that is in its set
6774 * first. If we took one not in the set first, it would decrement the
6775 * count, possibly to 0 which would cause it to be output as ending the
6776 * range, and the next time through we would take the same number, and
6777 * output it again as beginning the next range. By doing it the
6778 * opposite way, there is no possibility that the count will be
6779 * momentarily decremented to 0, and thus the two adjoining ranges will
6780 * be seamlessly merged. (In a tie and both are in the set or both not
6781 * in the set, it doesn't matter which we take first.) */
6782 if (array_a[i_a] < array_b[i_b]
6783 || (array_a[i_a] == array_b[i_b]
6784 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6786 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6790 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6794 /* Here, have chosen which of the two inputs to look at. Only output
6795 * if the running count changes to/from 0, which marks the
6796 * beginning/end of a range in that's in the set */
6799 array_u[i_u++] = cp;
6806 array_u[i_u++] = cp;
6811 /* Here, we are finished going through at least one of the lists, which
6812 * means there is something remaining in at most one. We check if the list
6813 * that hasn't been exhausted is positioned such that we are in the middle
6814 * of a range in its set or not. (i_a and i_b point to the element beyond
6815 * the one we care about.) If in the set, we decrement 'count'; if 0, there
6816 * is potentially more to output.
6817 * There are four cases:
6818 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6819 * in the union is entirely from the non-exhausted set.
6820 * 2) Both were in their sets, count is 2. Nothing further should
6821 * be output, as everything that remains will be in the exhausted
6822 * list's set, hence in the union; decrementing to 1 but not 0 insures
6824 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6825 * Nothing further should be output because the union includes
6826 * everything from the exhausted set. Not decrementing ensures that.
6827 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6828 * decrementing to 0 insures that we look at the remainder of the
6829 * non-exhausted set */
6830 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6831 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6836 /* The final length is what we've output so far, plus what else is about to
6837 * be output. (If 'count' is non-zero, then the input list we exhausted
6838 * has everything remaining up to the machine's limit in its set, and hence
6839 * in the union, so there will be no further output. */
6842 /* At most one of the subexpressions will be non-zero */
6843 len_u += (len_a - i_a) + (len_b - i_b);
6846 /* Set result to final length, which can change the pointer to array_u, so
6848 if (len_u != invlist_len(u)) {
6849 invlist_set_len(u, len_u);
6851 array_u = invlist_array(u);
6854 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6855 * the other) ended with everything above it not in its set. That means
6856 * that the remaining part of the union is precisely the same as the
6857 * non-exhausted list, so can just copy it unchanged. (If both list were
6858 * exhausted at the same time, then the operations below will be both 0.)
6861 IV copy_count; /* At most one will have a non-zero copy count */
6862 if ((copy_count = len_a - i_a) > 0) {
6863 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6865 else if ((copy_count = len_b - i_b) > 0) {
6866 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6870 /* We may be removing a reference to one of the inputs */
6871 if (a == *output || b == *output) {
6872 SvREFCNT_dec(*output);
6875 /* If we've changed b, restore it */
6885 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
6887 /* Take the intersection of two inversion lists and point <i> to it. *i
6888 * should be defined upon input, and if it points to one of the two lists,
6889 * the reference count to that list will be decremented.
6890 * If <complement_b> is TRUE, the result will be the intersection of <a>
6891 * and the complement (or inversion) of <b> instead of <b> directly.
6893 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6894 * Richard Gillam, published by Addison-Wesley, and explained at some
6895 * length there. The preface says to incorporate its examples into your
6896 * code at your own risk. In fact, it had bugs
6898 * The algorithm is like a merge sort, and is essentially the same as the
6902 UV* array_a; /* a's array */
6904 UV len_a; /* length of a's array */
6907 SV* r; /* the resulting intersection */
6911 UV i_a = 0; /* current index into a's array */
6915 /* running count, as explained in the algorithm source book; items are
6916 * stopped accumulating and are output when the count changes to/from 2.
6917 * The count is incremented when we start a range that's in the set, and
6918 * decremented when we start a range that's not in the set. So its range
6919 * is 0 to 2. Only when the count is 2 is something in the intersection.
6923 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
6926 /* Special case if either one is empty */
6927 len_a = invlist_len(a);
6928 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
6930 if (len_a != 0 && complement_b) {
6932 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
6933 * be empty. Here, also we are using 'b's complement, which hence
6934 * must be every possible code point. Thus the intersection is
6937 *i = invlist_clone(a);
6943 /* else *i is already 'a' */
6947 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
6948 * intersection must be empty */
6955 *i = _new_invlist(0);
6959 /* Here both lists exist and are non-empty */
6960 array_a = invlist_array(a);
6961 array_b = invlist_array(b);
6963 /* If are to take the intersection of 'a' with the complement of b, set it
6964 * up so are looking at b's complement. */
6967 /* To complement, we invert: if the first element is 0, remove it. To
6968 * do this, we just pretend the array starts one later, and clear the
6969 * flag as we don't have to do anything else later */
6970 if (array_b[0] == 0) {
6973 complement_b = FALSE;
6977 /* But if the first element is not zero, we unshift a 0 before the
6978 * array. The data structure reserves a space for that 0 (which
6979 * should be a '1' right now), so physical shifting is unneeded,
6980 * but temporarily change that element to 0. Before exiting the
6981 * routine, we must restore the element to '1' */
6988 /* Size the intersection for the worst case: that the intersection ends up
6989 * fragmenting everything to be completely disjoint */
6990 r= _new_invlist(len_a + len_b);
6992 /* Will contain U+0000 iff both components do */
6993 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
6994 && len_b > 0 && array_b[0] == 0);
6996 /* Go through each list item by item, stopping when exhausted one of
6998 while (i_a < len_a && i_b < len_b) {
6999 UV cp; /* The element to potentially add to the intersection's
7001 bool cp_in_set; /* Is it in the input list's set or not */
7003 /* We need to take one or the other of the two inputs for the
7004 * intersection. Since we are merging two sorted lists, we take the
7005 * smaller of the next items. In case of a tie, we take the one that
7006 * is not in its set first (a difference from the union algorithm). If
7007 * we took one in the set first, it would increment the count, possibly
7008 * to 2 which would cause it to be output as starting a range in the
7009 * intersection, and the next time through we would take that same
7010 * number, and output it again as ending the set. By doing it the
7011 * opposite of this, there is no possibility that the count will be
7012 * momentarily incremented to 2. (In a tie and both are in the set or
7013 * both not in the set, it doesn't matter which we take first.) */
7014 if (array_a[i_a] < array_b[i_b]
7015 || (array_a[i_a] == array_b[i_b]
7016 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7018 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7022 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7026 /* Here, have chosen which of the two inputs to look at. Only output
7027 * if the running count changes to/from 2, which marks the
7028 * beginning/end of a range that's in the intersection */
7032 array_r[i_r++] = cp;
7037 array_r[i_r++] = cp;
7043 /* Here, we are finished going through at least one of the lists, which
7044 * means there is something remaining in at most one. We check if the list
7045 * that has been exhausted is positioned such that we are in the middle
7046 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7047 * the ones we care about.) There are four cases:
7048 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7049 * nothing left in the intersection.
7050 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7051 * above 2. What should be output is exactly that which is in the
7052 * non-exhausted set, as everything it has is also in the intersection
7053 * set, and everything it doesn't have can't be in the intersection
7054 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7055 * gets incremented to 2. Like the previous case, the intersection is
7056 * everything that remains in the non-exhausted set.
7057 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7058 * remains 1. And the intersection has nothing more. */
7059 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7060 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7065 /* The final length is what we've output so far plus what else is in the
7066 * intersection. At most one of the subexpressions below will be non-zero */
7069 len_r += (len_a - i_a) + (len_b - i_b);
7072 /* Set result to final length, which can change the pointer to array_r, so
7074 if (len_r != invlist_len(r)) {
7075 invlist_set_len(r, len_r);
7077 array_r = invlist_array(r);
7080 /* Finish outputting any remaining */
7081 if (count >= 2) { /* At most one will have a non-zero copy count */
7083 if ((copy_count = len_a - i_a) > 0) {
7084 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7086 else if ((copy_count = len_b - i_b) > 0) {
7087 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7091 /* We may be removing a reference to one of the inputs */
7092 if (a == *i || b == *i) {
7096 /* If we've changed b, restore it */
7108 S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7110 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7111 * set. A pointer to the inversion list is returned. This may actually be
7112 * a new list, in which case the passed in one has been destroyed. The
7113 * passed in inversion list can be NULL, in which case a new one is created
7114 * with just the one range in it */
7119 if (invlist == NULL) {
7120 invlist = _new_invlist(2);
7124 len = invlist_len(invlist);
7127 /* If comes after the final entry, can just append it to the end */
7129 || start >= invlist_array(invlist)
7130 [invlist_len(invlist) - 1])
7132 _append_range_to_invlist(invlist, start, end);
7136 /* Here, can't just append things, create and return a new inversion list
7137 * which is the union of this range and the existing inversion list */
7138 range_invlist = _new_invlist(2);
7139 _append_range_to_invlist(range_invlist, start, end);
7141 _invlist_union(invlist, range_invlist, &invlist);
7143 /* The temporary can be freed */
7144 SvREFCNT_dec(range_invlist);
7149 PERL_STATIC_INLINE SV*
7150 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7151 return add_range_to_invlist(invlist, cp, cp);
7154 #ifndef PERL_IN_XSUB_RE
7156 Perl__invlist_invert(pTHX_ SV* const invlist)
7158 /* Complement the input inversion list. This adds a 0 if the list didn't
7159 * have a zero; removes it otherwise. As described above, the data
7160 * structure is set up so that this is very efficient */
7162 UV* len_pos = get_invlist_len_addr(invlist);
7164 PERL_ARGS_ASSERT__INVLIST_INVERT;
7166 /* The inverse of matching nothing is matching everything */
7167 if (*len_pos == 0) {
7168 _append_range_to_invlist(invlist, 0, UV_MAX);
7172 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7173 * zero element was a 0, so it is being removed, so the length decrements
7174 * by 1; and vice-versa. SvCUR is unaffected */
7175 if (*get_invlist_zero_addr(invlist) ^= 1) {
7184 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7186 /* Complement the input inversion list (which must be a Unicode property,
7187 * all of which don't match above the Unicode maximum code point.) And
7188 * Perl has chosen to not have the inversion match above that either. This
7189 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7195 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7197 _invlist_invert(invlist);
7199 len = invlist_len(invlist);
7201 if (len != 0) { /* If empty do nothing */
7202 array = invlist_array(invlist);
7203 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7204 /* Add 0x110000. First, grow if necessary */
7206 if (invlist_max(invlist) < len) {
7207 invlist_extend(invlist, len);
7208 array = invlist_array(invlist);
7210 invlist_set_len(invlist, len);
7211 array[len - 1] = PERL_UNICODE_MAX + 1;
7213 else { /* Remove the 0x110000 */
7214 invlist_set_len(invlist, len - 1);
7222 PERL_STATIC_INLINE SV*
7223 S_invlist_clone(pTHX_ SV* const invlist)
7226 /* Return a new inversion list that is a copy of the input one, which is
7229 /* Need to allocate extra space to accommodate Perl's addition of a
7230 * trailing NUL to SvPV's, since it thinks they are always strings */
7231 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7232 STRLEN length = SvCUR(invlist);
7234 PERL_ARGS_ASSERT_INVLIST_CLONE;
7236 SvCUR_set(new_invlist, length); /* This isn't done automatically */
7237 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7242 PERL_STATIC_INLINE UV*
7243 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7245 /* Return the address of the UV that contains the current iteration
7248 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7250 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7253 PERL_STATIC_INLINE UV*
7254 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7256 /* Return the address of the UV that contains the version id. */
7258 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7260 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7263 PERL_STATIC_INLINE void
7264 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
7266 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7268 *get_invlist_iter_addr(invlist) = 0;
7272 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7274 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7275 * This call sets in <*start> and <*end>, the next range in <invlist>.
7276 * Returns <TRUE> if successful and the next call will return the next
7277 * range; <FALSE> if was already at the end of the list. If the latter,
7278 * <*start> and <*end> are unchanged, and the next call to this function
7279 * will start over at the beginning of the list */
7281 UV* pos = get_invlist_iter_addr(invlist);
7282 UV len = invlist_len(invlist);
7285 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7288 *pos = UV_MAX; /* Force iternit() to be required next time */
7292 array = invlist_array(invlist);
7294 *start = array[(*pos)++];
7300 *end = array[(*pos)++] - 1;
7306 #ifndef PERL_IN_XSUB_RE
7308 Perl__invlist_contents(pTHX_ SV* const invlist)
7310 /* Get the contents of an inversion list into a string SV so that they can
7311 * be printed out. It uses the format traditionally done for debug tracing
7315 SV* output = newSVpvs("\n");
7317 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7319 invlist_iterinit(invlist);
7320 while (invlist_iternext(invlist, &start, &end)) {
7321 if (end == UV_MAX) {
7322 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7324 else if (end != start) {
7325 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7329 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7339 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7341 /* Dumps out the ranges in an inversion list. The string 'header'
7342 * if present is output on a line before the first range */
7346 if (header && strlen(header)) {
7347 PerlIO_printf(Perl_debug_log, "%s\n", header);
7349 invlist_iterinit(invlist);
7350 while (invlist_iternext(invlist, &start, &end)) {
7351 if (end == UV_MAX) {
7352 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7355 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7361 #undef HEADER_LENGTH
7362 #undef INVLIST_INITIAL_LENGTH
7363 #undef TO_INTERNAL_SIZE
7364 #undef FROM_INTERNAL_SIZE
7365 #undef INVLIST_LEN_OFFSET
7366 #undef INVLIST_ZERO_OFFSET
7367 #undef INVLIST_ITER_OFFSET
7368 #undef INVLIST_VERSION_ID
7370 /* End of inversion list object */
7373 - reg - regular expression, i.e. main body or parenthesized thing
7375 * Caller must absorb opening parenthesis.
7377 * Combining parenthesis handling with the base level of regular expression
7378 * is a trifle forced, but the need to tie the tails of the branches to what
7379 * follows makes it hard to avoid.
7381 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7383 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7385 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7389 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7390 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7393 register regnode *ret; /* Will be the head of the group. */
7394 register regnode *br;
7395 register regnode *lastbr;
7396 register regnode *ender = NULL;
7397 register I32 parno = 0;
7399 U32 oregflags = RExC_flags;
7400 bool have_branch = 0;
7402 I32 freeze_paren = 0;
7403 I32 after_freeze = 0;
7405 /* for (?g), (?gc), and (?o) warnings; warning
7406 about (?c) will warn about (?g) -- japhy */
7408 #define WASTED_O 0x01
7409 #define WASTED_G 0x02
7410 #define WASTED_C 0x04
7411 #define WASTED_GC (0x02|0x04)
7412 I32 wastedflags = 0x00;
7414 char * parse_start = RExC_parse; /* MJD */
7415 char * const oregcomp_parse = RExC_parse;
7417 GET_RE_DEBUG_FLAGS_DECL;
7419 PERL_ARGS_ASSERT_REG;
7420 DEBUG_PARSE("reg ");
7422 *flagp = 0; /* Tentatively. */
7425 /* Make an OPEN node, if parenthesized. */
7427 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7428 char *start_verb = RExC_parse;
7429 STRLEN verb_len = 0;
7430 char *start_arg = NULL;
7431 unsigned char op = 0;
7433 int internal_argval = 0; /* internal_argval is only useful if !argok */
7434 while ( *RExC_parse && *RExC_parse != ')' ) {
7435 if ( *RExC_parse == ':' ) {
7436 start_arg = RExC_parse + 1;
7442 verb_len = RExC_parse - start_verb;
7445 while ( *RExC_parse && *RExC_parse != ')' )
7447 if ( *RExC_parse != ')' )
7448 vFAIL("Unterminated verb pattern argument");
7449 if ( RExC_parse == start_arg )
7452 if ( *RExC_parse != ')' )
7453 vFAIL("Unterminated verb pattern");
7456 switch ( *start_verb ) {
7457 case 'A': /* (*ACCEPT) */
7458 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7460 internal_argval = RExC_nestroot;
7463 case 'C': /* (*COMMIT) */
7464 if ( memEQs(start_verb,verb_len,"COMMIT") )
7467 case 'F': /* (*FAIL) */
7468 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7473 case ':': /* (*:NAME) */
7474 case 'M': /* (*MARK:NAME) */
7475 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7480 case 'P': /* (*PRUNE) */
7481 if ( memEQs(start_verb,verb_len,"PRUNE") )
7484 case 'S': /* (*SKIP) */
7485 if ( memEQs(start_verb,verb_len,"SKIP") )
7488 case 'T': /* (*THEN) */
7489 /* [19:06] <TimToady> :: is then */
7490 if ( memEQs(start_verb,verb_len,"THEN") ) {
7492 RExC_seen |= REG_SEEN_CUTGROUP;
7498 vFAIL3("Unknown verb pattern '%.*s'",
7499 verb_len, start_verb);
7502 if ( start_arg && internal_argval ) {
7503 vFAIL3("Verb pattern '%.*s' may not have an argument",
7504 verb_len, start_verb);
7505 } else if ( argok < 0 && !start_arg ) {
7506 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7507 verb_len, start_verb);
7509 ret = reganode(pRExC_state, op, internal_argval);
7510 if ( ! internal_argval && ! SIZE_ONLY ) {
7512 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7513 ARG(ret) = add_data( pRExC_state, 1, "S" );
7514 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
7521 if (!internal_argval)
7522 RExC_seen |= REG_SEEN_VERBARG;
7523 } else if ( start_arg ) {
7524 vFAIL3("Verb pattern '%.*s' may not have an argument",
7525 verb_len, start_verb);
7527 ret = reg_node(pRExC_state, op);
7529 nextchar(pRExC_state);
7532 if (*RExC_parse == '?') { /* (?...) */
7533 bool is_logical = 0;
7534 const char * const seqstart = RExC_parse;
7535 bool has_use_defaults = FALSE;
7538 paren = *RExC_parse++;
7539 ret = NULL; /* For look-ahead/behind. */
7542 case 'P': /* (?P...) variants for those used to PCRE/Python */
7543 paren = *RExC_parse++;
7544 if ( paren == '<') /* (?P<...>) named capture */
7546 else if (paren == '>') { /* (?P>name) named recursion */
7547 goto named_recursion;
7549 else if (paren == '=') { /* (?P=...) named backref */
7550 /* this pretty much dupes the code for \k<NAME> in regatom(), if
7551 you change this make sure you change that */
7552 char* name_start = RExC_parse;
7554 SV *sv_dat = reg_scan_name(pRExC_state,
7555 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7556 if (RExC_parse == name_start || *RExC_parse != ')')
7557 vFAIL2("Sequence %.3s... not terminated",parse_start);
7560 num = add_data( pRExC_state, 1, "S" );
7561 RExC_rxi->data->data[num]=(void*)sv_dat;
7562 SvREFCNT_inc_simple_void(sv_dat);
7565 ret = reganode(pRExC_state,
7568 : (MORE_ASCII_RESTRICTED)
7570 : (AT_LEAST_UNI_SEMANTICS)
7578 Set_Node_Offset(ret, parse_start+1);
7579 Set_Node_Cur_Length(ret); /* MJD */
7581 nextchar(pRExC_state);
7585 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7587 case '<': /* (?<...) */
7588 if (*RExC_parse == '!')
7590 else if (*RExC_parse != '=')
7596 case '\'': /* (?'...') */
7597 name_start= RExC_parse;
7598 svname = reg_scan_name(pRExC_state,
7599 SIZE_ONLY ? /* reverse test from the others */
7600 REG_RSN_RETURN_NAME :
7601 REG_RSN_RETURN_NULL);
7602 if (RExC_parse == name_start) {
7604 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7607 if (*RExC_parse != paren)
7608 vFAIL2("Sequence (?%c... not terminated",
7609 paren=='>' ? '<' : paren);
7613 if (!svname) /* shouldn't happen */
7615 "panic: reg_scan_name returned NULL");
7616 if (!RExC_paren_names) {
7617 RExC_paren_names= newHV();
7618 sv_2mortal(MUTABLE_SV(RExC_paren_names));
7620 RExC_paren_name_list= newAV();
7621 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
7624 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
7626 sv_dat = HeVAL(he_str);
7628 /* croak baby croak */
7630 "panic: paren_name hash element allocation failed");
7631 } else if ( SvPOK(sv_dat) ) {
7632 /* (?|...) can mean we have dupes so scan to check
7633 its already been stored. Maybe a flag indicating
7634 we are inside such a construct would be useful,
7635 but the arrays are likely to be quite small, so
7636 for now we punt -- dmq */
7637 IV count = SvIV(sv_dat);
7638 I32 *pv = (I32*)SvPVX(sv_dat);
7640 for ( i = 0 ; i < count ; i++ ) {
7641 if ( pv[i] == RExC_npar ) {
7647 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7648 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7649 pv[count] = RExC_npar;
7650 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
7653 (void)SvUPGRADE(sv_dat,SVt_PVNV);
7654 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7656 SvIV_set(sv_dat, 1);
7659 /* Yes this does cause a memory leak in debugging Perls */
7660 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7661 SvREFCNT_dec(svname);
7664 /*sv_dump(sv_dat);*/
7666 nextchar(pRExC_state);
7668 goto capturing_parens;
7670 RExC_seen |= REG_SEEN_LOOKBEHIND;
7671 RExC_in_lookbehind++;
7673 case '=': /* (?=...) */
7674 RExC_seen_zerolen++;
7676 case '!': /* (?!...) */
7677 RExC_seen_zerolen++;
7678 if (*RExC_parse == ')') {
7679 ret=reg_node(pRExC_state, OPFAIL);
7680 nextchar(pRExC_state);
7684 case '|': /* (?|...) */
7685 /* branch reset, behave like a (?:...) except that
7686 buffers in alternations share the same numbers */
7688 after_freeze = freeze_paren = RExC_npar;
7690 case ':': /* (?:...) */
7691 case '>': /* (?>...) */
7693 case '$': /* (?$...) */
7694 case '@': /* (?@...) */
7695 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
7697 case '#': /* (?#...) */
7698 while (*RExC_parse && *RExC_parse != ')')
7700 if (*RExC_parse != ')')
7701 FAIL("Sequence (?#... not terminated");
7702 nextchar(pRExC_state);
7705 case '0' : /* (?0) */
7706 case 'R' : /* (?R) */
7707 if (*RExC_parse != ')')
7708 FAIL("Sequence (?R) not terminated");
7709 ret = reg_node(pRExC_state, GOSTART);
7710 *flagp |= POSTPONED;
7711 nextchar(pRExC_state);
7714 { /* named and numeric backreferences */
7716 case '&': /* (?&NAME) */
7717 parse_start = RExC_parse - 1;
7720 SV *sv_dat = reg_scan_name(pRExC_state,
7721 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7722 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7724 goto gen_recurse_regop;
7727 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7729 vFAIL("Illegal pattern");
7731 goto parse_recursion;
7733 case '-': /* (?-1) */
7734 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7735 RExC_parse--; /* rewind to let it be handled later */
7739 case '1': case '2': case '3': case '4': /* (?1) */
7740 case '5': case '6': case '7': case '8': case '9':
7743 num = atoi(RExC_parse);
7744 parse_start = RExC_parse - 1; /* MJD */
7745 if (*RExC_parse == '-')
7747 while (isDIGIT(*RExC_parse))
7749 if (*RExC_parse!=')')
7750 vFAIL("Expecting close bracket");
7753 if ( paren == '-' ) {
7755 Diagram of capture buffer numbering.
7756 Top line is the normal capture buffer numbers
7757 Bottom line is the negative indexing as from
7761 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7765 num = RExC_npar + num;
7768 vFAIL("Reference to nonexistent group");
7770 } else if ( paren == '+' ) {
7771 num = RExC_npar + num - 1;
7774 ret = reganode(pRExC_state, GOSUB, num);
7776 if (num > (I32)RExC_rx->nparens) {
7778 vFAIL("Reference to nonexistent group");
7780 ARG2L_SET( ret, RExC_recurse_count++);
7782 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7783 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
7787 RExC_seen |= REG_SEEN_RECURSE;
7788 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
7789 Set_Node_Offset(ret, parse_start); /* MJD */
7791 *flagp |= POSTPONED;
7792 nextchar(pRExC_state);
7794 } /* named and numeric backreferences */
7797 case '?': /* (??...) */
7799 if (*RExC_parse != '{') {
7801 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7804 *flagp |= POSTPONED;
7805 paren = *RExC_parse++;
7807 case '{': /* (?{...}) */
7812 char *s = RExC_parse;
7814 RExC_seen_zerolen++;
7815 RExC_seen |= REG_SEEN_EVAL;
7816 while (count && (c = *RExC_parse)) {
7827 if (*RExC_parse != ')') {
7829 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7833 OP_4tree *sop, *rop;
7834 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
7837 Perl_save_re_context(aTHX);
7838 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
7839 sop->op_private |= OPpREFCOUNTED;
7840 /* re_dup will OpREFCNT_inc */
7841 OpREFCNT_set(sop, 1);
7844 n = add_data(pRExC_state, 3, "nop");
7845 RExC_rxi->data->data[n] = (void*)rop;
7846 RExC_rxi->data->data[n+1] = (void*)sop;
7847 RExC_rxi->data->data[n+2] = (void*)pad;
7850 else { /* First pass */
7851 if (PL_reginterp_cnt < ++RExC_seen_evals
7853 /* No compiled RE interpolated, has runtime
7854 components ===> unsafe. */
7855 FAIL("Eval-group not allowed at runtime, use re 'eval'");
7856 if (PL_tainting && PL_tainted)
7857 FAIL("Eval-group in insecure regular expression");
7858 #if PERL_VERSION > 8
7859 if (IN_PERL_COMPILETIME)
7864 nextchar(pRExC_state);
7866 ret = reg_node(pRExC_state, LOGICAL);
7869 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
7870 /* deal with the length of this later - MJD */
7873 ret = reganode(pRExC_state, EVAL, n);
7874 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7875 Set_Node_Offset(ret, parse_start);
7878 case '(': /* (?(?{...})...) and (?(?=...)...) */
7881 if (RExC_parse[0] == '?') { /* (?(?...)) */
7882 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7883 || RExC_parse[1] == '<'
7884 || RExC_parse[1] == '{') { /* Lookahead or eval. */
7887 ret = reg_node(pRExC_state, LOGICAL);
7890 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
7894 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
7895 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7897 char ch = RExC_parse[0] == '<' ? '>' : '\'';
7898 char *name_start= RExC_parse++;
7900 SV *sv_dat=reg_scan_name(pRExC_state,
7901 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7902 if (RExC_parse == name_start || *RExC_parse != ch)
7903 vFAIL2("Sequence (?(%c... not terminated",
7904 (ch == '>' ? '<' : ch));
7907 num = add_data( pRExC_state, 1, "S" );
7908 RExC_rxi->data->data[num]=(void*)sv_dat;
7909 SvREFCNT_inc_simple_void(sv_dat);
7911 ret = reganode(pRExC_state,NGROUPP,num);
7912 goto insert_if_check_paren;
7914 else if (RExC_parse[0] == 'D' &&
7915 RExC_parse[1] == 'E' &&
7916 RExC_parse[2] == 'F' &&
7917 RExC_parse[3] == 'I' &&
7918 RExC_parse[4] == 'N' &&
7919 RExC_parse[5] == 'E')
7921 ret = reganode(pRExC_state,DEFINEP,0);
7924 goto insert_if_check_paren;
7926 else if (RExC_parse[0] == 'R') {
7929 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7930 parno = atoi(RExC_parse++);
7931 while (isDIGIT(*RExC_parse))
7933 } else if (RExC_parse[0] == '&') {
7936 sv_dat = reg_scan_name(pRExC_state,
7937 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7938 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7940 ret = reganode(pRExC_state,INSUBP,parno);
7941 goto insert_if_check_paren;
7943 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7946 parno = atoi(RExC_parse++);
7948 while (isDIGIT(*RExC_parse))
7950 ret = reganode(pRExC_state, GROUPP, parno);
7952 insert_if_check_paren:
7953 if ((c = *nextchar(pRExC_state)) != ')')
7954 vFAIL("Switch condition not recognized");
7956 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
7957 br = regbranch(pRExC_state, &flags, 1,depth+1);
7959 br = reganode(pRExC_state, LONGJMP, 0);
7961 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
7962 c = *nextchar(pRExC_state);
7967 vFAIL("(?(DEFINE)....) does not allow branches");
7968 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7969 regbranch(pRExC_state, &flags, 1,depth+1);
7970 REGTAIL(pRExC_state, ret, lastbr);
7973 c = *nextchar(pRExC_state);
7978 vFAIL("Switch (?(condition)... contains too many branches");
7979 ender = reg_node(pRExC_state, TAIL);
7980 REGTAIL(pRExC_state, br, ender);
7982 REGTAIL(pRExC_state, lastbr, ender);
7983 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7986 REGTAIL(pRExC_state, ret, ender);
7987 RExC_size++; /* XXX WHY do we need this?!!
7988 For large programs it seems to be required
7989 but I can't figure out why. -- dmq*/
7993 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7997 RExC_parse--; /* for vFAIL to print correctly */
7998 vFAIL("Sequence (? incomplete");
8000 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8002 has_use_defaults = TRUE;
8003 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8004 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8005 ? REGEX_UNICODE_CHARSET
8006 : REGEX_DEPENDS_CHARSET);
8010 parse_flags: /* (?i) */
8012 U32 posflags = 0, negflags = 0;
8013 U32 *flagsp = &posflags;
8014 char has_charset_modifier = '\0';
8015 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
8016 ? REGEX_UNICODE_CHARSET
8017 : REGEX_DEPENDS_CHARSET;
8019 while (*RExC_parse) {
8020 /* && strchr("iogcmsx", *RExC_parse) */
8021 /* (?g), (?gc) and (?o) are useless here
8022 and must be globally applied -- japhy */
8023 switch (*RExC_parse) {
8024 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8025 case LOCALE_PAT_MOD:
8026 if (has_charset_modifier) {
8027 goto excess_modifier;
8029 else if (flagsp == &negflags) {
8032 cs = REGEX_LOCALE_CHARSET;
8033 has_charset_modifier = LOCALE_PAT_MOD;
8034 RExC_contains_locale = 1;
8036 case UNICODE_PAT_MOD:
8037 if (has_charset_modifier) {
8038 goto excess_modifier;
8040 else if (flagsp == &negflags) {
8043 cs = REGEX_UNICODE_CHARSET;
8044 has_charset_modifier = UNICODE_PAT_MOD;
8046 case ASCII_RESTRICT_PAT_MOD:
8047 if (flagsp == &negflags) {
8050 if (has_charset_modifier) {
8051 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8052 goto excess_modifier;
8054 /* Doubled modifier implies more restricted */
8055 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8058 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8060 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8062 case DEPENDS_PAT_MOD:
8063 if (has_use_defaults) {
8064 goto fail_modifiers;
8066 else if (flagsp == &negflags) {
8069 else if (has_charset_modifier) {
8070 goto excess_modifier;
8073 /* The dual charset means unicode semantics if the
8074 * pattern (or target, not known until runtime) are
8075 * utf8, or something in the pattern indicates unicode
8077 cs = (RExC_utf8 || RExC_uni_semantics)
8078 ? REGEX_UNICODE_CHARSET
8079 : REGEX_DEPENDS_CHARSET;
8080 has_charset_modifier = DEPENDS_PAT_MOD;
8084 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8085 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8087 else if (has_charset_modifier == *(RExC_parse - 1)) {
8088 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8091 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8096 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8098 case ONCE_PAT_MOD: /* 'o' */
8099 case GLOBAL_PAT_MOD: /* 'g' */
8100 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8101 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8102 if (! (wastedflags & wflagbit) ) {
8103 wastedflags |= wflagbit;
8106 "Useless (%s%c) - %suse /%c modifier",
8107 flagsp == &negflags ? "?-" : "?",
8109 flagsp == &negflags ? "don't " : "",
8116 case CONTINUE_PAT_MOD: /* 'c' */
8117 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8118 if (! (wastedflags & WASTED_C) ) {
8119 wastedflags |= WASTED_GC;
8122 "Useless (%sc) - %suse /gc modifier",
8123 flagsp == &negflags ? "?-" : "?",
8124 flagsp == &negflags ? "don't " : ""
8129 case KEEPCOPY_PAT_MOD: /* 'p' */
8130 if (flagsp == &negflags) {
8132 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8134 *flagsp |= RXf_PMf_KEEPCOPY;
8138 /* A flag is a default iff it is following a minus, so
8139 * if there is a minus, it means will be trying to
8140 * re-specify a default which is an error */
8141 if (has_use_defaults || flagsp == &negflags) {
8144 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8148 wastedflags = 0; /* reset so (?g-c) warns twice */
8154 RExC_flags |= posflags;
8155 RExC_flags &= ~negflags;
8156 set_regex_charset(&RExC_flags, cs);
8158 oregflags |= posflags;
8159 oregflags &= ~negflags;
8160 set_regex_charset(&oregflags, cs);
8162 nextchar(pRExC_state);
8173 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8178 }} /* one for the default block, one for the switch */
8185 ret = reganode(pRExC_state, OPEN, parno);
8188 RExC_nestroot = parno;
8189 if (RExC_seen & REG_SEEN_RECURSE
8190 && !RExC_open_parens[parno-1])
8192 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8193 "Setting open paren #%"IVdf" to %d\n",
8194 (IV)parno, REG_NODE_NUM(ret)));
8195 RExC_open_parens[parno-1]= ret;
8198 Set_Node_Length(ret, 1); /* MJD */
8199 Set_Node_Offset(ret, RExC_parse); /* MJD */
8207 /* Pick up the branches, linking them together. */
8208 parse_start = RExC_parse; /* MJD */
8209 br = regbranch(pRExC_state, &flags, 1,depth+1);
8211 /* branch_len = (paren != 0); */
8215 if (*RExC_parse == '|') {
8216 if (!SIZE_ONLY && RExC_extralen) {
8217 reginsert(pRExC_state, BRANCHJ, br, depth+1);
8220 reginsert(pRExC_state, BRANCH, br, depth+1);
8221 Set_Node_Length(br, paren != 0);
8222 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8226 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
8228 else if (paren == ':') {
8229 *flagp |= flags&SIMPLE;
8231 if (is_open) { /* Starts with OPEN. */
8232 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
8234 else if (paren != '?') /* Not Conditional */
8236 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8238 while (*RExC_parse == '|') {
8239 if (!SIZE_ONLY && RExC_extralen) {
8240 ender = reganode(pRExC_state, LONGJMP,0);
8241 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8244 RExC_extralen += 2; /* Account for LONGJMP. */
8245 nextchar(pRExC_state);
8247 if (RExC_npar > after_freeze)
8248 after_freeze = RExC_npar;
8249 RExC_npar = freeze_paren;
8251 br = regbranch(pRExC_state, &flags, 0, depth+1);
8255 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
8257 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8260 if (have_branch || paren != ':') {
8261 /* Make a closing node, and hook it on the end. */
8264 ender = reg_node(pRExC_state, TAIL);
8267 ender = reganode(pRExC_state, CLOSE, parno);
8268 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8269 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8270 "Setting close paren #%"IVdf" to %d\n",
8271 (IV)parno, REG_NODE_NUM(ender)));
8272 RExC_close_parens[parno-1]= ender;
8273 if (RExC_nestroot == parno)
8276 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8277 Set_Node_Length(ender,1); /* MJD */
8283 *flagp &= ~HASWIDTH;
8286 ender = reg_node(pRExC_state, SUCCEED);
8289 ender = reg_node(pRExC_state, END);
8291 assert(!RExC_opend); /* there can only be one! */
8296 REGTAIL(pRExC_state, lastbr, ender);
8298 if (have_branch && !SIZE_ONLY) {
8300 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8302 /* Hook the tails of the branches to the closing node. */
8303 for (br = ret; br; br = regnext(br)) {
8304 const U8 op = PL_regkind[OP(br)];
8306 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8308 else if (op == BRANCHJ) {
8309 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8317 static const char parens[] = "=!<,>";
8319 if (paren && (p = strchr(parens, paren))) {
8320 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8321 int flag = (p - parens) > 1;
8324 node = SUSPEND, flag = 0;
8325 reginsert(pRExC_state, node,ret, depth+1);
8326 Set_Node_Cur_Length(ret);
8327 Set_Node_Offset(ret, parse_start + 1);
8329 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8333 /* Check for proper termination. */
8335 RExC_flags = oregflags;
8336 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8337 RExC_parse = oregcomp_parse;
8338 vFAIL("Unmatched (");
8341 else if (!paren && RExC_parse < RExC_end) {
8342 if (*RExC_parse == ')') {
8344 vFAIL("Unmatched )");
8347 FAIL("Junk on end of regexp"); /* "Can't happen". */
8351 if (RExC_in_lookbehind) {
8352 RExC_in_lookbehind--;
8354 if (after_freeze > RExC_npar)
8355 RExC_npar = after_freeze;
8360 - regbranch - one alternative of an | operator
8362 * Implements the concatenation operator.
8365 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8368 register regnode *ret;
8369 register regnode *chain = NULL;
8370 register regnode *latest;
8371 I32 flags = 0, c = 0;
8372 GET_RE_DEBUG_FLAGS_DECL;
8374 PERL_ARGS_ASSERT_REGBRANCH;
8376 DEBUG_PARSE("brnc");
8381 if (!SIZE_ONLY && RExC_extralen)
8382 ret = reganode(pRExC_state, BRANCHJ,0);
8384 ret = reg_node(pRExC_state, BRANCH);
8385 Set_Node_Length(ret, 1);
8389 if (!first && SIZE_ONLY)
8390 RExC_extralen += 1; /* BRANCHJ */
8392 *flagp = WORST; /* Tentatively. */
8395 nextchar(pRExC_state);
8396 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8398 latest = regpiece(pRExC_state, &flags,depth+1);
8399 if (latest == NULL) {
8400 if (flags & TRYAGAIN)
8404 else if (ret == NULL)
8406 *flagp |= flags&(HASWIDTH|POSTPONED);
8407 if (chain == NULL) /* First piece. */
8408 *flagp |= flags&SPSTART;
8411 REGTAIL(pRExC_state, chain, latest);
8416 if (chain == NULL) { /* Loop ran zero times. */
8417 chain = reg_node(pRExC_state, NOTHING);
8422 *flagp |= flags&SIMPLE;
8429 - regpiece - something followed by possible [*+?]
8431 * Note that the branching code sequences used for ? and the general cases
8432 * of * and + are somewhat optimized: they use the same NOTHING node as
8433 * both the endmarker for their branch list and the body of the last branch.
8434 * It might seem that this node could be dispensed with entirely, but the
8435 * endmarker role is not redundant.
8438 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8441 register regnode *ret;
8443 register char *next;
8445 const char * const origparse = RExC_parse;
8447 I32 max = REG_INFTY;
8448 #ifdef RE_TRACK_PATTERN_OFFSETS
8451 const char *maxpos = NULL;
8452 GET_RE_DEBUG_FLAGS_DECL;
8454 PERL_ARGS_ASSERT_REGPIECE;
8456 DEBUG_PARSE("piec");
8458 ret = regatom(pRExC_state, &flags,depth+1);
8460 if (flags & TRYAGAIN)
8467 if (op == '{' && regcurly(RExC_parse)) {
8469 #ifdef RE_TRACK_PATTERN_OFFSETS
8470 parse_start = RExC_parse; /* MJD */
8472 next = RExC_parse + 1;
8473 while (isDIGIT(*next) || *next == ',') {
8482 if (*next == '}') { /* got one */
8486 min = atoi(RExC_parse);
8490 maxpos = RExC_parse;
8492 if (!max && *maxpos != '0')
8493 max = REG_INFTY; /* meaning "infinity" */
8494 else if (max >= REG_INFTY)
8495 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
8497 nextchar(pRExC_state);
8500 if ((flags&SIMPLE)) {
8501 RExC_naughty += 2 + RExC_naughty / 2;
8502 reginsert(pRExC_state, CURLY, ret, depth+1);
8503 Set_Node_Offset(ret, parse_start+1); /* MJD */
8504 Set_Node_Cur_Length(ret);
8507 regnode * const w = reg_node(pRExC_state, WHILEM);
8510 REGTAIL(pRExC_state, ret, w);
8511 if (!SIZE_ONLY && RExC_extralen) {
8512 reginsert(pRExC_state, LONGJMP,ret, depth+1);
8513 reginsert(pRExC_state, NOTHING,ret, depth+1);
8514 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
8516 reginsert(pRExC_state, CURLYX,ret, depth+1);
8518 Set_Node_Offset(ret, parse_start+1);
8519 Set_Node_Length(ret,
8520 op == '{' ? (RExC_parse - parse_start) : 1);
8522 if (!SIZE_ONLY && RExC_extralen)
8523 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
8524 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
8526 RExC_whilem_seen++, RExC_extralen += 3;
8527 RExC_naughty += 4 + RExC_naughty; /* compound interest */
8536 vFAIL("Can't do {n,m} with n > m");
8538 ARG1_SET(ret, (U16)min);
8539 ARG2_SET(ret, (U16)max);
8551 #if 0 /* Now runtime fix should be reliable. */
8553 /* if this is reinstated, don't forget to put this back into perldiag:
8555 =item Regexp *+ operand could be empty at {#} in regex m/%s/
8557 (F) The part of the regexp subject to either the * or + quantifier
8558 could match an empty string. The {#} shows in the regular
8559 expression about where the problem was discovered.
8563 if (!(flags&HASWIDTH) && op != '?')
8564 vFAIL("Regexp *+ operand could be empty");
8567 #ifdef RE_TRACK_PATTERN_OFFSETS
8568 parse_start = RExC_parse;
8570 nextchar(pRExC_state);
8572 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
8574 if (op == '*' && (flags&SIMPLE)) {
8575 reginsert(pRExC_state, STAR, ret, depth+1);
8579 else if (op == '*') {
8583 else if (op == '+' && (flags&SIMPLE)) {
8584 reginsert(pRExC_state, PLUS, ret, depth+1);
8588 else if (op == '+') {
8592 else if (op == '?') {
8597 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8598 ckWARN3reg(RExC_parse,
8599 "%.*s matches null string many times",
8600 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8604 if (RExC_parse < RExC_end && *RExC_parse == '?') {
8605 nextchar(pRExC_state);
8606 reginsert(pRExC_state, MINMOD, ret, depth+1);
8607 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
8609 #ifndef REG_ALLOW_MINMOD_SUSPEND
8612 if (RExC_parse < RExC_end && *RExC_parse == '+') {
8614 nextchar(pRExC_state);
8615 ender = reg_node(pRExC_state, SUCCEED);
8616 REGTAIL(pRExC_state, ret, ender);
8617 reginsert(pRExC_state, SUSPEND, ret, depth+1);
8619 ender = reg_node(pRExC_state, TAIL);
8620 REGTAIL(pRExC_state, ret, ender);
8624 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
8626 vFAIL("Nested quantifiers");
8633 /* reg_namedseq(pRExC_state,UVp, UV depth)
8635 This is expected to be called by a parser routine that has
8636 recognized '\N' and needs to handle the rest. RExC_parse is
8637 expected to point at the first char following the N at the time
8640 The \N may be inside (indicated by valuep not being NULL) or outside a
8643 \N may begin either a named sequence, or if outside a character class, mean
8644 to match a non-newline. For non single-quoted regexes, the tokenizer has
8645 attempted to decide which, and in the case of a named sequence converted it
8646 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8647 where c1... are the characters in the sequence. For single-quoted regexes,
8648 the tokenizer passes the \N sequence through unchanged; this code will not
8649 attempt to determine this nor expand those. The net effect is that if the
8650 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8651 signals that this \N occurrence means to match a non-newline.
8653 Only the \N{U+...} form should occur in a character class, for the same
8654 reason that '.' inside a character class means to just match a period: it
8655 just doesn't make sense.
8657 If valuep is non-null then it is assumed that we are parsing inside
8658 of a charclass definition and the first codepoint in the resolved
8659 string is returned via *valuep and the routine will return NULL.
8660 In this mode if a multichar string is returned from the charnames
8661 handler, a warning will be issued, and only the first char in the
8662 sequence will be examined. If the string returned is zero length
8663 then the value of *valuep is undefined and NON-NULL will
8664 be returned to indicate failure. (This will NOT be a valid pointer
8667 If valuep is null then it is assumed that we are parsing normal text and a
8668 new EXACT node is inserted into the program containing the resolved string,
8669 and a pointer to the new node is returned. But if the string is zero length
8670 a NOTHING node is emitted instead.
8672 On success RExC_parse is set to the char following the endbrace.
8673 Parsing failures will generate a fatal error via vFAIL(...)
8676 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
8678 char * endbrace; /* '}' following the name */
8679 regnode *ret = NULL;
8682 GET_RE_DEBUG_FLAGS_DECL;
8684 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
8688 /* The [^\n] meaning of \N ignores spaces and comments under the /x
8689 * modifier. The other meaning does not */
8690 p = (RExC_flags & RXf_PMf_EXTENDED)
8691 ? regwhite( pRExC_state, RExC_parse )
8694 /* Disambiguate between \N meaning a named character versus \N meaning
8695 * [^\n]. The former is assumed when it can't be the latter. */
8696 if (*p != '{' || regcurly(p)) {
8699 /* no bare \N in a charclass */
8700 vFAIL("\\N in a character class must be a named character: \\N{...}");
8702 nextchar(pRExC_state);
8703 ret = reg_node(pRExC_state, REG_ANY);
8704 *flagp |= HASWIDTH|SIMPLE;
8707 Set_Node_Length(ret, 1); /* MJD */
8711 /* Here, we have decided it should be a named sequence */
8713 /* The test above made sure that the next real character is a '{', but
8714 * under the /x modifier, it could be separated by space (or a comment and
8715 * \n) and this is not allowed (for consistency with \x{...} and the
8716 * tokenizer handling of \N{NAME}). */
8717 if (*RExC_parse != '{') {
8718 vFAIL("Missing braces on \\N{}");
8721 RExC_parse++; /* Skip past the '{' */
8723 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8724 || ! (endbrace == RExC_parse /* nothing between the {} */
8725 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
8726 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8728 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
8729 vFAIL("\\N{NAME} must be resolved by the lexer");
8732 if (endbrace == RExC_parse) { /* empty: \N{} */
8734 RExC_parse = endbrace + 1;
8735 return reg_node(pRExC_state,NOTHING);
8739 ckWARNreg(RExC_parse,
8740 "Ignoring zero length \\N{} in character class"
8742 RExC_parse = endbrace + 1;
8745 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
8748 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
8749 RExC_parse += 2; /* Skip past the 'U+' */
8751 if (valuep) { /* In a bracketed char class */
8752 /* We only pay attention to the first char of
8753 multichar strings being returned. I kinda wonder
8754 if this makes sense as it does change the behaviour
8755 from earlier versions, OTOH that behaviour was broken
8756 as well. XXX Solution is to recharacterize as
8757 [rest-of-class]|multi1|multi2... */
8759 STRLEN length_of_hex;
8760 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8761 | PERL_SCAN_DISALLOW_PREFIX
8762 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8764 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8765 if (endchar < endbrace) {
8766 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8769 length_of_hex = (STRLEN)(endchar - RExC_parse);
8770 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8772 /* The tokenizer should have guaranteed validity, but it's possible to
8773 * bypass it by using single quoting, so check */
8774 if (length_of_hex == 0
8775 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8777 RExC_parse += length_of_hex; /* Includes all the valid */
8778 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
8779 ? UTF8SKIP(RExC_parse)
8781 /* Guard against malformed utf8 */
8782 if (RExC_parse >= endchar) RExC_parse = endchar;
8783 vFAIL("Invalid hexadecimal number in \\N{U+...}");
8786 RExC_parse = endbrace + 1;
8787 if (endchar == endbrace) return NULL;
8789 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
8791 else { /* Not a char class */
8793 /* What is done here is to convert this to a sub-pattern of the form
8794 * (?:\x{char1}\x{char2}...)
8795 * and then call reg recursively. That way, it retains its atomicness,
8796 * while not having to worry about special handling that some code
8797 * points may have. toke.c has converted the original Unicode values
8798 * to native, so that we can just pass on the hex values unchanged. We
8799 * do have to set a flag to keep recoding from happening in the
8802 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8804 char *endchar; /* Points to '.' or '}' ending cur char in the input
8806 char *orig_end = RExC_end;
8808 while (RExC_parse < endbrace) {
8810 /* Code points are separated by dots. If none, there is only one
8811 * code point, and is terminated by the brace */
8812 endchar = RExC_parse + strcspn(RExC_parse, ".}");
8814 /* Convert to notation the rest of the code understands */
8815 sv_catpv(substitute_parse, "\\x{");
8816 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8817 sv_catpv(substitute_parse, "}");
8819 /* Point to the beginning of the next character in the sequence. */
8820 RExC_parse = endchar + 1;
8822 sv_catpv(substitute_parse, ")");
8824 RExC_parse = SvPV(substitute_parse, len);
8826 /* Don't allow empty number */
8828 vFAIL("Invalid hexadecimal number in \\N{U+...}");
8830 RExC_end = RExC_parse + len;
8832 /* The values are Unicode, and therefore not subject to recoding */
8833 RExC_override_recoding = 1;
8835 ret = reg(pRExC_state, 1, flagp, depth+1);
8837 RExC_parse = endbrace;
8838 RExC_end = orig_end;
8839 RExC_override_recoding = 0;
8841 nextchar(pRExC_state);
8851 * It returns the code point in utf8 for the value in *encp.
8852 * value: a code value in the source encoding
8853 * encp: a pointer to an Encode object
8855 * If the result from Encode is not a single character,
8856 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8859 S_reg_recode(pTHX_ const char value, SV **encp)
8862 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
8863 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
8864 const STRLEN newlen = SvCUR(sv);
8865 UV uv = UNICODE_REPLACEMENT;
8867 PERL_ARGS_ASSERT_REG_RECODE;
8871 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8874 if (!newlen || numlen != newlen) {
8875 uv = UNICODE_REPLACEMENT;
8883 - regatom - the lowest level
8885 Try to identify anything special at the start of the pattern. If there
8886 is, then handle it as required. This may involve generating a single regop,
8887 such as for an assertion; or it may involve recursing, such as to
8888 handle a () structure.
8890 If the string doesn't start with something special then we gobble up
8891 as much literal text as we can.
8893 Once we have been able to handle whatever type of thing started the
8894 sequence, we return.
8896 Note: we have to be careful with escapes, as they can be both literal
8897 and special, and in the case of \10 and friends can either, depending
8898 on context. Specifically there are two separate switches for handling
8899 escape sequences, with the one for handling literal escapes requiring
8900 a dummy entry for all of the special escapes that are actually handled
8905 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8908 register regnode *ret = NULL;
8910 char *parse_start = RExC_parse;
8912 GET_RE_DEBUG_FLAGS_DECL;
8913 DEBUG_PARSE("atom");
8914 *flagp = WORST; /* Tentatively. */
8916 PERL_ARGS_ASSERT_REGATOM;
8919 switch ((U8)*RExC_parse) {
8921 RExC_seen_zerolen++;
8922 nextchar(pRExC_state);
8923 if (RExC_flags & RXf_PMf_MULTILINE)
8924 ret = reg_node(pRExC_state, MBOL);
8925 else if (RExC_flags & RXf_PMf_SINGLELINE)
8926 ret = reg_node(pRExC_state, SBOL);
8928 ret = reg_node(pRExC_state, BOL);
8929 Set_Node_Length(ret, 1); /* MJD */
8932 nextchar(pRExC_state);
8934 RExC_seen_zerolen++;
8935 if (RExC_flags & RXf_PMf_MULTILINE)
8936 ret = reg_node(pRExC_state, MEOL);
8937 else if (RExC_flags & RXf_PMf_SINGLELINE)
8938 ret = reg_node(pRExC_state, SEOL);
8940 ret = reg_node(pRExC_state, EOL);
8941 Set_Node_Length(ret, 1); /* MJD */
8944 nextchar(pRExC_state);
8945 if (RExC_flags & RXf_PMf_SINGLELINE)
8946 ret = reg_node(pRExC_state, SANY);
8948 ret = reg_node(pRExC_state, REG_ANY);
8949 *flagp |= HASWIDTH|SIMPLE;
8951 Set_Node_Length(ret, 1); /* MJD */
8955 char * const oregcomp_parse = ++RExC_parse;
8956 ret = regclass(pRExC_state,depth+1);
8957 if (*RExC_parse != ']') {
8958 RExC_parse = oregcomp_parse;
8959 vFAIL("Unmatched [");
8961 nextchar(pRExC_state);
8962 *flagp |= HASWIDTH|SIMPLE;
8963 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8967 nextchar(pRExC_state);
8968 ret = reg(pRExC_state, 1, &flags,depth+1);
8970 if (flags & TRYAGAIN) {
8971 if (RExC_parse == RExC_end) {
8972 /* Make parent create an empty node if needed. */
8980 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8984 if (flags & TRYAGAIN) {
8988 vFAIL("Internal urp");
8989 /* Supposed to be caught earlier. */
8992 if (!regcurly(RExC_parse)) {
9001 vFAIL("Quantifier follows nothing");
9006 This switch handles escape sequences that resolve to some kind
9007 of special regop and not to literal text. Escape sequnces that
9008 resolve to literal text are handled below in the switch marked
9011 Every entry in this switch *must* have a corresponding entry
9012 in the literal escape switch. However, the opposite is not
9013 required, as the default for this switch is to jump to the
9014 literal text handling code.
9016 switch ((U8)*++RExC_parse) {
9017 /* Special Escapes */
9019 RExC_seen_zerolen++;
9020 ret = reg_node(pRExC_state, SBOL);
9022 goto finish_meta_pat;
9024 ret = reg_node(pRExC_state, GPOS);
9025 RExC_seen |= REG_SEEN_GPOS;
9027 goto finish_meta_pat;
9029 RExC_seen_zerolen++;
9030 ret = reg_node(pRExC_state, KEEPS);
9032 /* XXX:dmq : disabling in-place substitution seems to
9033 * be necessary here to avoid cases of memory corruption, as
9034 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9036 RExC_seen |= REG_SEEN_LOOKBEHIND;
9037 goto finish_meta_pat;
9039 ret = reg_node(pRExC_state, SEOL);
9041 RExC_seen_zerolen++; /* Do not optimize RE away */
9042 goto finish_meta_pat;
9044 ret = reg_node(pRExC_state, EOS);
9046 RExC_seen_zerolen++; /* Do not optimize RE away */
9047 goto finish_meta_pat;
9049 ret = reg_node(pRExC_state, CANY);
9050 RExC_seen |= REG_SEEN_CANY;
9051 *flagp |= HASWIDTH|SIMPLE;
9052 goto finish_meta_pat;
9054 ret = reg_node(pRExC_state, CLUMP);
9056 goto finish_meta_pat;
9058 switch (get_regex_charset(RExC_flags)) {
9059 case REGEX_LOCALE_CHARSET:
9062 case REGEX_UNICODE_CHARSET:
9065 case REGEX_ASCII_RESTRICTED_CHARSET:
9066 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9069 case REGEX_DEPENDS_CHARSET:
9075 ret = reg_node(pRExC_state, op);
9076 *flagp |= HASWIDTH|SIMPLE;
9077 goto finish_meta_pat;
9079 switch (get_regex_charset(RExC_flags)) {
9080 case REGEX_LOCALE_CHARSET:
9083 case REGEX_UNICODE_CHARSET:
9086 case REGEX_ASCII_RESTRICTED_CHARSET:
9087 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9090 case REGEX_DEPENDS_CHARSET:
9096 ret = reg_node(pRExC_state, op);
9097 *flagp |= HASWIDTH|SIMPLE;
9098 goto finish_meta_pat;
9100 RExC_seen_zerolen++;
9101 RExC_seen |= REG_SEEN_LOOKBEHIND;
9102 switch (get_regex_charset(RExC_flags)) {
9103 case REGEX_LOCALE_CHARSET:
9106 case REGEX_UNICODE_CHARSET:
9109 case REGEX_ASCII_RESTRICTED_CHARSET:
9110 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9113 case REGEX_DEPENDS_CHARSET:
9119 ret = reg_node(pRExC_state, op);
9120 FLAGS(ret) = get_regex_charset(RExC_flags);
9122 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9123 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
9125 goto finish_meta_pat;
9127 RExC_seen_zerolen++;
9128 RExC_seen |= REG_SEEN_LOOKBEHIND;
9129 switch (get_regex_charset(RExC_flags)) {
9130 case REGEX_LOCALE_CHARSET:
9133 case REGEX_UNICODE_CHARSET:
9136 case REGEX_ASCII_RESTRICTED_CHARSET:
9137 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9140 case REGEX_DEPENDS_CHARSET:
9146 ret = reg_node(pRExC_state, op);
9147 FLAGS(ret) = get_regex_charset(RExC_flags);
9149 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9150 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
9152 goto finish_meta_pat;
9154 switch (get_regex_charset(RExC_flags)) {
9155 case REGEX_LOCALE_CHARSET:
9158 case REGEX_UNICODE_CHARSET:
9161 case REGEX_ASCII_RESTRICTED_CHARSET:
9162 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9165 case REGEX_DEPENDS_CHARSET:
9171 ret = reg_node(pRExC_state, op);
9172 *flagp |= HASWIDTH|SIMPLE;
9173 goto finish_meta_pat;
9175 switch (get_regex_charset(RExC_flags)) {
9176 case REGEX_LOCALE_CHARSET:
9179 case REGEX_UNICODE_CHARSET:
9182 case REGEX_ASCII_RESTRICTED_CHARSET:
9183 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9186 case REGEX_DEPENDS_CHARSET:
9192 ret = reg_node(pRExC_state, op);
9193 *flagp |= HASWIDTH|SIMPLE;
9194 goto finish_meta_pat;
9196 switch (get_regex_charset(RExC_flags)) {
9197 case REGEX_LOCALE_CHARSET:
9200 case REGEX_ASCII_RESTRICTED_CHARSET:
9201 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9204 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9205 case REGEX_UNICODE_CHARSET:
9211 ret = reg_node(pRExC_state, op);
9212 *flagp |= HASWIDTH|SIMPLE;
9213 goto finish_meta_pat;
9215 switch (get_regex_charset(RExC_flags)) {
9216 case REGEX_LOCALE_CHARSET:
9219 case REGEX_ASCII_RESTRICTED_CHARSET:
9220 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9223 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9224 case REGEX_UNICODE_CHARSET:
9230 ret = reg_node(pRExC_state, op);
9231 *flagp |= HASWIDTH|SIMPLE;
9232 goto finish_meta_pat;
9234 ret = reg_node(pRExC_state, LNBREAK);
9235 *flagp |= HASWIDTH|SIMPLE;
9236 goto finish_meta_pat;
9238 ret = reg_node(pRExC_state, HORIZWS);
9239 *flagp |= HASWIDTH|SIMPLE;
9240 goto finish_meta_pat;
9242 ret = reg_node(pRExC_state, NHORIZWS);
9243 *flagp |= HASWIDTH|SIMPLE;
9244 goto finish_meta_pat;
9246 ret = reg_node(pRExC_state, VERTWS);
9247 *flagp |= HASWIDTH|SIMPLE;
9248 goto finish_meta_pat;
9250 ret = reg_node(pRExC_state, NVERTWS);
9251 *flagp |= HASWIDTH|SIMPLE;
9253 nextchar(pRExC_state);
9254 Set_Node_Length(ret, 2); /* MJD */
9259 char* const oldregxend = RExC_end;
9261 char* parse_start = RExC_parse - 2;
9264 if (RExC_parse[1] == '{') {
9265 /* a lovely hack--pretend we saw [\pX] instead */
9266 RExC_end = strchr(RExC_parse, '}');
9268 const U8 c = (U8)*RExC_parse;
9270 RExC_end = oldregxend;
9271 vFAIL2("Missing right brace on \\%c{}", c);
9276 RExC_end = RExC_parse + 2;
9277 if (RExC_end > oldregxend)
9278 RExC_end = oldregxend;
9282 ret = regclass(pRExC_state,depth+1);
9284 RExC_end = oldregxend;
9287 Set_Node_Offset(ret, parse_start + 2);
9288 Set_Node_Cur_Length(ret);
9289 nextchar(pRExC_state);
9290 *flagp |= HASWIDTH|SIMPLE;
9294 /* Handle \N and \N{NAME} here and not below because it can be
9295 multicharacter. join_exact() will join them up later on.
9296 Also this makes sure that things like /\N{BLAH}+/ and
9297 \N{BLAH} being multi char Just Happen. dmq*/
9299 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9301 case 'k': /* Handle \k<NAME> and \k'NAME' */
9304 char ch= RExC_parse[1];
9305 if (ch != '<' && ch != '\'' && ch != '{') {
9307 vFAIL2("Sequence %.2s... not terminated",parse_start);
9309 /* this pretty much dupes the code for (?P=...) in reg(), if
9310 you change this make sure you change that */
9311 char* name_start = (RExC_parse += 2);
9313 SV *sv_dat = reg_scan_name(pRExC_state,
9314 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9315 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9316 if (RExC_parse == name_start || *RExC_parse != ch)
9317 vFAIL2("Sequence %.3s... not terminated",parse_start);
9320 num = add_data( pRExC_state, 1, "S" );
9321 RExC_rxi->data->data[num]=(void*)sv_dat;
9322 SvREFCNT_inc_simple_void(sv_dat);
9326 ret = reganode(pRExC_state,
9329 : (MORE_ASCII_RESTRICTED)
9331 : (AT_LEAST_UNI_SEMANTICS)
9339 /* override incorrect value set in reganode MJD */
9340 Set_Node_Offset(ret, parse_start+1);
9341 Set_Node_Cur_Length(ret); /* MJD */
9342 nextchar(pRExC_state);
9348 case '1': case '2': case '3': case '4':
9349 case '5': case '6': case '7': case '8': case '9':
9352 bool isg = *RExC_parse == 'g';
9357 if (*RExC_parse == '{') {
9361 if (*RExC_parse == '-') {
9365 if (hasbrace && !isDIGIT(*RExC_parse)) {
9366 if (isrel) RExC_parse--;
9368 goto parse_named_seq;
9370 num = atoi(RExC_parse);
9371 if (isg && num == 0)
9372 vFAIL("Reference to invalid group 0");
9374 num = RExC_npar - num;
9376 vFAIL("Reference to nonexistent or unclosed group");
9378 if (!isg && num > 9 && num >= RExC_npar)
9381 char * const parse_start = RExC_parse - 1; /* MJD */
9382 while (isDIGIT(*RExC_parse))
9384 if (parse_start == RExC_parse - 1)
9385 vFAIL("Unterminated \\g... pattern");
9387 if (*RExC_parse != '}')
9388 vFAIL("Unterminated \\g{...} pattern");
9392 if (num > (I32)RExC_rx->nparens)
9393 vFAIL("Reference to nonexistent group");
9396 ret = reganode(pRExC_state,
9399 : (MORE_ASCII_RESTRICTED)
9401 : (AT_LEAST_UNI_SEMANTICS)
9409 /* override incorrect value set in reganode MJD */
9410 Set_Node_Offset(ret, parse_start+1);
9411 Set_Node_Cur_Length(ret); /* MJD */
9413 nextchar(pRExC_state);
9418 if (RExC_parse >= RExC_end)
9419 FAIL("Trailing \\");
9422 /* Do not generate "unrecognized" warnings here, we fall
9423 back into the quick-grab loop below */
9430 if (RExC_flags & RXf_PMf_EXTENDED) {
9431 if ( reg_skipcomment( pRExC_state ) )
9438 parse_start = RExC_parse - 1;
9443 register STRLEN len;
9448 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
9451 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
9452 * it is folded to 'ss' even if not utf8 */
9453 bool is_exactfu_sharp_s;
9456 node_type = ((! FOLD) ? EXACT
9459 : (MORE_ASCII_RESTRICTED)
9461 : (AT_LEAST_UNI_SEMANTICS)
9464 ret = reg_node(pRExC_state, node_type);
9467 /* XXX The node can hold up to 255 bytes, yet this only goes to
9468 * 127. I (khw) do not know why. Keeping it somewhat less than
9469 * 255 allows us to not have to worry about overflow due to
9470 * converting to utf8 and fold expansion, but that value is
9471 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
9472 * split up by this limit into a single one using the real max of
9473 * 255. Even at 127, this breaks under rare circumstances. If
9474 * folding, we do not want to split a node at a character that is a
9475 * non-final in a multi-char fold, as an input string could just
9476 * happen to want to match across the node boundary. The join
9477 * would solve that problem if the join actually happens. But a
9478 * series of more than two nodes in a row each of 127 would cause
9479 * the first join to succeed to get to 254, but then there wouldn't
9480 * be room for the next one, which could at be one of those split
9481 * multi-char folds. I don't know of any fool-proof solution. One
9482 * could back off to end with only a code point that isn't such a
9483 * non-final, but it is possible for there not to be any in the
9485 for (len = 0, p = RExC_parse - 1;
9486 len < 127 && p < RExC_end;
9489 char * const oldp = p;
9491 if (RExC_flags & RXf_PMf_EXTENDED)
9492 p = regwhite( pRExC_state, p );
9503 /* Literal Escapes Switch
9505 This switch is meant to handle escape sequences that
9506 resolve to a literal character.
9508 Every escape sequence that represents something
9509 else, like an assertion or a char class, is handled
9510 in the switch marked 'Special Escapes' above in this
9511 routine, but also has an entry here as anything that
9512 isn't explicitly mentioned here will be treated as
9513 an unescaped equivalent literal.
9517 /* These are all the special escapes. */
9518 case 'A': /* Start assertion */
9519 case 'b': case 'B': /* Word-boundary assertion*/
9520 case 'C': /* Single char !DANGEROUS! */
9521 case 'd': case 'D': /* digit class */
9522 case 'g': case 'G': /* generic-backref, pos assertion */
9523 case 'h': case 'H': /* HORIZWS */
9524 case 'k': case 'K': /* named backref, keep marker */
9525 case 'N': /* named char sequence */
9526 case 'p': case 'P': /* Unicode property */
9527 case 'R': /* LNBREAK */
9528 case 's': case 'S': /* space class */
9529 case 'v': case 'V': /* VERTWS */
9530 case 'w': case 'W': /* word class */
9531 case 'X': /* eXtended Unicode "combining character sequence" */
9532 case 'z': case 'Z': /* End of line/string assertion */
9536 /* Anything after here is an escape that resolves to a
9537 literal. (Except digits, which may or may not)
9556 ender = ASCII_TO_NATIVE('\033');
9560 ender = ASCII_TO_NATIVE('\007');
9565 STRLEN brace_len = len;
9567 const char* error_msg;
9569 bool valid = grok_bslash_o(p,
9576 RExC_parse = p; /* going to die anyway; point
9577 to exact spot of failure */
9584 if (PL_encoding && ender < 0x100) {
9585 goto recode_encoding;
9594 char* const e = strchr(p, '}');
9598 vFAIL("Missing right brace on \\x{}");
9601 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9602 | PERL_SCAN_DISALLOW_PREFIX;
9603 STRLEN numlen = e - p - 1;
9604 ender = grok_hex(p + 1, &numlen, &flags, NULL);
9611 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9613 ender = grok_hex(p, &numlen, &flags, NULL);
9616 if (PL_encoding && ender < 0x100)
9617 goto recode_encoding;
9621 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
9623 case '0': case '1': case '2': case '3':case '4':
9624 case '5': case '6': case '7': case '8':case '9':
9626 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
9628 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9630 ender = grok_oct(p, &numlen, &flags, NULL);
9640 if (PL_encoding && ender < 0x100)
9641 goto recode_encoding;
9644 if (! RExC_override_recoding) {
9645 SV* enc = PL_encoding;
9646 ender = reg_recode((const char)(U8)ender, &enc);
9647 if (!enc && SIZE_ONLY)
9648 ckWARNreg(p, "Invalid escape in the specified encoding");
9654 FAIL("Trailing \\");
9657 if (!SIZE_ONLY&& isALPHA(*p)) {
9658 /* Include any { following the alpha to emphasize
9659 * that it could be part of an escape at some point
9661 int len = (*(p + 1) == '{') ? 2 : 1;
9662 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
9664 goto normal_default;
9669 if (UTF8_IS_START(*p) && UTF) {
9671 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9672 &numlen, UTF8_ALLOW_DEFAULT);
9678 } /* End of switch on the literal */
9680 is_exactfu_sharp_s = (node_type == EXACTFU
9681 && ender == LATIN_SMALL_LETTER_SHARP_S);
9682 if ( RExC_flags & RXf_PMf_EXTENDED)
9683 p = regwhite( pRExC_state, p );
9684 if ((UTF && FOLD) || is_exactfu_sharp_s) {
9685 /* Prime the casefolded buffer. Locale rules, which apply
9686 * only to code points < 256, aren't known until execution,
9687 * so for them, just output the original character using
9688 * utf8. If we start to fold non-UTF patterns, be sure to
9689 * update join_exact() */
9690 if (LOC && ender < 256) {
9691 if (UNI_IS_INVARIANT(ender)) {
9692 *tmpbuf = (U8) ender;
9695 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9696 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9700 else if (isASCII(ender)) { /* Note: Here can't also be LOC
9702 ender = toLOWER(ender);
9703 *tmpbuf = (U8) ender;
9706 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9708 /* Locale and /aa require more selectivity about the
9709 * fold, so are handled below. Otherwise, here, just
9711 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9714 /* Under locale rules or /aa we are not to mix,
9715 * respectively, ords < 256 or ASCII with non-. So
9716 * reject folds that mix them, using only the
9717 * non-folded code point. So do the fold to a
9718 * temporary, and inspect each character in it. */
9719 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9721 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9722 U8* e = s + foldlen;
9723 bool fold_ok = TRUE;
9727 || (LOC && (UTF8_IS_INVARIANT(*s)
9728 || UTF8_IS_DOWNGRADEABLE_START(*s))))
9736 Copy(trialbuf, tmpbuf, foldlen, U8);
9740 uvuni_to_utf8(tmpbuf, ender);
9741 foldlen = UNISKIP(ender);
9745 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
9748 else if (UTF || is_exactfu_sharp_s) {
9750 /* Emit all the Unicode characters. */
9752 for (foldbuf = tmpbuf;
9754 foldlen -= numlen) {
9755 ender = utf8_to_uvchr(foldbuf, &numlen);
9757 const STRLEN unilen = reguni(pRExC_state, ender, s);
9760 /* In EBCDIC the numlen
9761 * and unilen can differ. */
9763 if (numlen >= foldlen)
9767 break; /* "Can't happen." */
9771 const STRLEN unilen = reguni(pRExC_state, ender, s);
9780 REGC((char)ender, s++);
9784 if (UTF || is_exactfu_sharp_s) {
9786 /* Emit all the Unicode characters. */
9788 for (foldbuf = tmpbuf;
9790 foldlen -= numlen) {
9791 ender = utf8_to_uvchr(foldbuf, &numlen);
9793 const STRLEN unilen = reguni(pRExC_state, ender, s);
9796 /* In EBCDIC the numlen
9797 * and unilen can differ. */
9799 if (numlen >= foldlen)
9807 const STRLEN unilen = reguni(pRExC_state, ender, s);
9816 REGC((char)ender, s++);
9819 loopdone: /* Jumped to when encounters something that shouldn't be in
9822 Set_Node_Cur_Length(ret); /* MJD */
9823 nextchar(pRExC_state);
9825 /* len is STRLEN which is unsigned, need to copy to signed */
9828 vFAIL("Internal disaster");
9832 if (len == 1 && UNI_IS_INVARIANT(ender))
9836 RExC_size += STR_SZ(len);
9839 RExC_emit += STR_SZ(len);
9847 /* Jumped to when an unrecognized character set is encountered */
9849 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9854 S_regwhite( RExC_state_t *pRExC_state, char *p )
9856 const char *e = RExC_end;
9858 PERL_ARGS_ASSERT_REGWHITE;
9863 else if (*p == '#') {
9872 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9880 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9881 Character classes ([:foo:]) can also be negated ([:^foo:]).
9882 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9883 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9884 but trigger failures because they are currently unimplemented. */
9886 #define POSIXCC_DONE(c) ((c) == ':')
9887 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9888 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9891 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9894 I32 namedclass = OOB_NAMEDCLASS;
9896 PERL_ARGS_ASSERT_REGPPOSIXCC;
9898 if (value == '[' && RExC_parse + 1 < RExC_end &&
9899 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9900 POSIXCC(UCHARAT(RExC_parse))) {
9901 const char c = UCHARAT(RExC_parse);
9902 char* const s = RExC_parse++;
9904 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9906 if (RExC_parse == RExC_end)
9907 /* Grandfather lone [:, [=, [. */
9910 const char* const t = RExC_parse++; /* skip over the c */
9913 if (UCHARAT(RExC_parse) == ']') {
9914 const char *posixcc = s + 1;
9915 RExC_parse++; /* skip over the ending ] */
9918 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9919 const I32 skip = t - posixcc;
9921 /* Initially switch on the length of the name. */
9924 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9925 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9928 /* Names all of length 5. */
9929 /* alnum alpha ascii blank cntrl digit graph lower
9930 print punct space upper */
9931 /* Offset 4 gives the best switch position. */
9932 switch (posixcc[4]) {
9934 if (memEQ(posixcc, "alph", 4)) /* alpha */
9935 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9938 if (memEQ(posixcc, "spac", 4)) /* space */
9939 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9942 if (memEQ(posixcc, "grap", 4)) /* graph */
9943 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9946 if (memEQ(posixcc, "asci", 4)) /* ascii */
9947 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9950 if (memEQ(posixcc, "blan", 4)) /* blank */
9951 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9954 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9955 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9958 if (memEQ(posixcc, "alnu", 4)) /* alnum */
9959 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9962 if (memEQ(posixcc, "lowe", 4)) /* lower */
9963 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9964 else if (memEQ(posixcc, "uppe", 4)) /* upper */
9965 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9968 if (memEQ(posixcc, "digi", 4)) /* digit */
9969 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9970 else if (memEQ(posixcc, "prin", 4)) /* print */
9971 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9972 else if (memEQ(posixcc, "punc", 4)) /* punct */
9973 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9978 if (memEQ(posixcc, "xdigit", 6))
9979 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9983 if (namedclass == OOB_NAMEDCLASS)
9984 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9986 assert (posixcc[skip] == ':');
9987 assert (posixcc[skip+1] == ']');
9988 } else if (!SIZE_ONLY) {
9989 /* [[=foo=]] and [[.foo.]] are still future. */
9991 /* adjust RExC_parse so the warning shows after
9993 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9995 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9998 /* Maternal grandfather:
9999 * "[:" ending in ":" but not in ":]" */
10009 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10013 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10015 if (POSIXCC(UCHARAT(RExC_parse))) {
10016 const char *s = RExC_parse;
10017 const char c = *s++;
10019 while (isALNUM(*s))
10021 if (*s && c == *s && s[1] == ']') {
10023 "POSIX syntax [%c %c] belongs inside character classes",
10026 /* [[=foo=]] and [[.foo.]] are still future. */
10027 if (POSIXCC_NOTYET(c)) {
10028 /* adjust RExC_parse so the error shows after
10029 the class closes */
10030 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10032 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10038 /* Generate the code to add a full posix character <class> to the bracketed
10039 * character class given by <node>. (<node> is needed only under locale rules)
10040 * destlist is the inversion list for non-locale rules that this class is
10042 * sourcelist is the ASCII-range inversion list to add under /a rules
10043 * Xsourcelist is the full Unicode range list to use otherwise. */
10044 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10046 SV* scratch_list = NULL; \
10048 /* Set this class in the node for runtime matching */ \
10049 ANYOF_CLASS_SET(node, class); \
10051 /* For above Latin1 code points, we use the full Unicode range */ \
10052 _invlist_intersection(PL_AboveLatin1, \
10055 /* And set the output to it, adding instead if there already is an \
10056 * output. Checking if <destlist> is NULL first saves an extra \
10057 * clone. Its reference count will be decremented at the next \
10058 * union, etc, or if this is the only instance, at the end of the \
10060 if (! destlist) { \
10061 destlist = scratch_list; \
10064 _invlist_union(destlist, scratch_list, &destlist); \
10065 SvREFCNT_dec(scratch_list); \
10069 /* For non-locale, just add it to any existing list */ \
10070 _invlist_union(destlist, \
10071 (AT_LEAST_ASCII_RESTRICTED) \
10077 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10079 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10081 SV* scratch_list = NULL; \
10082 ANYOF_CLASS_SET(node, class); \
10083 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10084 if (! destlist) { \
10085 destlist = scratch_list; \
10088 _invlist_union(destlist, scratch_list, &destlist); \
10089 SvREFCNT_dec(scratch_list); \
10093 _invlist_union_complement_2nd(destlist, \
10094 (AT_LEAST_ASCII_RESTRICTED) \
10098 /* Under /d, everything in the upper half of the Latin1 range \
10099 * matches this complement */ \
10100 if (DEPENDS_SEMANTICS) { \
10101 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10105 /* Generate the code to add a posix character <class> to the bracketed
10106 * character class given by <node>. (<node> is needed only under locale rules)
10107 * destlist is the inversion list for non-locale rules that this class is
10109 * sourcelist is the ASCII-range inversion list to add under /a rules
10110 * l1_sourcelist is the Latin1 range list to use otherwise.
10111 * Xpropertyname is the name to add to <run_time_list> of the property to
10112 * specify the code points above Latin1 that will have to be
10113 * determined at run-time
10114 * run_time_list is a SV* that contains text names of properties that are to
10115 * be computed at run time. This concatenates <Xpropertyname>
10116 * to it, apppropriately
10117 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10119 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10120 l1_sourcelist, Xpropertyname, run_time_list) \
10121 /* If not /a matching, there are going to be code points we will have \
10122 * to defer to runtime to look-up */ \
10123 if (! AT_LEAST_ASCII_RESTRICTED) { \
10124 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10127 ANYOF_CLASS_SET(node, class); \
10130 _invlist_union(destlist, \
10131 (AT_LEAST_ASCII_RESTRICTED) \
10137 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
10138 * this and DO_N_POSIX */
10139 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10140 l1_sourcelist, Xpropertyname, run_time_list) \
10141 if (AT_LEAST_ASCII_RESTRICTED) { \
10142 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
10145 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10147 ANYOF_CLASS_SET(node, namedclass); \
10150 SV* scratch_list = NULL; \
10151 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
10152 if (! destlist) { \
10153 destlist = scratch_list; \
10156 _invlist_union(destlist, scratch_list, &destlist); \
10157 SvREFCNT_dec(scratch_list); \
10159 if (DEPENDS_SEMANTICS) { \
10160 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10166 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10169 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10170 * Locale folding is done at run-time, so this function should not be
10171 * called for nodes that are for locales.
10173 * This function sets the bit corresponding to the fold of the input
10174 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
10177 * It also knows about the characters that are in the bitmap that have
10178 * folds that are matchable only outside it, and sets the appropriate lists
10181 * It returns the number of bits that actually changed from 0 to 1 */
10186 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10188 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10191 /* It assumes the bit for 'value' has already been set */
10192 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10193 ANYOF_BITMAP_SET(node, fold);
10196 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10197 /* Certain Latin1 characters have matches outside the bitmap. To get
10198 * here, 'value' is one of those characters. None of these matches is
10199 * valid for ASCII characters under /aa, which have been excluded by
10200 * the 'if' above. The matches fall into three categories:
10201 * 1) They are singly folded-to or -from an above 255 character, as
10202 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10204 * 2) They are part of a multi-char fold with another character in the
10205 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10206 * 3) They are part of a multi-char fold with a character not in the
10207 * bitmap, such as various ligatures.
10208 * We aren't dealing fully with multi-char folds, except we do deal
10209 * with the pattern containing a character that has a multi-char fold
10210 * (not so much the inverse).
10211 * For types 1) and 3), the matches only happen when the target string
10212 * is utf8; that's not true for 2), and we set a flag for it.
10214 * The code below adds to the passed in inversion list the single fold
10215 * closures for 'value'. The values are hard-coded here so that an
10216 * innocent-looking character class, like /[ks]/i won't have to go out
10217 * to disk to find the possible matches. XXX It would be better to
10218 * generate these via regen, in case a new version of the Unicode
10219 * standard adds new mappings, though that is not really likely. */
10224 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10228 /* LATIN SMALL LETTER LONG S */
10229 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10232 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10233 GREEK_SMALL_LETTER_MU);
10234 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10235 GREEK_CAPITAL_LETTER_MU);
10237 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10238 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10239 /* ANGSTROM SIGN */
10240 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10241 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
10242 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10243 PL_fold_latin1[value]);
10246 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10247 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10248 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10250 case LATIN_SMALL_LETTER_SHARP_S:
10251 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10252 LATIN_CAPITAL_LETTER_SHARP_S);
10254 /* Under /a, /d, and /u, this can match the two chars "ss" */
10255 if (! MORE_ASCII_RESTRICTED) {
10256 add_alternate(alternate_ptr, (U8 *) "ss", 2);
10258 /* And under /u or /a, it can match even if the target is
10260 if (AT_LEAST_UNI_SEMANTICS) {
10261 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10265 case 'F': case 'f':
10266 case 'I': case 'i':
10267 case 'L': case 'l':
10268 case 'T': case 't':
10269 case 'A': case 'a':
10270 case 'H': case 'h':
10271 case 'J': case 'j':
10272 case 'N': case 'n':
10273 case 'W': case 'w':
10274 case 'Y': case 'y':
10275 /* These all are targets of multi-character folds from code
10276 * points that require UTF8 to express, so they can't match
10277 * unless the target string is in UTF-8, so no action here is
10278 * necessary, as regexec.c properly handles the general case
10279 * for UTF-8 matching */
10282 /* Use deprecated warning to increase the chances of this
10284 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10288 else if (DEPENDS_SEMANTICS
10289 && ! isASCII(value)
10290 && PL_fold_latin1[value] != value)
10292 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10293 * folds only when the target string is in UTF-8. We add the fold
10294 * here to the list of things to match outside the bitmap, which
10295 * won't be looked at unless it is UTF8 (or else if something else
10296 * says to look even if not utf8, but those things better not happen
10297 * under DEPENDS semantics. */
10298 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10305 PERL_STATIC_INLINE U8
10306 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10308 /* This inline function sets a bit in the bitmap if not already set, and if
10309 * appropriate, its fold, returning the number of bits that actually
10310 * changed from 0 to 1 */
10314 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10316 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
10320 ANYOF_BITMAP_SET(node, value);
10323 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
10324 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10331 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10333 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10334 * alternate list, pointed to by 'alternate_ptr'. This is an array of
10335 * the multi-character folds of characters in the node */
10338 PERL_ARGS_ASSERT_ADD_ALTERNATE;
10340 if (! *alternate_ptr) {
10341 *alternate_ptr = newAV();
10343 sv = newSVpvn_utf8((char*)string, len, TRUE);
10344 av_push(*alternate_ptr, sv);
10349 parse a class specification and produce either an ANYOF node that
10350 matches the pattern or perhaps will be optimized into an EXACTish node
10351 instead. The node contains a bit map for the first 256 characters, with the
10352 corresponding bit set if that character is in the list. For characters
10353 above 255, a range list is used */
10356 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10359 register UV nextvalue;
10360 register IV prevvalue = OOB_UNICODE;
10361 register IV range = 0;
10362 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10363 register regnode *ret;
10366 char *rangebegin = NULL;
10367 bool need_class = 0;
10368 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
10370 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10371 than just initialized. */
10372 SV* properties = NULL; /* Code points that match \p{} \P{} */
10373 UV element_count = 0; /* Number of distinct elements in the class.
10374 Optimizations may be possible if this is tiny */
10377 /* Unicode properties are stored in a swash; this holds the current one
10378 * being parsed. If this swash is the only above-latin1 component of the
10379 * character class, an optimization is to pass it directly on to the
10380 * execution engine. Otherwise, it is set to NULL to indicate that there
10381 * are other things in the class that have to be dealt with at execution
10383 SV* swash = NULL; /* Code points that match \p{} \P{} */
10385 /* Set if a component of this character class is user-defined; just passed
10386 * on to the engine */
10387 UV has_user_defined_property = 0;
10389 /* code points this node matches that can't be stored in the bitmap */
10390 SV* nonbitmap = NULL;
10392 /* The items that are to match that aren't stored in the bitmap, but are a
10393 * result of things that are stored there. This is the fold closure of
10394 * such a character, either because it has DEPENDS semantics and shouldn't
10395 * be matched unless the target string is utf8, or is a code point that is
10396 * too large for the bit map, as for example, the fold of the MICRO SIGN is
10397 * above 255. This all is solely for performance reasons. By having this
10398 * code know the outside-the-bitmap folds that the bitmapped characters are
10399 * involved with, we don't have to go out to disk to find the list of
10400 * matches, unless the character class includes code points that aren't
10401 * storable in the bit map. That means that a character class with an 's'
10402 * in it, for example, doesn't need to go out to disk to find everything
10403 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
10404 * empty unless there is something whose fold we don't know about, and will
10405 * have to go out to the disk to find. */
10406 SV* l1_fold_invlist = NULL;
10408 /* List of multi-character folds that are matched by this node */
10409 AV* unicode_alternate = NULL;
10411 UV literal_endpoint = 0;
10413 UV stored = 0; /* how many chars stored in the bitmap */
10415 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
10416 case we need to change the emitted regop to an EXACT. */
10417 const char * orig_parse = RExC_parse;
10418 GET_RE_DEBUG_FLAGS_DECL;
10420 PERL_ARGS_ASSERT_REGCLASS;
10422 PERL_UNUSED_ARG(depth);
10425 DEBUG_PARSE("clas");
10427 /* Assume we are going to generate an ANYOF node. */
10428 ret = reganode(pRExC_state, ANYOF, 0);
10432 ANYOF_FLAGS(ret) = 0;
10435 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
10439 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
10441 /* We have decided to not allow multi-char folds in inverted character
10442 * classes, due to the confusion that can happen, especially with
10443 * classes that are designed for a non-Unicode world: You have the
10444 * peculiar case that:
10445 "s s" =~ /^[^\xDF]+$/i => Y
10446 "ss" =~ /^[^\xDF]+$/i => N
10448 * See [perl #89750] */
10449 allow_full_fold = FALSE;
10453 RExC_size += ANYOF_SKIP;
10454 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10457 RExC_emit += ANYOF_SKIP;
10459 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
10461 ANYOF_BITMAP_ZERO(ret);
10462 listsv = newSVpvs("# comment\n");
10463 initial_listsv_len = SvCUR(listsv);
10466 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10468 if (!SIZE_ONLY && POSIXCC(nextvalue))
10469 checkposixcc(pRExC_state);
10471 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10472 if (UCHARAT(RExC_parse) == ']')
10473 goto charclassloop;
10476 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
10480 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10483 rangebegin = RExC_parse;
10487 value = utf8n_to_uvchr((U8*)RExC_parse,
10488 RExC_end - RExC_parse,
10489 &numlen, UTF8_ALLOW_DEFAULT);
10490 RExC_parse += numlen;
10493 value = UCHARAT(RExC_parse++);
10495 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10496 if (value == '[' && POSIXCC(nextvalue))
10497 namedclass = regpposixcc(pRExC_state, value);
10498 else if (value == '\\') {
10500 value = utf8n_to_uvchr((U8*)RExC_parse,
10501 RExC_end - RExC_parse,
10502 &numlen, UTF8_ALLOW_DEFAULT);
10503 RExC_parse += numlen;
10506 value = UCHARAT(RExC_parse++);
10507 /* Some compilers cannot handle switching on 64-bit integer
10508 * values, therefore value cannot be an UV. Yes, this will
10509 * be a problem later if we want switch on Unicode.
10510 * A similar issue a little bit later when switching on
10511 * namedclass. --jhi */
10512 switch ((I32)value) {
10513 case 'w': namedclass = ANYOF_ALNUM; break;
10514 case 'W': namedclass = ANYOF_NALNUM; break;
10515 case 's': namedclass = ANYOF_SPACE; break;
10516 case 'S': namedclass = ANYOF_NSPACE; break;
10517 case 'd': namedclass = ANYOF_DIGIT; break;
10518 case 'D': namedclass = ANYOF_NDIGIT; break;
10519 case 'v': namedclass = ANYOF_VERTWS; break;
10520 case 'V': namedclass = ANYOF_NVERTWS; break;
10521 case 'h': namedclass = ANYOF_HORIZWS; break;
10522 case 'H': namedclass = ANYOF_NHORIZWS; break;
10523 case 'N': /* Handle \N{NAME} in class */
10525 /* We only pay attention to the first char of
10526 multichar strings being returned. I kinda wonder
10527 if this makes sense as it does change the behaviour
10528 from earlier versions, OTOH that behaviour was broken
10530 UV v; /* value is register so we cant & it /grrr */
10531 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
10541 if (RExC_parse >= RExC_end)
10542 vFAIL2("Empty \\%c{}", (U8)value);
10543 if (*RExC_parse == '{') {
10544 const U8 c = (U8)value;
10545 e = strchr(RExC_parse++, '}');
10547 vFAIL2("Missing right brace on \\%c{}", c);
10548 while (isSPACE(UCHARAT(RExC_parse)))
10550 if (e == RExC_parse)
10551 vFAIL2("Empty \\%c{}", c);
10552 n = e - RExC_parse;
10553 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10564 if (UCHARAT(RExC_parse) == '^') {
10567 value = value == 'p' ? 'P' : 'p'; /* toggle */
10568 while (isSPACE(UCHARAT(RExC_parse))) {
10573 /* Try to get the definition of the property into
10574 * <invlist>. If /i is in effect, the effective property
10575 * will have its name be <__NAME_i>. The design is
10576 * discussed in commit
10577 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10578 Newx(name, n + sizeof("_i__\n"), char);
10580 sprintf(name, "%s%.*s%s\n",
10581 (FOLD) ? "__" : "",
10587 /* Look up the property name, and get its swash and
10588 * inversion list, if the property is found */
10590 SvREFCNT_dec(swash);
10592 swash = _core_swash_init("utf8", name, &PL_sv_undef,
10595 TRUE, /* this routine will handle
10596 undefined properties */
10597 NULL, FALSE /* No inversion list */
10601 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10603 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10605 || ! (invlist = *invlistsvp))
10608 SvREFCNT_dec(swash);
10612 /* Here didn't find it. It could be a user-defined
10613 * property that will be available at run-time. Add it
10614 * to the list to look up then */
10615 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10616 (value == 'p' ? '+' : '!'),
10618 has_user_defined_property = 1;
10620 /* We don't know yet, so have to assume that the
10621 * property could match something in the Latin1 range,
10622 * hence something that isn't utf8 */
10623 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10627 /* Here, did get the swash and its inversion list. If
10628 * the swash is from a user-defined property, then this
10629 * whole character class should be regarded as such */
10630 SV** user_defined_svp =
10631 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10632 "USER_DEFINED", FALSE);
10633 if (user_defined_svp) {
10634 has_user_defined_property
10635 |= SvUV(*user_defined_svp);
10638 /* Invert if asking for the complement */
10639 if (value == 'P') {
10640 _invlist_union_complement_2nd(properties, invlist, &properties);
10642 /* The swash can't be used as-is, because we've
10643 * inverted things; delay removing it to here after
10644 * have copied its invlist above */
10645 SvREFCNT_dec(swash);
10649 _invlist_union(properties, invlist, &properties);
10654 RExC_parse = e + 1;
10655 namedclass = ANYOF_MAX; /* no official name, but it's named */
10657 /* \p means they want Unicode semantics */
10658 RExC_uni_semantics = 1;
10661 case 'n': value = '\n'; break;
10662 case 'r': value = '\r'; break;
10663 case 't': value = '\t'; break;
10664 case 'f': value = '\f'; break;
10665 case 'b': value = '\b'; break;
10666 case 'e': value = ASCII_TO_NATIVE('\033');break;
10667 case 'a': value = ASCII_TO_NATIVE('\007');break;
10669 RExC_parse--; /* function expects to be pointed at the 'o' */
10671 const char* error_msg;
10672 bool valid = grok_bslash_o(RExC_parse,
10677 RExC_parse += numlen;
10682 if (PL_encoding && value < 0x100) {
10683 goto recode_encoding;
10687 if (*RExC_parse == '{') {
10688 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10689 | PERL_SCAN_DISALLOW_PREFIX;
10690 char * const e = strchr(RExC_parse++, '}');
10692 vFAIL("Missing right brace on \\x{}");
10694 numlen = e - RExC_parse;
10695 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10696 RExC_parse = e + 1;
10699 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10701 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10702 RExC_parse += numlen;
10704 if (PL_encoding && value < 0x100)
10705 goto recode_encoding;
10708 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
10710 case '0': case '1': case '2': case '3': case '4':
10711 case '5': case '6': case '7':
10713 /* Take 1-3 octal digits */
10714 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10716 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10717 RExC_parse += numlen;
10718 if (PL_encoding && value < 0x100)
10719 goto recode_encoding;
10723 if (! RExC_override_recoding) {
10724 SV* enc = PL_encoding;
10725 value = reg_recode((const char)(U8)value, &enc);
10726 if (!enc && SIZE_ONLY)
10727 ckWARNreg(RExC_parse,
10728 "Invalid escape in the specified encoding");
10732 /* Allow \_ to not give an error */
10733 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
10734 ckWARN2reg(RExC_parse,
10735 "Unrecognized escape \\%c in character class passed through",
10740 } /* end of \blah */
10743 literal_endpoint++;
10746 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10748 /* What matches in a locale is not known until runtime, so need to
10749 * (one time per class) allocate extra space to pass to regexec.
10750 * The space will contain a bit for each named class that is to be
10751 * matched against. This isn't needed for \p{} and pseudo-classes,
10752 * as they are not affected by locale, and hence are dealt with
10754 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10757 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10760 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10761 ANYOF_CLASS_ZERO(ret);
10763 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
10766 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
10767 * literal, as is the character that began the false range, i.e.
10768 * the 'a' in the examples */
10772 RExC_parse >= rangebegin ?
10773 RExC_parse - rangebegin : 0;
10774 ckWARN4reg(RExC_parse,
10775 "False [] range \"%*.*s\"",
10779 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10780 if (prevvalue < 256) {
10782 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
10785 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
10789 range = 0; /* this was not a true range */
10794 /* Possible truncation here but in some 64-bit environments
10795 * the compiler gets heartburn about switch on 64-bit values.
10796 * A similar issue a little earlier when switching on value.
10798 switch ((I32)namedclass) {
10800 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
10801 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10802 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10804 case ANYOF_NALNUMC:
10805 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10806 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10809 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10810 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10813 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10814 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10818 ANYOF_CLASS_SET(ret, namedclass);
10821 _invlist_union(properties, PL_ASCII, &properties);
10826 ANYOF_CLASS_SET(ret, namedclass);
10829 _invlist_union_complement_2nd(properties,
10830 PL_ASCII, &properties);
10831 if (DEPENDS_SEMANTICS) {
10832 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
10837 DO_POSIX(ret, namedclass, properties,
10838 PL_PosixBlank, PL_XPosixBlank);
10841 DO_N_POSIX(ret, namedclass, properties,
10842 PL_PosixBlank, PL_XPosixBlank);
10845 DO_POSIX(ret, namedclass, properties,
10846 PL_PosixCntrl, PL_XPosixCntrl);
10849 DO_N_POSIX(ret, namedclass, properties,
10850 PL_PosixCntrl, PL_XPosixCntrl);
10853 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10854 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10857 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10858 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10861 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10862 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10865 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10866 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10868 case ANYOF_HORIZWS:
10869 /* For these, we use the nonbitmap, as /d doesn't make a
10870 * difference in what these match. There would be problems
10871 * if these characters had folds other than themselves, as
10872 * nonbitmap is subject to folding. It turns out that \h
10873 * is just a synonym for XPosixBlank */
10874 _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
10876 case ANYOF_NHORIZWS:
10877 _invlist_union_complement_2nd(nonbitmap,
10878 PL_XPosixBlank, &nonbitmap);
10882 { /* These require special handling, as they differ under
10883 folding, matching Cased there (which in the ASCII range
10884 is the same as Alpha */
10890 if (FOLD && ! LOC) {
10891 ascii_source = PL_PosixAlpha;
10892 l1_source = PL_L1Cased;
10896 ascii_source = PL_PosixLower;
10897 l1_source = PL_L1PosixLower;
10898 Xname = "XPosixLower";
10900 if (namedclass == ANYOF_LOWER) {
10901 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10902 ascii_source, l1_source, Xname, listsv);
10905 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10906 properties, ascii_source, l1_source, Xname, listsv);
10911 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10912 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10915 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10916 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10919 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10920 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10923 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10924 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10927 DO_POSIX(ret, namedclass, properties,
10928 PL_PosixSpace, PL_XPosixSpace);
10930 case ANYOF_NPSXSPC:
10931 DO_N_POSIX(ret, namedclass, properties,
10932 PL_PosixSpace, PL_XPosixSpace);
10935 DO_POSIX(ret, namedclass, properties,
10936 PL_PerlSpace, PL_XPerlSpace);
10939 DO_N_POSIX(ret, namedclass, properties,
10940 PL_PerlSpace, PL_XPerlSpace);
10942 case ANYOF_UPPER: /* Same as LOWER, above */
10949 if (FOLD && ! LOC) {
10950 ascii_source = PL_PosixAlpha;
10951 l1_source = PL_L1Cased;
10955 ascii_source = PL_PosixUpper;
10956 l1_source = PL_L1PosixUpper;
10957 Xname = "XPosixUpper";
10959 if (namedclass == ANYOF_UPPER) {
10960 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10961 ascii_source, l1_source, Xname, listsv);
10964 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10965 properties, ascii_source, l1_source, Xname, listsv);
10969 case ANYOF_ALNUM: /* Really is 'Word' */
10970 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10971 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10974 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10975 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10978 /* For these, we use the nonbitmap, as /d doesn't make a
10979 * difference in what these match. There would be problems
10980 * if these characters had folds other than themselves, as
10981 * nonbitmap is subject to folding */
10982 _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
10984 case ANYOF_NVERTWS:
10985 _invlist_union_complement_2nd(nonbitmap,
10986 PL_VertSpace, &nonbitmap);
10989 DO_POSIX(ret, namedclass, properties,
10990 PL_PosixXDigit, PL_XPosixXDigit);
10992 case ANYOF_NXDIGIT:
10993 DO_N_POSIX(ret, namedclass, properties,
10994 PL_PosixXDigit, PL_XPosixXDigit);
10997 /* this is to handle \p and \P */
11000 vFAIL("Invalid [::] class");
11006 } /* end of namedclass \blah */
11009 if (prevvalue > (IV)value) /* b-a */ {
11010 const int w = RExC_parse - rangebegin;
11011 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11012 range = 0; /* not a valid range */
11016 prevvalue = value; /* save the beginning of the range */
11017 if (RExC_parse+1 < RExC_end
11018 && *RExC_parse == '-'
11019 && RExC_parse[1] != ']')
11023 /* a bad range like \w-, [:word:]- ? */
11024 if (namedclass > OOB_NAMEDCLASS) {
11025 if (ckWARN(WARN_REGEXP)) {
11027 RExC_parse >= rangebegin ?
11028 RExC_parse - rangebegin : 0;
11030 "False [] range \"%*.*s\"",
11035 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11037 range = 1; /* yeah, it's a range! */
11038 continue; /* but do it the next time */
11042 /* non-Latin1 code point implies unicode semantics. Must be set in
11043 * pass1 so is there for the whole of pass 2 */
11045 RExC_uni_semantics = 1;
11048 /* now is the next time */
11050 if (prevvalue < 256) {
11051 const IV ceilvalue = value < 256 ? value : 255;
11054 /* In EBCDIC [\x89-\x91] should include
11055 * the \x8e but [i-j] should not. */
11056 if (literal_endpoint == 2 &&
11057 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11058 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11060 if (isLOWER(prevvalue)) {
11061 for (i = prevvalue; i <= ceilvalue; i++)
11062 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11064 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11067 for (i = prevvalue; i <= ceilvalue; i++)
11068 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11070 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11076 for (i = prevvalue; i <= ceilvalue; i++) {
11077 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11081 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
11082 const UV natvalue = NATIVE_TO_UNI(value);
11083 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11086 literal_endpoint = 0;
11090 range = 0; /* this range (if it was one) is done now */
11097 /****** !SIZE_ONLY AFTER HERE *********/
11099 /* If folding and there are code points above 255, we calculate all
11100 * characters that could fold to or from the ones already on the list */
11101 if (FOLD && nonbitmap) {
11102 UV start, end; /* End points of code point ranges */
11104 SV* fold_intersection = NULL;
11106 /* This is a list of all the characters that participate in folds
11107 * (except marks, etc in multi-char folds */
11108 if (! PL_utf8_foldable) {
11109 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11110 PL_utf8_foldable = _swash_to_invlist(swash);
11111 SvREFCNT_dec(swash);
11114 /* This is a hash that for a particular fold gives all characters
11115 * that are involved in it */
11116 if (! PL_utf8_foldclosures) {
11118 /* If we were unable to find any folds, then we likely won't be
11119 * able to find the closures. So just create an empty list.
11120 * Folding will effectively be restricted to the non-Unicode rules
11121 * hard-coded into Perl. (This case happens legitimately during
11122 * compilation of Perl itself before the Unicode tables are
11124 if (invlist_len(PL_utf8_foldable) == 0) {
11125 PL_utf8_foldclosures = newHV();
11127 /* If the folds haven't been read in, call a fold function
11129 if (! PL_utf8_tofold) {
11130 U8 dummy[UTF8_MAXBYTES+1];
11133 /* This particular string is above \xff in both UTF-8 and
11135 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11136 assert(PL_utf8_tofold); /* Verify that worked */
11138 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11142 /* Only the characters in this class that participate in folds need be
11143 * checked. Get the intersection of this class and all the possible
11144 * characters that are foldable. This can quickly narrow down a large
11146 _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11148 /* Now look at the foldable characters in this class individually */
11149 invlist_iterinit(fold_intersection);
11150 while (invlist_iternext(fold_intersection, &start, &end)) {
11153 /* Look at every character in the range */
11154 for (j = start; j <= end; j++) {
11157 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11160 _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
11162 if (foldlen > (STRLEN)UNISKIP(f)) {
11164 /* Any multicharacter foldings (disallowed in lookbehind
11165 * patterns) require the following transform: [ABCDEF] ->
11166 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11167 * folds into "rst", all other characters fold to single
11168 * characters. We save away these multicharacter foldings,
11169 * to be later saved as part of the additional "s" data. */
11170 if (! RExC_in_lookbehind) {
11172 U8* e = foldbuf + foldlen;
11174 /* If any of the folded characters of this are in the
11175 * Latin1 range, tell the regex engine that this can
11176 * match a non-utf8 target string. The only multi-byte
11177 * fold whose source is in the Latin1 range (U+00DF)
11178 * applies only when the target string is utf8, or
11179 * under unicode rules */
11180 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11183 /* Can't mix ascii with non- under /aa */
11184 if (MORE_ASCII_RESTRICTED
11185 && (isASCII(*loc) != isASCII(j)))
11187 goto end_multi_fold;
11189 if (UTF8_IS_INVARIANT(*loc)
11190 || UTF8_IS_DOWNGRADEABLE_START(*loc))
11192 /* Can't mix above and below 256 under LOC
11195 goto end_multi_fold;
11198 |= ANYOF_NONBITMAP_NON_UTF8;
11201 loc += UTF8SKIP(loc);
11205 add_alternate(&unicode_alternate, foldbuf, foldlen);
11209 /* This is special-cased, as it is the only letter which
11210 * has both a multi-fold and single-fold in Latin1. All
11211 * the other chars that have single and multi-folds are
11212 * always in utf8, and the utf8 folding algorithm catches
11214 if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11215 stored += set_regclass_bit(pRExC_state,
11217 LATIN_SMALL_LETTER_SHARP_S,
11218 &l1_fold_invlist, &unicode_alternate);
11222 /* Single character fold. Add everything in its fold
11223 * closure to the list that this node should match */
11226 /* The fold closures data structure is a hash with the keys
11227 * being every character that is folded to, like 'k', and
11228 * the values each an array of everything that folds to its
11229 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
11230 if ((listp = hv_fetch(PL_utf8_foldclosures,
11231 (char *) foldbuf, foldlen, FALSE)))
11233 AV* list = (AV*) *listp;
11235 for (k = 0; k <= av_len(list); k++) {
11236 SV** c_p = av_fetch(list, k, FALSE);
11239 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11243 /* /aa doesn't allow folds between ASCII and non-;
11244 * /l doesn't allow them between above and below
11246 if ((MORE_ASCII_RESTRICTED
11247 && (isASCII(c) != isASCII(j)))
11248 || (LOC && ((c < 256) != (j < 256))))
11253 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11254 stored += set_regclass_bit(pRExC_state,
11257 &l1_fold_invlist, &unicode_alternate);
11259 /* It may be that the code point is already in
11260 * this range or already in the bitmap, in
11261 * which case we need do nothing */
11262 else if ((c < start || c > end)
11264 || ! ANYOF_BITMAP_TEST(ret, c)))
11266 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11273 SvREFCNT_dec(fold_intersection);
11276 /* Combine the two lists into one. */
11277 if (l1_fold_invlist) {
11279 _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11280 SvREFCNT_dec(l1_fold_invlist);
11283 nonbitmap = l1_fold_invlist;
11287 /* And combine the result (if any) with any inversion list from properties.
11288 * The lists are kept separate up to now because we don't want to fold the
11292 _invlist_union(nonbitmap, properties, &nonbitmap);
11293 SvREFCNT_dec(properties);
11296 nonbitmap = properties;
11300 /* Here, <nonbitmap> contains all the code points we can determine at
11301 * compile time that we haven't put into the bitmap. Go through it, and
11302 * for things that belong in the bitmap, put them there, and delete from
11306 /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11307 * possibly only should match when the target string is UTF-8 */
11308 UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11310 /* This gets set if we actually need to modify things */
11311 bool change_invlist = FALSE;
11315 /* Start looking through <nonbitmap> */
11316 invlist_iterinit(nonbitmap);
11317 while (invlist_iternext(nonbitmap, &start, &end)) {
11321 /* Quit if are above what we should change */
11322 if (start > max_cp_to_set) {
11326 change_invlist = TRUE;
11328 /* Set all the bits in the range, up to the max that we are doing */
11329 high = (end < max_cp_to_set) ? end : max_cp_to_set;
11330 for (i = start; i <= (int) high; i++) {
11331 if (! ANYOF_BITMAP_TEST(ret, i)) {
11332 ANYOF_BITMAP_SET(ret, i);
11340 /* Done with loop; remove any code points that are in the bitmap from
11342 if (change_invlist) {
11343 _invlist_subtract(nonbitmap,
11344 (DEPENDS_SEMANTICS)
11350 /* If have completely emptied it, remove it completely */
11351 if (invlist_len(nonbitmap) == 0) {
11352 SvREFCNT_dec(nonbitmap);
11357 /* Here, we have calculated what code points should be in the character
11358 * class. <nonbitmap> does not overlap the bitmap except possibly in the
11359 * case of DEPENDS rules.
11361 * Now we can see about various optimizations. Fold calculation (which we
11362 * did above) needs to take place before inversion. Otherwise /[^k]/i
11363 * would invert to include K, which under /i would match k, which it
11366 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
11367 * set the FOLD flag yet, so this does optimize those. It doesn't
11368 * optimize locale. Doing so perhaps could be done as long as there is
11369 * nothing like \w in it; some thought also would have to be given to the
11370 * interaction with above 0x100 chars */
11371 if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11373 && ! unicode_alternate
11374 /* In case of /d, there are some things that should match only when in
11375 * not in the bitmap, i.e., they require UTF8 to match. These are
11376 * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11377 * case, they don't require UTF8, so can invert here */
11379 || ! DEPENDS_SEMANTICS
11380 || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11381 && SvCUR(listsv) == initial_listsv_len)
11385 for (i = 0; i < 256; ++i) {
11386 if (ANYOF_BITMAP_TEST(ret, i)) {
11387 ANYOF_BITMAP_CLEAR(ret, i);
11390 ANYOF_BITMAP_SET(ret, i);
11395 /* The inversion means that everything above 255 is matched */
11396 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11399 /* Here, also has things outside the bitmap that may overlap with
11400 * the bitmap. We have to sync them up, so that they get inverted
11401 * in both places. Earlier, we removed all overlaps except in the
11402 * case of /d rules, so no syncing is needed except for this case
11404 SV *remove_list = NULL;
11406 if (DEPENDS_SEMANTICS) {
11409 /* Set the bits that correspond to the ones that aren't in the
11410 * bitmap. Otherwise, when we invert, we'll miss these.
11411 * Earlier, we removed from the nonbitmap all code points
11412 * < 128, so there is no extra work here */
11413 invlist_iterinit(nonbitmap);
11414 while (invlist_iternext(nonbitmap, &start, &end)) {
11415 if (start > 255) { /* The bit map goes to 255 */
11421 for (i = start; i <= (int) end; ++i) {
11422 ANYOF_BITMAP_SET(ret, i);
11429 /* Now invert both the bitmap and the nonbitmap. Anything in the
11430 * bitmap has to also be removed from the non-bitmap, but again,
11431 * there should not be overlap unless is /d rules. */
11432 _invlist_invert(nonbitmap);
11434 /* Any swash can't be used as-is, because we've inverted things */
11436 SvREFCNT_dec(swash);
11440 for (i = 0; i < 256; ++i) {
11441 if (ANYOF_BITMAP_TEST(ret, i)) {
11442 ANYOF_BITMAP_CLEAR(ret, i);
11443 if (DEPENDS_SEMANTICS) {
11444 if (! remove_list) {
11445 remove_list = _new_invlist(2);
11447 remove_list = add_cp_to_invlist(remove_list, i);
11451 ANYOF_BITMAP_SET(ret, i);
11457 /* And do the removal */
11458 if (DEPENDS_SEMANTICS) {
11460 _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11461 SvREFCNT_dec(remove_list);
11465 /* There is no overlap for non-/d, so just delete anything
11467 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
11471 stored = 256 - stored;
11473 /* Clear the invert flag since have just done it here */
11474 ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
11477 /* Folding in the bitmap is taken care of above, but not for locale (for
11478 * which we have to wait to see what folding is in effect at runtime), and
11479 * for some things not in the bitmap (only the upper latin folds in this
11480 * case, as all other single-char folding has been set above). Set
11481 * run-time fold flag for these */
11483 || (DEPENDS_SEMANTICS
11485 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11486 || unicode_alternate))
11488 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
11491 /* A single character class can be "optimized" into an EXACTish node.
11492 * Note that since we don't currently count how many characters there are
11493 * outside the bitmap, we are XXX missing optimization possibilities for
11494 * them. This optimization can't happen unless this is a truly single
11495 * character class, which means that it can't be an inversion into a
11496 * many-character class, and there must be no possibility of there being
11497 * things outside the bitmap. 'stored' (only) for locales doesn't include
11498 * \w, etc, so have to make a special test that they aren't present
11500 * Similarly A 2-character class of the very special form like [bB] can be
11501 * optimized into an EXACTFish node, but only for non-locales, and for
11502 * characters which only have the two folds; so things like 'fF' and 'Ii'
11503 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11506 && ! unicode_alternate
11507 && SvCUR(listsv) == initial_listsv_len
11508 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
11509 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11510 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11511 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11512 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11513 /* If the latest code point has a fold whose
11514 * bit is set, it must be the only other one */
11515 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
11516 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
11518 /* Note that the information needed to decide to do this optimization
11519 * is not currently available until the 2nd pass, and that the actually
11520 * used EXACTish node takes less space than the calculated ANYOF node,
11521 * and hence the amount of space calculated in the first pass is larger
11522 * than actually used, so this optimization doesn't gain us any space.
11523 * But an EXACT node is faster than an ANYOF node, and can be combined
11524 * with any adjacent EXACT nodes later by the optimizer for further
11525 * gains. The speed of executing an EXACTF is similar to an ANYOF
11526 * node, so the optimization advantage comes from the ability to join
11527 * it to adjacent EXACT nodes */
11529 const char * cur_parse= RExC_parse;
11531 RExC_emit = (regnode *)orig_emit;
11532 RExC_parse = (char *)orig_parse;
11536 /* A locale node with one point can be folded; all the other cases
11537 * with folding will have two points, since we calculate them above
11539 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
11546 else { /* else 2 chars in the bit map: the folds of each other */
11548 /* Use the folded value, which for the cases where we get here,
11549 * is just the lower case of the current one (which may resolve to
11550 * itself, or to the other one */
11551 value = toLOWER_LATIN1(value);
11553 /* To join adjacent nodes, they must be the exact EXACTish type.
11554 * Try to use the most likely type, by using EXACTFA if possible,
11555 * then EXACTFU if the regex calls for it, or is required because
11556 * the character is non-ASCII. (If <value> is ASCII, its fold is
11557 * also ASCII for the cases where we get here.) */
11558 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11561 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
11564 else { /* Otherwise, more likely to be EXACTF type */
11569 ret = reg_node(pRExC_state, op);
11570 RExC_parse = (char *)cur_parse;
11571 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11572 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11573 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11575 RExC_emit += STR_SZ(2);
11578 *STRING(ret)= (char)value;
11580 RExC_emit += STR_SZ(1);
11582 SvREFCNT_dec(listsv);
11586 /* If there is a swash and more than one element, we can't use the swash in
11587 * the optimization below. */
11588 if (swash && element_count > 1) {
11589 SvREFCNT_dec(swash);
11593 && SvCUR(listsv) == initial_listsv_len
11594 && ! unicode_alternate)
11596 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11597 SvREFCNT_dec(listsv);
11598 SvREFCNT_dec(unicode_alternate);
11601 /* av[0] stores the character class description in its textual form:
11602 * used later (regexec.c:Perl_regclass_swash()) to initialize the
11603 * appropriate swash, and is also useful for dumping the regnode.
11604 * av[1] if NULL, is a placeholder to later contain the swash computed
11605 * from av[0]. But if no further computation need be done, the
11606 * swash is stored there now.
11607 * av[2] stores the multicharacter foldings, used later in
11608 * regexec.c:S_reginclass().
11609 * av[3] stores the nonbitmap inversion list for use in addition or
11610 * instead of av[0]; not used if av[1] isn't NULL
11611 * av[4] is set if any component of the class is from a user-defined
11612 * property; not used if av[1] isn't NULL */
11613 AV * const av = newAV();
11616 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11620 av_store(av, 1, swash);
11621 SvREFCNT_dec(nonbitmap);
11624 av_store(av, 1, NULL);
11626 av_store(av, 3, nonbitmap);
11627 av_store(av, 4, newSVuv(has_user_defined_property));
11631 /* Store any computed multi-char folds only if we are allowing
11633 if (allow_full_fold) {
11634 av_store(av, 2, MUTABLE_SV(unicode_alternate));
11635 if (unicode_alternate) { /* This node is variable length */
11640 av_store(av, 2, NULL);
11642 rv = newRV_noinc(MUTABLE_SV(av));
11643 n = add_data(pRExC_state, 1, "s");
11644 RExC_rxi->data->data[n] = (void*)rv;
11651 /* reg_skipcomment()
11653 Absorbs an /x style # comments from the input stream.
11654 Returns true if there is more text remaining in the stream.
11655 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11656 terminates the pattern without including a newline.
11658 Note its the callers responsibility to ensure that we are
11659 actually in /x mode
11664 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11668 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11670 while (RExC_parse < RExC_end)
11671 if (*RExC_parse++ == '\n') {
11676 /* we ran off the end of the pattern without ending
11677 the comment, so we have to add an \n when wrapping */
11678 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11686 Advances the parse position, and optionally absorbs
11687 "whitespace" from the inputstream.
11689 Without /x "whitespace" means (?#...) style comments only,
11690 with /x this means (?#...) and # comments and whitespace proper.
11692 Returns the RExC_parse point from BEFORE the scan occurs.
11694 This is the /x friendly way of saying RExC_parse++.
11698 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
11700 char* const retval = RExC_parse++;
11702 PERL_ARGS_ASSERT_NEXTCHAR;
11705 if (RExC_end - RExC_parse >= 3
11706 && *RExC_parse == '('
11707 && RExC_parse[1] == '?'
11708 && RExC_parse[2] == '#')
11710 while (*RExC_parse != ')') {
11711 if (RExC_parse == RExC_end)
11712 FAIL("Sequence (?#... not terminated");
11718 if (RExC_flags & RXf_PMf_EXTENDED) {
11719 if (isSPACE(*RExC_parse)) {
11723 else if (*RExC_parse == '#') {
11724 if ( reg_skipcomment( pRExC_state ) )
11733 - reg_node - emit a node
11735 STATIC regnode * /* Location. */
11736 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
11739 register regnode *ptr;
11740 regnode * const ret = RExC_emit;
11741 GET_RE_DEBUG_FLAGS_DECL;
11743 PERL_ARGS_ASSERT_REG_NODE;
11746 SIZE_ALIGN(RExC_size);
11750 if (RExC_emit >= RExC_emit_bound)
11751 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11752 op, RExC_emit, RExC_emit_bound);
11754 NODE_ALIGN_FILL(ret);
11756 FILL_ADVANCE_NODE(ptr, op);
11757 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
11758 #ifdef RE_TRACK_PATTERN_OFFSETS
11759 if (RExC_offsets) { /* MJD */
11760 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
11761 "reg_node", __LINE__,
11763 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11764 ? "Overwriting end of array!\n" : "OK",
11765 (UV)(RExC_emit - RExC_emit_start),
11766 (UV)(RExC_parse - RExC_start),
11767 (UV)RExC_offsets[0]));
11768 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
11776 - reganode - emit a node with an argument
11778 STATIC regnode * /* Location. */
11779 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
11782 register regnode *ptr;
11783 regnode * const ret = RExC_emit;
11784 GET_RE_DEBUG_FLAGS_DECL;
11786 PERL_ARGS_ASSERT_REGANODE;
11789 SIZE_ALIGN(RExC_size);
11794 assert(2==regarglen[op]+1);
11796 Anything larger than this has to allocate the extra amount.
11797 If we changed this to be:
11799 RExC_size += (1 + regarglen[op]);
11801 then it wouldn't matter. Its not clear what side effect
11802 might come from that so its not done so far.
11807 if (RExC_emit >= RExC_emit_bound)
11808 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11809 op, RExC_emit, RExC_emit_bound);
11811 NODE_ALIGN_FILL(ret);
11813 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
11814 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
11815 #ifdef RE_TRACK_PATTERN_OFFSETS
11816 if (RExC_offsets) { /* MJD */
11817 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11821 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
11822 "Overwriting end of array!\n" : "OK",
11823 (UV)(RExC_emit - RExC_emit_start),
11824 (UV)(RExC_parse - RExC_start),
11825 (UV)RExC_offsets[0]));
11826 Set_Cur_Node_Offset;
11834 - reguni - emit (if appropriate) a Unicode character
11837 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
11841 PERL_ARGS_ASSERT_REGUNI;
11843 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
11847 - reginsert - insert an operator in front of already-emitted operand
11849 * Means relocating the operand.
11852 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
11855 register regnode *src;
11856 register regnode *dst;
11857 register regnode *place;
11858 const int offset = regarglen[(U8)op];
11859 const int size = NODE_STEP_REGNODE + offset;
11860 GET_RE_DEBUG_FLAGS_DECL;
11862 PERL_ARGS_ASSERT_REGINSERT;
11863 PERL_UNUSED_ARG(depth);
11864 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
11865 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
11874 if (RExC_open_parens) {
11876 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
11877 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
11878 if ( RExC_open_parens[paren] >= opnd ) {
11879 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
11880 RExC_open_parens[paren] += size;
11882 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
11884 if ( RExC_close_parens[paren] >= opnd ) {
11885 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
11886 RExC_close_parens[paren] += size;
11888 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
11893 while (src > opnd) {
11894 StructCopy(--src, --dst, regnode);
11895 #ifdef RE_TRACK_PATTERN_OFFSETS
11896 if (RExC_offsets) { /* MJD 20010112 */
11897 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
11901 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11902 ? "Overwriting end of array!\n" : "OK",
11903 (UV)(src - RExC_emit_start),
11904 (UV)(dst - RExC_emit_start),
11905 (UV)RExC_offsets[0]));
11906 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
11907 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
11913 place = opnd; /* Op node, where operand used to be. */
11914 #ifdef RE_TRACK_PATTERN_OFFSETS
11915 if (RExC_offsets) { /* MJD */
11916 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11920 (UV)(place - RExC_emit_start) > RExC_offsets[0]
11921 ? "Overwriting end of array!\n" : "OK",
11922 (UV)(place - RExC_emit_start),
11923 (UV)(RExC_parse - RExC_start),
11924 (UV)RExC_offsets[0]));
11925 Set_Node_Offset(place, RExC_parse);
11926 Set_Node_Length(place, 1);
11929 src = NEXTOPER(place);
11930 FILL_ADVANCE_NODE(place, op);
11931 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
11932 Zero(src, offset, regnode);
11936 - regtail - set the next-pointer at the end of a node chain of p to val.
11937 - SEE ALSO: regtail_study
11939 /* TODO: All three parms should be const */
11941 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11944 register regnode *scan;
11945 GET_RE_DEBUG_FLAGS_DECL;
11947 PERL_ARGS_ASSERT_REGTAIL;
11949 PERL_UNUSED_ARG(depth);
11955 /* Find last node. */
11958 regnode * const temp = regnext(scan);
11960 SV * const mysv=sv_newmortal();
11961 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
11962 regprop(RExC_rx, mysv, scan);
11963 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
11964 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
11965 (temp == NULL ? "->" : ""),
11966 (temp == NULL ? PL_reg_name[OP(val)] : "")
11974 if (reg_off_by_arg[OP(scan)]) {
11975 ARG_SET(scan, val - scan);
11978 NEXT_OFF(scan) = val - scan;
11984 - regtail_study - set the next-pointer at the end of a node chain of p to val.
11985 - Look for optimizable sequences at the same time.
11986 - currently only looks for EXACT chains.
11988 This is experimental code. The idea is to use this routine to perform
11989 in place optimizations on branches and groups as they are constructed,
11990 with the long term intention of removing optimization from study_chunk so
11991 that it is purely analytical.
11993 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
11994 to control which is which.
11997 /* TODO: All four parms should be const */
12000 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12003 register regnode *scan;
12005 #ifdef EXPERIMENTAL_INPLACESCAN
12008 GET_RE_DEBUG_FLAGS_DECL;
12010 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12016 /* Find last node. */
12020 regnode * const temp = regnext(scan);
12021 #ifdef EXPERIMENTAL_INPLACESCAN
12022 if (PL_regkind[OP(scan)] == EXACT) {
12023 bool has_exactf_sharp_s; /* Unexamined in this routine */
12024 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12029 switch (OP(scan)) {
12035 case EXACTFU_NO_TRIE:
12037 if( exact == PSEUDO )
12039 else if ( exact != OP(scan) )
12048 SV * const mysv=sv_newmortal();
12049 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12050 regprop(RExC_rx, mysv, scan);
12051 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12052 SvPV_nolen_const(mysv),
12053 REG_NODE_NUM(scan),
12054 PL_reg_name[exact]);
12061 SV * const mysv_val=sv_newmortal();
12062 DEBUG_PARSE_MSG("");
12063 regprop(RExC_rx, mysv_val, val);
12064 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12065 SvPV_nolen_const(mysv_val),
12066 (IV)REG_NODE_NUM(val),
12070 if (reg_off_by_arg[OP(scan)]) {
12071 ARG_SET(scan, val - scan);
12074 NEXT_OFF(scan) = val - scan;
12082 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12086 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12092 for (bit=0; bit<32; bit++) {
12093 if (flags & (1<<bit)) {
12094 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12097 if (!set++ && lead)
12098 PerlIO_printf(Perl_debug_log, "%s",lead);
12099 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12102 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12103 if (!set++ && lead) {
12104 PerlIO_printf(Perl_debug_log, "%s",lead);
12107 case REGEX_UNICODE_CHARSET:
12108 PerlIO_printf(Perl_debug_log, "UNICODE");
12110 case REGEX_LOCALE_CHARSET:
12111 PerlIO_printf(Perl_debug_log, "LOCALE");
12113 case REGEX_ASCII_RESTRICTED_CHARSET:
12114 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12116 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12117 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12120 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12126 PerlIO_printf(Perl_debug_log, "\n");
12128 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12134 Perl_regdump(pTHX_ const regexp *r)
12138 SV * const sv = sv_newmortal();
12139 SV *dsv= sv_newmortal();
12140 RXi_GET_DECL(r,ri);
12141 GET_RE_DEBUG_FLAGS_DECL;
12143 PERL_ARGS_ASSERT_REGDUMP;
12145 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12147 /* Header fields of interest. */
12148 if (r->anchored_substr) {
12149 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12150 RE_SV_DUMPLEN(r->anchored_substr), 30);
12151 PerlIO_printf(Perl_debug_log,
12152 "anchored %s%s at %"IVdf" ",
12153 s, RE_SV_TAIL(r->anchored_substr),
12154 (IV)r->anchored_offset);
12155 } else if (r->anchored_utf8) {
12156 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12157 RE_SV_DUMPLEN(r->anchored_utf8), 30);
12158 PerlIO_printf(Perl_debug_log,
12159 "anchored utf8 %s%s at %"IVdf" ",
12160 s, RE_SV_TAIL(r->anchored_utf8),
12161 (IV)r->anchored_offset);
12163 if (r->float_substr) {
12164 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12165 RE_SV_DUMPLEN(r->float_substr), 30);
12166 PerlIO_printf(Perl_debug_log,
12167 "floating %s%s at %"IVdf"..%"UVuf" ",
12168 s, RE_SV_TAIL(r->float_substr),
12169 (IV)r->float_min_offset, (UV)r->float_max_offset);
12170 } else if (r->float_utf8) {
12171 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12172 RE_SV_DUMPLEN(r->float_utf8), 30);
12173 PerlIO_printf(Perl_debug_log,
12174 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12175 s, RE_SV_TAIL(r->float_utf8),
12176 (IV)r->float_min_offset, (UV)r->float_max_offset);
12178 if (r->check_substr || r->check_utf8)
12179 PerlIO_printf(Perl_debug_log,
12181 (r->check_substr == r->float_substr
12182 && r->check_utf8 == r->float_utf8
12183 ? "(checking floating" : "(checking anchored"));
12184 if (r->extflags & RXf_NOSCAN)
12185 PerlIO_printf(Perl_debug_log, " noscan");
12186 if (r->extflags & RXf_CHECK_ALL)
12187 PerlIO_printf(Perl_debug_log, " isall");
12188 if (r->check_substr || r->check_utf8)
12189 PerlIO_printf(Perl_debug_log, ") ");
12191 if (ri->regstclass) {
12192 regprop(r, sv, ri->regstclass);
12193 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12195 if (r->extflags & RXf_ANCH) {
12196 PerlIO_printf(Perl_debug_log, "anchored");
12197 if (r->extflags & RXf_ANCH_BOL)
12198 PerlIO_printf(Perl_debug_log, "(BOL)");
12199 if (r->extflags & RXf_ANCH_MBOL)
12200 PerlIO_printf(Perl_debug_log, "(MBOL)");
12201 if (r->extflags & RXf_ANCH_SBOL)
12202 PerlIO_printf(Perl_debug_log, "(SBOL)");
12203 if (r->extflags & RXf_ANCH_GPOS)
12204 PerlIO_printf(Perl_debug_log, "(GPOS)");
12205 PerlIO_putc(Perl_debug_log, ' ');
12207 if (r->extflags & RXf_GPOS_SEEN)
12208 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12209 if (r->intflags & PREGf_SKIP)
12210 PerlIO_printf(Perl_debug_log, "plus ");
12211 if (r->intflags & PREGf_IMPLICIT)
12212 PerlIO_printf(Perl_debug_log, "implicit ");
12213 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12214 if (r->extflags & RXf_EVAL_SEEN)
12215 PerlIO_printf(Perl_debug_log, "with eval ");
12216 PerlIO_printf(Perl_debug_log, "\n");
12217 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12219 PERL_ARGS_ASSERT_REGDUMP;
12220 PERL_UNUSED_CONTEXT;
12221 PERL_UNUSED_ARG(r);
12222 #endif /* DEBUGGING */
12226 - regprop - printable representation of opcode
12228 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12231 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12232 if (flags & ANYOF_INVERT) \
12233 /*make sure the invert info is in each */ \
12234 sv_catpvs(sv, "^"); \
12240 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12245 RXi_GET_DECL(prog,progi);
12246 GET_RE_DEBUG_FLAGS_DECL;
12248 PERL_ARGS_ASSERT_REGPROP;
12252 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
12253 /* It would be nice to FAIL() here, but this may be called from
12254 regexec.c, and it would be hard to supply pRExC_state. */
12255 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12256 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12258 k = PL_regkind[OP(o)];
12261 sv_catpvs(sv, " ");
12262 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12263 * is a crude hack but it may be the best for now since
12264 * we have no flag "this EXACTish node was UTF-8"
12266 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12267 PERL_PV_ESCAPE_UNI_DETECT |
12268 PERL_PV_ESCAPE_NONASCII |
12269 PERL_PV_PRETTY_ELLIPSES |
12270 PERL_PV_PRETTY_LTGT |
12271 PERL_PV_PRETTY_NOCLEAR
12273 } else if (k == TRIE) {
12274 /* print the details of the trie in dumpuntil instead, as
12275 * progi->data isn't available here */
12276 const char op = OP(o);
12277 const U32 n = ARG(o);
12278 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12279 (reg_ac_data *)progi->data->data[n] :
12281 const reg_trie_data * const trie
12282 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12284 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12285 DEBUG_TRIE_COMPILE_r(
12286 Perl_sv_catpvf(aTHX_ sv,
12287 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12288 (UV)trie->startstate,
12289 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12290 (UV)trie->wordcount,
12293 (UV)TRIE_CHARCOUNT(trie),
12294 (UV)trie->uniquecharcount
12297 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12299 int rangestart = -1;
12300 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12301 sv_catpvs(sv, "[");
12302 for (i = 0; i <= 256; i++) {
12303 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12304 if (rangestart == -1)
12306 } else if (rangestart != -1) {
12307 if (i <= rangestart + 3)
12308 for (; rangestart < i; rangestart++)
12309 put_byte(sv, rangestart);
12311 put_byte(sv, rangestart);
12312 sv_catpvs(sv, "-");
12313 put_byte(sv, i - 1);
12318 sv_catpvs(sv, "]");
12321 } else if (k == CURLY) {
12322 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12323 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12324 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12326 else if (k == WHILEM && o->flags) /* Ordinal/of */
12327 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12328 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12329 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
12330 if ( RXp_PAREN_NAMES(prog) ) {
12331 if ( k != REF || (OP(o) < NREF)) {
12332 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12333 SV **name= av_fetch(list, ARG(o), 0 );
12335 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12338 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12339 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12340 I32 *nums=(I32*)SvPVX(sv_dat);
12341 SV **name= av_fetch(list, nums[0], 0 );
12344 for ( n=0; n<SvIVX(sv_dat); n++ ) {
12345 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12346 (n ? "," : ""), (IV)nums[n]);
12348 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12352 } else if (k == GOSUB)
12353 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12354 else if (k == VERB) {
12356 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12357 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12358 } else if (k == LOGICAL)
12359 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
12360 else if (k == ANYOF) {
12361 int i, rangestart = -1;
12362 const U8 flags = ANYOF_FLAGS(o);
12365 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12366 static const char * const anyofs[] = {
12399 if (flags & ANYOF_LOCALE)
12400 sv_catpvs(sv, "{loc}");
12401 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
12402 sv_catpvs(sv, "{i}");
12403 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
12404 if (flags & ANYOF_INVERT)
12405 sv_catpvs(sv, "^");
12407 /* output what the standard cp 0-255 bitmap matches */
12408 for (i = 0; i <= 256; i++) {
12409 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12410 if (rangestart == -1)
12412 } else if (rangestart != -1) {
12413 if (i <= rangestart + 3)
12414 for (; rangestart < i; rangestart++)
12415 put_byte(sv, rangestart);
12417 put_byte(sv, rangestart);
12418 sv_catpvs(sv, "-");
12419 put_byte(sv, i - 1);
12426 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12427 /* output any special charclass tests (used entirely under use locale) */
12428 if (ANYOF_CLASS_TEST_ANY_SET(o))
12429 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
12430 if (ANYOF_CLASS_TEST(o,i)) {
12431 sv_catpv(sv, anyofs[i]);
12435 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12437 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12438 sv_catpvs(sv, "{non-utf8-latin1-all}");
12441 /* output information about the unicode matching */
12442 if (flags & ANYOF_UNICODE_ALL)
12443 sv_catpvs(sv, "{unicode_all}");
12444 else if (ANYOF_NONBITMAP(o))
12445 sv_catpvs(sv, "{unicode}");
12446 if (flags & ANYOF_NONBITMAP_NON_UTF8)
12447 sv_catpvs(sv, "{outside bitmap}");
12449 if (ANYOF_NONBITMAP(o)) {
12450 SV *lv; /* Set if there is something outside the bit map */
12451 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
12452 bool byte_output = FALSE; /* If something in the bitmap has been
12455 if (lv && lv != &PL_sv_undef) {
12457 U8 s[UTF8_MAXBYTES_CASE+1];
12459 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
12460 uvchr_to_utf8(s, i);
12463 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
12467 && swash_fetch(sw, s, TRUE))
12469 if (rangestart == -1)
12471 } else if (rangestart != -1) {
12472 byte_output = TRUE;
12473 if (i <= rangestart + 3)
12474 for (; rangestart < i; rangestart++) {
12475 put_byte(sv, rangestart);
12478 put_byte(sv, rangestart);
12479 sv_catpvs(sv, "-");
12488 char *s = savesvpv(lv);
12489 char * const origs = s;
12491 while (*s && *s != '\n')
12495 const char * const t = ++s;
12498 sv_catpvs(sv, " ");
12504 /* Truncate very long output */
12505 if (s - origs > 256) {
12506 Perl_sv_catpvf(aTHX_ sv,
12508 (int) (s - origs - 1),
12514 else if (*s == '\t') {
12533 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12535 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
12536 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
12538 PERL_UNUSED_CONTEXT;
12539 PERL_UNUSED_ARG(sv);
12540 PERL_UNUSED_ARG(o);
12541 PERL_UNUSED_ARG(prog);
12542 #endif /* DEBUGGING */
12546 Perl_re_intuit_string(pTHX_ REGEXP * const r)
12547 { /* Assume that RE_INTUIT is set */
12549 struct regexp *const prog = (struct regexp *)SvANY(r);
12550 GET_RE_DEBUG_FLAGS_DECL;
12552 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
12553 PERL_UNUSED_CONTEXT;
12557 const char * const s = SvPV_nolen_const(prog->check_substr
12558 ? prog->check_substr : prog->check_utf8);
12560 if (!PL_colorset) reginitcolors();
12561 PerlIO_printf(Perl_debug_log,
12562 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
12564 prog->check_substr ? "" : "utf8 ",
12565 PL_colors[5],PL_colors[0],
12568 (strlen(s) > 60 ? "..." : ""));
12571 return prog->check_substr ? prog->check_substr : prog->check_utf8;
12577 handles refcounting and freeing the perl core regexp structure. When
12578 it is necessary to actually free the structure the first thing it
12579 does is call the 'free' method of the regexp_engine associated to
12580 the regexp, allowing the handling of the void *pprivate; member
12581 first. (This routine is not overridable by extensions, which is why
12582 the extensions free is called first.)
12584 See regdupe and regdupe_internal if you change anything here.
12586 #ifndef PERL_IN_XSUB_RE
12588 Perl_pregfree(pTHX_ REGEXP *r)
12594 Perl_pregfree2(pTHX_ REGEXP *rx)
12597 struct regexp *const r = (struct regexp *)SvANY(rx);
12598 GET_RE_DEBUG_FLAGS_DECL;
12600 PERL_ARGS_ASSERT_PREGFREE2;
12602 if (r->mother_re) {
12603 ReREFCNT_dec(r->mother_re);
12605 CALLREGFREE_PVT(rx); /* free the private data */
12606 SvREFCNT_dec(RXp_PAREN_NAMES(r));
12609 SvREFCNT_dec(r->anchored_substr);
12610 SvREFCNT_dec(r->anchored_utf8);
12611 SvREFCNT_dec(r->float_substr);
12612 SvREFCNT_dec(r->float_utf8);
12613 Safefree(r->substrs);
12615 RX_MATCH_COPY_FREE(rx);
12616 #ifdef PERL_OLD_COPY_ON_WRITE
12617 SvREFCNT_dec(r->saved_copy);
12624 This is a hacky workaround to the structural issue of match results
12625 being stored in the regexp structure which is in turn stored in
12626 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12627 could be PL_curpm in multiple contexts, and could require multiple
12628 result sets being associated with the pattern simultaneously, such
12629 as when doing a recursive match with (??{$qr})
12631 The solution is to make a lightweight copy of the regexp structure
12632 when a qr// is returned from the code executed by (??{$qr}) this
12633 lightweight copy doesn't actually own any of its data except for
12634 the starp/end and the actual regexp structure itself.
12640 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
12642 struct regexp *ret;
12643 struct regexp *const r = (struct regexp *)SvANY(rx);
12644 register const I32 npar = r->nparens+1;
12646 PERL_ARGS_ASSERT_REG_TEMP_COPY;
12649 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12650 ret = (struct regexp *)SvANY(ret_x);
12652 (void)ReREFCNT_inc(rx);
12653 /* We can take advantage of the existing "copied buffer" mechanism in SVs
12654 by pointing directly at the buffer, but flagging that the allocated
12655 space in the copy is zero. As we've just done a struct copy, it's now
12656 a case of zero-ing that, rather than copying the current length. */
12657 SvPV_set(ret_x, RX_WRAPPED(rx));
12658 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
12659 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12660 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
12661 SvLEN_set(ret_x, 0);
12662 SvSTASH_set(ret_x, NULL);
12663 SvMAGIC_set(ret_x, NULL);
12664 Newx(ret->offs, npar, regexp_paren_pair);
12665 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12667 Newx(ret->substrs, 1, struct reg_substr_data);
12668 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12670 SvREFCNT_inc_void(ret->anchored_substr);
12671 SvREFCNT_inc_void(ret->anchored_utf8);
12672 SvREFCNT_inc_void(ret->float_substr);
12673 SvREFCNT_inc_void(ret->float_utf8);
12675 /* check_substr and check_utf8, if non-NULL, point to either their
12676 anchored or float namesakes, and don't hold a second reference. */
12678 RX_MATCH_COPIED_off(ret_x);
12679 #ifdef PERL_OLD_COPY_ON_WRITE
12680 ret->saved_copy = NULL;
12682 ret->mother_re = rx;
12688 /* regfree_internal()
12690 Free the private data in a regexp. This is overloadable by
12691 extensions. Perl takes care of the regexp structure in pregfree(),
12692 this covers the *pprivate pointer which technically perl doesn't
12693 know about, however of course we have to handle the
12694 regexp_internal structure when no extension is in use.
12696 Note this is called before freeing anything in the regexp
12701 Perl_regfree_internal(pTHX_ REGEXP * const rx)
12704 struct regexp *const r = (struct regexp *)SvANY(rx);
12705 RXi_GET_DECL(r,ri);
12706 GET_RE_DEBUG_FLAGS_DECL;
12708 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12714 SV *dsv= sv_newmortal();
12715 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
12716 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
12717 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12718 PL_colors[4],PL_colors[5],s);
12721 #ifdef RE_TRACK_PATTERN_OFFSETS
12723 Safefree(ri->u.offsets); /* 20010421 MJD */
12726 int n = ri->data->count;
12727 PAD* new_comppad = NULL;
12732 /* If you add a ->what type here, update the comment in regcomp.h */
12733 switch (ri->data->what[n]) {
12738 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
12741 Safefree(ri->data->data[n]);
12744 new_comppad = MUTABLE_AV(ri->data->data[n]);
12747 if (new_comppad == NULL)
12748 Perl_croak(aTHX_ "panic: pregfree comppad");
12749 PAD_SAVE_LOCAL(old_comppad,
12750 /* Watch out for global destruction's random ordering. */
12751 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
12754 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
12757 op_free((OP_4tree*)ri->data->data[n]);
12759 PAD_RESTORE_LOCAL(old_comppad);
12760 SvREFCNT_dec(MUTABLE_SV(new_comppad));
12761 new_comppad = NULL;
12766 { /* Aho Corasick add-on structure for a trie node.
12767 Used in stclass optimization only */
12769 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
12771 refcount = --aho->refcount;
12774 PerlMemShared_free(aho->states);
12775 PerlMemShared_free(aho->fail);
12776 /* do this last!!!! */
12777 PerlMemShared_free(ri->data->data[n]);
12778 PerlMemShared_free(ri->regstclass);
12784 /* trie structure. */
12786 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
12788 refcount = --trie->refcount;
12791 PerlMemShared_free(trie->charmap);
12792 PerlMemShared_free(trie->states);
12793 PerlMemShared_free(trie->trans);
12795 PerlMemShared_free(trie->bitmap);
12797 PerlMemShared_free(trie->jump);
12798 PerlMemShared_free(trie->wordinfo);
12799 /* do this last!!!! */
12800 PerlMemShared_free(ri->data->data[n]);
12805 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
12808 Safefree(ri->data->what);
12809 Safefree(ri->data);
12815 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12816 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
12817 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
12820 re_dup - duplicate a regexp.
12822 This routine is expected to clone a given regexp structure. It is only
12823 compiled under USE_ITHREADS.
12825 After all of the core data stored in struct regexp is duplicated
12826 the regexp_engine.dupe method is used to copy any private data
12827 stored in the *pprivate pointer. This allows extensions to handle
12828 any duplication it needs to do.
12830 See pregfree() and regfree_internal() if you change anything here.
12832 #if defined(USE_ITHREADS)
12833 #ifndef PERL_IN_XSUB_RE
12835 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
12839 const struct regexp *r = (const struct regexp *)SvANY(sstr);
12840 struct regexp *ret = (struct regexp *)SvANY(dstr);
12842 PERL_ARGS_ASSERT_RE_DUP_GUTS;
12844 npar = r->nparens+1;
12845 Newx(ret->offs, npar, regexp_paren_pair);
12846 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12848 /* no need to copy these */
12849 Newx(ret->swap, npar, regexp_paren_pair);
12852 if (ret->substrs) {
12853 /* Do it this way to avoid reading from *r after the StructCopy().
12854 That way, if any of the sv_dup_inc()s dislodge *r from the L1
12855 cache, it doesn't matter. */
12856 const bool anchored = r->check_substr
12857 ? r->check_substr == r->anchored_substr
12858 : r->check_utf8 == r->anchored_utf8;
12859 Newx(ret->substrs, 1, struct reg_substr_data);
12860 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12862 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12863 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12864 ret->float_substr = sv_dup_inc(ret->float_substr, param);
12865 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
12867 /* check_substr and check_utf8, if non-NULL, point to either their
12868 anchored or float namesakes, and don't hold a second reference. */
12870 if (ret->check_substr) {
12872 assert(r->check_utf8 == r->anchored_utf8);
12873 ret->check_substr = ret->anchored_substr;
12874 ret->check_utf8 = ret->anchored_utf8;
12876 assert(r->check_substr == r->float_substr);
12877 assert(r->check_utf8 == r->float_utf8);
12878 ret->check_substr = ret->float_substr;
12879 ret->check_utf8 = ret->float_utf8;
12881 } else if (ret->check_utf8) {
12883 ret->check_utf8 = ret->anchored_utf8;
12885 ret->check_utf8 = ret->float_utf8;
12890 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
12893 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
12895 if (RX_MATCH_COPIED(dstr))
12896 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
12898 ret->subbeg = NULL;
12899 #ifdef PERL_OLD_COPY_ON_WRITE
12900 ret->saved_copy = NULL;
12903 if (ret->mother_re) {
12904 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
12905 /* Our storage points directly to our mother regexp, but that's
12906 1: a buffer in a different thread
12907 2: something we no longer hold a reference on
12908 so we need to copy it locally. */
12909 /* Note we need to use SvCUR(), rather than
12910 SvLEN(), on our mother_re, because it, in
12911 turn, may well be pointing to its own mother_re. */
12912 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
12913 SvCUR(ret->mother_re)+1));
12914 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
12916 ret->mother_re = NULL;
12920 #endif /* PERL_IN_XSUB_RE */
12925 This is the internal complement to regdupe() which is used to copy
12926 the structure pointed to by the *pprivate pointer in the regexp.
12927 This is the core version of the extension overridable cloning hook.
12928 The regexp structure being duplicated will be copied by perl prior
12929 to this and will be provided as the regexp *r argument, however
12930 with the /old/ structures pprivate pointer value. Thus this routine
12931 may override any copying normally done by perl.
12933 It returns a pointer to the new regexp_internal structure.
12937 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
12940 struct regexp *const r = (struct regexp *)SvANY(rx);
12941 regexp_internal *reti;
12943 RXi_GET_DECL(r,ri);
12945 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
12949 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
12950 Copy(ri->program, reti->program, len+1, regnode);
12953 reti->regstclass = NULL;
12956 struct reg_data *d;
12957 const int count = ri->data->count;
12960 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
12961 char, struct reg_data);
12962 Newx(d->what, count, U8);
12965 for (i = 0; i < count; i++) {
12966 d->what[i] = ri->data->what[i];
12967 switch (d->what[i]) {
12968 /* legal options are one of: sSfpontTua
12969 see also regcomp.h and pregfree() */
12970 case 'a': /* actually an AV, but the dup function is identical. */
12973 case 'p': /* actually an AV, but the dup function is identical. */
12974 case 'u': /* actually an HV, but the dup function is identical. */
12975 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
12978 /* This is cheating. */
12979 Newx(d->data[i], 1, struct regnode_charclass_class);
12980 StructCopy(ri->data->data[i], d->data[i],
12981 struct regnode_charclass_class);
12982 reti->regstclass = (regnode*)d->data[i];
12985 /* Compiled op trees are readonly and in shared memory,
12986 and can thus be shared without duplication. */
12988 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
12992 /* Trie stclasses are readonly and can thus be shared
12993 * without duplication. We free the stclass in pregfree
12994 * when the corresponding reg_ac_data struct is freed.
12996 reti->regstclass= ri->regstclass;
13000 ((reg_trie_data*)ri->data->data[i])->refcount++;
13004 d->data[i] = ri->data->data[i];
13007 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13016 reti->name_list_idx = ri->name_list_idx;
13018 #ifdef RE_TRACK_PATTERN_OFFSETS
13019 if (ri->u.offsets) {
13020 Newx(reti->u.offsets, 2*len+1, U32);
13021 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13024 SetProgLen(reti,len);
13027 return (void*)reti;
13030 #endif /* USE_ITHREADS */
13032 #ifndef PERL_IN_XSUB_RE
13035 - regnext - dig the "next" pointer out of a node
13038 Perl_regnext(pTHX_ register regnode *p)
13041 register I32 offset;
13046 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13047 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13050 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13059 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13062 STRLEN l1 = strlen(pat1);
13063 STRLEN l2 = strlen(pat2);
13066 const char *message;
13068 PERL_ARGS_ASSERT_RE_CROAK2;
13074 Copy(pat1, buf, l1 , char);
13075 Copy(pat2, buf + l1, l2 , char);
13076 buf[l1 + l2] = '\n';
13077 buf[l1 + l2 + 1] = '\0';
13079 /* ANSI variant takes additional second argument */
13080 va_start(args, pat2);
13084 msv = vmess(buf, &args);
13086 message = SvPV_const(msv,l1);
13089 Copy(message, buf, l1 , char);
13090 buf[l1-1] = '\0'; /* Overwrite \n */
13091 Perl_croak(aTHX_ "%s", buf);
13094 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13096 #ifndef PERL_IN_XSUB_RE
13098 Perl_save_re_context(pTHX)
13102 struct re_save_state *state;
13104 SAVEVPTR(PL_curcop);
13105 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13107 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13108 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13109 SSPUSHUV(SAVEt_RE_STATE);
13111 Copy(&PL_reg_state, state, 1, struct re_save_state);
13113 PL_reg_start_tmp = 0;
13114 PL_reg_start_tmpl = 0;
13115 PL_reg_oldsaved = NULL;
13116 PL_reg_oldsavedlen = 0;
13117 PL_reg_maxiter = 0;
13118 PL_reg_leftiter = 0;
13119 PL_reg_poscache = NULL;
13120 PL_reg_poscache_size = 0;
13121 #ifdef PERL_OLD_COPY_ON_WRITE
13125 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13127 const REGEXP * const rx = PM_GETRE(PL_curpm);
13130 for (i = 1; i <= RX_NPARENS(rx); i++) {
13131 char digits[TYPE_CHARS(long)];
13132 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13133 GV *const *const gvp
13134 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13137 GV * const gv = *gvp;
13138 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13148 clear_re(pTHX_ void *r)
13151 ReREFCNT_dec((REGEXP *)r);
13157 S_put_byte(pTHX_ SV *sv, int c)
13159 PERL_ARGS_ASSERT_PUT_BYTE;
13161 /* Our definition of isPRINT() ignores locales, so only bytes that are
13162 not part of UTF-8 are considered printable. I assume that the same
13163 holds for UTF-EBCDIC.
13164 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13165 which Wikipedia says:
13167 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13168 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13169 identical, to the ASCII delete (DEL) or rubout control character.
13170 ) So the old condition can be simplified to !isPRINT(c) */
13173 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13176 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13180 const char string = c;
13181 if (c == '-' || c == ']' || c == '\\' || c == '^')
13182 sv_catpvs(sv, "\\");
13183 sv_catpvn(sv, &string, 1);
13188 #define CLEAR_OPTSTART \
13189 if (optstart) STMT_START { \
13190 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13194 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13196 STATIC const regnode *
13197 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13198 const regnode *last, const regnode *plast,
13199 SV* sv, I32 indent, U32 depth)
13202 register U8 op = PSEUDO; /* Arbitrary non-END op. */
13203 register const regnode *next;
13204 const regnode *optstart= NULL;
13206 RXi_GET_DECL(r,ri);
13207 GET_RE_DEBUG_FLAGS_DECL;
13209 PERL_ARGS_ASSERT_DUMPUNTIL;
13211 #ifdef DEBUG_DUMPUNTIL
13212 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13213 last ? last-start : 0,plast ? plast-start : 0);
13216 if (plast && plast < last)
13219 while (PL_regkind[op] != END && (!last || node < last)) {
13220 /* While that wasn't END last time... */
13223 if (op == CLOSE || op == WHILEM)
13225 next = regnext((regnode *)node);
13228 if (OP(node) == OPTIMIZED) {
13229 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13236 regprop(r, sv, node);
13237 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13238 (int)(2*indent + 1), "", SvPVX_const(sv));
13240 if (OP(node) != OPTIMIZED) {
13241 if (next == NULL) /* Next ptr. */
13242 PerlIO_printf(Perl_debug_log, " (0)");
13243 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13244 PerlIO_printf(Perl_debug_log, " (FAIL)");
13246 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13247 (void)PerlIO_putc(Perl_debug_log, '\n');
13251 if (PL_regkind[(U8)op] == BRANCHJ) {
13254 register const regnode *nnode = (OP(next) == LONGJMP
13255 ? regnext((regnode *)next)
13257 if (last && nnode > last)
13259 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13262 else if (PL_regkind[(U8)op] == BRANCH) {
13264 DUMPUNTIL(NEXTOPER(node), next);
13266 else if ( PL_regkind[(U8)op] == TRIE ) {
13267 const regnode *this_trie = node;
13268 const char op = OP(node);
13269 const U32 n = ARG(node);
13270 const reg_ac_data * const ac = op>=AHOCORASICK ?
13271 (reg_ac_data *)ri->data->data[n] :
13273 const reg_trie_data * const trie =
13274 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13276 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13278 const regnode *nextbranch= NULL;
13281 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13282 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13284 PerlIO_printf(Perl_debug_log, "%*s%s ",
13285 (int)(2*(indent+3)), "",
13286 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13287 PL_colors[0], PL_colors[1],
13288 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13289 PERL_PV_PRETTY_ELLIPSES |
13290 PERL_PV_PRETTY_LTGT
13295 U16 dist= trie->jump[word_idx+1];
13296 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13297 (UV)((dist ? this_trie + dist : next) - start));
13300 nextbranch= this_trie + trie->jump[0];
13301 DUMPUNTIL(this_trie + dist, nextbranch);
13303 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13304 nextbranch= regnext((regnode *)nextbranch);
13306 PerlIO_printf(Perl_debug_log, "\n");
13309 if (last && next > last)
13314 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
13315 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13316 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13318 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13320 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13322 else if ( op == PLUS || op == STAR) {
13323 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13325 else if (PL_regkind[(U8)op] == ANYOF) {
13326 /* arglen 1 + class block */
13327 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13328 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13329 node = NEXTOPER(node);
13331 else if (PL_regkind[(U8)op] == EXACT) {
13332 /* Literal string, where present. */
13333 node += NODE_SZ_STR(node) - 1;
13334 node = NEXTOPER(node);
13337 node = NEXTOPER(node);
13338 node += regarglen[(U8)op];
13340 if (op == CURLYX || op == OPEN)
13344 #ifdef DEBUG_DUMPUNTIL
13345 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13350 #endif /* DEBUGGING */
13354 * c-indentation-style: bsd
13355 * c-basic-offset: 4
13356 * indent-tabs-mode: t
13359 * ex: set ts=8 sts=4 sw=4 noet: