5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
95 # if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97 # pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99 # pragma optimize("w",on )
100 # endif /* BUGGY_MSC6 */
104 #define STATIC static
107 typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 HV *paren_names; /* Paren names */
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
142 char *starttry; /* -Dr: where regtry was called. */
143 #define RExC_starttry (pRExC_state->starttry)
146 const char *lastparse;
148 AV *paren_name_list; /* idx -> name */
149 #define RExC_lastparse (pRExC_state->lastparse)
150 #define RExC_lastnum (pRExC_state->lastnum)
151 #define RExC_paren_name_list (pRExC_state->paren_name_list)
155 #define RExC_flags (pRExC_state->flags)
156 #define RExC_precomp (pRExC_state->precomp)
157 #define RExC_rx_sv (pRExC_state->rx_sv)
158 #define RExC_rx (pRExC_state->rx)
159 #define RExC_rxi (pRExC_state->rxi)
160 #define RExC_start (pRExC_state->start)
161 #define RExC_end (pRExC_state->end)
162 #define RExC_parse (pRExC_state->parse)
163 #define RExC_whilem_seen (pRExC_state->whilem_seen)
164 #ifdef RE_TRACK_PATTERN_OFFSETS
165 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
167 #define RExC_emit (pRExC_state->emit)
168 #define RExC_emit_start (pRExC_state->emit_start)
169 #define RExC_emit_bound (pRExC_state->emit_bound)
170 #define RExC_naughty (pRExC_state->naughty)
171 #define RExC_sawback (pRExC_state->sawback)
172 #define RExC_seen (pRExC_state->seen)
173 #define RExC_size (pRExC_state->size)
174 #define RExC_npar (pRExC_state->npar)
175 #define RExC_nestroot (pRExC_state->nestroot)
176 #define RExC_extralen (pRExC_state->extralen)
177 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178 #define RExC_seen_evals (pRExC_state->seen_evals)
179 #define RExC_utf8 (pRExC_state->utf8)
180 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
181 #define RExC_open_parens (pRExC_state->open_parens)
182 #define RExC_close_parens (pRExC_state->close_parens)
183 #define RExC_opend (pRExC_state->opend)
184 #define RExC_paren_names (pRExC_state->paren_names)
185 #define RExC_recurse (pRExC_state->recurse)
186 #define RExC_recurse_count (pRExC_state->recurse_count)
189 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
194 #undef SPSTART /* dratted cpp namespace... */
197 * Flags to be passed up and down.
199 #define WORST 0 /* Worst case. */
200 #define HASWIDTH 0x01 /* Known to match non-null strings. */
202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
205 #define SPSTART 0x04 /* Starts with * or +. */
206 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
207 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
211 /* whether trie related optimizations are enabled */
212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213 #define TRIE_STUDY_OPT
214 #define FULL_TRIE_STUDY
220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221 #define PBITVAL(paren) (1 << ((paren) & 7))
222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
226 /* If not already in utf8, do a longjmp back to the beginning */
227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228 #define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
232 /* About scan_data_t.
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
238 string that must appear for at a fixed location, and we look for the
239 longest string that may appear at a floating location. So for instance
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
248 The strings can be composites, for instance
252 will result in a composite fixed substring 'foo'.
254 For each string some basic information is maintained:
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
259 character must match before the string we are searching.
260 Likewise when combined with minlenp and the length of the string
261 tells us how many characters must appear after the string we have
265 Only used for floating strings. This is the rightmost point that
266 the string can appear at. Ifset to I32 max it indicates that the
267 string can occur infinitely far to the right.
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
307 typedef struct scan_data_t {
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
313 I32 last_end; /* min value, <0 unless valid. */
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
329 struct regnode_charclass_class *start_class;
333 * Forward declarations for pregcomp()'s friends.
336 static const scan_data_t zero_scan_data =
337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
339 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
340 #define SF_BEFORE_SEOL 0x0001
341 #define SF_BEFORE_MEOL 0x0002
342 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
346 # define SF_FIX_SHIFT_EOL (0+2)
347 # define SF_FL_SHIFT_EOL (0+4)
349 # define SF_FIX_SHIFT_EOL (+2)
350 # define SF_FL_SHIFT_EOL (+4)
353 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
356 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
358 #define SF_IS_INF 0x0040
359 #define SF_HAS_PAR 0x0080
360 #define SF_IN_PAR 0x0100
361 #define SF_HAS_EVAL 0x0200
362 #define SCF_DO_SUBSTR 0x0400
363 #define SCF_DO_STCLASS_AND 0x0800
364 #define SCF_DO_STCLASS_OR 0x1000
365 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
366 #define SCF_WHILEM_VISITED_POS 0x2000
368 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
369 #define SCF_SEEN_ACCEPT 0x8000
371 #define UTF cBOOL(RExC_utf8)
372 #define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373 #define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
376 #define OOB_UNICODE 12345678
377 #define OOB_NAMEDCLASS -1
379 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
383 /* length of regex to show in messages that don't mark a position within */
384 #define RegexLengthToShowInErrorMessages 127
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
391 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
392 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
394 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
401 #define _FAIL(code) STMT_START { \
402 const char *ellipses = ""; \
403 IV len = RExC_end - RExC_precomp; \
406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
415 #define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
419 #define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
426 #define Simple_vFAIL(m) STMT_START { \
427 const IV offset = RExC_parse - RExC_precomp; \
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
435 #define vFAIL(m) STMT_START { \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
442 * Like Simple_vFAIL(), but accepts two arguments.
444 #define Simple_vFAIL2(m,a1) STMT_START { \
445 const IV offset = RExC_parse - RExC_precomp; \
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
453 #define vFAIL2(m,a1) STMT_START { \
455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
456 Simple_vFAIL2(m, a1); \
461 * Like Simple_vFAIL(), but accepts three arguments.
463 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
464 const IV offset = RExC_parse - RExC_precomp; \
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
472 #define vFAIL3(m,a1,a2) STMT_START { \
474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
475 Simple_vFAIL3(m, a1, a2); \
479 * Like Simple_vFAIL(), but accepts four arguments.
481 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
482 const IV offset = RExC_parse - RExC_precomp; \
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define ckWARNreg(loc,m) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define ckWARNregdep(loc,m) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
500 #define ckWARN2reg(loc, m, a1) STMT_START { \
501 const IV offset = loc - RExC_precomp; \
502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
506 #define vWARN3(loc, m, a1, a2) STMT_START { \
507 const IV offset = loc - RExC_precomp; \
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
512 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
518 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
519 const IV offset = loc - RExC_precomp; \
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
524 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
530 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
531 const IV offset = loc - RExC_precomp; \
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
537 /* Allow for side effects in s */
538 #define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
542 /* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
546 * Position is 1 indexed.
548 #ifndef RE_TRACK_PATTERN_OFFSETS
549 #define Set_Node_Offset_To_R(node,byte)
550 #define Set_Node_Offset(node,byte)
551 #define Set_Cur_Node_Offset
552 #define Set_Node_Length_To_R(node,len)
553 #define Set_Node_Length(node,len)
554 #define Set_Node_Cur_Length(node)
555 #define Node_Offset(n)
556 #define Node_Length(n)
557 #define Set_Node_Offset_Length(node,offset,len)
558 #define ProgLen(ri) ri->u.proglen
559 #define SetProgLen(ri,x) ri->u.proglen = x
561 #define ProgLen(ri) ri->u.offsets[0]
562 #define SetProgLen(ri,x) ri->u.offsets[0] = x
563 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
566 __LINE__, (int)(node), (int)(byte))); \
568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
570 RExC_offsets[2*(node)-1] = (byte); \
575 #define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
579 #define Set_Node_Length_To_R(node,len) STMT_START { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
582 __LINE__, (int)(node), (int)(len))); \
584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
586 RExC_offsets[2*(node)] = (len); \
591 #define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594 #define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
597 /* Get offsets and lengths */
598 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
601 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
607 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608 #define EXPERIMENTAL_INPLACESCAN
609 #endif /*RE_TRACK_PATTERN_OFFSETS*/
611 #define DEBUG_STUDYDATA(str,data,depth) \
612 DEBUG_OPTIMISE_MORE_r(if(data){ \
613 PerlIO_printf(Perl_debug_log, \
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
619 (UV)((data)->flags), \
620 (IV)((data)->whilem_c), \
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
642 PerlIO_printf(Perl_debug_log,"\n"); \
645 static void clear_re(pTHX_ void *r);
647 /* Mark that we cannot extend a found fixed substring at this point.
648 Update the longest found anchored substring and the longest found
649 floating substrings if needed. */
652 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
656 GET_RE_DEBUG_FLAGS_DECL;
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
661 SvSetMagicSV(*data->longest, data->last_found);
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
668 data->flags &= ~SF_FIX_BEFORE_EOL;
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
672 else { /* *data->longest == data->longest_float */
673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
674 data->offset_float_max = (l
675 ? data->last_start_max
676 : data->pos_min + data->pos_delta);
677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
678 data->offset_float_max = I32_MAX;
679 if (data->flags & SF_BEFORE_EOL)
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
683 data->flags &= ~SF_FL_BEFORE_EOL;
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
688 SvCUR_set(data->last_found, 0);
690 SV * const sv = data->last_found;
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
698 data->flags &= ~SF_BEFORE_EOL;
699 DEBUG_STUDYDATA("commit: ",data,0);
702 /* Can match anything (initialization) */
704 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
706 PERL_ARGS_ASSERT_CL_ANYTHING;
708 ANYOF_CLASS_ZERO(cl);
709 ANYOF_BITMAP_SETALL(cl);
710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
712 cl->flags |= ANYOF_LOCALE;
715 /* Can match anything (initialization) */
717 S_cl_is_anything(const struct regnode_charclass_class *cl)
721 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
723 for (value = 0; value <= ANYOF_MAX; value += 2)
724 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
726 if (!(cl->flags & ANYOF_UNICODE_ALL))
728 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
733 /* Can match anything (initialization) */
735 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
737 PERL_ARGS_ASSERT_CL_INIT;
739 Zero(cl, 1, struct regnode_charclass_class);
741 cl_anything(pRExC_state, cl);
745 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
747 PERL_ARGS_ASSERT_CL_INIT_ZERO;
749 Zero(cl, 1, struct regnode_charclass_class);
751 cl_anything(pRExC_state, cl);
753 cl->flags |= ANYOF_LOCALE;
756 /* 'And' a given class with another one. Can create false positives */
757 /* We assume that cl is not inverted */
759 S_cl_and(struct regnode_charclass_class *cl,
760 const struct regnode_charclass_class *and_with)
762 PERL_ARGS_ASSERT_CL_AND;
764 assert(and_with->type == ANYOF);
765 if (!(and_with->flags & ANYOF_CLASS)
766 && !(cl->flags & ANYOF_CLASS)
767 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
768 && !(and_with->flags & ANYOF_FOLD)
769 && !(cl->flags & ANYOF_FOLD)) {
772 if (and_with->flags & ANYOF_INVERT)
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] &= ~and_with->bitmap[i];
776 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
777 cl->bitmap[i] &= and_with->bitmap[i];
778 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
779 if (!(and_with->flags & ANYOF_EOS))
780 cl->flags &= ~ANYOF_EOS;
782 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
783 !(and_with->flags & ANYOF_INVERT)) {
784 cl->flags &= ~ANYOF_UNICODE_ALL;
785 cl->flags |= ANYOF_UNICODE;
786 ARG_SET(cl, ARG(and_with));
788 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
789 !(and_with->flags & ANYOF_INVERT))
790 cl->flags &= ~ANYOF_UNICODE_ALL;
791 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
792 !(and_with->flags & ANYOF_INVERT))
793 cl->flags &= ~ANYOF_UNICODE;
796 /* 'OR' a given class with another one. Can create false positives */
797 /* We assume that cl is not inverted */
799 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
801 PERL_ARGS_ASSERT_CL_OR;
803 if (or_with->flags & ANYOF_INVERT) {
805 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
806 * <= (B1 | !B2) | (CL1 | !CL2)
807 * which is wasteful if CL2 is small, but we ignore CL2:
808 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
809 * XXXX Can we handle case-fold? Unclear:
810 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
811 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
813 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
814 && !(or_with->flags & ANYOF_FOLD)
815 && !(cl->flags & ANYOF_FOLD) ) {
818 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
819 cl->bitmap[i] |= ~or_with->bitmap[i];
820 } /* XXXX: logic is complicated otherwise */
822 cl_anything(pRExC_state, cl);
825 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
826 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
827 && (!(or_with->flags & ANYOF_FOLD)
828 || (cl->flags & ANYOF_FOLD)) ) {
831 /* OR char bitmap and class bitmap separately */
832 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
833 cl->bitmap[i] |= or_with->bitmap[i];
834 if (or_with->flags & ANYOF_CLASS) {
835 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
836 cl->classflags[i] |= or_with->classflags[i];
837 cl->flags |= ANYOF_CLASS;
840 else { /* XXXX: logic is complicated, leave it along for a moment. */
841 cl_anything(pRExC_state, cl);
844 if (or_with->flags & ANYOF_EOS)
845 cl->flags |= ANYOF_EOS;
847 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
848 ARG(cl) != ARG(or_with)) {
849 cl->flags |= ANYOF_UNICODE_ALL;
850 cl->flags &= ~ANYOF_UNICODE;
852 if (or_with->flags & ANYOF_UNICODE_ALL) {
853 cl->flags |= ANYOF_UNICODE_ALL;
854 cl->flags &= ~ANYOF_UNICODE;
858 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
859 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
860 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
861 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
866 dump_trie(trie,widecharmap,revcharmap)
867 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
868 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
870 These routines dump out a trie in a somewhat readable format.
871 The _interim_ variants are used for debugging the interim
872 tables that are used to generate the final compressed
873 representation which is what dump_trie expects.
875 Part of the reason for their existance is to provide a form
876 of documentation as to how the different representations function.
881 Dumps the final compressed table form of the trie to Perl_debug_log.
882 Used for debugging make_trie().
886 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
887 AV *revcharmap, U32 depth)
890 SV *sv=sv_newmortal();
891 int colwidth= widecharmap ? 6 : 4;
893 GET_RE_DEBUG_FLAGS_DECL;
895 PERL_ARGS_ASSERT_DUMP_TRIE;
897 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
898 (int)depth * 2 + 2,"",
899 "Match","Base","Ofs" );
901 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
902 SV ** const tmp = av_fetch( revcharmap, state, 0);
904 PerlIO_printf( Perl_debug_log, "%*s",
906 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
907 PL_colors[0], PL_colors[1],
908 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
909 PERL_PV_ESCAPE_FIRSTCHAR
914 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
915 (int)depth * 2 + 2,"");
917 for( state = 0 ; state < trie->uniquecharcount ; state++ )
918 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
919 PerlIO_printf( Perl_debug_log, "\n");
921 for( state = 1 ; state < trie->statecount ; state++ ) {
922 const U32 base = trie->states[ state ].trans.base;
924 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
926 if ( trie->states[ state ].wordnum ) {
927 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
929 PerlIO_printf( Perl_debug_log, "%6s", "" );
932 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
937 while( ( base + ofs < trie->uniquecharcount ) ||
938 ( base + ofs - trie->uniquecharcount < trie->lasttrans
939 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
942 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
944 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
945 if ( ( base + ofs >= trie->uniquecharcount ) &&
946 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
947 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
949 PerlIO_printf( Perl_debug_log, "%*"UVXf,
951 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
953 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
957 PerlIO_printf( Perl_debug_log, "]");
960 PerlIO_printf( Perl_debug_log, "\n" );
962 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
963 for (word=1; word <= trie->wordcount; word++) {
964 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
965 (int)word, (int)(trie->wordinfo[word].prev),
966 (int)(trie->wordinfo[word].len));
968 PerlIO_printf(Perl_debug_log, "\n" );
971 Dumps a fully constructed but uncompressed trie in list form.
972 List tries normally only are used for construction when the number of
973 possible chars (trie->uniquecharcount) is very high.
974 Used for debugging make_trie().
977 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
978 HV *widecharmap, AV *revcharmap, U32 next_alloc,
982 SV *sv=sv_newmortal();
983 int colwidth= widecharmap ? 6 : 4;
984 GET_RE_DEBUG_FLAGS_DECL;
986 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
988 /* print out the table precompression. */
989 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
990 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
991 "------:-----+-----------------\n" );
993 for( state=1 ; state < next_alloc ; state ++ ) {
996 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
997 (int)depth * 2 + 2,"", (UV)state );
998 if ( ! trie->states[ state ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, "%5s| ","");
1001 PerlIO_printf( Perl_debug_log, "W%4x| ",
1002 trie->states[ state ].wordnum
1005 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1006 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1008 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1010 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1011 PL_colors[0], PL_colors[1],
1012 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1013 PERL_PV_ESCAPE_FIRSTCHAR
1015 TRIE_LIST_ITEM(state,charid).forid,
1016 (UV)TRIE_LIST_ITEM(state,charid).newstate
1019 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1020 (int)((depth * 2) + 14), "");
1023 PerlIO_printf( Perl_debug_log, "\n");
1028 Dumps a fully constructed but uncompressed trie in table form.
1029 This is the normal DFA style state transition table, with a few
1030 twists to facilitate compression later.
1031 Used for debugging make_trie().
1034 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1035 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1040 SV *sv=sv_newmortal();
1041 int colwidth= widecharmap ? 6 : 4;
1042 GET_RE_DEBUG_FLAGS_DECL;
1044 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1047 print out the table precompression so that we can do a visual check
1048 that they are identical.
1051 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1053 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1054 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1056 PerlIO_printf( Perl_debug_log, "%*s",
1058 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1059 PL_colors[0], PL_colors[1],
1060 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1061 PERL_PV_ESCAPE_FIRSTCHAR
1067 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1069 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1070 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1073 PerlIO_printf( Perl_debug_log, "\n" );
1075 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1077 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1078 (int)depth * 2 + 2,"",
1079 (UV)TRIE_NODENUM( state ) );
1081 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1082 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1084 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1086 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1088 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1089 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1091 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1092 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1100 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1101 startbranch: the first branch in the whole branch sequence
1102 first : start branch of sequence of branch-exact nodes.
1103 May be the same as startbranch
1104 last : Thing following the last branch.
1105 May be the same as tail.
1106 tail : item following the branch sequence
1107 count : words in the sequence
1108 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1109 depth : indent depth
1111 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1113 A trie is an N'ary tree where the branches are determined by digital
1114 decomposition of the key. IE, at the root node you look up the 1st character and
1115 follow that branch repeat until you find the end of the branches. Nodes can be
1116 marked as "accepting" meaning they represent a complete word. Eg:
1120 would convert into the following structure. Numbers represent states, letters
1121 following numbers represent valid transitions on the letter from that state, if
1122 the number is in square brackets it represents an accepting state, otherwise it
1123 will be in parenthesis.
1125 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1129 (1) +-i->(6)-+-s->[7]
1131 +-s->(3)-+-h->(4)-+-e->[5]
1133 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1135 This shows that when matching against the string 'hers' we will begin at state 1
1136 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1137 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1138 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1139 single traverse. We store a mapping from accepting to state to which word was
1140 matched, and then when we have multiple possibilities we try to complete the
1141 rest of the regex in the order in which they occured in the alternation.
1143 The only prior NFA like behaviour that would be changed by the TRIE support is
1144 the silent ignoring of duplicate alternations which are of the form:
1146 / (DUPE|DUPE) X? (?{ ... }) Y /x
1148 Thus EVAL blocks follwing a trie may be called a different number of times with
1149 and without the optimisation. With the optimisations dupes will be silently
1150 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1151 the following demonstrates:
1153 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1155 which prints out 'word' three times, but
1157 'words'=~/(word|word|word)(?{ print $1 })S/
1159 which doesnt print it out at all. This is due to other optimisations kicking in.
1161 Example of what happens on a structural level:
1163 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1165 1: CURLYM[1] {1,32767}(18)
1176 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1177 and should turn into:
1179 1: CURLYM[1] {1,32767}(18)
1181 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1189 Cases where tail != last would be like /(?foo|bar)baz/:
1199 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1200 and would end up looking like:
1203 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1210 d = uvuni_to_utf8_flags(d, uv, 0);
1212 is the recommended Unicode-aware way of saying
1217 #define TRIE_STORE_REVCHAR \
1220 SV *zlopp = newSV(2); \
1221 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1222 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1223 SvCUR_set(zlopp, kapow - flrbbbbb); \
1226 av_push(revcharmap, zlopp); \
1228 char ooooff = (char)uvc; \
1229 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1233 #define TRIE_READ_CHAR STMT_START { \
1237 if ( foldlen > 0 ) { \
1238 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1243 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1244 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1245 foldlen -= UNISKIP( uvc ); \
1246 scan = foldbuf + UNISKIP( uvc ); \
1249 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1259 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1260 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1261 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1262 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1264 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1265 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1266 TRIE_LIST_CUR( state )++; \
1269 #define TRIE_LIST_NEW(state) STMT_START { \
1270 Newxz( trie->states[ state ].trans.list, \
1271 4, reg_trie_trans_le ); \
1272 TRIE_LIST_CUR( state ) = 1; \
1273 TRIE_LIST_LEN( state ) = 4; \
1276 #define TRIE_HANDLE_WORD(state) STMT_START { \
1277 U16 dupe= trie->states[ state ].wordnum; \
1278 regnode * const noper_next = regnext( noper ); \
1281 /* store the word for dumping */ \
1283 if (OP(noper) != NOTHING) \
1284 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1286 tmp = newSVpvn_utf8( "", 0, UTF ); \
1287 av_push( trie_words, tmp ); \
1291 trie->wordinfo[curword].prev = 0; \
1292 trie->wordinfo[curword].len = wordlen; \
1293 trie->wordinfo[curword].accept = state; \
1295 if ( noper_next < tail ) { \
1297 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1298 trie->jump[curword] = (U16)(noper_next - convert); \
1300 jumper = noper_next; \
1302 nextbranch= regnext(cur); \
1306 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1307 /* chain, so that when the bits of chain are later */\
1308 /* linked together, the dups appear in the chain */\
1309 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1310 trie->wordinfo[dupe].prev = curword; \
1312 /* we haven't inserted this word yet. */ \
1313 trie->states[ state ].wordnum = curword; \
1318 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1319 ( ( base + charid >= ucharcount \
1320 && base + charid < ubound \
1321 && state == trie->trans[ base - ucharcount + charid ].check \
1322 && trie->trans[ base - ucharcount + charid ].next ) \
1323 ? trie->trans[ base - ucharcount + charid ].next \
1324 : ( state==1 ? special : 0 ) \
1328 #define MADE_JUMP_TRIE 2
1329 #define MADE_EXACT_TRIE 4
1332 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1335 /* first pass, loop through and scan words */
1336 reg_trie_data *trie;
1337 HV *widecharmap = NULL;
1338 AV *revcharmap = newAV();
1340 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1345 regnode *jumper = NULL;
1346 regnode *nextbranch = NULL;
1347 regnode *convert = NULL;
1348 U32 *prev_states; /* temp array mapping each state to previous one */
1349 /* we just use folder as a flag in utf8 */
1350 const U8 * const folder = ( flags == EXACTF
1352 : ( flags == EXACTFL
1359 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1360 AV *trie_words = NULL;
1361 /* along with revcharmap, this only used during construction but both are
1362 * useful during debugging so we store them in the struct when debugging.
1365 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1366 STRLEN trie_charcount=0;
1368 SV *re_trie_maxbuff;
1369 GET_RE_DEBUG_FLAGS_DECL;
1371 PERL_ARGS_ASSERT_MAKE_TRIE;
1373 PERL_UNUSED_ARG(depth);
1376 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1378 trie->startstate = 1;
1379 trie->wordcount = word_count;
1380 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1381 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1382 if (!(UTF && folder))
1383 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1384 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1385 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1388 trie_words = newAV();
1391 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1392 if (!SvIOK(re_trie_maxbuff)) {
1393 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1396 PerlIO_printf( Perl_debug_log,
1397 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1398 (int)depth * 2 + 2, "",
1399 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1400 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1404 /* Find the node we are going to overwrite */
1405 if ( first == startbranch && OP( last ) != BRANCH ) {
1406 /* whole branch chain */
1409 /* branch sub-chain */
1410 convert = NEXTOPER( first );
1413 /* -- First loop and Setup --
1415 We first traverse the branches and scan each word to determine if it
1416 contains widechars, and how many unique chars there are, this is
1417 important as we have to build a table with at least as many columns as we
1420 We use an array of integers to represent the character codes 0..255
1421 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1422 native representation of the character value as the key and IV's for the
1425 *TODO* If we keep track of how many times each character is used we can
1426 remap the columns so that the table compression later on is more
1427 efficient in terms of memory by ensuring most common value is in the
1428 middle and the least common are on the outside. IMO this would be better
1429 than a most to least common mapping as theres a decent chance the most
1430 common letter will share a node with the least common, meaning the node
1431 will not be compressable. With a middle is most common approach the worst
1432 case is when we have the least common nodes twice.
1436 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1437 regnode * const noper = NEXTOPER( cur );
1438 const U8 *uc = (U8*)STRING( noper );
1439 const U8 * const e = uc + STR_LEN( noper );
1441 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1442 const U8 *scan = (U8*)NULL;
1443 U32 wordlen = 0; /* required init */
1445 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1447 if (OP(noper) == NOTHING) {
1451 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1452 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1453 regardless of encoding */
1455 for ( ; uc < e ; uc += len ) {
1456 TRIE_CHARCOUNT(trie)++;
1460 if ( !trie->charmap[ uvc ] ) {
1461 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1463 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1467 /* store the codepoint in the bitmap, and if its ascii
1468 also store its folded equivelent. */
1469 TRIE_BITMAP_SET(trie,uvc);
1471 /* store the folded codepoint */
1472 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1475 /* store first byte of utf8 representation of
1476 codepoints in the 127 < uvc < 256 range */
1477 if (127 < uvc && uvc < 192) {
1478 TRIE_BITMAP_SET(trie,194);
1479 } else if (191 < uvc ) {
1480 TRIE_BITMAP_SET(trie,195);
1481 /* && uvc < 256 -- we know uvc is < 256 already */
1484 set_bit = 0; /* We've done our bit :-) */
1489 widecharmap = newHV();
1491 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1494 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1496 if ( !SvTRUE( *svpp ) ) {
1497 sv_setiv( *svpp, ++trie->uniquecharcount );
1502 if( cur == first ) {
1505 } else if (chars < trie->minlen) {
1507 } else if (chars > trie->maxlen) {
1511 } /* end first pass */
1512 DEBUG_TRIE_COMPILE_r(
1513 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1514 (int)depth * 2 + 2,"",
1515 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1516 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1517 (int)trie->minlen, (int)trie->maxlen )
1521 We now know what we are dealing with in terms of unique chars and
1522 string sizes so we can calculate how much memory a naive
1523 representation using a flat table will take. If it's over a reasonable
1524 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1525 conservative but potentially much slower representation using an array
1528 At the end we convert both representations into the same compressed
1529 form that will be used in regexec.c for matching with. The latter
1530 is a form that cannot be used to construct with but has memory
1531 properties similar to the list form and access properties similar
1532 to the table form making it both suitable for fast searches and
1533 small enough that its feasable to store for the duration of a program.
1535 See the comment in the code where the compressed table is produced
1536 inplace from the flat tabe representation for an explanation of how
1537 the compression works.
1542 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1545 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1547 Second Pass -- Array Of Lists Representation
1549 Each state will be represented by a list of charid:state records
1550 (reg_trie_trans_le) the first such element holds the CUR and LEN
1551 points of the allocated array. (See defines above).
1553 We build the initial structure using the lists, and then convert
1554 it into the compressed table form which allows faster lookups
1555 (but cant be modified once converted).
1558 STRLEN transcount = 1;
1560 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1561 "%*sCompiling trie using list compiler\n",
1562 (int)depth * 2 + 2, ""));
1564 trie->states = (reg_trie_state *)
1565 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1566 sizeof(reg_trie_state) );
1570 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1572 regnode * const noper = NEXTOPER( cur );
1573 U8 *uc = (U8*)STRING( noper );
1574 const U8 * const e = uc + STR_LEN( noper );
1575 U32 state = 1; /* required init */
1576 U16 charid = 0; /* sanity init */
1577 U8 *scan = (U8*)NULL; /* sanity init */
1578 STRLEN foldlen = 0; /* required init */
1579 U32 wordlen = 0; /* required init */
1580 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1582 if (OP(noper) != NOTHING) {
1583 for ( ; uc < e ; uc += len ) {
1588 charid = trie->charmap[ uvc ];
1590 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1594 charid=(U16)SvIV( *svpp );
1597 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1604 if ( !trie->states[ state ].trans.list ) {
1605 TRIE_LIST_NEW( state );
1607 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1608 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1609 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1614 newstate = next_alloc++;
1615 prev_states[newstate] = state;
1616 TRIE_LIST_PUSH( state, charid, newstate );
1621 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1625 TRIE_HANDLE_WORD(state);
1627 } /* end second pass */
1629 /* next alloc is the NEXT state to be allocated */
1630 trie->statecount = next_alloc;
1631 trie->states = (reg_trie_state *)
1632 PerlMemShared_realloc( trie->states,
1634 * sizeof(reg_trie_state) );
1636 /* and now dump it out before we compress it */
1637 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1638 revcharmap, next_alloc,
1642 trie->trans = (reg_trie_trans *)
1643 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1650 for( state=1 ; state < next_alloc ; state ++ ) {
1654 DEBUG_TRIE_COMPILE_MORE_r(
1655 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1659 if (trie->states[state].trans.list) {
1660 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1664 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1665 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1666 if ( forid < minid ) {
1668 } else if ( forid > maxid ) {
1672 if ( transcount < tp + maxid - minid + 1) {
1674 trie->trans = (reg_trie_trans *)
1675 PerlMemShared_realloc( trie->trans,
1677 * sizeof(reg_trie_trans) );
1678 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1680 base = trie->uniquecharcount + tp - minid;
1681 if ( maxid == minid ) {
1683 for ( ; zp < tp ; zp++ ) {
1684 if ( ! trie->trans[ zp ].next ) {
1685 base = trie->uniquecharcount + zp - minid;
1686 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1687 trie->trans[ zp ].check = state;
1693 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1694 trie->trans[ tp ].check = state;
1699 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1700 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1701 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1702 trie->trans[ tid ].check = state;
1704 tp += ( maxid - minid + 1 );
1706 Safefree(trie->states[ state ].trans.list);
1709 DEBUG_TRIE_COMPILE_MORE_r(
1710 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1713 trie->states[ state ].trans.base=base;
1715 trie->lasttrans = tp + 1;
1719 Second Pass -- Flat Table Representation.
1721 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1722 We know that we will need Charcount+1 trans at most to store the data
1723 (one row per char at worst case) So we preallocate both structures
1724 assuming worst case.
1726 We then construct the trie using only the .next slots of the entry
1729 We use the .check field of the first entry of the node temporarily to
1730 make compression both faster and easier by keeping track of how many non
1731 zero fields are in the node.
1733 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1736 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1737 number representing the first entry of the node, and state as a
1738 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1739 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1740 are 2 entrys per node. eg:
1748 The table is internally in the right hand, idx form. However as we also
1749 have to deal with the states array which is indexed by nodenum we have to
1750 use TRIE_NODENUM() to convert.
1753 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1754 "%*sCompiling trie using table compiler\n",
1755 (int)depth * 2 + 2, ""));
1757 trie->trans = (reg_trie_trans *)
1758 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1759 * trie->uniquecharcount + 1,
1760 sizeof(reg_trie_trans) );
1761 trie->states = (reg_trie_state *)
1762 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1763 sizeof(reg_trie_state) );
1764 next_alloc = trie->uniquecharcount + 1;
1767 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1769 regnode * const noper = NEXTOPER( cur );
1770 const U8 *uc = (U8*)STRING( noper );
1771 const U8 * const e = uc + STR_LEN( noper );
1773 U32 state = 1; /* required init */
1775 U16 charid = 0; /* sanity init */
1776 U32 accept_state = 0; /* sanity init */
1777 U8 *scan = (U8*)NULL; /* sanity init */
1779 STRLEN foldlen = 0; /* required init */
1780 U32 wordlen = 0; /* required init */
1781 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1783 if ( OP(noper) != NOTHING ) {
1784 for ( ; uc < e ; uc += len ) {
1789 charid = trie->charmap[ uvc ];
1791 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1792 charid = svpp ? (U16)SvIV(*svpp) : 0;
1796 if ( !trie->trans[ state + charid ].next ) {
1797 trie->trans[ state + charid ].next = next_alloc;
1798 trie->trans[ state ].check++;
1799 prev_states[TRIE_NODENUM(next_alloc)]
1800 = TRIE_NODENUM(state);
1801 next_alloc += trie->uniquecharcount;
1803 state = trie->trans[ state + charid ].next;
1805 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1807 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1810 accept_state = TRIE_NODENUM( state );
1811 TRIE_HANDLE_WORD(accept_state);
1813 } /* end second pass */
1815 /* and now dump it out before we compress it */
1816 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1818 next_alloc, depth+1));
1822 * Inplace compress the table.*
1824 For sparse data sets the table constructed by the trie algorithm will
1825 be mostly 0/FAIL transitions or to put it another way mostly empty.
1826 (Note that leaf nodes will not contain any transitions.)
1828 This algorithm compresses the tables by eliminating most such
1829 transitions, at the cost of a modest bit of extra work during lookup:
1831 - Each states[] entry contains a .base field which indicates the
1832 index in the state[] array wheres its transition data is stored.
1834 - If .base is 0 there are no valid transitions from that node.
1836 - If .base is nonzero then charid is added to it to find an entry in
1839 -If trans[states[state].base+charid].check!=state then the
1840 transition is taken to be a 0/Fail transition. Thus if there are fail
1841 transitions at the front of the node then the .base offset will point
1842 somewhere inside the previous nodes data (or maybe even into a node
1843 even earlier), but the .check field determines if the transition is
1847 The following process inplace converts the table to the compressed
1848 table: We first do not compress the root node 1,and mark its all its
1849 .check pointers as 1 and set its .base pointer as 1 as well. This
1850 allows to do a DFA construction from the compressed table later, and
1851 ensures that any .base pointers we calculate later are greater than
1854 - We set 'pos' to indicate the first entry of the second node.
1856 - We then iterate over the columns of the node, finding the first and
1857 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1858 and set the .check pointers accordingly, and advance pos
1859 appropriately and repreat for the next node. Note that when we copy
1860 the next pointers we have to convert them from the original
1861 NODEIDX form to NODENUM form as the former is not valid post
1864 - If a node has no transitions used we mark its base as 0 and do not
1865 advance the pos pointer.
1867 - If a node only has one transition we use a second pointer into the
1868 structure to fill in allocated fail transitions from other states.
1869 This pointer is independent of the main pointer and scans forward
1870 looking for null transitions that are allocated to a state. When it
1871 finds one it writes the single transition into the "hole". If the
1872 pointer doesnt find one the single transition is appended as normal.
1874 - Once compressed we can Renew/realloc the structures to release the
1877 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1878 specifically Fig 3.47 and the associated pseudocode.
1882 const U32 laststate = TRIE_NODENUM( next_alloc );
1885 trie->statecount = laststate;
1887 for ( state = 1 ; state < laststate ; state++ ) {
1889 const U32 stateidx = TRIE_NODEIDX( state );
1890 const U32 o_used = trie->trans[ stateidx ].check;
1891 U32 used = trie->trans[ stateidx ].check;
1892 trie->trans[ stateidx ].check = 0;
1894 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1895 if ( flag || trie->trans[ stateidx + charid ].next ) {
1896 if ( trie->trans[ stateidx + charid ].next ) {
1898 for ( ; zp < pos ; zp++ ) {
1899 if ( ! trie->trans[ zp ].next ) {
1903 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1904 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1905 trie->trans[ zp ].check = state;
1906 if ( ++zp > pos ) pos = zp;
1913 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1915 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1916 trie->trans[ pos ].check = state;
1921 trie->lasttrans = pos + 1;
1922 trie->states = (reg_trie_state *)
1923 PerlMemShared_realloc( trie->states, laststate
1924 * sizeof(reg_trie_state) );
1925 DEBUG_TRIE_COMPILE_MORE_r(
1926 PerlIO_printf( Perl_debug_log,
1927 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1928 (int)depth * 2 + 2,"",
1929 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1932 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1935 } /* end table compress */
1937 DEBUG_TRIE_COMPILE_MORE_r(
1938 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1939 (int)depth * 2 + 2, "",
1940 (UV)trie->statecount,
1941 (UV)trie->lasttrans)
1943 /* resize the trans array to remove unused space */
1944 trie->trans = (reg_trie_trans *)
1945 PerlMemShared_realloc( trie->trans, trie->lasttrans
1946 * sizeof(reg_trie_trans) );
1948 { /* Modify the program and insert the new TRIE node*/
1949 U8 nodetype =(U8)(flags & 0xFF);
1953 regnode *optimize = NULL;
1954 #ifdef RE_TRACK_PATTERN_OFFSETS
1957 U32 mjd_nodelen = 0;
1958 #endif /* RE_TRACK_PATTERN_OFFSETS */
1959 #endif /* DEBUGGING */
1961 This means we convert either the first branch or the first Exact,
1962 depending on whether the thing following (in 'last') is a branch
1963 or not and whther first is the startbranch (ie is it a sub part of
1964 the alternation or is it the whole thing.)
1965 Assuming its a sub part we conver the EXACT otherwise we convert
1966 the whole branch sequence, including the first.
1968 /* Find the node we are going to overwrite */
1969 if ( first != startbranch || OP( last ) == BRANCH ) {
1970 /* branch sub-chain */
1971 NEXT_OFF( first ) = (U16)(last - first);
1972 #ifdef RE_TRACK_PATTERN_OFFSETS
1974 mjd_offset= Node_Offset((convert));
1975 mjd_nodelen= Node_Length((convert));
1978 /* whole branch chain */
1980 #ifdef RE_TRACK_PATTERN_OFFSETS
1983 const regnode *nop = NEXTOPER( convert );
1984 mjd_offset= Node_Offset((nop));
1985 mjd_nodelen= Node_Length((nop));
1989 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1990 (int)depth * 2 + 2, "",
1991 (UV)mjd_offset, (UV)mjd_nodelen)
1994 /* But first we check to see if there is a common prefix we can
1995 split out as an EXACT and put in front of the TRIE node. */
1996 trie->startstate= 1;
1997 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1999 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2003 const U32 base = trie->states[ state ].trans.base;
2005 if ( trie->states[state].wordnum )
2008 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2009 if ( ( base + ofs >= trie->uniquecharcount ) &&
2010 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2011 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2013 if ( ++count > 1 ) {
2014 SV **tmp = av_fetch( revcharmap, ofs, 0);
2015 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2016 if ( state == 1 ) break;
2018 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2020 PerlIO_printf(Perl_debug_log,
2021 "%*sNew Start State=%"UVuf" Class: [",
2022 (int)depth * 2 + 2, "",
2025 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2026 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2028 TRIE_BITMAP_SET(trie,*ch);
2030 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2032 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2036 TRIE_BITMAP_SET(trie,*ch);
2038 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2039 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2045 SV **tmp = av_fetch( revcharmap, idx, 0);
2047 char *ch = SvPV( *tmp, len );
2049 SV *sv=sv_newmortal();
2050 PerlIO_printf( Perl_debug_log,
2051 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2052 (int)depth * 2 + 2, "",
2054 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2055 PL_colors[0], PL_colors[1],
2056 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2057 PERL_PV_ESCAPE_FIRSTCHAR
2062 OP( convert ) = nodetype;
2063 str=STRING(convert);
2066 STR_LEN(convert) += len;
2072 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2077 trie->prefixlen = (state-1);
2079 regnode *n = convert+NODE_SZ_STR(convert);
2080 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2081 trie->startstate = state;
2082 trie->minlen -= (state - 1);
2083 trie->maxlen -= (state - 1);
2085 /* At least the UNICOS C compiler choked on this
2086 * being argument to DEBUG_r(), so let's just have
2089 #ifdef PERL_EXT_RE_BUILD
2095 regnode *fix = convert;
2096 U32 word = trie->wordcount;
2098 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2099 while( ++fix < n ) {
2100 Set_Node_Offset_Length(fix, 0, 0);
2103 SV ** const tmp = av_fetch( trie_words, word, 0 );
2105 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2106 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2108 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2116 NEXT_OFF(convert) = (U16)(tail - convert);
2117 DEBUG_r(optimize= n);
2123 if ( trie->maxlen ) {
2124 NEXT_OFF( convert ) = (U16)(tail - convert);
2125 ARG_SET( convert, data_slot );
2126 /* Store the offset to the first unabsorbed branch in
2127 jump[0], which is otherwise unused by the jump logic.
2128 We use this when dumping a trie and during optimisation. */
2130 trie->jump[0] = (U16)(nextbranch - convert);
2133 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2134 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2136 OP( convert ) = TRIEC;
2137 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2138 PerlMemShared_free(trie->bitmap);
2141 OP( convert ) = TRIE;
2143 /* store the type in the flags */
2144 convert->flags = nodetype;
2148 + regarglen[ OP( convert ) ];
2150 /* XXX We really should free up the resource in trie now,
2151 as we won't use them - (which resources?) dmq */
2153 /* needed for dumping*/
2154 DEBUG_r(if (optimize) {
2155 regnode *opt = convert;
2157 while ( ++opt < optimize) {
2158 Set_Node_Offset_Length(opt,0,0);
2161 Try to clean up some of the debris left after the
2164 while( optimize < jumper ) {
2165 mjd_nodelen += Node_Length((optimize));
2166 OP( optimize ) = OPTIMIZED;
2167 Set_Node_Offset_Length(optimize,0,0);
2170 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2172 } /* end node insert */
2174 /* Finish populating the prev field of the wordinfo array. Walk back
2175 * from each accept state until we find another accept state, and if
2176 * so, point the first word's .prev field at the second word. If the
2177 * second already has a .prev field set, stop now. This will be the
2178 * case either if we've already processed that word's accept state,
2179 * or that that state had multiple words, and the overspill words
2180 * were already linked up earlier.
2187 for (word=1; word <= trie->wordcount; word++) {
2189 if (trie->wordinfo[word].prev)
2191 state = trie->wordinfo[word].accept;
2193 state = prev_states[state];
2196 prev = trie->states[state].wordnum;
2200 trie->wordinfo[word].prev = prev;
2202 Safefree(prev_states);
2206 /* and now dump out the compressed format */
2207 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2209 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2211 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2212 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2214 SvREFCNT_dec(revcharmap);
2218 : trie->startstate>1
2224 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2226 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2228 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2229 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2232 We find the fail state for each state in the trie, this state is the longest proper
2233 suffix of the current states 'word' that is also a proper prefix of another word in our
2234 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2235 the DFA not to have to restart after its tried and failed a word at a given point, it
2236 simply continues as though it had been matching the other word in the first place.
2238 'abcdgu'=~/abcdefg|cdgu/
2239 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2240 fail, which would bring use to the state representing 'd' in the second word where we would
2241 try 'g' and succeed, prodceding to match 'cdgu'.
2243 /* add a fail transition */
2244 const U32 trie_offset = ARG(source);
2245 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2247 const U32 ucharcount = trie->uniquecharcount;
2248 const U32 numstates = trie->statecount;
2249 const U32 ubound = trie->lasttrans + ucharcount;
2253 U32 base = trie->states[ 1 ].trans.base;
2256 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2257 GET_RE_DEBUG_FLAGS_DECL;
2259 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2261 PERL_UNUSED_ARG(depth);
2265 ARG_SET( stclass, data_slot );
2266 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2267 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2268 aho->trie=trie_offset;
2269 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2270 Copy( trie->states, aho->states, numstates, reg_trie_state );
2271 Newxz( q, numstates, U32);
2272 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2275 /* initialize fail[0..1] to be 1 so that we always have
2276 a valid final fail state */
2277 fail[ 0 ] = fail[ 1 ] = 1;
2279 for ( charid = 0; charid < ucharcount ; charid++ ) {
2280 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2282 q[ q_write ] = newstate;
2283 /* set to point at the root */
2284 fail[ q[ q_write++ ] ]=1;
2287 while ( q_read < q_write) {
2288 const U32 cur = q[ q_read++ % numstates ];
2289 base = trie->states[ cur ].trans.base;
2291 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2292 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2294 U32 fail_state = cur;
2297 fail_state = fail[ fail_state ];
2298 fail_base = aho->states[ fail_state ].trans.base;
2299 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2301 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2302 fail[ ch_state ] = fail_state;
2303 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2305 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2307 q[ q_write++ % numstates] = ch_state;
2311 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2312 when we fail in state 1, this allows us to use the
2313 charclass scan to find a valid start char. This is based on the principle
2314 that theres a good chance the string being searched contains lots of stuff
2315 that cant be a start char.
2317 fail[ 0 ] = fail[ 1 ] = 0;
2318 DEBUG_TRIE_COMPILE_r({
2319 PerlIO_printf(Perl_debug_log,
2320 "%*sStclass Failtable (%"UVuf" states): 0",
2321 (int)(depth * 2), "", (UV)numstates
2323 for( q_read=1; q_read<numstates; q_read++ ) {
2324 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2326 PerlIO_printf(Perl_debug_log, "\n");
2329 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2334 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2335 * These need to be revisited when a newer toolchain becomes available.
2337 #if defined(__sparc64__) && defined(__GNUC__)
2338 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2339 # undef SPARC64_GCC_WORKAROUND
2340 # define SPARC64_GCC_WORKAROUND 1
2344 #define DEBUG_PEEP(str,scan,depth) \
2345 DEBUG_OPTIMISE_r({if (scan){ \
2346 SV * const mysv=sv_newmortal(); \
2347 regnode *Next = regnext(scan); \
2348 regprop(RExC_rx, mysv, scan); \
2349 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2350 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2351 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2358 #define JOIN_EXACT(scan,min,flags) \
2359 if (PL_regkind[OP(scan)] == EXACT) \
2360 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2363 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2364 /* Merge several consecutive EXACTish nodes into one. */
2365 regnode *n = regnext(scan);
2367 regnode *next = scan + NODE_SZ_STR(scan);
2371 regnode *stop = scan;
2372 GET_RE_DEBUG_FLAGS_DECL;
2374 PERL_UNUSED_ARG(depth);
2377 PERL_ARGS_ASSERT_JOIN_EXACT;
2378 #ifndef EXPERIMENTAL_INPLACESCAN
2379 PERL_UNUSED_ARG(flags);
2380 PERL_UNUSED_ARG(val);
2382 DEBUG_PEEP("join",scan,depth);
2384 /* Skip NOTHING, merge EXACT*. */
2386 ( PL_regkind[OP(n)] == NOTHING ||
2387 (stringok && (OP(n) == OP(scan))))
2389 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2391 if (OP(n) == TAIL || n > next)
2393 if (PL_regkind[OP(n)] == NOTHING) {
2394 DEBUG_PEEP("skip:",n,depth);
2395 NEXT_OFF(scan) += NEXT_OFF(n);
2396 next = n + NODE_STEP_REGNODE;
2403 else if (stringok) {
2404 const unsigned int oldl = STR_LEN(scan);
2405 regnode * const nnext = regnext(n);
2407 DEBUG_PEEP("merg",n,depth);
2410 if (oldl + STR_LEN(n) > U8_MAX)
2412 NEXT_OFF(scan) += NEXT_OFF(n);
2413 STR_LEN(scan) += STR_LEN(n);
2414 next = n + NODE_SZ_STR(n);
2415 /* Now we can overwrite *n : */
2416 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2424 #ifdef EXPERIMENTAL_INPLACESCAN
2425 if (flags && !NEXT_OFF(n)) {
2426 DEBUG_PEEP("atch", val, depth);
2427 if (reg_off_by_arg[OP(n)]) {
2428 ARG_SET(n, val - n);
2431 NEXT_OFF(n) = val - n;
2438 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2440 Two problematic code points in Unicode casefolding of EXACT nodes:
2442 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2443 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2449 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2450 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2452 This means that in case-insensitive matching (or "loose matching",
2453 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2454 length of the above casefolded versions) can match a target string
2455 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2456 This would rather mess up the minimum length computation.
2458 What we'll do is to look for the tail four bytes, and then peek
2459 at the preceding two bytes to see whether we need to decrease
2460 the minimum length by four (six minus two).
2462 Thanks to the design of UTF-8, there cannot be false matches:
2463 A sequence of valid UTF-8 bytes cannot be a subsequence of
2464 another valid sequence of UTF-8 bytes.
2467 char * const s0 = STRING(scan), *s, *t;
2468 char * const s1 = s0 + STR_LEN(scan) - 1;
2469 char * const s2 = s1 - 4;
2470 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2471 const char t0[] = "\xaf\x49\xaf\x42";
2473 const char t0[] = "\xcc\x88\xcc\x81";
2475 const char * const t1 = t0 + 3;
2478 s < s2 && (t = ninstr(s, s1, t0, t1));
2481 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2482 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2484 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2485 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2493 n = scan + NODE_SZ_STR(scan);
2495 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2502 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2506 /* REx optimizer. Converts nodes into quickier variants "in place".
2507 Finds fixed substrings. */
2509 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2510 to the position after last scanned or to NULL. */
2512 #define INIT_AND_WITHP \
2513 assert(!and_withp); \
2514 Newx(and_withp,1,struct regnode_charclass_class); \
2515 SAVEFREEPV(and_withp)
2517 /* this is a chain of data about sub patterns we are processing that
2518 need to be handled seperately/specially in study_chunk. Its so
2519 we can simulate recursion without losing state. */
2521 typedef struct scan_frame {
2522 regnode *last; /* last node to process in this frame */
2523 regnode *next; /* next node to process when last is reached */
2524 struct scan_frame *prev; /*previous frame*/
2525 I32 stop; /* what stopparen do we use */
2529 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2531 #define CASE_SYNST_FNC(nAmE) \
2533 if (flags & SCF_DO_STCLASS_AND) { \
2534 for (value = 0; value < 256; value++) \
2535 if (!is_ ## nAmE ## _cp(value)) \
2536 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2539 for (value = 0; value < 256; value++) \
2540 if (is_ ## nAmE ## _cp(value)) \
2541 ANYOF_BITMAP_SET(data->start_class, value); \
2545 if (flags & SCF_DO_STCLASS_AND) { \
2546 for (value = 0; value < 256; value++) \
2547 if (is_ ## nAmE ## _cp(value)) \
2548 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2551 for (value = 0; value < 256; value++) \
2552 if (!is_ ## nAmE ## _cp(value)) \
2553 ANYOF_BITMAP_SET(data->start_class, value); \
2560 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2561 I32 *minlenp, I32 *deltap,
2566 struct regnode_charclass_class *and_withp,
2567 U32 flags, U32 depth)
2568 /* scanp: Start here (read-write). */
2569 /* deltap: Write maxlen-minlen here. */
2570 /* last: Stop before this one. */
2571 /* data: string data about the pattern */
2572 /* stopparen: treat close N as END */
2573 /* recursed: which subroutines have we recursed into */
2574 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2577 I32 min = 0, pars = 0, code;
2578 regnode *scan = *scanp, *next;
2580 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2581 int is_inf_internal = 0; /* The studied chunk is infinite */
2582 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2583 scan_data_t data_fake;
2584 SV *re_trie_maxbuff = NULL;
2585 regnode *first_non_open = scan;
2586 I32 stopmin = I32_MAX;
2587 scan_frame *frame = NULL;
2588 GET_RE_DEBUG_FLAGS_DECL;
2590 PERL_ARGS_ASSERT_STUDY_CHUNK;
2593 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2597 while (first_non_open && OP(first_non_open) == OPEN)
2598 first_non_open=regnext(first_non_open);
2603 while ( scan && OP(scan) != END && scan < last ){
2604 /* Peephole optimizer: */
2605 DEBUG_STUDYDATA("Peep:", data,depth);
2606 DEBUG_PEEP("Peep",scan,depth);
2607 JOIN_EXACT(scan,&min,0);
2609 /* Follow the next-chain of the current node and optimize
2610 away all the NOTHINGs from it. */
2611 if (OP(scan) != CURLYX) {
2612 const int max = (reg_off_by_arg[OP(scan)]
2614 /* I32 may be smaller than U16 on CRAYs! */
2615 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2616 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2620 /* Skip NOTHING and LONGJMP. */
2621 while ((n = regnext(n))
2622 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2623 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2624 && off + noff < max)
2626 if (reg_off_by_arg[OP(scan)])
2629 NEXT_OFF(scan) = off;
2634 /* The principal pseudo-switch. Cannot be a switch, since we
2635 look into several different things. */
2636 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2637 || OP(scan) == IFTHEN) {
2638 next = regnext(scan);
2640 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2642 if (OP(next) == code || code == IFTHEN) {
2643 /* NOTE - There is similar code to this block below for handling
2644 TRIE nodes on a re-study. If you change stuff here check there
2646 I32 max1 = 0, min1 = I32_MAX, num = 0;
2647 struct regnode_charclass_class accum;
2648 regnode * const startbranch=scan;
2650 if (flags & SCF_DO_SUBSTR)
2651 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2652 if (flags & SCF_DO_STCLASS)
2653 cl_init_zero(pRExC_state, &accum);
2655 while (OP(scan) == code) {
2656 I32 deltanext, minnext, f = 0, fake;
2657 struct regnode_charclass_class this_class;
2660 data_fake.flags = 0;
2662 data_fake.whilem_c = data->whilem_c;
2663 data_fake.last_closep = data->last_closep;
2666 data_fake.last_closep = &fake;
2668 data_fake.pos_delta = delta;
2669 next = regnext(scan);
2670 scan = NEXTOPER(scan);
2672 scan = NEXTOPER(scan);
2673 if (flags & SCF_DO_STCLASS) {
2674 cl_init(pRExC_state, &this_class);
2675 data_fake.start_class = &this_class;
2676 f = SCF_DO_STCLASS_AND;
2678 if (flags & SCF_WHILEM_VISITED_POS)
2679 f |= SCF_WHILEM_VISITED_POS;
2681 /* we suppose the run is continuous, last=next...*/
2682 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2684 stopparen, recursed, NULL, f,depth+1);
2687 if (max1 < minnext + deltanext)
2688 max1 = minnext + deltanext;
2689 if (deltanext == I32_MAX)
2690 is_inf = is_inf_internal = 1;
2692 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2694 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2695 if ( stopmin > minnext)
2696 stopmin = min + min1;
2697 flags &= ~SCF_DO_SUBSTR;
2699 data->flags |= SCF_SEEN_ACCEPT;
2702 if (data_fake.flags & SF_HAS_EVAL)
2703 data->flags |= SF_HAS_EVAL;
2704 data->whilem_c = data_fake.whilem_c;
2706 if (flags & SCF_DO_STCLASS)
2707 cl_or(pRExC_state, &accum, &this_class);
2709 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2711 if (flags & SCF_DO_SUBSTR) {
2712 data->pos_min += min1;
2713 data->pos_delta += max1 - min1;
2714 if (max1 != min1 || is_inf)
2715 data->longest = &(data->longest_float);
2718 delta += max1 - min1;
2719 if (flags & SCF_DO_STCLASS_OR) {
2720 cl_or(pRExC_state, data->start_class, &accum);
2722 cl_and(data->start_class, and_withp);
2723 flags &= ~SCF_DO_STCLASS;
2726 else if (flags & SCF_DO_STCLASS_AND) {
2728 cl_and(data->start_class, &accum);
2729 flags &= ~SCF_DO_STCLASS;
2732 /* Switch to OR mode: cache the old value of
2733 * data->start_class */
2735 StructCopy(data->start_class, and_withp,
2736 struct regnode_charclass_class);
2737 flags &= ~SCF_DO_STCLASS_AND;
2738 StructCopy(&accum, data->start_class,
2739 struct regnode_charclass_class);
2740 flags |= SCF_DO_STCLASS_OR;
2741 data->start_class->flags |= ANYOF_EOS;
2745 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2748 Assuming this was/is a branch we are dealing with: 'scan' now
2749 points at the item that follows the branch sequence, whatever
2750 it is. We now start at the beginning of the sequence and look
2757 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2759 If we can find such a subseqence we need to turn the first
2760 element into a trie and then add the subsequent branch exact
2761 strings to the trie.
2765 1. patterns where the whole set of branch can be converted.
2767 2. patterns where only a subset can be converted.
2769 In case 1 we can replace the whole set with a single regop
2770 for the trie. In case 2 we need to keep the start and end
2773 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2774 becomes BRANCH TRIE; BRANCH X;
2776 There is an additional case, that being where there is a
2777 common prefix, which gets split out into an EXACT like node
2778 preceding the TRIE node.
2780 If x(1..n)==tail then we can do a simple trie, if not we make
2781 a "jump" trie, such that when we match the appropriate word
2782 we "jump" to the appopriate tail node. Essentailly we turn
2783 a nested if into a case structure of sorts.
2788 if (!re_trie_maxbuff) {
2789 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2790 if (!SvIOK(re_trie_maxbuff))
2791 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2793 if ( SvIV(re_trie_maxbuff)>=0 ) {
2795 regnode *first = (regnode *)NULL;
2796 regnode *last = (regnode *)NULL;
2797 regnode *tail = scan;
2802 SV * const mysv = sv_newmortal(); /* for dumping */
2804 /* var tail is used because there may be a TAIL
2805 regop in the way. Ie, the exacts will point to the
2806 thing following the TAIL, but the last branch will
2807 point at the TAIL. So we advance tail. If we
2808 have nested (?:) we may have to move through several
2812 while ( OP( tail ) == TAIL ) {
2813 /* this is the TAIL generated by (?:) */
2814 tail = regnext( tail );
2819 regprop(RExC_rx, mysv, tail );
2820 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2821 (int)depth * 2 + 2, "",
2822 "Looking for TRIE'able sequences. Tail node is: ",
2823 SvPV_nolen_const( mysv )
2829 step through the branches, cur represents each
2830 branch, noper is the first thing to be matched
2831 as part of that branch and noper_next is the
2832 regnext() of that node. if noper is an EXACT
2833 and noper_next is the same as scan (our current
2834 position in the regex) then the EXACT branch is
2835 a possible optimization target. Once we have
2836 two or more consequetive such branches we can
2837 create a trie of the EXACT's contents and stich
2838 it in place. If the sequence represents all of
2839 the branches we eliminate the whole thing and
2840 replace it with a single TRIE. If it is a
2841 subsequence then we need to stitch it in. This
2842 means the first branch has to remain, and needs
2843 to be repointed at the item on the branch chain
2844 following the last branch optimized. This could
2845 be either a BRANCH, in which case the
2846 subsequence is internal, or it could be the
2847 item following the branch sequence in which
2848 case the subsequence is at the end.
2852 /* dont use tail as the end marker for this traverse */
2853 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2854 regnode * const noper = NEXTOPER( cur );
2855 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2856 regnode * const noper_next = regnext( noper );
2860 regprop(RExC_rx, mysv, cur);
2861 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2862 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2864 regprop(RExC_rx, mysv, noper);
2865 PerlIO_printf( Perl_debug_log, " -> %s",
2866 SvPV_nolen_const(mysv));
2869 regprop(RExC_rx, mysv, noper_next );
2870 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2871 SvPV_nolen_const(mysv));
2873 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2874 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2876 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2877 : PL_regkind[ OP( noper ) ] == EXACT )
2878 || OP(noper) == NOTHING )
2880 && noper_next == tail
2885 if ( !first || optype == NOTHING ) {
2886 if (!first) first = cur;
2887 optype = OP( noper );
2893 Currently we do not believe that the trie logic can
2894 handle case insensitive matching properly when the
2895 pattern is not unicode (thus forcing unicode semantics).
2897 If/when this is fixed the following define can be swapped
2898 in below to fully enable trie logic.
2900 #define TRIE_TYPE_IS_SAFE 1
2903 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2905 if ( last && TRIE_TYPE_IS_SAFE ) {
2906 make_trie( pRExC_state,
2907 startbranch, first, cur, tail, count,
2910 if ( PL_regkind[ OP( noper ) ] == EXACT
2912 && noper_next == tail
2917 optype = OP( noper );
2927 regprop(RExC_rx, mysv, cur);
2928 PerlIO_printf( Perl_debug_log,
2929 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2930 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2934 if ( last && TRIE_TYPE_IS_SAFE ) {
2935 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2936 #ifdef TRIE_STUDY_OPT
2937 if ( ((made == MADE_EXACT_TRIE &&
2938 startbranch == first)
2939 || ( first_non_open == first )) &&
2941 flags |= SCF_TRIE_RESTUDY;
2942 if ( startbranch == first
2945 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2955 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2956 scan = NEXTOPER(NEXTOPER(scan));
2957 } else /* single branch is optimized. */
2958 scan = NEXTOPER(scan);
2960 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2961 scan_frame *newframe = NULL;
2966 if (OP(scan) != SUSPEND) {
2967 /* set the pointer */
2968 if (OP(scan) == GOSUB) {
2970 RExC_recurse[ARG2L(scan)] = scan;
2971 start = RExC_open_parens[paren-1];
2972 end = RExC_close_parens[paren-1];
2975 start = RExC_rxi->program + 1;
2979 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2980 SAVEFREEPV(recursed);
2982 if (!PAREN_TEST(recursed,paren+1)) {
2983 PAREN_SET(recursed,paren+1);
2984 Newx(newframe,1,scan_frame);
2986 if (flags & SCF_DO_SUBSTR) {
2987 SCAN_COMMIT(pRExC_state,data,minlenp);
2988 data->longest = &(data->longest_float);
2990 is_inf = is_inf_internal = 1;
2991 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2992 cl_anything(pRExC_state, data->start_class);
2993 flags &= ~SCF_DO_STCLASS;
2996 Newx(newframe,1,scan_frame);
2999 end = regnext(scan);
3004 SAVEFREEPV(newframe);
3005 newframe->next = regnext(scan);
3006 newframe->last = last;
3007 newframe->stop = stopparen;
3008 newframe->prev = frame;
3018 else if (OP(scan) == EXACT) {
3019 I32 l = STR_LEN(scan);
3022 const U8 * const s = (U8*)STRING(scan);
3023 l = utf8_length(s, s + l);
3024 uc = utf8_to_uvchr(s, NULL);
3026 uc = *((U8*)STRING(scan));
3029 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3030 /* The code below prefers earlier match for fixed
3031 offset, later match for variable offset. */
3032 if (data->last_end == -1) { /* Update the start info. */
3033 data->last_start_min = data->pos_min;
3034 data->last_start_max = is_inf
3035 ? I32_MAX : data->pos_min + data->pos_delta;
3037 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3039 SvUTF8_on(data->last_found);
3041 SV * const sv = data->last_found;
3042 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3043 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3044 if (mg && mg->mg_len >= 0)
3045 mg->mg_len += utf8_length((U8*)STRING(scan),
3046 (U8*)STRING(scan)+STR_LEN(scan));
3048 data->last_end = data->pos_min + l;
3049 data->pos_min += l; /* As in the first entry. */
3050 data->flags &= ~SF_BEFORE_EOL;
3052 if (flags & SCF_DO_STCLASS_AND) {
3053 /* Check whether it is compatible with what we know already! */
3057 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3058 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3059 && (!(data->start_class->flags & ANYOF_FOLD)
3060 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3063 ANYOF_CLASS_ZERO(data->start_class);
3064 ANYOF_BITMAP_ZERO(data->start_class);
3066 ANYOF_BITMAP_SET(data->start_class, uc);
3067 data->start_class->flags &= ~ANYOF_EOS;
3069 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3071 else if (flags & SCF_DO_STCLASS_OR) {
3072 /* false positive possible if the class is case-folded */
3074 ANYOF_BITMAP_SET(data->start_class, uc);
3076 data->start_class->flags |= ANYOF_UNICODE_ALL;
3077 data->start_class->flags &= ~ANYOF_EOS;
3078 cl_and(data->start_class, and_withp);
3080 flags &= ~SCF_DO_STCLASS;
3082 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3083 I32 l = STR_LEN(scan);
3084 UV uc = *((U8*)STRING(scan));
3086 /* Search for fixed substrings supports EXACT only. */
3087 if (flags & SCF_DO_SUBSTR) {
3089 SCAN_COMMIT(pRExC_state, data, minlenp);
3092 const U8 * const s = (U8 *)STRING(scan);
3093 l = utf8_length(s, s + l);
3094 uc = utf8_to_uvchr(s, NULL);
3097 if (flags & SCF_DO_SUBSTR)
3099 if (flags & SCF_DO_STCLASS_AND) {
3100 /* Check whether it is compatible with what we know already! */
3104 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3105 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3106 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3108 ANYOF_CLASS_ZERO(data->start_class);
3109 ANYOF_BITMAP_ZERO(data->start_class);
3111 ANYOF_BITMAP_SET(data->start_class, uc);
3112 data->start_class->flags &= ~ANYOF_EOS;
3113 data->start_class->flags |= ANYOF_FOLD;
3114 if (OP(scan) == EXACTFL)
3115 data->start_class->flags |= ANYOF_LOCALE;
3118 else if (flags & SCF_DO_STCLASS_OR) {
3119 if (data->start_class->flags & ANYOF_FOLD) {
3120 /* false positive possible if the class is case-folded.
3121 Assume that the locale settings are the same... */
3123 ANYOF_BITMAP_SET(data->start_class, uc);
3124 data->start_class->flags &= ~ANYOF_EOS;
3126 cl_and(data->start_class, and_withp);
3128 flags &= ~SCF_DO_STCLASS;
3130 else if (REGNODE_VARIES(OP(scan))) {
3131 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3132 I32 f = flags, pos_before = 0;
3133 regnode * const oscan = scan;
3134 struct regnode_charclass_class this_class;
3135 struct regnode_charclass_class *oclass = NULL;
3136 I32 next_is_eval = 0;
3138 switch (PL_regkind[OP(scan)]) {
3139 case WHILEM: /* End of (?:...)* . */
3140 scan = NEXTOPER(scan);
3143 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3144 next = NEXTOPER(scan);
3145 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3147 maxcount = REG_INFTY;
3148 next = regnext(scan);
3149 scan = NEXTOPER(scan);
3153 if (flags & SCF_DO_SUBSTR)
3158 if (flags & SCF_DO_STCLASS) {
3160 maxcount = REG_INFTY;
3161 next = regnext(scan);
3162 scan = NEXTOPER(scan);
3165 is_inf = is_inf_internal = 1;
3166 scan = regnext(scan);
3167 if (flags & SCF_DO_SUBSTR) {
3168 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3169 data->longest = &(data->longest_float);
3171 goto optimize_curly_tail;
3173 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3174 && (scan->flags == stopparen))
3179 mincount = ARG1(scan);
3180 maxcount = ARG2(scan);
3182 next = regnext(scan);
3183 if (OP(scan) == CURLYX) {
3184 I32 lp = (data ? *(data->last_closep) : 0);
3185 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3187 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3188 next_is_eval = (OP(scan) == EVAL);
3190 if (flags & SCF_DO_SUBSTR) {
3191 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3192 pos_before = data->pos_min;
3196 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3198 data->flags |= SF_IS_INF;
3200 if (flags & SCF_DO_STCLASS) {
3201 cl_init(pRExC_state, &this_class);
3202 oclass = data->start_class;
3203 data->start_class = &this_class;
3204 f |= SCF_DO_STCLASS_AND;
3205 f &= ~SCF_DO_STCLASS_OR;
3207 /* These are the cases when once a subexpression
3208 fails at a particular position, it cannot succeed
3209 even after backtracking at the enclosing scope.
3211 XXXX what if minimal match and we are at the
3212 initial run of {n,m}? */
3213 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3214 f &= ~SCF_WHILEM_VISITED_POS;
3216 /* This will finish on WHILEM, setting scan, or on NULL: */
3217 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3218 last, data, stopparen, recursed, NULL,
3220 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3222 if (flags & SCF_DO_STCLASS)
3223 data->start_class = oclass;
3224 if (mincount == 0 || minnext == 0) {
3225 if (flags & SCF_DO_STCLASS_OR) {
3226 cl_or(pRExC_state, data->start_class, &this_class);
3228 else if (flags & SCF_DO_STCLASS_AND) {
3229 /* Switch to OR mode: cache the old value of
3230 * data->start_class */
3232 StructCopy(data->start_class, and_withp,
3233 struct regnode_charclass_class);
3234 flags &= ~SCF_DO_STCLASS_AND;
3235 StructCopy(&this_class, data->start_class,
3236 struct regnode_charclass_class);
3237 flags |= SCF_DO_STCLASS_OR;
3238 data->start_class->flags |= ANYOF_EOS;
3240 } else { /* Non-zero len */
3241 if (flags & SCF_DO_STCLASS_OR) {
3242 cl_or(pRExC_state, data->start_class, &this_class);
3243 cl_and(data->start_class, and_withp);
3245 else if (flags & SCF_DO_STCLASS_AND)
3246 cl_and(data->start_class, &this_class);
3247 flags &= ~SCF_DO_STCLASS;
3249 if (!scan) /* It was not CURLYX, but CURLY. */
3251 if ( /* ? quantifier ok, except for (?{ ... }) */
3252 (next_is_eval || !(mincount == 0 && maxcount == 1))
3253 && (minnext == 0) && (deltanext == 0)
3254 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3255 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3257 ckWARNreg(RExC_parse,
3258 "Quantifier unexpected on zero-length expression");
3261 min += minnext * mincount;
3262 is_inf_internal |= ((maxcount == REG_INFTY
3263 && (minnext + deltanext) > 0)
3264 || deltanext == I32_MAX);
3265 is_inf |= is_inf_internal;
3266 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3268 /* Try powerful optimization CURLYX => CURLYN. */
3269 if ( OP(oscan) == CURLYX && data
3270 && data->flags & SF_IN_PAR
3271 && !(data->flags & SF_HAS_EVAL)
3272 && !deltanext && minnext == 1 ) {
3273 /* Try to optimize to CURLYN. */
3274 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3275 regnode * const nxt1 = nxt;
3282 if (!REGNODE_SIMPLE(OP(nxt))
3283 && !(PL_regkind[OP(nxt)] == EXACT
3284 && STR_LEN(nxt) == 1))
3290 if (OP(nxt) != CLOSE)
3292 if (RExC_open_parens) {
3293 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3294 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3296 /* Now we know that nxt2 is the only contents: */
3297 oscan->flags = (U8)ARG(nxt);
3299 OP(nxt1) = NOTHING; /* was OPEN. */
3302 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3303 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3304 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3305 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3306 OP(nxt + 1) = OPTIMIZED; /* was count. */
3307 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3312 /* Try optimization CURLYX => CURLYM. */
3313 if ( OP(oscan) == CURLYX && data
3314 && !(data->flags & SF_HAS_PAR)
3315 && !(data->flags & SF_HAS_EVAL)
3316 && !deltanext /* atom is fixed width */
3317 && minnext != 0 /* CURLYM can't handle zero width */
3319 /* XXXX How to optimize if data == 0? */
3320 /* Optimize to a simpler form. */
3321 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3325 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3326 && (OP(nxt2) != WHILEM))
3328 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3329 /* Need to optimize away parenths. */
3330 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3331 /* Set the parenth number. */
3332 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3334 oscan->flags = (U8)ARG(nxt);
3335 if (RExC_open_parens) {
3336 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3337 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3339 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3340 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3343 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3344 OP(nxt + 1) = OPTIMIZED; /* was count. */
3345 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3346 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3349 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3350 regnode *nnxt = regnext(nxt1);
3352 if (reg_off_by_arg[OP(nxt1)])
3353 ARG_SET(nxt1, nxt2 - nxt1);
3354 else if (nxt2 - nxt1 < U16_MAX)
3355 NEXT_OFF(nxt1) = nxt2 - nxt1;
3357 OP(nxt) = NOTHING; /* Cannot beautify */
3362 /* Optimize again: */
3363 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3364 NULL, stopparen, recursed, NULL, 0,depth+1);
3369 else if ((OP(oscan) == CURLYX)
3370 && (flags & SCF_WHILEM_VISITED_POS)
3371 /* See the comment on a similar expression above.
3372 However, this time it not a subexpression
3373 we care about, but the expression itself. */
3374 && (maxcount == REG_INFTY)
3375 && data && ++data->whilem_c < 16) {
3376 /* This stays as CURLYX, we can put the count/of pair. */
3377 /* Find WHILEM (as in regexec.c) */
3378 regnode *nxt = oscan + NEXT_OFF(oscan);
3380 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3382 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3383 | (RExC_whilem_seen << 4)); /* On WHILEM */
3385 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3387 if (flags & SCF_DO_SUBSTR) {
3388 SV *last_str = NULL;
3389 int counted = mincount != 0;
3391 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3392 #if defined(SPARC64_GCC_WORKAROUND)
3395 const char *s = NULL;
3398 if (pos_before >= data->last_start_min)
3401 b = data->last_start_min;
3404 s = SvPV_const(data->last_found, l);
3405 old = b - data->last_start_min;
3408 I32 b = pos_before >= data->last_start_min
3409 ? pos_before : data->last_start_min;
3411 const char * const s = SvPV_const(data->last_found, l);
3412 I32 old = b - data->last_start_min;
3416 old = utf8_hop((U8*)s, old) - (U8*)s;
3418 /* Get the added string: */
3419 last_str = newSVpvn_utf8(s + old, l, UTF);
3420 if (deltanext == 0 && pos_before == b) {
3421 /* What was added is a constant string */
3423 SvGROW(last_str, (mincount * l) + 1);
3424 repeatcpy(SvPVX(last_str) + l,
3425 SvPVX_const(last_str), l, mincount - 1);
3426 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3427 /* Add additional parts. */
3428 SvCUR_set(data->last_found,
3429 SvCUR(data->last_found) - l);
3430 sv_catsv(data->last_found, last_str);
3432 SV * sv = data->last_found;
3434 SvUTF8(sv) && SvMAGICAL(sv) ?
3435 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3436 if (mg && mg->mg_len >= 0)
3437 mg->mg_len += CHR_SVLEN(last_str) - l;
3439 data->last_end += l * (mincount - 1);
3442 /* start offset must point into the last copy */
3443 data->last_start_min += minnext * (mincount - 1);
3444 data->last_start_max += is_inf ? I32_MAX
3445 : (maxcount - 1) * (minnext + data->pos_delta);
3448 /* It is counted once already... */
3449 data->pos_min += minnext * (mincount - counted);
3450 data->pos_delta += - counted * deltanext +
3451 (minnext + deltanext) * maxcount - minnext * mincount;
3452 if (mincount != maxcount) {
3453 /* Cannot extend fixed substrings found inside
3455 SCAN_COMMIT(pRExC_state,data,minlenp);
3456 if (mincount && last_str) {
3457 SV * const sv = data->last_found;
3458 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3459 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3463 sv_setsv(sv, last_str);
3464 data->last_end = data->pos_min;
3465 data->last_start_min =
3466 data->pos_min - CHR_SVLEN(last_str);
3467 data->last_start_max = is_inf
3469 : data->pos_min + data->pos_delta
3470 - CHR_SVLEN(last_str);
3472 data->longest = &(data->longest_float);
3474 SvREFCNT_dec(last_str);
3476 if (data && (fl & SF_HAS_EVAL))
3477 data->flags |= SF_HAS_EVAL;
3478 optimize_curly_tail:
3479 if (OP(oscan) != CURLYX) {
3480 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3482 NEXT_OFF(oscan) += NEXT_OFF(next);
3485 default: /* REF and CLUMP only? */
3486 if (flags & SCF_DO_SUBSTR) {
3487 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3488 data->longest = &(data->longest_float);
3490 is_inf = is_inf_internal = 1;
3491 if (flags & SCF_DO_STCLASS_OR)
3492 cl_anything(pRExC_state, data->start_class);
3493 flags &= ~SCF_DO_STCLASS;
3497 else if (OP(scan) == LNBREAK) {
3498 if (flags & SCF_DO_STCLASS) {
3500 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3501 if (flags & SCF_DO_STCLASS_AND) {
3502 for (value = 0; value < 256; value++)
3503 if (!is_VERTWS_cp(value))
3504 ANYOF_BITMAP_CLEAR(data->start_class, value);
3507 for (value = 0; value < 256; value++)
3508 if (is_VERTWS_cp(value))
3509 ANYOF_BITMAP_SET(data->start_class, value);
3511 if (flags & SCF_DO_STCLASS_OR)
3512 cl_and(data->start_class, and_withp);
3513 flags &= ~SCF_DO_STCLASS;
3517 if (flags & SCF_DO_SUBSTR) {
3518 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3520 data->pos_delta += 1;
3521 data->longest = &(data->longest_float);
3524 else if (OP(scan) == FOLDCHAR) {
3525 int d = ARG(scan)==0xDF ? 1 : 2;
3526 flags &= ~SCF_DO_STCLASS;
3529 if (flags & SCF_DO_SUBSTR) {
3530 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3532 data->pos_delta += d;
3533 data->longest = &(data->longest_float);
3536 else if (REGNODE_SIMPLE(OP(scan))) {
3539 if (flags & SCF_DO_SUBSTR) {
3540 SCAN_COMMIT(pRExC_state,data,minlenp);
3544 if (flags & SCF_DO_STCLASS) {
3545 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3547 /* Some of the logic below assumes that switching
3548 locale on will only add false positives. */
3549 switch (PL_regkind[OP(scan)]) {
3553 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3554 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3555 cl_anything(pRExC_state, data->start_class);
3558 if (OP(scan) == SANY)
3560 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3561 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3562 || (data->start_class->flags & ANYOF_CLASS));
3563 cl_anything(pRExC_state, data->start_class);
3565 if (flags & SCF_DO_STCLASS_AND || !value)
3566 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3569 if (flags & SCF_DO_STCLASS_AND)
3570 cl_and(data->start_class,
3571 (struct regnode_charclass_class*)scan);
3573 cl_or(pRExC_state, data->start_class,
3574 (struct regnode_charclass_class*)scan);
3577 if (flags & SCF_DO_STCLASS_AND) {
3578 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3579 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3580 if (FLAGS(scan) & USE_UNI) {
3581 for (value = 0; value < 256; value++) {
3582 if (!isWORDCHAR_L1(value)) {
3583 ANYOF_BITMAP_CLEAR(data->start_class, value);
3587 for (value = 0; value < 256; value++) {
3588 if (!isALNUM(value)) {
3589 ANYOF_BITMAP_CLEAR(data->start_class, value);
3596 if (data->start_class->flags & ANYOF_LOCALE)
3597 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3598 else if (FLAGS(scan) & USE_UNI) {
3599 for (value = 0; value < 256; value++) {
3600 if (isWORDCHAR_L1(value)) {
3601 ANYOF_BITMAP_SET(data->start_class, value);
3605 for (value = 0; value < 256; value++) {
3606 if (isALNUM(value)) {
3607 ANYOF_BITMAP_SET(data->start_class, value);
3614 if (flags & SCF_DO_STCLASS_AND) {
3615 if (data->start_class->flags & ANYOF_LOCALE)
3616 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3619 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3620 data->start_class->flags |= ANYOF_LOCALE;
3624 if (flags & SCF_DO_STCLASS_AND) {
3625 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3626 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3627 if (FLAGS(scan) & USE_UNI) {
3628 for (value = 0; value < 256; value++) {
3629 if (isWORDCHAR_L1(value)) {
3630 ANYOF_BITMAP_CLEAR(data->start_class, value);
3634 for (value = 0; value < 256; value++) {
3635 if (isALNUM(value)) {
3636 ANYOF_BITMAP_CLEAR(data->start_class, value);
3643 if (data->start_class->flags & ANYOF_LOCALE)
3644 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3646 for (value = 0; value < 256; value++)
3647 if (!isALNUM(value))
3648 ANYOF_BITMAP_SET(data->start_class, value);
3653 if (flags & SCF_DO_STCLASS_AND) {
3654 if (data->start_class->flags & ANYOF_LOCALE)
3655 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3658 data->start_class->flags |= ANYOF_LOCALE;
3659 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3663 if (flags & SCF_DO_STCLASS_AND) {
3664 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3665 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3666 if (FLAGS(scan) & USE_UNI) {
3667 for (value = 0; value < 256; value++) {
3668 if (!isSPACE_L1(value)) {
3669 ANYOF_BITMAP_CLEAR(data->start_class, value);
3673 for (value = 0; value < 256; value++) {
3674 if (!isSPACE(value)) {
3675 ANYOF_BITMAP_CLEAR(data->start_class, value);
3682 if (data->start_class->flags & ANYOF_LOCALE) {
3683 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3685 else if (FLAGS(scan) & USE_UNI) {
3686 for (value = 0; value < 256; value++) {
3687 if (isSPACE_L1(value)) {
3688 ANYOF_BITMAP_SET(data->start_class, value);
3692 for (value = 0; value < 256; value++) {
3693 if (isSPACE(value)) {
3694 ANYOF_BITMAP_SET(data->start_class, value);
3701 if (flags & SCF_DO_STCLASS_AND) {
3702 if (data->start_class->flags & ANYOF_LOCALE)
3703 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3706 data->start_class->flags |= ANYOF_LOCALE;
3707 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3711 if (flags & SCF_DO_STCLASS_AND) {
3712 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3713 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3714 if (FLAGS(scan) & USE_UNI) {
3715 for (value = 0; value < 256; value++) {
3716 if (isSPACE_L1(value)) {
3717 ANYOF_BITMAP_CLEAR(data->start_class, value);
3721 for (value = 0; value < 256; value++) {
3722 if (isSPACE(value)) {
3723 ANYOF_BITMAP_CLEAR(data->start_class, value);
3730 if (data->start_class->flags & ANYOF_LOCALE)
3731 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3732 else if (FLAGS(scan) & USE_UNI) {
3733 for (value = 0; value < 256; value++) {
3734 if (!isSPACE_L1(value)) {
3735 ANYOF_BITMAP_SET(data->start_class, value);
3740 for (value = 0; value < 256; value++) {
3741 if (!isSPACE(value)) {
3742 ANYOF_BITMAP_SET(data->start_class, value);
3749 if (flags & SCF_DO_STCLASS_AND) {
3750 if (data->start_class->flags & ANYOF_LOCALE) {
3751 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3752 for (value = 0; value < 256; value++)
3753 if (!isSPACE(value))
3754 ANYOF_BITMAP_CLEAR(data->start_class, value);
3758 data->start_class->flags |= ANYOF_LOCALE;
3759 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3763 if (flags & SCF_DO_STCLASS_AND) {
3764 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3765 for (value = 0; value < 256; value++)
3766 if (!isDIGIT(value))
3767 ANYOF_BITMAP_CLEAR(data->start_class, value);
3770 if (data->start_class->flags & ANYOF_LOCALE)
3771 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3773 for (value = 0; value < 256; value++)
3775 ANYOF_BITMAP_SET(data->start_class, value);
3780 if (flags & SCF_DO_STCLASS_AND) {
3781 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3782 for (value = 0; value < 256; value++)
3784 ANYOF_BITMAP_CLEAR(data->start_class, value);
3787 if (data->start_class->flags & ANYOF_LOCALE)
3788 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3790 for (value = 0; value < 256; value++)
3791 if (!isDIGIT(value))
3792 ANYOF_BITMAP_SET(data->start_class, value);
3796 CASE_SYNST_FNC(VERTWS);
3797 CASE_SYNST_FNC(HORIZWS);
3800 if (flags & SCF_DO_STCLASS_OR)
3801 cl_and(data->start_class, and_withp);
3802 flags &= ~SCF_DO_STCLASS;
3805 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3806 data->flags |= (OP(scan) == MEOL
3810 else if ( PL_regkind[OP(scan)] == BRANCHJ
3811 /* Lookbehind, or need to calculate parens/evals/stclass: */
3812 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3813 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3814 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3815 || OP(scan) == UNLESSM )
3817 /* Negative Lookahead/lookbehind
3818 In this case we can't do fixed string optimisation.
3821 I32 deltanext, minnext, fake = 0;
3823 struct regnode_charclass_class intrnl;
3826 data_fake.flags = 0;
3828 data_fake.whilem_c = data->whilem_c;
3829 data_fake.last_closep = data->last_closep;
3832 data_fake.last_closep = &fake;
3833 data_fake.pos_delta = delta;
3834 if ( flags & SCF_DO_STCLASS && !scan->flags
3835 && OP(scan) == IFMATCH ) { /* Lookahead */
3836 cl_init(pRExC_state, &intrnl);
3837 data_fake.start_class = &intrnl;
3838 f |= SCF_DO_STCLASS_AND;
3840 if (flags & SCF_WHILEM_VISITED_POS)
3841 f |= SCF_WHILEM_VISITED_POS;
3842 next = regnext(scan);
3843 nscan = NEXTOPER(NEXTOPER(scan));
3844 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3845 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3848 FAIL("Variable length lookbehind not implemented");
3850 else if (minnext > (I32)U8_MAX) {
3851 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3853 scan->flags = (U8)minnext;
3856 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3858 if (data_fake.flags & SF_HAS_EVAL)
3859 data->flags |= SF_HAS_EVAL;
3860 data->whilem_c = data_fake.whilem_c;
3862 if (f & SCF_DO_STCLASS_AND) {
3863 if (flags & SCF_DO_STCLASS_OR) {
3864 /* OR before, AND after: ideally we would recurse with
3865 * data_fake to get the AND applied by study of the
3866 * remainder of the pattern, and then derecurse;
3867 * *** HACK *** for now just treat as "no information".
3868 * See [perl #56690].
3870 cl_init(pRExC_state, data->start_class);
3872 /* AND before and after: combine and continue */
3873 const int was = (data->start_class->flags & ANYOF_EOS);
3875 cl_and(data->start_class, &intrnl);
3877 data->start_class->flags |= ANYOF_EOS;
3881 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3883 /* Positive Lookahead/lookbehind
3884 In this case we can do fixed string optimisation,
3885 but we must be careful about it. Note in the case of
3886 lookbehind the positions will be offset by the minimum
3887 length of the pattern, something we won't know about
3888 until after the recurse.
3890 I32 deltanext, fake = 0;
3892 struct regnode_charclass_class intrnl;
3894 /* We use SAVEFREEPV so that when the full compile
3895 is finished perl will clean up the allocated
3896 minlens when its all done. This was we don't
3897 have to worry about freeing them when we know
3898 they wont be used, which would be a pain.
3901 Newx( minnextp, 1, I32 );
3902 SAVEFREEPV(minnextp);
3905 StructCopy(data, &data_fake, scan_data_t);
3906 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3909 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3910 data_fake.last_found=newSVsv(data->last_found);
3914 data_fake.last_closep = &fake;
3915 data_fake.flags = 0;
3916 data_fake.pos_delta = delta;
3918 data_fake.flags |= SF_IS_INF;
3919 if ( flags & SCF_DO_STCLASS && !scan->flags
3920 && OP(scan) == IFMATCH ) { /* Lookahead */
3921 cl_init(pRExC_state, &intrnl);
3922 data_fake.start_class = &intrnl;
3923 f |= SCF_DO_STCLASS_AND;
3925 if (flags & SCF_WHILEM_VISITED_POS)
3926 f |= SCF_WHILEM_VISITED_POS;
3927 next = regnext(scan);
3928 nscan = NEXTOPER(NEXTOPER(scan));
3930 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3931 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3934 FAIL("Variable length lookbehind not implemented");
3936 else if (*minnextp > (I32)U8_MAX) {
3937 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3939 scan->flags = (U8)*minnextp;
3944 if (f & SCF_DO_STCLASS_AND) {
3945 const int was = (data->start_class->flags & ANYOF_EOS);
3947 cl_and(data->start_class, &intrnl);
3949 data->start_class->flags |= ANYOF_EOS;
3952 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3954 if (data_fake.flags & SF_HAS_EVAL)
3955 data->flags |= SF_HAS_EVAL;
3956 data->whilem_c = data_fake.whilem_c;
3957 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3958 if (RExC_rx->minlen<*minnextp)
3959 RExC_rx->minlen=*minnextp;
3960 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3961 SvREFCNT_dec(data_fake.last_found);
3963 if ( data_fake.minlen_fixed != minlenp )
3965 data->offset_fixed= data_fake.offset_fixed;
3966 data->minlen_fixed= data_fake.minlen_fixed;
3967 data->lookbehind_fixed+= scan->flags;
3969 if ( data_fake.minlen_float != minlenp )
3971 data->minlen_float= data_fake.minlen_float;
3972 data->offset_float_min=data_fake.offset_float_min;
3973 data->offset_float_max=data_fake.offset_float_max;
3974 data->lookbehind_float+= scan->flags;
3983 else if (OP(scan) == OPEN) {
3984 if (stopparen != (I32)ARG(scan))
3987 else if (OP(scan) == CLOSE) {
3988 if (stopparen == (I32)ARG(scan)) {
3991 if ((I32)ARG(scan) == is_par) {
3992 next = regnext(scan);
3994 if ( next && (OP(next) != WHILEM) && next < last)
3995 is_par = 0; /* Disable optimization */
3998 *(data->last_closep) = ARG(scan);
4000 else if (OP(scan) == EVAL) {
4002 data->flags |= SF_HAS_EVAL;
4004 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4005 if (flags & SCF_DO_SUBSTR) {
4006 SCAN_COMMIT(pRExC_state,data,minlenp);
4007 flags &= ~SCF_DO_SUBSTR;
4009 if (data && OP(scan)==ACCEPT) {
4010 data->flags |= SCF_SEEN_ACCEPT;
4015 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4017 if (flags & SCF_DO_SUBSTR) {
4018 SCAN_COMMIT(pRExC_state,data,minlenp);
4019 data->longest = &(data->longest_float);
4021 is_inf = is_inf_internal = 1;
4022 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4023 cl_anything(pRExC_state, data->start_class);
4024 flags &= ~SCF_DO_STCLASS;
4026 else if (OP(scan) == GPOS) {
4027 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4028 !(delta || is_inf || (data && data->pos_delta)))
4030 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4031 RExC_rx->extflags |= RXf_ANCH_GPOS;
4032 if (RExC_rx->gofs < (U32)min)
4033 RExC_rx->gofs = min;
4035 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4039 #ifdef TRIE_STUDY_OPT
4040 #ifdef FULL_TRIE_STUDY
4041 else if (PL_regkind[OP(scan)] == TRIE) {
4042 /* NOTE - There is similar code to this block above for handling
4043 BRANCH nodes on the initial study. If you change stuff here
4045 regnode *trie_node= scan;
4046 regnode *tail= regnext(scan);
4047 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4048 I32 max1 = 0, min1 = I32_MAX;
4049 struct regnode_charclass_class accum;
4051 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4052 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4053 if (flags & SCF_DO_STCLASS)
4054 cl_init_zero(pRExC_state, &accum);
4060 const regnode *nextbranch= NULL;
4063 for ( word=1 ; word <= trie->wordcount ; word++)
4065 I32 deltanext=0, minnext=0, f = 0, fake;
4066 struct regnode_charclass_class this_class;
4068 data_fake.flags = 0;
4070 data_fake.whilem_c = data->whilem_c;
4071 data_fake.last_closep = data->last_closep;
4074 data_fake.last_closep = &fake;
4075 data_fake.pos_delta = delta;
4076 if (flags & SCF_DO_STCLASS) {
4077 cl_init(pRExC_state, &this_class);
4078 data_fake.start_class = &this_class;
4079 f = SCF_DO_STCLASS_AND;
4081 if (flags & SCF_WHILEM_VISITED_POS)
4082 f |= SCF_WHILEM_VISITED_POS;
4084 if (trie->jump[word]) {
4086 nextbranch = trie_node + trie->jump[0];
4087 scan= trie_node + trie->jump[word];
4088 /* We go from the jump point to the branch that follows
4089 it. Note this means we need the vestigal unused branches
4090 even though they arent otherwise used.
4092 minnext = study_chunk(pRExC_state, &scan, minlenp,
4093 &deltanext, (regnode *)nextbranch, &data_fake,
4094 stopparen, recursed, NULL, f,depth+1);
4096 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4097 nextbranch= regnext((regnode*)nextbranch);
4099 if (min1 > (I32)(minnext + trie->minlen))
4100 min1 = minnext + trie->minlen;
4101 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4102 max1 = minnext + deltanext + trie->maxlen;
4103 if (deltanext == I32_MAX)
4104 is_inf = is_inf_internal = 1;
4106 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4108 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4109 if ( stopmin > min + min1)
4110 stopmin = min + min1;
4111 flags &= ~SCF_DO_SUBSTR;
4113 data->flags |= SCF_SEEN_ACCEPT;
4116 if (data_fake.flags & SF_HAS_EVAL)
4117 data->flags |= SF_HAS_EVAL;
4118 data->whilem_c = data_fake.whilem_c;
4120 if (flags & SCF_DO_STCLASS)
4121 cl_or(pRExC_state, &accum, &this_class);
4124 if (flags & SCF_DO_SUBSTR) {
4125 data->pos_min += min1;
4126 data->pos_delta += max1 - min1;
4127 if (max1 != min1 || is_inf)
4128 data->longest = &(data->longest_float);
4131 delta += max1 - min1;
4132 if (flags & SCF_DO_STCLASS_OR) {
4133 cl_or(pRExC_state, data->start_class, &accum);
4135 cl_and(data->start_class, and_withp);
4136 flags &= ~SCF_DO_STCLASS;
4139 else if (flags & SCF_DO_STCLASS_AND) {
4141 cl_and(data->start_class, &accum);
4142 flags &= ~SCF_DO_STCLASS;
4145 /* Switch to OR mode: cache the old value of
4146 * data->start_class */
4148 StructCopy(data->start_class, and_withp,
4149 struct regnode_charclass_class);
4150 flags &= ~SCF_DO_STCLASS_AND;
4151 StructCopy(&accum, data->start_class,
4152 struct regnode_charclass_class);
4153 flags |= SCF_DO_STCLASS_OR;
4154 data->start_class->flags |= ANYOF_EOS;
4161 else if (PL_regkind[OP(scan)] == TRIE) {
4162 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4165 min += trie->minlen;
4166 delta += (trie->maxlen - trie->minlen);
4167 flags &= ~SCF_DO_STCLASS; /* xxx */
4168 if (flags & SCF_DO_SUBSTR) {
4169 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4170 data->pos_min += trie->minlen;
4171 data->pos_delta += (trie->maxlen - trie->minlen);
4172 if (trie->maxlen != trie->minlen)
4173 data->longest = &(data->longest_float);
4175 if (trie->jump) /* no more substrings -- for now /grr*/
4176 flags &= ~SCF_DO_SUBSTR;
4178 #endif /* old or new */
4179 #endif /* TRIE_STUDY_OPT */
4181 /* Else: zero-length, ignore. */
4182 scan = regnext(scan);
4187 stopparen = frame->stop;
4188 frame = frame->prev;
4189 goto fake_study_recurse;
4194 DEBUG_STUDYDATA("pre-fin:",data,depth);
4197 *deltap = is_inf_internal ? I32_MAX : delta;
4198 if (flags & SCF_DO_SUBSTR && is_inf)
4199 data->pos_delta = I32_MAX - data->pos_min;
4200 if (is_par > (I32)U8_MAX)
4202 if (is_par && pars==1 && data) {
4203 data->flags |= SF_IN_PAR;
4204 data->flags &= ~SF_HAS_PAR;
4206 else if (pars && data) {
4207 data->flags |= SF_HAS_PAR;
4208 data->flags &= ~SF_IN_PAR;
4210 if (flags & SCF_DO_STCLASS_OR)
4211 cl_and(data->start_class, and_withp);
4212 if (flags & SCF_TRIE_RESTUDY)
4213 data->flags |= SCF_TRIE_RESTUDY;
4215 DEBUG_STUDYDATA("post-fin:",data,depth);
4217 return min < stopmin ? min : stopmin;
4221 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4223 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4225 PERL_ARGS_ASSERT_ADD_DATA;
4227 Renewc(RExC_rxi->data,
4228 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4229 char, struct reg_data);
4231 Renew(RExC_rxi->data->what, count + n, U8);
4233 Newx(RExC_rxi->data->what, n, U8);
4234 RExC_rxi->data->count = count + n;
4235 Copy(s, RExC_rxi->data->what + count, n, U8);
4239 /*XXX: todo make this not included in a non debugging perl */
4240 #ifndef PERL_IN_XSUB_RE
4242 Perl_reginitcolors(pTHX)
4245 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4247 char *t = savepv(s);
4251 t = strchr(t, '\t');
4257 PL_colors[i] = t = (char *)"";
4262 PL_colors[i++] = (char *)"";
4269 #ifdef TRIE_STUDY_OPT
4270 #define CHECK_RESTUDY_GOTO \
4272 (data.flags & SCF_TRIE_RESTUDY) \
4276 #define CHECK_RESTUDY_GOTO
4280 - pregcomp - compile a regular expression into internal code
4282 * We can't allocate space until we know how big the compiled form will be,
4283 * but we can't compile it (and thus know how big it is) until we've got a
4284 * place to put the code. So we cheat: we compile it twice, once with code
4285 * generation turned off and size counting turned on, and once "for real".
4286 * This also means that we don't allocate space until we are sure that the
4287 * thing really will compile successfully, and we never have to move the
4288 * code and thus invalidate pointers into it. (Note that it has to be in
4289 * one piece because free() must be able to free it all.) [NB: not true in perl]
4291 * Beware that the optimization-preparation code in here knows about some
4292 * of the structure of the compiled regexp. [I'll say.]
4297 #ifndef PERL_IN_XSUB_RE
4298 #define RE_ENGINE_PTR &PL_core_reg_engine
4300 extern const struct regexp_engine my_reg_engine;
4301 #define RE_ENGINE_PTR &my_reg_engine
4304 #ifndef PERL_IN_XSUB_RE
4306 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4309 HV * const table = GvHV(PL_hintgv);
4311 PERL_ARGS_ASSERT_PREGCOMP;
4313 /* Dispatch a request to compile a regexp to correct
4316 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4317 GET_RE_DEBUG_FLAGS_DECL;
4318 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4319 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4321 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4324 return CALLREGCOMP_ENG(eng, pattern, flags);
4327 return Perl_re_compile(aTHX_ pattern, flags);
4332 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4337 register regexp_internal *ri;
4349 RExC_state_t RExC_state;
4350 RExC_state_t * const pRExC_state = &RExC_state;
4351 #ifdef TRIE_STUDY_OPT
4353 RExC_state_t copyRExC_state;
4355 GET_RE_DEBUG_FLAGS_DECL;
4357 PERL_ARGS_ASSERT_RE_COMPILE;
4359 DEBUG_r(if (!PL_colorset) reginitcolors());
4361 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4364 /* Longjmp back to here if have to switch in midstream to utf8 */
4365 if (! RExC_orig_utf8) {
4366 JMPENV_PUSH(jump_ret);
4369 if (jump_ret == 0) { /* First time through */
4370 exp = SvPV(pattern, plen);
4374 SV *dsv= sv_newmortal();
4375 RE_PV_QUOTED_DECL(s, RExC_utf8,
4376 dsv, exp, plen, 60);
4377 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4378 PL_colors[4],PL_colors[5],s);
4381 else { /* longjumped back */
4384 /* If the cause for the longjmp was other than changing to utf8, pop
4385 * our own setjmp, and longjmp to the correct handler */
4386 if (jump_ret != UTF8_LONGJMP) {
4388 JMPENV_JUMP(jump_ret);
4393 /* It's possible to write a regexp in ascii that represents Unicode
4394 codepoints outside of the byte range, such as via \x{100}. If we
4395 detect such a sequence we have to convert the entire pattern to utf8
4396 and then recompile, as our sizing calculation will have been based
4397 on 1 byte == 1 character, but we will need to use utf8 to encode
4398 at least some part of the pattern, and therefore must convert the whole
4401 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4402 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4403 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4405 RExC_orig_utf8 = RExC_utf8 = 1;
4409 #ifdef TRIE_STUDY_OPT
4414 RExC_flags = pm_flags;
4418 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4419 RExC_seen_evals = 0;
4422 /* First pass: determine size, legality. */
4430 RExC_emit = &PL_regdummy;
4431 RExC_whilem_seen = 0;
4432 RExC_open_parens = NULL;
4433 RExC_close_parens = NULL;
4435 RExC_paren_names = NULL;
4437 RExC_paren_name_list = NULL;
4439 RExC_recurse = NULL;
4440 RExC_recurse_count = 0;
4442 #if 0 /* REGC() is (currently) a NOP at the first pass.
4443 * Clever compilers notice this and complain. --jhi */
4444 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4446 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4447 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4448 RExC_precomp = NULL;
4452 /* Here, finished first pass. Get rid of our setjmp, which we added for
4453 * efficiency only if the passed-in string wasn't in utf8, as shown by
4454 * RExC_orig_utf8. But if the first pass was redone, that variable will be
4455 * 1 here even though the original string wasn't utf8, but in this case
4456 * there will have been a long jump */
4457 if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4461 PerlIO_printf(Perl_debug_log,
4462 "Required size %"IVdf" nodes\n"
4463 "Starting second pass (creation)\n",
4466 RExC_lastparse=NULL;
4468 /* Small enough for pointer-storage convention?
4469 If extralen==0, this means that we will not need long jumps. */
4470 if (RExC_size >= 0x10000L && RExC_extralen)
4471 RExC_size += RExC_extralen;
4474 if (RExC_whilem_seen > 15)
4475 RExC_whilem_seen = 15;
4477 /* Allocate space and zero-initialize. Note, the two step process
4478 of zeroing when in debug mode, thus anything assigned has to
4479 happen after that */
4480 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4481 r = (struct regexp*)SvANY(rx);
4482 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4483 char, regexp_internal);
4484 if ( r == NULL || ri == NULL )
4485 FAIL("Regexp out of space");
4487 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4488 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4490 /* bulk initialize base fields with 0. */
4491 Zero(ri, sizeof(regexp_internal), char);
4494 /* non-zero initialization begins here */
4496 r->engine= RE_ENGINE_PTR;
4497 r->extflags = pm_flags;
4499 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4500 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
4502 /* The caret is output if there are any defaults: if not all the STD
4503 * flags are set, or if no character set specifier is needed */
4505 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4507 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4508 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4509 >> RXf_PMf_STD_PMMOD_SHIFT);
4510 const char *fptr = STD_PAT_MODS; /*"msix"*/
4512 /* Allocate for the worst case, which is all the std flags are turned
4513 * on. If more precision is desired, we could do a population count of
4514 * the flags set. This could be done with a small lookup table, or by
4515 * shifting, masking and adding, or even, when available, assembly
4516 * language for a machine-language population count.
4517 * We never output a minus, as all those are defaults, so are
4518 * covered by the caret */
4519 const STRLEN wraplen = plen + has_p + has_runon
4520 + has_default /* If needs a caret */
4521 + has_charset /* If needs a character set specifier */
4522 + (sizeof(STD_PAT_MODS) - 1)
4523 + (sizeof("(?:)") - 1);
4525 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4527 SvFLAGS(rx) |= SvUTF8(pattern);
4530 /* If a default, cover it using the caret */
4532 *p++= DEFAULT_PAT_MOD;
4535 if (r->extflags & RXf_PMf_LOCALE) {
4536 *p++ = LOCALE_PAT_MOD;
4538 *p++ = UNICODE_PAT_MOD;
4542 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4545 while((ch = *fptr++)) {
4553 Copy(RExC_precomp, p, plen, char);
4554 assert ((RX_WRAPPED(rx) - p) < 16);
4555 r->pre_prefix = p - RX_WRAPPED(rx);
4561 SvCUR_set(rx, p - SvPVX_const(rx));
4565 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4567 if (RExC_seen & REG_SEEN_RECURSE) {
4568 Newxz(RExC_open_parens, RExC_npar,regnode *);
4569 SAVEFREEPV(RExC_open_parens);
4570 Newxz(RExC_close_parens,RExC_npar,regnode *);
4571 SAVEFREEPV(RExC_close_parens);
4574 /* Useful during FAIL. */
4575 #ifdef RE_TRACK_PATTERN_OFFSETS
4576 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4577 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4578 "%s %"UVuf" bytes for offset annotations.\n",
4579 ri->u.offsets ? "Got" : "Couldn't get",
4580 (UV)((2*RExC_size+1) * sizeof(U32))));
4582 SetProgLen(ri,RExC_size);
4587 /* Second pass: emit code. */
4588 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4593 RExC_emit_start = ri->program;
4594 RExC_emit = ri->program;
4595 RExC_emit_bound = ri->program + RExC_size + 1;
4597 /* Store the count of eval-groups for security checks: */
4598 RExC_rx->seen_evals = RExC_seen_evals;
4599 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4600 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4604 /* XXXX To minimize changes to RE engine we always allocate
4605 3-units-long substrs field. */
4606 Newx(r->substrs, 1, struct reg_substr_data);
4607 if (RExC_recurse_count) {
4608 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4609 SAVEFREEPV(RExC_recurse);
4613 r->minlen = minlen = sawplus = sawopen = 0;
4614 Zero(r->substrs, 1, struct reg_substr_data);
4616 #ifdef TRIE_STUDY_OPT
4618 StructCopy(&zero_scan_data, &data, scan_data_t);
4619 copyRExC_state = RExC_state;
4622 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4624 RExC_state = copyRExC_state;
4625 if (seen & REG_TOP_LEVEL_BRANCHES)
4626 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4628 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4629 if (data.last_found) {
4630 SvREFCNT_dec(data.longest_fixed);
4631 SvREFCNT_dec(data.longest_float);
4632 SvREFCNT_dec(data.last_found);
4634 StructCopy(&zero_scan_data, &data, scan_data_t);
4637 StructCopy(&zero_scan_data, &data, scan_data_t);
4640 /* Dig out information for optimizations. */
4641 r->extflags = RExC_flags; /* was pm_op */
4642 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4645 SvUTF8_on(rx); /* Unicode in it? */
4646 ri->regstclass = NULL;
4647 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4648 r->intflags |= PREGf_NAUGHTY;
4649 scan = ri->program + 1; /* First BRANCH. */
4651 /* testing for BRANCH here tells us whether there is "must appear"
4652 data in the pattern. If there is then we can use it for optimisations */
4653 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4655 STRLEN longest_float_length, longest_fixed_length;
4656 struct regnode_charclass_class ch_class; /* pointed to by data */
4658 I32 last_close = 0; /* pointed to by data */
4659 regnode *first= scan;
4660 regnode *first_next= regnext(first);
4663 * Skip introductions and multiplicators >= 1
4664 * so that we can extract the 'meat' of the pattern that must
4665 * match in the large if() sequence following.
4666 * NOTE that EXACT is NOT covered here, as it is normally
4667 * picked up by the optimiser separately.
4669 * This is unfortunate as the optimiser isnt handling lookahead
4670 * properly currently.
4673 while ((OP(first) == OPEN && (sawopen = 1)) ||
4674 /* An OR of *one* alternative - should not happen now. */
4675 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4676 /* for now we can't handle lookbehind IFMATCH*/
4677 (OP(first) == IFMATCH && !first->flags) ||
4678 (OP(first) == PLUS) ||
4679 (OP(first) == MINMOD) ||
4680 /* An {n,m} with n>0 */
4681 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4682 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4685 * the only op that could be a regnode is PLUS, all the rest
4686 * will be regnode_1 or regnode_2.
4689 if (OP(first) == PLUS)
4692 first += regarglen[OP(first)];
4694 first = NEXTOPER(first);
4695 first_next= regnext(first);
4698 /* Starting-point info. */
4700 DEBUG_PEEP("first:",first,0);
4701 /* Ignore EXACT as we deal with it later. */
4702 if (PL_regkind[OP(first)] == EXACT) {
4703 if (OP(first) == EXACT)
4704 NOOP; /* Empty, get anchored substr later. */
4705 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4706 ri->regstclass = first;
4709 else if (PL_regkind[OP(first)] == TRIE &&
4710 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4713 /* this can happen only on restudy */
4714 if ( OP(first) == TRIE ) {
4715 struct regnode_1 *trieop = (struct regnode_1 *)
4716 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4717 StructCopy(first,trieop,struct regnode_1);
4718 trie_op=(regnode *)trieop;
4720 struct regnode_charclass *trieop = (struct regnode_charclass *)
4721 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4722 StructCopy(first,trieop,struct regnode_charclass);
4723 trie_op=(regnode *)trieop;
4726 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4727 ri->regstclass = trie_op;
4730 else if (REGNODE_SIMPLE(OP(first)))
4731 ri->regstclass = first;
4732 else if (PL_regkind[OP(first)] == BOUND ||
4733 PL_regkind[OP(first)] == NBOUND)
4734 ri->regstclass = first;
4735 else if (PL_regkind[OP(first)] == BOL) {
4736 r->extflags |= (OP(first) == MBOL
4738 : (OP(first) == SBOL
4741 first = NEXTOPER(first);
4744 else if (OP(first) == GPOS) {
4745 r->extflags |= RXf_ANCH_GPOS;
4746 first = NEXTOPER(first);
4749 else if ((!sawopen || !RExC_sawback) &&
4750 (OP(first) == STAR &&
4751 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4752 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4754 /* turn .* into ^.* with an implied $*=1 */
4756 (OP(NEXTOPER(first)) == REG_ANY)
4759 r->extflags |= type;
4760 r->intflags |= PREGf_IMPLICIT;
4761 first = NEXTOPER(first);
4764 if (sawplus && (!sawopen || !RExC_sawback)
4765 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4766 /* x+ must match at the 1st pos of run of x's */
4767 r->intflags |= PREGf_SKIP;
4769 /* Scan is after the zeroth branch, first is atomic matcher. */
4770 #ifdef TRIE_STUDY_OPT
4773 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4774 (IV)(first - scan + 1))
4778 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4779 (IV)(first - scan + 1))
4785 * If there's something expensive in the r.e., find the
4786 * longest literal string that must appear and make it the
4787 * regmust. Resolve ties in favor of later strings, since
4788 * the regstart check works with the beginning of the r.e.
4789 * and avoiding duplication strengthens checking. Not a
4790 * strong reason, but sufficient in the absence of others.
4791 * [Now we resolve ties in favor of the earlier string if
4792 * it happens that c_offset_min has been invalidated, since the
4793 * earlier string may buy us something the later one won't.]
4796 data.longest_fixed = newSVpvs("");
4797 data.longest_float = newSVpvs("");
4798 data.last_found = newSVpvs("");
4799 data.longest = &(data.longest_fixed);
4801 if (!ri->regstclass) {
4802 cl_init(pRExC_state, &ch_class);
4803 data.start_class = &ch_class;
4804 stclass_flag = SCF_DO_STCLASS_AND;
4805 } else /* XXXX Check for BOUND? */
4807 data.last_closep = &last_close;
4809 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4810 &data, -1, NULL, NULL,
4811 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4817 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4818 && data.last_start_min == 0 && data.last_end > 0
4819 && !RExC_seen_zerolen
4820 && !(RExC_seen & REG_SEEN_VERBARG)
4821 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4822 r->extflags |= RXf_CHECK_ALL;
4823 scan_commit(pRExC_state, &data,&minlen,0);
4824 SvREFCNT_dec(data.last_found);
4826 /* Note that code very similar to this but for anchored string
4827 follows immediately below, changes may need to be made to both.
4830 longest_float_length = CHR_SVLEN(data.longest_float);
4831 if (longest_float_length
4832 || (data.flags & SF_FL_BEFORE_EOL
4833 && (!(data.flags & SF_FL_BEFORE_MEOL)
4834 || (RExC_flags & RXf_PMf_MULTILINE))))
4838 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4839 && data.offset_fixed == data.offset_float_min
4840 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4841 goto remove_float; /* As in (a)+. */
4843 /* copy the information about the longest float from the reg_scan_data
4844 over to the program. */
4845 if (SvUTF8(data.longest_float)) {
4846 r->float_utf8 = data.longest_float;
4847 r->float_substr = NULL;
4849 r->float_substr = data.longest_float;
4850 r->float_utf8 = NULL;
4852 /* float_end_shift is how many chars that must be matched that
4853 follow this item. We calculate it ahead of time as once the
4854 lookbehind offset is added in we lose the ability to correctly
4856 ml = data.minlen_float ? *(data.minlen_float)
4857 : (I32)longest_float_length;
4858 r->float_end_shift = ml - data.offset_float_min
4859 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4860 + data.lookbehind_float;
4861 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4862 r->float_max_offset = data.offset_float_max;
4863 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4864 r->float_max_offset -= data.lookbehind_float;
4866 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4867 && (!(data.flags & SF_FL_BEFORE_MEOL)
4868 || (RExC_flags & RXf_PMf_MULTILINE)));
4869 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4873 r->float_substr = r->float_utf8 = NULL;
4874 SvREFCNT_dec(data.longest_float);
4875 longest_float_length = 0;
4878 /* Note that code very similar to this but for floating string
4879 is immediately above, changes may need to be made to both.
4882 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4883 if (longest_fixed_length
4884 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4885 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4886 || (RExC_flags & RXf_PMf_MULTILINE))))
4890 /* copy the information about the longest fixed
4891 from the reg_scan_data over to the program. */
4892 if (SvUTF8(data.longest_fixed)) {
4893 r->anchored_utf8 = data.longest_fixed;
4894 r->anchored_substr = NULL;
4896 r->anchored_substr = data.longest_fixed;
4897 r->anchored_utf8 = NULL;
4899 /* fixed_end_shift is how many chars that must be matched that
4900 follow this item. We calculate it ahead of time as once the
4901 lookbehind offset is added in we lose the ability to correctly
4903 ml = data.minlen_fixed ? *(data.minlen_fixed)
4904 : (I32)longest_fixed_length;
4905 r->anchored_end_shift = ml - data.offset_fixed
4906 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4907 + data.lookbehind_fixed;
4908 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4910 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4911 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4912 || (RExC_flags & RXf_PMf_MULTILINE)));
4913 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4916 r->anchored_substr = r->anchored_utf8 = NULL;
4917 SvREFCNT_dec(data.longest_fixed);
4918 longest_fixed_length = 0;
4921 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4922 ri->regstclass = NULL;
4923 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4925 && !(data.start_class->flags & ANYOF_EOS)
4926 && !cl_is_anything(data.start_class))
4928 const U32 n = add_data(pRExC_state, 1, "f");
4930 Newx(RExC_rxi->data->data[n], 1,
4931 struct regnode_charclass_class);
4932 StructCopy(data.start_class,
4933 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4934 struct regnode_charclass_class);
4935 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4936 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4937 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4938 regprop(r, sv, (regnode*)data.start_class);
4939 PerlIO_printf(Perl_debug_log,
4940 "synthetic stclass \"%s\".\n",
4941 SvPVX_const(sv));});
4944 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4945 if (longest_fixed_length > longest_float_length) {
4946 r->check_end_shift = r->anchored_end_shift;
4947 r->check_substr = r->anchored_substr;
4948 r->check_utf8 = r->anchored_utf8;
4949 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4950 if (r->extflags & RXf_ANCH_SINGLE)
4951 r->extflags |= RXf_NOSCAN;
4954 r->check_end_shift = r->float_end_shift;
4955 r->check_substr = r->float_substr;
4956 r->check_utf8 = r->float_utf8;
4957 r->check_offset_min = r->float_min_offset;
4958 r->check_offset_max = r->float_max_offset;
4960 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4961 This should be changed ASAP! */
4962 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4963 r->extflags |= RXf_USE_INTUIT;
4964 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4965 r->extflags |= RXf_INTUIT_TAIL;
4967 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4968 if ( (STRLEN)minlen < longest_float_length )
4969 minlen= longest_float_length;
4970 if ( (STRLEN)minlen < longest_fixed_length )
4971 minlen= longest_fixed_length;
4975 /* Several toplevels. Best we can is to set minlen. */
4977 struct regnode_charclass_class ch_class;
4980 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4982 scan = ri->program + 1;
4983 cl_init(pRExC_state, &ch_class);
4984 data.start_class = &ch_class;
4985 data.last_closep = &last_close;
4988 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4989 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4993 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4994 = r->float_substr = r->float_utf8 = NULL;
4995 if (!(data.start_class->flags & ANYOF_EOS)
4996 && !cl_is_anything(data.start_class))
4998 const U32 n = add_data(pRExC_state, 1, "f");
5000 Newx(RExC_rxi->data->data[n], 1,
5001 struct regnode_charclass_class);
5002 StructCopy(data.start_class,
5003 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5004 struct regnode_charclass_class);
5005 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5006 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5007 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5008 regprop(r, sv, (regnode*)data.start_class);
5009 PerlIO_printf(Perl_debug_log,
5010 "synthetic stclass \"%s\".\n",
5011 SvPVX_const(sv));});
5015 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5016 the "real" pattern. */
5018 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5019 (IV)minlen, (IV)r->minlen);
5021 r->minlenret = minlen;
5022 if (r->minlen < minlen)
5025 if (RExC_seen & REG_SEEN_GPOS)
5026 r->extflags |= RXf_GPOS_SEEN;
5027 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5028 r->extflags |= RXf_LOOKBEHIND_SEEN;
5029 if (RExC_seen & REG_SEEN_EVAL)
5030 r->extflags |= RXf_EVAL_SEEN;
5031 if (RExC_seen & REG_SEEN_CANY)
5032 r->extflags |= RXf_CANY_SEEN;
5033 if (RExC_seen & REG_SEEN_VERBARG)
5034 r->intflags |= PREGf_VERBARG_SEEN;
5035 if (RExC_seen & REG_SEEN_CUTGROUP)
5036 r->intflags |= PREGf_CUTGROUP_SEEN;
5037 if (RExC_paren_names)
5038 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5040 RXp_PAREN_NAMES(r) = NULL;
5042 #ifdef STUPID_PATTERN_CHECKS
5043 if (RX_PRELEN(rx) == 0)
5044 r->extflags |= RXf_NULL;
5045 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5046 /* XXX: this should happen BEFORE we compile */
5047 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5048 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5049 r->extflags |= RXf_WHITE;
5050 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5051 r->extflags |= RXf_START_ONLY;
5053 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5054 /* XXX: this should happen BEFORE we compile */
5055 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5057 regnode *first = ri->program + 1;
5059 U8 nop = OP(NEXTOPER(first));
5061 if (PL_regkind[fop] == NOTHING && nop == END)
5062 r->extflags |= RXf_NULL;
5063 else if (PL_regkind[fop] == BOL && nop == END)
5064 r->extflags |= RXf_START_ONLY;
5065 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5066 r->extflags |= RXf_WHITE;
5070 if (RExC_paren_names) {
5071 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5072 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5075 ri->name_list_idx = 0;
5077 if (RExC_recurse_count) {
5078 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5079 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5080 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5083 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5084 /* assume we don't need to swap parens around before we match */
5087 PerlIO_printf(Perl_debug_log,"Final program:\n");
5090 #ifdef RE_TRACK_PATTERN_OFFSETS
5091 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5092 const U32 len = ri->u.offsets[0];
5094 GET_RE_DEBUG_FLAGS_DECL;
5095 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5096 for (i = 1; i <= len; i++) {
5097 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5098 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5099 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5101 PerlIO_printf(Perl_debug_log, "\n");
5107 #undef RE_ENGINE_PTR
5111 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5114 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5116 PERL_UNUSED_ARG(value);
5118 if (flags & RXapif_FETCH) {
5119 return reg_named_buff_fetch(rx, key, flags);
5120 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5121 Perl_croak_no_modify(aTHX);
5123 } else if (flags & RXapif_EXISTS) {
5124 return reg_named_buff_exists(rx, key, flags)
5127 } else if (flags & RXapif_REGNAMES) {
5128 return reg_named_buff_all(rx, flags);
5129 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5130 return reg_named_buff_scalar(rx, flags);
5132 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5138 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5141 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5142 PERL_UNUSED_ARG(lastkey);
5144 if (flags & RXapif_FIRSTKEY)
5145 return reg_named_buff_firstkey(rx, flags);
5146 else if (flags & RXapif_NEXTKEY)
5147 return reg_named_buff_nextkey(rx, flags);
5149 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5155 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5158 AV *retarray = NULL;
5160 struct regexp *const rx = (struct regexp *)SvANY(r);
5162 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5164 if (flags & RXapif_ALL)
5167 if (rx && RXp_PAREN_NAMES(rx)) {
5168 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5171 SV* sv_dat=HeVAL(he_str);
5172 I32 *nums=(I32*)SvPVX(sv_dat);
5173 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5174 if ((I32)(rx->nparens) >= nums[i]
5175 && rx->offs[nums[i]].start != -1
5176 && rx->offs[nums[i]].end != -1)
5179 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5183 ret = newSVsv(&PL_sv_undef);
5186 av_push(retarray, ret);
5189 return newRV_noinc(MUTABLE_SV(retarray));
5196 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5199 struct regexp *const rx = (struct regexp *)SvANY(r);
5201 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5203 if (rx && RXp_PAREN_NAMES(rx)) {
5204 if (flags & RXapif_ALL) {
5205 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5207 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5221 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5223 struct regexp *const rx = (struct regexp *)SvANY(r);
5225 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5227 if ( rx && RXp_PAREN_NAMES(rx) ) {
5228 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5230 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5237 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5239 struct regexp *const rx = (struct regexp *)SvANY(r);
5240 GET_RE_DEBUG_FLAGS_DECL;
5242 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5244 if (rx && RXp_PAREN_NAMES(rx)) {
5245 HV *hv = RXp_PAREN_NAMES(rx);
5247 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5250 SV* sv_dat = HeVAL(temphe);
5251 I32 *nums = (I32*)SvPVX(sv_dat);
5252 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5253 if ((I32)(rx->lastparen) >= nums[i] &&
5254 rx->offs[nums[i]].start != -1 &&
5255 rx->offs[nums[i]].end != -1)
5261 if (parno || flags & RXapif_ALL) {
5262 return newSVhek(HeKEY_hek(temphe));
5270 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5275 struct regexp *const rx = (struct regexp *)SvANY(r);
5277 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5279 if (rx && RXp_PAREN_NAMES(rx)) {
5280 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5281 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5282 } else if (flags & RXapif_ONE) {
5283 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5284 av = MUTABLE_AV(SvRV(ret));
5285 length = av_len(av);
5287 return newSViv(length + 1);
5289 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5293 return &PL_sv_undef;
5297 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5299 struct regexp *const rx = (struct regexp *)SvANY(r);
5302 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5304 if (rx && RXp_PAREN_NAMES(rx)) {
5305 HV *hv= RXp_PAREN_NAMES(rx);
5307 (void)hv_iterinit(hv);
5308 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5311 SV* sv_dat = HeVAL(temphe);
5312 I32 *nums = (I32*)SvPVX(sv_dat);
5313 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5314 if ((I32)(rx->lastparen) >= nums[i] &&
5315 rx->offs[nums[i]].start != -1 &&
5316 rx->offs[nums[i]].end != -1)
5322 if (parno || flags & RXapif_ALL) {
5323 av_push(av, newSVhek(HeKEY_hek(temphe)));
5328 return newRV_noinc(MUTABLE_SV(av));
5332 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5335 struct regexp *const rx = (struct regexp *)SvANY(r);
5340 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5343 sv_setsv(sv,&PL_sv_undef);
5347 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5349 i = rx->offs[0].start;
5353 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5355 s = rx->subbeg + rx->offs[0].end;
5356 i = rx->sublen - rx->offs[0].end;
5359 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5360 (s1 = rx->offs[paren].start) != -1 &&
5361 (t1 = rx->offs[paren].end) != -1)
5365 s = rx->subbeg + s1;
5367 sv_setsv(sv,&PL_sv_undef);
5370 assert(rx->sublen >= (s - rx->subbeg) + i );
5372 const int oldtainted = PL_tainted;
5374 sv_setpvn(sv, s, i);
5375 PL_tainted = oldtainted;
5376 if ( (rx->extflags & RXf_CANY_SEEN)
5377 ? (RXp_MATCH_UTF8(rx)
5378 && (!i || is_utf8_string((U8*)s, i)))
5379 : (RXp_MATCH_UTF8(rx)) )
5386 if (RXp_MATCH_TAINTED(rx)) {
5387 if (SvTYPE(sv) >= SVt_PVMG) {
5388 MAGIC* const mg = SvMAGIC(sv);
5391 SvMAGIC_set(sv, mg->mg_moremagic);
5393 if ((mgt = SvMAGIC(sv))) {
5394 mg->mg_moremagic = mgt;
5395 SvMAGIC_set(sv, mg);
5405 sv_setsv(sv,&PL_sv_undef);
5411 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5412 SV const * const value)
5414 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5416 PERL_UNUSED_ARG(rx);
5417 PERL_UNUSED_ARG(paren);
5418 PERL_UNUSED_ARG(value);
5421 Perl_croak_no_modify(aTHX);
5425 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5428 struct regexp *const rx = (struct regexp *)SvANY(r);
5432 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5434 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5436 /* $` / ${^PREMATCH} */
5437 case RX_BUFF_IDX_PREMATCH:
5438 if (rx->offs[0].start != -1) {
5439 i = rx->offs[0].start;
5447 /* $' / ${^POSTMATCH} */
5448 case RX_BUFF_IDX_POSTMATCH:
5449 if (rx->offs[0].end != -1) {
5450 i = rx->sublen - rx->offs[0].end;
5452 s1 = rx->offs[0].end;
5458 /* $& / ${^MATCH}, $1, $2, ... */
5460 if (paren <= (I32)rx->nparens &&
5461 (s1 = rx->offs[paren].start) != -1 &&
5462 (t1 = rx->offs[paren].end) != -1)
5467 if (ckWARN(WARN_UNINITIALIZED))
5468 report_uninit((const SV *)sv);
5473 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5474 const char * const s = rx->subbeg + s1;
5479 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5486 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5488 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5489 PERL_UNUSED_ARG(rx);
5493 return newSVpvs("Regexp");
5496 /* Scans the name of a named buffer from the pattern.
5497 * If flags is REG_RSN_RETURN_NULL returns null.
5498 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5499 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5500 * to the parsed name as looked up in the RExC_paren_names hash.
5501 * If there is an error throws a vFAIL().. type exception.
5504 #define REG_RSN_RETURN_NULL 0
5505 #define REG_RSN_RETURN_NAME 1
5506 #define REG_RSN_RETURN_DATA 2
5509 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5511 char *name_start = RExC_parse;
5513 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5515 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5516 /* skip IDFIRST by using do...while */
5519 RExC_parse += UTF8SKIP(RExC_parse);
5520 } while (isALNUM_utf8((U8*)RExC_parse));
5524 } while (isALNUM(*RExC_parse));
5529 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5530 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5531 if ( flags == REG_RSN_RETURN_NAME)
5533 else if (flags==REG_RSN_RETURN_DATA) {
5536 if ( ! sv_name ) /* should not happen*/
5537 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5538 if (RExC_paren_names)
5539 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5541 sv_dat = HeVAL(he_str);
5543 vFAIL("Reference to nonexistent named group");
5547 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5554 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5555 int rem=(int)(RExC_end - RExC_parse); \
5564 if (RExC_lastparse!=RExC_parse) \
5565 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5568 iscut ? "..." : "<" \
5571 PerlIO_printf(Perl_debug_log,"%16s",""); \
5574 num = RExC_size + 1; \
5576 num=REG_NODE_NUM(RExC_emit); \
5577 if (RExC_lastnum!=num) \
5578 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5580 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5581 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5582 (int)((depth*2)), "", \
5586 RExC_lastparse=RExC_parse; \
5591 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5592 DEBUG_PARSE_MSG((funcname)); \
5593 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5595 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5596 DEBUG_PARSE_MSG((funcname)); \
5597 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5600 - reg - regular expression, i.e. main body or parenthesized thing
5602 * Caller must absorb opening parenthesis.
5604 * Combining parenthesis handling with the base level of regular expression
5605 * is a trifle forced, but the need to tie the tails of the branches to what
5606 * follows makes it hard to avoid.
5608 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5610 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5612 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5616 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5617 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5620 register regnode *ret; /* Will be the head of the group. */
5621 register regnode *br;
5622 register regnode *lastbr;
5623 register regnode *ender = NULL;
5624 register I32 parno = 0;
5626 U32 oregflags = RExC_flags;
5627 bool have_branch = 0;
5629 I32 freeze_paren = 0;
5630 I32 after_freeze = 0;
5632 /* for (?g), (?gc), and (?o) warnings; warning
5633 about (?c) will warn about (?g) -- japhy */
5635 #define WASTED_O 0x01
5636 #define WASTED_G 0x02
5637 #define WASTED_C 0x04
5638 #define WASTED_GC (0x02|0x04)
5639 I32 wastedflags = 0x00;
5641 char * parse_start = RExC_parse; /* MJD */
5642 char * const oregcomp_parse = RExC_parse;
5644 GET_RE_DEBUG_FLAGS_DECL;
5646 PERL_ARGS_ASSERT_REG;
5647 DEBUG_PARSE("reg ");
5649 *flagp = 0; /* Tentatively. */
5652 /* Make an OPEN node, if parenthesized. */
5654 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5655 char *start_verb = RExC_parse;
5656 STRLEN verb_len = 0;
5657 char *start_arg = NULL;
5658 unsigned char op = 0;
5660 int internal_argval = 0; /* internal_argval is only useful if !argok */
5661 while ( *RExC_parse && *RExC_parse != ')' ) {
5662 if ( *RExC_parse == ':' ) {
5663 start_arg = RExC_parse + 1;
5669 verb_len = RExC_parse - start_verb;
5672 while ( *RExC_parse && *RExC_parse != ')' )
5674 if ( *RExC_parse != ')' )
5675 vFAIL("Unterminated verb pattern argument");
5676 if ( RExC_parse == start_arg )
5679 if ( *RExC_parse != ')' )
5680 vFAIL("Unterminated verb pattern");
5683 switch ( *start_verb ) {
5684 case 'A': /* (*ACCEPT) */
5685 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5687 internal_argval = RExC_nestroot;
5690 case 'C': /* (*COMMIT) */
5691 if ( memEQs(start_verb,verb_len,"COMMIT") )
5694 case 'F': /* (*FAIL) */
5695 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5700 case ':': /* (*:NAME) */
5701 case 'M': /* (*MARK:NAME) */
5702 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5707 case 'P': /* (*PRUNE) */
5708 if ( memEQs(start_verb,verb_len,"PRUNE") )
5711 case 'S': /* (*SKIP) */
5712 if ( memEQs(start_verb,verb_len,"SKIP") )
5715 case 'T': /* (*THEN) */
5716 /* [19:06] <TimToady> :: is then */
5717 if ( memEQs(start_verb,verb_len,"THEN") ) {
5719 RExC_seen |= REG_SEEN_CUTGROUP;
5725 vFAIL3("Unknown verb pattern '%.*s'",
5726 verb_len, start_verb);
5729 if ( start_arg && internal_argval ) {
5730 vFAIL3("Verb pattern '%.*s' may not have an argument",
5731 verb_len, start_verb);
5732 } else if ( argok < 0 && !start_arg ) {
5733 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5734 verb_len, start_verb);
5736 ret = reganode(pRExC_state, op, internal_argval);
5737 if ( ! internal_argval && ! SIZE_ONLY ) {
5739 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5740 ARG(ret) = add_data( pRExC_state, 1, "S" );
5741 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5748 if (!internal_argval)
5749 RExC_seen |= REG_SEEN_VERBARG;
5750 } else if ( start_arg ) {
5751 vFAIL3("Verb pattern '%.*s' may not have an argument",
5752 verb_len, start_verb);
5754 ret = reg_node(pRExC_state, op);
5756 nextchar(pRExC_state);
5759 if (*RExC_parse == '?') { /* (?...) */
5760 bool is_logical = 0;
5761 const char * const seqstart = RExC_parse;
5762 bool has_use_defaults = FALSE;
5765 paren = *RExC_parse++;
5766 ret = NULL; /* For look-ahead/behind. */
5769 case 'P': /* (?P...) variants for those used to PCRE/Python */
5770 paren = *RExC_parse++;
5771 if ( paren == '<') /* (?P<...>) named capture */
5773 else if (paren == '>') { /* (?P>name) named recursion */
5774 goto named_recursion;
5776 else if (paren == '=') { /* (?P=...) named backref */
5777 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5778 you change this make sure you change that */
5779 char* name_start = RExC_parse;
5781 SV *sv_dat = reg_scan_name(pRExC_state,
5782 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5783 if (RExC_parse == name_start || *RExC_parse != ')')
5784 vFAIL2("Sequence %.3s... not terminated",parse_start);
5787 num = add_data( pRExC_state, 1, "S" );
5788 RExC_rxi->data->data[num]=(void*)sv_dat;
5789 SvREFCNT_inc_simple_void(sv_dat);
5792 ret = reganode(pRExC_state,
5793 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5797 Set_Node_Offset(ret, parse_start+1);
5798 Set_Node_Cur_Length(ret); /* MJD */
5800 nextchar(pRExC_state);
5804 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5806 case '<': /* (?<...) */
5807 if (*RExC_parse == '!')
5809 else if (*RExC_parse != '=')
5815 case '\'': /* (?'...') */
5816 name_start= RExC_parse;
5817 svname = reg_scan_name(pRExC_state,
5818 SIZE_ONLY ? /* reverse test from the others */
5819 REG_RSN_RETURN_NAME :
5820 REG_RSN_RETURN_NULL);
5821 if (RExC_parse == name_start) {
5823 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5826 if (*RExC_parse != paren)
5827 vFAIL2("Sequence (?%c... not terminated",
5828 paren=='>' ? '<' : paren);
5832 if (!svname) /* shouldnt happen */
5834 "panic: reg_scan_name returned NULL");
5835 if (!RExC_paren_names) {
5836 RExC_paren_names= newHV();
5837 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5839 RExC_paren_name_list= newAV();
5840 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5843 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5845 sv_dat = HeVAL(he_str);
5847 /* croak baby croak */
5849 "panic: paren_name hash element allocation failed");
5850 } else if ( SvPOK(sv_dat) ) {
5851 /* (?|...) can mean we have dupes so scan to check
5852 its already been stored. Maybe a flag indicating
5853 we are inside such a construct would be useful,
5854 but the arrays are likely to be quite small, so
5855 for now we punt -- dmq */
5856 IV count = SvIV(sv_dat);
5857 I32 *pv = (I32*)SvPVX(sv_dat);
5859 for ( i = 0 ; i < count ; i++ ) {
5860 if ( pv[i] == RExC_npar ) {
5866 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5867 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5868 pv[count] = RExC_npar;
5869 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5872 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5873 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5875 SvIV_set(sv_dat, 1);
5878 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5879 SvREFCNT_dec(svname);
5882 /*sv_dump(sv_dat);*/
5884 nextchar(pRExC_state);
5886 goto capturing_parens;
5888 RExC_seen |= REG_SEEN_LOOKBEHIND;
5890 case '=': /* (?=...) */
5891 RExC_seen_zerolen++;
5893 case '!': /* (?!...) */
5894 RExC_seen_zerolen++;
5895 if (*RExC_parse == ')') {
5896 ret=reg_node(pRExC_state, OPFAIL);
5897 nextchar(pRExC_state);
5901 case '|': /* (?|...) */
5902 /* branch reset, behave like a (?:...) except that
5903 buffers in alternations share the same numbers */
5905 after_freeze = freeze_paren = RExC_npar;
5907 case ':': /* (?:...) */
5908 case '>': /* (?>...) */
5910 case '$': /* (?$...) */
5911 case '@': /* (?@...) */
5912 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5914 case '#': /* (?#...) */
5915 while (*RExC_parse && *RExC_parse != ')')
5917 if (*RExC_parse != ')')
5918 FAIL("Sequence (?#... not terminated");
5919 nextchar(pRExC_state);
5922 case '0' : /* (?0) */
5923 case 'R' : /* (?R) */
5924 if (*RExC_parse != ')')
5925 FAIL("Sequence (?R) not terminated");
5926 ret = reg_node(pRExC_state, GOSTART);
5927 *flagp |= POSTPONED;
5928 nextchar(pRExC_state);
5931 { /* named and numeric backreferences */
5933 case '&': /* (?&NAME) */
5934 parse_start = RExC_parse - 1;
5937 SV *sv_dat = reg_scan_name(pRExC_state,
5938 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5939 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5941 goto gen_recurse_regop;
5944 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5946 vFAIL("Illegal pattern");
5948 goto parse_recursion;
5950 case '-': /* (?-1) */
5951 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5952 RExC_parse--; /* rewind to let it be handled later */
5956 case '1': case '2': case '3': case '4': /* (?1) */
5957 case '5': case '6': case '7': case '8': case '9':
5960 num = atoi(RExC_parse);
5961 parse_start = RExC_parse - 1; /* MJD */
5962 if (*RExC_parse == '-')
5964 while (isDIGIT(*RExC_parse))
5966 if (*RExC_parse!=')')
5967 vFAIL("Expecting close bracket");
5970 if ( paren == '-' ) {
5972 Diagram of capture buffer numbering.
5973 Top line is the normal capture buffer numbers
5974 Botton line is the negative indexing as from
5978 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5982 num = RExC_npar + num;
5985 vFAIL("Reference to nonexistent group");
5987 } else if ( paren == '+' ) {
5988 num = RExC_npar + num - 1;
5991 ret = reganode(pRExC_state, GOSUB, num);
5993 if (num > (I32)RExC_rx->nparens) {
5995 vFAIL("Reference to nonexistent group");
5997 ARG2L_SET( ret, RExC_recurse_count++);
5999 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6000 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6004 RExC_seen |= REG_SEEN_RECURSE;
6005 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6006 Set_Node_Offset(ret, parse_start); /* MJD */
6008 *flagp |= POSTPONED;
6009 nextchar(pRExC_state);
6011 } /* named and numeric backreferences */
6014 case '?': /* (??...) */
6016 if (*RExC_parse != '{') {
6018 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6021 *flagp |= POSTPONED;
6022 paren = *RExC_parse++;
6024 case '{': /* (?{...}) */
6029 char *s = RExC_parse;
6031 RExC_seen_zerolen++;
6032 RExC_seen |= REG_SEEN_EVAL;
6033 while (count && (c = *RExC_parse)) {
6044 if (*RExC_parse != ')') {
6046 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6050 OP_4tree *sop, *rop;
6051 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6054 Perl_save_re_context(aTHX);
6055 rop = sv_compile_2op(sv, &sop, "re", &pad);
6056 sop->op_private |= OPpREFCOUNTED;
6057 /* re_dup will OpREFCNT_inc */
6058 OpREFCNT_set(sop, 1);
6061 n = add_data(pRExC_state, 3, "nop");
6062 RExC_rxi->data->data[n] = (void*)rop;
6063 RExC_rxi->data->data[n+1] = (void*)sop;
6064 RExC_rxi->data->data[n+2] = (void*)pad;
6067 else { /* First pass */
6068 if (PL_reginterp_cnt < ++RExC_seen_evals
6070 /* No compiled RE interpolated, has runtime
6071 components ===> unsafe. */
6072 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6073 if (PL_tainting && PL_tainted)
6074 FAIL("Eval-group in insecure regular expression");
6075 #if PERL_VERSION > 8
6076 if (IN_PERL_COMPILETIME)
6081 nextchar(pRExC_state);
6083 ret = reg_node(pRExC_state, LOGICAL);
6086 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6087 /* deal with the length of this later - MJD */
6090 ret = reganode(pRExC_state, EVAL, n);
6091 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6092 Set_Node_Offset(ret, parse_start);
6095 case '(': /* (?(?{...})...) and (?(?=...)...) */
6098 if (RExC_parse[0] == '?') { /* (?(?...)) */
6099 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6100 || RExC_parse[1] == '<'
6101 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6104 ret = reg_node(pRExC_state, LOGICAL);
6107 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6111 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6112 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6114 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6115 char *name_start= RExC_parse++;
6117 SV *sv_dat=reg_scan_name(pRExC_state,
6118 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6119 if (RExC_parse == name_start || *RExC_parse != ch)
6120 vFAIL2("Sequence (?(%c... not terminated",
6121 (ch == '>' ? '<' : ch));
6124 num = add_data( pRExC_state, 1, "S" );
6125 RExC_rxi->data->data[num]=(void*)sv_dat;
6126 SvREFCNT_inc_simple_void(sv_dat);
6128 ret = reganode(pRExC_state,NGROUPP,num);
6129 goto insert_if_check_paren;
6131 else if (RExC_parse[0] == 'D' &&
6132 RExC_parse[1] == 'E' &&
6133 RExC_parse[2] == 'F' &&
6134 RExC_parse[3] == 'I' &&
6135 RExC_parse[4] == 'N' &&
6136 RExC_parse[5] == 'E')
6138 ret = reganode(pRExC_state,DEFINEP,0);
6141 goto insert_if_check_paren;
6143 else if (RExC_parse[0] == 'R') {
6146 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6147 parno = atoi(RExC_parse++);
6148 while (isDIGIT(*RExC_parse))
6150 } else if (RExC_parse[0] == '&') {
6153 sv_dat = reg_scan_name(pRExC_state,
6154 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6155 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6157 ret = reganode(pRExC_state,INSUBP,parno);
6158 goto insert_if_check_paren;
6160 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6163 parno = atoi(RExC_parse++);
6165 while (isDIGIT(*RExC_parse))
6167 ret = reganode(pRExC_state, GROUPP, parno);
6169 insert_if_check_paren:
6170 if ((c = *nextchar(pRExC_state)) != ')')
6171 vFAIL("Switch condition not recognized");
6173 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6174 br = regbranch(pRExC_state, &flags, 1,depth+1);
6176 br = reganode(pRExC_state, LONGJMP, 0);
6178 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6179 c = *nextchar(pRExC_state);
6184 vFAIL("(?(DEFINE)....) does not allow branches");
6185 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6186 regbranch(pRExC_state, &flags, 1,depth+1);
6187 REGTAIL(pRExC_state, ret, lastbr);
6190 c = *nextchar(pRExC_state);
6195 vFAIL("Switch (?(condition)... contains too many branches");
6196 ender = reg_node(pRExC_state, TAIL);
6197 REGTAIL(pRExC_state, br, ender);
6199 REGTAIL(pRExC_state, lastbr, ender);
6200 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6203 REGTAIL(pRExC_state, ret, ender);
6204 RExC_size++; /* XXX WHY do we need this?!!
6205 For large programs it seems to be required
6206 but I can't figure out why. -- dmq*/
6210 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6214 RExC_parse--; /* for vFAIL to print correctly */
6215 vFAIL("Sequence (? incomplete");
6217 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6219 has_use_defaults = TRUE;
6220 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6221 RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6225 parse_flags: /* (?i) */
6227 U32 posflags = 0, negflags = 0;
6228 U32 *flagsp = &posflags;
6229 bool has_charset_modifier = 0;
6231 while (*RExC_parse) {
6232 /* && strchr("iogcmsx", *RExC_parse) */
6233 /* (?g), (?gc) and (?o) are useless here
6234 and must be globally applied -- japhy */
6235 switch (*RExC_parse) {
6236 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6237 case LOCALE_PAT_MOD:
6238 if (has_charset_modifier || flagsp == &negflags) {
6239 goto fail_modifiers;
6241 *flagsp &= ~RXf_PMf_UNICODE;
6242 *flagsp |= RXf_PMf_LOCALE;
6243 has_charset_modifier = 1;
6245 case UNICODE_PAT_MOD:
6246 if (has_charset_modifier || flagsp == &negflags) {
6247 goto fail_modifiers;
6249 *flagsp &= ~RXf_PMf_LOCALE;
6250 *flagsp |= RXf_PMf_UNICODE;
6251 has_charset_modifier = 1;
6254 if (has_use_defaults
6255 || has_charset_modifier
6256 || flagsp == &negflags)
6258 goto fail_modifiers;
6260 *flagsp &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6261 has_charset_modifier = 1;
6263 case ONCE_PAT_MOD: /* 'o' */
6264 case GLOBAL_PAT_MOD: /* 'g' */
6265 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6266 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6267 if (! (wastedflags & wflagbit) ) {
6268 wastedflags |= wflagbit;
6271 "Useless (%s%c) - %suse /%c modifier",
6272 flagsp == &negflags ? "?-" : "?",
6274 flagsp == &negflags ? "don't " : "",
6281 case CONTINUE_PAT_MOD: /* 'c' */
6282 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6283 if (! (wastedflags & WASTED_C) ) {
6284 wastedflags |= WASTED_GC;
6287 "Useless (%sc) - %suse /gc modifier",
6288 flagsp == &negflags ? "?-" : "?",
6289 flagsp == &negflags ? "don't " : ""
6294 case KEEPCOPY_PAT_MOD: /* 'p' */
6295 if (flagsp == &negflags) {
6297 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6299 *flagsp |= RXf_PMf_KEEPCOPY;
6303 /* A flag is a default iff it is following a minus, so
6304 * if there is a minus, it means will be trying to
6305 * re-specify a default which is an error */
6306 if (has_use_defaults || flagsp == &negflags) {
6309 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6313 wastedflags = 0; /* reset so (?g-c) warns twice */
6319 RExC_flags |= posflags;
6320 RExC_flags &= ~negflags;
6322 oregflags |= posflags;
6323 oregflags &= ~negflags;
6325 nextchar(pRExC_state);
6336 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6341 }} /* one for the default block, one for the switch */
6348 ret = reganode(pRExC_state, OPEN, parno);
6351 RExC_nestroot = parno;
6352 if (RExC_seen & REG_SEEN_RECURSE
6353 && !RExC_open_parens[parno-1])
6355 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6356 "Setting open paren #%"IVdf" to %d\n",
6357 (IV)parno, REG_NODE_NUM(ret)));
6358 RExC_open_parens[parno-1]= ret;
6361 Set_Node_Length(ret, 1); /* MJD */
6362 Set_Node_Offset(ret, RExC_parse); /* MJD */
6370 /* Pick up the branches, linking them together. */
6371 parse_start = RExC_parse; /* MJD */
6372 br = regbranch(pRExC_state, &flags, 1,depth+1);
6375 if (RExC_npar > after_freeze)
6376 after_freeze = RExC_npar;
6377 RExC_npar = freeze_paren;
6380 /* branch_len = (paren != 0); */
6384 if (*RExC_parse == '|') {
6385 if (!SIZE_ONLY && RExC_extralen) {
6386 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6389 reginsert(pRExC_state, BRANCH, br, depth+1);
6390 Set_Node_Length(br, paren != 0);
6391 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6395 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6397 else if (paren == ':') {
6398 *flagp |= flags&SIMPLE;
6400 if (is_open) { /* Starts with OPEN. */
6401 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6403 else if (paren != '?') /* Not Conditional */
6405 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6407 while (*RExC_parse == '|') {
6408 if (!SIZE_ONLY && RExC_extralen) {
6409 ender = reganode(pRExC_state, LONGJMP,0);
6410 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6413 RExC_extralen += 2; /* Account for LONGJMP. */
6414 nextchar(pRExC_state);
6416 if (RExC_npar > after_freeze)
6417 after_freeze = RExC_npar;
6418 RExC_npar = freeze_paren;
6420 br = regbranch(pRExC_state, &flags, 0, depth+1);
6424 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6426 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6429 if (have_branch || paren != ':') {
6430 /* Make a closing node, and hook it on the end. */
6433 ender = reg_node(pRExC_state, TAIL);
6436 ender = reganode(pRExC_state, CLOSE, parno);
6437 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6438 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6439 "Setting close paren #%"IVdf" to %d\n",
6440 (IV)parno, REG_NODE_NUM(ender)));
6441 RExC_close_parens[parno-1]= ender;
6442 if (RExC_nestroot == parno)
6445 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6446 Set_Node_Length(ender,1); /* MJD */
6452 *flagp &= ~HASWIDTH;
6455 ender = reg_node(pRExC_state, SUCCEED);
6458 ender = reg_node(pRExC_state, END);
6460 assert(!RExC_opend); /* there can only be one! */
6465 REGTAIL(pRExC_state, lastbr, ender);
6467 if (have_branch && !SIZE_ONLY) {
6469 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6471 /* Hook the tails of the branches to the closing node. */
6472 for (br = ret; br; br = regnext(br)) {
6473 const U8 op = PL_regkind[OP(br)];
6475 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6477 else if (op == BRANCHJ) {
6478 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6486 static const char parens[] = "=!<,>";
6488 if (paren && (p = strchr(parens, paren))) {
6489 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6490 int flag = (p - parens) > 1;
6493 node = SUSPEND, flag = 0;
6494 reginsert(pRExC_state, node,ret, depth+1);
6495 Set_Node_Cur_Length(ret);
6496 Set_Node_Offset(ret, parse_start + 1);
6498 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6502 /* Check for proper termination. */
6504 RExC_flags = oregflags;
6505 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6506 RExC_parse = oregcomp_parse;
6507 vFAIL("Unmatched (");
6510 else if (!paren && RExC_parse < RExC_end) {
6511 if (*RExC_parse == ')') {
6513 vFAIL("Unmatched )");
6516 FAIL("Junk on end of regexp"); /* "Can't happen". */
6520 RExC_npar = after_freeze;
6525 - regbranch - one alternative of an | operator
6527 * Implements the concatenation operator.
6530 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6533 register regnode *ret;
6534 register regnode *chain = NULL;
6535 register regnode *latest;
6536 I32 flags = 0, c = 0;
6537 GET_RE_DEBUG_FLAGS_DECL;
6539 PERL_ARGS_ASSERT_REGBRANCH;
6541 DEBUG_PARSE("brnc");
6546 if (!SIZE_ONLY && RExC_extralen)
6547 ret = reganode(pRExC_state, BRANCHJ,0);
6549 ret = reg_node(pRExC_state, BRANCH);
6550 Set_Node_Length(ret, 1);
6554 if (!first && SIZE_ONLY)
6555 RExC_extralen += 1; /* BRANCHJ */
6557 *flagp = WORST; /* Tentatively. */
6560 nextchar(pRExC_state);
6561 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6563 latest = regpiece(pRExC_state, &flags,depth+1);
6564 if (latest == NULL) {
6565 if (flags & TRYAGAIN)
6569 else if (ret == NULL)
6571 *flagp |= flags&(HASWIDTH|POSTPONED);
6572 if (chain == NULL) /* First piece. */
6573 *flagp |= flags&SPSTART;
6576 REGTAIL(pRExC_state, chain, latest);
6581 if (chain == NULL) { /* Loop ran zero times. */
6582 chain = reg_node(pRExC_state, NOTHING);
6587 *flagp |= flags&SIMPLE;
6594 - regpiece - something followed by possible [*+?]
6596 * Note that the branching code sequences used for ? and the general cases
6597 * of * and + are somewhat optimized: they use the same NOTHING node as
6598 * both the endmarker for their branch list and the body of the last branch.
6599 * It might seem that this node could be dispensed with entirely, but the
6600 * endmarker role is not redundant.
6603 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6606 register regnode *ret;
6608 register char *next;
6610 const char * const origparse = RExC_parse;
6612 I32 max = REG_INFTY;
6614 const char *maxpos = NULL;
6615 GET_RE_DEBUG_FLAGS_DECL;
6617 PERL_ARGS_ASSERT_REGPIECE;
6619 DEBUG_PARSE("piec");
6621 ret = regatom(pRExC_state, &flags,depth+1);
6623 if (flags & TRYAGAIN)
6630 if (op == '{' && regcurly(RExC_parse)) {
6632 parse_start = RExC_parse; /* MJD */
6633 next = RExC_parse + 1;
6634 while (isDIGIT(*next) || *next == ',') {
6643 if (*next == '}') { /* got one */
6647 min = atoi(RExC_parse);
6651 maxpos = RExC_parse;
6653 if (!max && *maxpos != '0')
6654 max = REG_INFTY; /* meaning "infinity" */
6655 else if (max >= REG_INFTY)
6656 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6658 nextchar(pRExC_state);
6661 if ((flags&SIMPLE)) {
6662 RExC_naughty += 2 + RExC_naughty / 2;
6663 reginsert(pRExC_state, CURLY, ret, depth+1);
6664 Set_Node_Offset(ret, parse_start+1); /* MJD */
6665 Set_Node_Cur_Length(ret);
6668 regnode * const w = reg_node(pRExC_state, WHILEM);
6671 REGTAIL(pRExC_state, ret, w);
6672 if (!SIZE_ONLY && RExC_extralen) {
6673 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6674 reginsert(pRExC_state, NOTHING,ret, depth+1);
6675 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6677 reginsert(pRExC_state, CURLYX,ret, depth+1);
6679 Set_Node_Offset(ret, parse_start+1);
6680 Set_Node_Length(ret,
6681 op == '{' ? (RExC_parse - parse_start) : 1);
6683 if (!SIZE_ONLY && RExC_extralen)
6684 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6685 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6687 RExC_whilem_seen++, RExC_extralen += 3;
6688 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6697 vFAIL("Can't do {n,m} with n > m");
6699 ARG1_SET(ret, (U16)min);
6700 ARG2_SET(ret, (U16)max);
6712 #if 0 /* Now runtime fix should be reliable. */
6714 /* if this is reinstated, don't forget to put this back into perldiag:
6716 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6718 (F) The part of the regexp subject to either the * or + quantifier
6719 could match an empty string. The {#} shows in the regular
6720 expression about where the problem was discovered.
6724 if (!(flags&HASWIDTH) && op != '?')
6725 vFAIL("Regexp *+ operand could be empty");
6728 parse_start = RExC_parse;
6729 nextchar(pRExC_state);
6731 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6733 if (op == '*' && (flags&SIMPLE)) {
6734 reginsert(pRExC_state, STAR, ret, depth+1);
6738 else if (op == '*') {
6742 else if (op == '+' && (flags&SIMPLE)) {
6743 reginsert(pRExC_state, PLUS, ret, depth+1);
6747 else if (op == '+') {
6751 else if (op == '?') {
6756 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6757 ckWARN3reg(RExC_parse,
6758 "%.*s matches null string many times",
6759 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6763 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6764 nextchar(pRExC_state);
6765 reginsert(pRExC_state, MINMOD, ret, depth+1);
6766 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6768 #ifndef REG_ALLOW_MINMOD_SUSPEND
6771 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6773 nextchar(pRExC_state);
6774 ender = reg_node(pRExC_state, SUCCEED);
6775 REGTAIL(pRExC_state, ret, ender);
6776 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6778 ender = reg_node(pRExC_state, TAIL);
6779 REGTAIL(pRExC_state, ret, ender);
6783 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6785 vFAIL("Nested quantifiers");
6792 /* reg_namedseq(pRExC_state,UVp)
6794 This is expected to be called by a parser routine that has
6795 recognized '\N' and needs to handle the rest. RExC_parse is
6796 expected to point at the first char following the N at the time
6799 The \N may be inside (indicated by valuep not being NULL) or outside a
6802 \N may begin either a named sequence, or if outside a character class, mean
6803 to match a non-newline. For non single-quoted regexes, the tokenizer has
6804 attempted to decide which, and in the case of a named sequence converted it
6805 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6806 where c1... are the characters in the sequence. For single-quoted regexes,
6807 the tokenizer passes the \N sequence through unchanged; this code will not
6808 attempt to determine this nor expand those. The net effect is that if the
6809 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6810 signals that this \N occurrence means to match a non-newline.
6812 Only the \N{U+...} form should occur in a character class, for the same
6813 reason that '.' inside a character class means to just match a period: it
6814 just doesn't make sense.
6816 If valuep is non-null then it is assumed that we are parsing inside
6817 of a charclass definition and the first codepoint in the resolved
6818 string is returned via *valuep and the routine will return NULL.
6819 In this mode if a multichar string is returned from the charnames
6820 handler, a warning will be issued, and only the first char in the
6821 sequence will be examined. If the string returned is zero length
6822 then the value of *valuep is undefined and NON-NULL will
6823 be returned to indicate failure. (This will NOT be a valid pointer
6826 If valuep is null then it is assumed that we are parsing normal text and a
6827 new EXACT node is inserted into the program containing the resolved string,
6828 and a pointer to the new node is returned. But if the string is zero length
6829 a NOTHING node is emitted instead.
6831 On success RExC_parse is set to the char following the endbrace.
6832 Parsing failures will generate a fatal error via vFAIL(...)
6835 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6837 char * endbrace; /* '}' following the name */
6838 regnode *ret = NULL;
6840 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6844 GET_RE_DEBUG_FLAGS_DECL;
6846 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6850 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6851 * modifier. The other meaning does not */
6852 p = (RExC_flags & RXf_PMf_EXTENDED)
6853 ? regwhite( pRExC_state, RExC_parse )
6856 /* Disambiguate between \N meaning a named character versus \N meaning
6857 * [^\n]. The former is assumed when it can't be the latter. */
6858 if (*p != '{' || regcurly(p)) {
6861 /* no bare \N in a charclass */
6862 vFAIL("\\N in a character class must be a named character: \\N{...}");
6864 nextchar(pRExC_state);
6865 ret = reg_node(pRExC_state, REG_ANY);
6866 *flagp |= HASWIDTH|SIMPLE;
6869 Set_Node_Length(ret, 1); /* MJD */
6873 /* Here, we have decided it should be a named sequence */
6875 /* The test above made sure that the next real character is a '{', but
6876 * under the /x modifier, it could be separated by space (or a comment and
6877 * \n) and this is not allowed (for consistency with \x{...} and the
6878 * tokenizer handling of \N{NAME}). */
6879 if (*RExC_parse != '{') {
6880 vFAIL("Missing braces on \\N{}");
6883 RExC_parse++; /* Skip past the '{' */
6885 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6886 || ! (endbrace == RExC_parse /* nothing between the {} */
6887 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6888 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6890 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6891 vFAIL("\\N{NAME} must be resolved by the lexer");
6894 if (endbrace == RExC_parse) { /* empty: \N{} */
6896 RExC_parse = endbrace + 1;
6897 return reg_node(pRExC_state,NOTHING);
6901 ckWARNreg(RExC_parse,
6902 "Ignoring zero length \\N{} in character class"
6904 RExC_parse = endbrace + 1;
6907 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6910 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
6911 RExC_parse += 2; /* Skip past the 'U+' */
6913 if (valuep) { /* In a bracketed char class */
6914 /* We only pay attention to the first char of
6915 multichar strings being returned. I kinda wonder
6916 if this makes sense as it does change the behaviour
6917 from earlier versions, OTOH that behaviour was broken
6918 as well. XXX Solution is to recharacterize as
6919 [rest-of-class]|multi1|multi2... */
6921 STRLEN length_of_hex;
6922 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6923 | PERL_SCAN_DISALLOW_PREFIX
6924 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6926 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6927 if (endchar < endbrace) {
6928 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6931 length_of_hex = (STRLEN)(endchar - RExC_parse);
6932 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6934 /* The tokenizer should have guaranteed validity, but it's possible to
6935 * bypass it by using single quoting, so check */
6936 if (length_of_hex == 0
6937 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6939 RExC_parse += length_of_hex; /* Includes all the valid */
6940 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6941 ? UTF8SKIP(RExC_parse)
6943 /* Guard against malformed utf8 */
6944 if (RExC_parse >= endchar) RExC_parse = endchar;
6945 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6948 RExC_parse = endbrace + 1;
6949 if (endchar == endbrace) return NULL;
6951 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
6953 else { /* Not a char class */
6954 char *s; /* String to put in generated EXACT node */
6955 STRLEN len = 0; /* Its current byte length */
6956 char *endchar; /* Points to '.' or '}' ending cur char in the input
6959 ret = reg_node(pRExC_state,
6960 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6963 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6964 * the input which is of the form now 'c1.c2.c3...}' until find the
6965 * ending brace or exceed length 255. The characters that exceed this
6966 * limit are dropped. The limit could be relaxed should it become
6967 * desirable by reparsing this as (?:\N{NAME}), so could generate
6968 * multiple EXACT nodes, as is done for just regular input. But this
6969 * is primarily a named character, and not intended to be a huge long
6970 * string, so 255 bytes should be good enough */
6972 STRLEN length_of_hex;
6973 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6974 | PERL_SCAN_DISALLOW_PREFIX
6975 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6976 UV cp; /* Ord of current character */
6978 /* Code points are separated by dots. If none, there is only one
6979 * code point, and is terminated by the brace */
6980 endchar = RExC_parse + strcspn(RExC_parse, ".}");
6982 /* The values are Unicode even on EBCDIC machines */
6983 length_of_hex = (STRLEN)(endchar - RExC_parse);
6984 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6985 if ( length_of_hex == 0
6986 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6988 RExC_parse += length_of_hex; /* Includes all the valid */
6989 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6990 ? UTF8SKIP(RExC_parse)
6992 /* Guard against malformed utf8 */
6993 if (RExC_parse >= endchar) RExC_parse = endchar;
6994 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6997 if (! FOLD) { /* Not folding, just append to the string */
7000 /* Quit before adding this character if would exceed limit */
7001 if (len + UNISKIP(cp) > U8_MAX) break;
7003 unilen = reguni(pRExC_state, cp, s);
7008 } else { /* Folding, output the folded equivalent */
7009 STRLEN foldlen,numlen;
7010 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7011 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7013 /* Quit before exceeding size limit */
7014 if (len + foldlen > U8_MAX) break;
7016 for (foldbuf = tmpbuf;
7020 cp = utf8_to_uvchr(foldbuf, &numlen);
7022 const STRLEN unilen = reguni(pRExC_state, cp, s);
7025 /* In EBCDIC the numlen and unilen can differ. */
7027 if (numlen >= foldlen)
7031 break; /* "Can't happen." */
7035 /* Point to the beginning of the next character in the sequence. */
7036 RExC_parse = endchar + 1;
7038 /* Quit if no more characters */
7039 if (RExC_parse >= endbrace) break;
7044 if (RExC_parse < endbrace) {
7045 ckWARNreg(RExC_parse - 1,
7046 "Using just the first characters returned by \\N{}");
7049 RExC_size += STR_SZ(len);
7052 RExC_emit += STR_SZ(len);
7055 RExC_parse = endbrace + 1;
7057 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7058 with malformed in t/re/pat_advanced.t */
7060 Set_Node_Cur_Length(ret); /* MJD */
7061 nextchar(pRExC_state);
7071 * It returns the code point in utf8 for the value in *encp.
7072 * value: a code value in the source encoding
7073 * encp: a pointer to an Encode object
7075 * If the result from Encode is not a single character,
7076 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7079 S_reg_recode(pTHX_ const char value, SV **encp)
7082 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7083 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7084 const STRLEN newlen = SvCUR(sv);
7085 UV uv = UNICODE_REPLACEMENT;
7087 PERL_ARGS_ASSERT_REG_RECODE;
7091 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7094 if (!newlen || numlen != newlen) {
7095 uv = UNICODE_REPLACEMENT;
7103 - regatom - the lowest level
7105 Try to identify anything special at the start of the pattern. If there
7106 is, then handle it as required. This may involve generating a single regop,
7107 such as for an assertion; or it may involve recursing, such as to
7108 handle a () structure.
7110 If the string doesn't start with something special then we gobble up
7111 as much literal text as we can.
7113 Once we have been able to handle whatever type of thing started the
7114 sequence, we return.
7116 Note: we have to be careful with escapes, as they can be both literal
7117 and special, and in the case of \10 and friends can either, depending
7118 on context. Specifically there are two seperate switches for handling
7119 escape sequences, with the one for handling literal escapes requiring
7120 a dummy entry for all of the special escapes that are actually handled
7125 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7128 register regnode *ret = NULL;
7130 char *parse_start = RExC_parse;
7131 GET_RE_DEBUG_FLAGS_DECL;
7132 DEBUG_PARSE("atom");
7133 *flagp = WORST; /* Tentatively. */
7135 PERL_ARGS_ASSERT_REGATOM;
7138 switch ((U8)*RExC_parse) {
7140 RExC_seen_zerolen++;
7141 nextchar(pRExC_state);
7142 if (RExC_flags & RXf_PMf_MULTILINE)
7143 ret = reg_node(pRExC_state, MBOL);
7144 else if (RExC_flags & RXf_PMf_SINGLELINE)
7145 ret = reg_node(pRExC_state, SBOL);
7147 ret = reg_node(pRExC_state, BOL);
7148 Set_Node_Length(ret, 1); /* MJD */
7151 nextchar(pRExC_state);
7153 RExC_seen_zerolen++;
7154 if (RExC_flags & RXf_PMf_MULTILINE)
7155 ret = reg_node(pRExC_state, MEOL);
7156 else if (RExC_flags & RXf_PMf_SINGLELINE)
7157 ret = reg_node(pRExC_state, SEOL);
7159 ret = reg_node(pRExC_state, EOL);
7160 Set_Node_Length(ret, 1); /* MJD */
7163 nextchar(pRExC_state);
7164 if (RExC_flags & RXf_PMf_SINGLELINE)
7165 ret = reg_node(pRExC_state, SANY);
7167 ret = reg_node(pRExC_state, REG_ANY);
7168 *flagp |= HASWIDTH|SIMPLE;
7170 Set_Node_Length(ret, 1); /* MJD */
7174 char * const oregcomp_parse = ++RExC_parse;
7175 ret = regclass(pRExC_state,depth+1);
7176 if (*RExC_parse != ']') {
7177 RExC_parse = oregcomp_parse;
7178 vFAIL("Unmatched [");
7180 nextchar(pRExC_state);
7181 *flagp |= HASWIDTH|SIMPLE;
7182 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7186 nextchar(pRExC_state);
7187 ret = reg(pRExC_state, 1, &flags,depth+1);
7189 if (flags & TRYAGAIN) {
7190 if (RExC_parse == RExC_end) {
7191 /* Make parent create an empty node if needed. */
7199 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7203 if (flags & TRYAGAIN) {
7207 vFAIL("Internal urp");
7208 /* Supposed to be caught earlier. */
7211 if (!regcurly(RExC_parse)) {
7220 vFAIL("Quantifier follows nothing");
7228 len=0; /* silence a spurious compiler warning */
7229 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7230 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7231 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7232 ret = reganode(pRExC_state, FOLDCHAR, cp);
7233 Set_Node_Length(ret, 1); /* MJD */
7234 nextchar(pRExC_state); /* kill whitespace under /x */
7242 This switch handles escape sequences that resolve to some kind
7243 of special regop and not to literal text. Escape sequnces that
7244 resolve to literal text are handled below in the switch marked
7247 Every entry in this switch *must* have a corresponding entry
7248 in the literal escape switch. However, the opposite is not
7249 required, as the default for this switch is to jump to the
7250 literal text handling code.
7252 switch ((U8)*++RExC_parse) {
7257 /* Special Escapes */
7259 RExC_seen_zerolen++;
7260 ret = reg_node(pRExC_state, SBOL);
7262 goto finish_meta_pat;
7264 ret = reg_node(pRExC_state, GPOS);
7265 RExC_seen |= REG_SEEN_GPOS;
7267 goto finish_meta_pat;
7269 RExC_seen_zerolen++;
7270 ret = reg_node(pRExC_state, KEEPS);
7272 /* XXX:dmq : disabling in-place substitution seems to
7273 * be necessary here to avoid cases of memory corruption, as
7274 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7276 RExC_seen |= REG_SEEN_LOOKBEHIND;
7277 goto finish_meta_pat;
7279 ret = reg_node(pRExC_state, SEOL);
7281 RExC_seen_zerolen++; /* Do not optimize RE away */
7282 goto finish_meta_pat;
7284 ret = reg_node(pRExC_state, EOS);
7286 RExC_seen_zerolen++; /* Do not optimize RE away */
7287 goto finish_meta_pat;
7289 ret = reg_node(pRExC_state, CANY);
7290 RExC_seen |= REG_SEEN_CANY;
7291 *flagp |= HASWIDTH|SIMPLE;
7292 goto finish_meta_pat;
7294 ret = reg_node(pRExC_state, CLUMP);
7296 goto finish_meta_pat;
7299 ret = reg_node(pRExC_state, (U8)(ALNUML));
7301 ret = reg_node(pRExC_state, (U8)(ALNUM));
7302 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7304 *flagp |= HASWIDTH|SIMPLE;
7305 goto finish_meta_pat;
7308 ret = reg_node(pRExC_state, (U8)(NALNUML));
7310 ret = reg_node(pRExC_state, (U8)(NALNUM));
7311 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7313 *flagp |= HASWIDTH|SIMPLE;
7314 goto finish_meta_pat;
7316 RExC_seen_zerolen++;
7317 RExC_seen |= REG_SEEN_LOOKBEHIND;
7319 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7321 ret = reg_node(pRExC_state, (U8)(BOUND));
7322 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7325 goto finish_meta_pat;
7327 RExC_seen_zerolen++;
7328 RExC_seen |= REG_SEEN_LOOKBEHIND;
7330 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7332 ret = reg_node(pRExC_state, (U8)(NBOUND));
7333 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7336 goto finish_meta_pat;
7339 ret = reg_node(pRExC_state, (U8)(SPACEL));
7341 ret = reg_node(pRExC_state, (U8)(SPACE));
7342 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7344 *flagp |= HASWIDTH|SIMPLE;
7345 goto finish_meta_pat;
7348 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7350 ret = reg_node(pRExC_state, (U8)(NSPACE));
7351 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7353 *flagp |= HASWIDTH|SIMPLE;
7354 goto finish_meta_pat;
7356 ret = reg_node(pRExC_state, DIGIT);
7357 *flagp |= HASWIDTH|SIMPLE;
7358 goto finish_meta_pat;
7360 ret = reg_node(pRExC_state, NDIGIT);
7361 *flagp |= HASWIDTH|SIMPLE;
7362 goto finish_meta_pat;
7364 ret = reg_node(pRExC_state, LNBREAK);
7365 *flagp |= HASWIDTH|SIMPLE;
7366 goto finish_meta_pat;
7368 ret = reg_node(pRExC_state, HORIZWS);
7369 *flagp |= HASWIDTH|SIMPLE;
7370 goto finish_meta_pat;
7372 ret = reg_node(pRExC_state, NHORIZWS);
7373 *flagp |= HASWIDTH|SIMPLE;
7374 goto finish_meta_pat;
7376 ret = reg_node(pRExC_state, VERTWS);
7377 *flagp |= HASWIDTH|SIMPLE;
7378 goto finish_meta_pat;
7380 ret = reg_node(pRExC_state, NVERTWS);
7381 *flagp |= HASWIDTH|SIMPLE;
7383 nextchar(pRExC_state);
7384 Set_Node_Length(ret, 2); /* MJD */
7389 char* const oldregxend = RExC_end;
7391 char* parse_start = RExC_parse - 2;
7394 if (RExC_parse[1] == '{') {
7395 /* a lovely hack--pretend we saw [\pX] instead */
7396 RExC_end = strchr(RExC_parse, '}');
7398 const U8 c = (U8)*RExC_parse;
7400 RExC_end = oldregxend;
7401 vFAIL2("Missing right brace on \\%c{}", c);
7406 RExC_end = RExC_parse + 2;
7407 if (RExC_end > oldregxend)
7408 RExC_end = oldregxend;
7412 ret = regclass(pRExC_state,depth+1);
7414 RExC_end = oldregxend;
7417 Set_Node_Offset(ret, parse_start + 2);
7418 Set_Node_Cur_Length(ret);
7419 nextchar(pRExC_state);
7420 *flagp |= HASWIDTH|SIMPLE;
7424 /* Handle \N and \N{NAME} here and not below because it can be
7425 multicharacter. join_exact() will join them up later on.
7426 Also this makes sure that things like /\N{BLAH}+/ and
7427 \N{BLAH} being multi char Just Happen. dmq*/
7429 ret= reg_namedseq(pRExC_state, NULL, flagp);
7431 case 'k': /* Handle \k<NAME> and \k'NAME' */
7434 char ch= RExC_parse[1];
7435 if (ch != '<' && ch != '\'' && ch != '{') {
7437 vFAIL2("Sequence %.2s... not terminated",parse_start);
7439 /* this pretty much dupes the code for (?P=...) in reg(), if
7440 you change this make sure you change that */
7441 char* name_start = (RExC_parse += 2);
7443 SV *sv_dat = reg_scan_name(pRExC_state,
7444 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7445 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7446 if (RExC_parse == name_start || *RExC_parse != ch)
7447 vFAIL2("Sequence %.3s... not terminated",parse_start);
7450 num = add_data( pRExC_state, 1, "S" );
7451 RExC_rxi->data->data[num]=(void*)sv_dat;
7452 SvREFCNT_inc_simple_void(sv_dat);
7456 ret = reganode(pRExC_state,
7457 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7461 /* override incorrect value set in reganode MJD */
7462 Set_Node_Offset(ret, parse_start+1);
7463 Set_Node_Cur_Length(ret); /* MJD */
7464 nextchar(pRExC_state);
7470 case '1': case '2': case '3': case '4':
7471 case '5': case '6': case '7': case '8': case '9':
7474 bool isg = *RExC_parse == 'g';
7479 if (*RExC_parse == '{') {
7483 if (*RExC_parse == '-') {
7487 if (hasbrace && !isDIGIT(*RExC_parse)) {
7488 if (isrel) RExC_parse--;
7490 goto parse_named_seq;
7492 num = atoi(RExC_parse);
7493 if (isg && num == 0)
7494 vFAIL("Reference to invalid group 0");
7496 num = RExC_npar - num;
7498 vFAIL("Reference to nonexistent or unclosed group");
7500 if (!isg && num > 9 && num >= RExC_npar)
7503 char * const parse_start = RExC_parse - 1; /* MJD */
7504 while (isDIGIT(*RExC_parse))
7506 if (parse_start == RExC_parse - 1)
7507 vFAIL("Unterminated \\g... pattern");
7509 if (*RExC_parse != '}')
7510 vFAIL("Unterminated \\g{...} pattern");
7514 if (num > (I32)RExC_rx->nparens)
7515 vFAIL("Reference to nonexistent group");
7518 ret = reganode(pRExC_state,
7519 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7523 /* override incorrect value set in reganode MJD */
7524 Set_Node_Offset(ret, parse_start+1);
7525 Set_Node_Cur_Length(ret); /* MJD */
7527 nextchar(pRExC_state);
7532 if (RExC_parse >= RExC_end)
7533 FAIL("Trailing \\");
7536 /* Do not generate "unrecognized" warnings here, we fall
7537 back into the quick-grab loop below */
7544 if (RExC_flags & RXf_PMf_EXTENDED) {
7545 if ( reg_skipcomment( pRExC_state ) )
7552 register STRLEN len;
7557 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7559 parse_start = RExC_parse - 1;
7565 ret = reg_node(pRExC_state,
7566 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7568 for (len = 0, p = RExC_parse - 1;
7569 len < 127 && p < RExC_end;
7572 char * const oldp = p;
7574 if (RExC_flags & RXf_PMf_EXTENDED)
7575 p = regwhite( pRExC_state, p );
7580 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7581 goto normal_default;
7591 /* Literal Escapes Switch
7593 This switch is meant to handle escape sequences that
7594 resolve to a literal character.
7596 Every escape sequence that represents something
7597 else, like an assertion or a char class, is handled
7598 in the switch marked 'Special Escapes' above in this
7599 routine, but also has an entry here as anything that
7600 isn't explicitly mentioned here will be treated as
7601 an unescaped equivalent literal.
7605 /* These are all the special escapes. */
7609 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7610 goto normal_default;
7611 case 'A': /* Start assertion */
7612 case 'b': case 'B': /* Word-boundary assertion*/
7613 case 'C': /* Single char !DANGEROUS! */
7614 case 'd': case 'D': /* digit class */
7615 case 'g': case 'G': /* generic-backref, pos assertion */
7616 case 'h': case 'H': /* HORIZWS */
7617 case 'k': case 'K': /* named backref, keep marker */
7618 case 'N': /* named char sequence */
7619 case 'p': case 'P': /* Unicode property */
7620 case 'R': /* LNBREAK */
7621 case 's': case 'S': /* space class */
7622 case 'v': case 'V': /* VERTWS */
7623 case 'w': case 'W': /* word class */
7624 case 'X': /* eXtended Unicode "combining character sequence" */
7625 case 'z': case 'Z': /* End of line/string assertion */
7629 /* Anything after here is an escape that resolves to a
7630 literal. (Except digits, which may or may not)
7649 ender = ASCII_TO_NATIVE('\033');
7653 ender = ASCII_TO_NATIVE('\007');
7658 STRLEN brace_len = len;
7660 const char* error_msg;
7662 bool valid = grok_bslash_o(p,
7669 RExC_parse = p; /* going to die anyway; point
7670 to exact spot of failure */
7677 if (PL_encoding && ender < 0x100) {
7678 goto recode_encoding;
7687 char* const e = strchr(p, '}');
7691 vFAIL("Missing right brace on \\x{}");
7694 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7695 | PERL_SCAN_DISALLOW_PREFIX;
7696 STRLEN numlen = e - p - 1;
7697 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7704 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7706 ender = grok_hex(p, &numlen, &flags, NULL);
7709 if (PL_encoding && ender < 0x100)
7710 goto recode_encoding;
7714 ender = grok_bslash_c(*p++, SIZE_ONLY);
7716 case '0': case '1': case '2': case '3':case '4':
7717 case '5': case '6': case '7': case '8':case '9':
7719 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7721 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7723 ender = grok_oct(p, &numlen, &flags, NULL);
7733 if (PL_encoding && ender < 0x100)
7734 goto recode_encoding;
7738 SV* enc = PL_encoding;
7739 ender = reg_recode((const char)(U8)ender, &enc);
7740 if (!enc && SIZE_ONLY)
7741 ckWARNreg(p, "Invalid escape in the specified encoding");
7747 FAIL("Trailing \\");
7750 if (!SIZE_ONLY&& isALPHA(*p))
7751 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7752 goto normal_default;
7757 if (UTF8_IS_START(*p) && UTF) {
7759 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7760 &numlen, UTF8_ALLOW_DEFAULT);
7767 if ( RExC_flags & RXf_PMf_EXTENDED)
7768 p = regwhite( pRExC_state, p );
7770 /* Prime the casefolded buffer. */
7771 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7773 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7778 /* Emit all the Unicode characters. */
7780 for (foldbuf = tmpbuf;
7782 foldlen -= numlen) {
7783 ender = utf8_to_uvchr(foldbuf, &numlen);
7785 const STRLEN unilen = reguni(pRExC_state, ender, s);
7788 /* In EBCDIC the numlen
7789 * and unilen can differ. */
7791 if (numlen >= foldlen)
7795 break; /* "Can't happen." */
7799 const STRLEN unilen = reguni(pRExC_state, ender, s);
7808 REGC((char)ender, s++);
7814 /* Emit all the Unicode characters. */
7816 for (foldbuf = tmpbuf;
7818 foldlen -= numlen) {
7819 ender = utf8_to_uvchr(foldbuf, &numlen);
7821 const STRLEN unilen = reguni(pRExC_state, ender, s);
7824 /* In EBCDIC the numlen
7825 * and unilen can differ. */
7827 if (numlen >= foldlen)
7835 const STRLEN unilen = reguni(pRExC_state, ender, s);
7844 REGC((char)ender, s++);
7848 Set_Node_Cur_Length(ret); /* MJD */
7849 nextchar(pRExC_state);
7851 /* len is STRLEN which is unsigned, need to copy to signed */
7854 vFAIL("Internal disaster");
7858 if (len == 1 && UNI_IS_INVARIANT(ender))
7862 RExC_size += STR_SZ(len);
7865 RExC_emit += STR_SZ(len);
7875 S_regwhite( RExC_state_t *pRExC_state, char *p )
7877 const char *e = RExC_end;
7879 PERL_ARGS_ASSERT_REGWHITE;
7884 else if (*p == '#') {
7893 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7901 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7902 Character classes ([:foo:]) can also be negated ([:^foo:]).
7903 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7904 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7905 but trigger failures because they are currently unimplemented. */
7907 #define POSIXCC_DONE(c) ((c) == ':')
7908 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7909 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7912 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7915 I32 namedclass = OOB_NAMEDCLASS;
7917 PERL_ARGS_ASSERT_REGPPOSIXCC;
7919 if (value == '[' && RExC_parse + 1 < RExC_end &&
7920 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7921 POSIXCC(UCHARAT(RExC_parse))) {
7922 const char c = UCHARAT(RExC_parse);
7923 char* const s = RExC_parse++;
7925 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7927 if (RExC_parse == RExC_end)
7928 /* Grandfather lone [:, [=, [. */
7931 const char* const t = RExC_parse++; /* skip over the c */
7934 if (UCHARAT(RExC_parse) == ']') {
7935 const char *posixcc = s + 1;
7936 RExC_parse++; /* skip over the ending ] */
7939 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7940 const I32 skip = t - posixcc;
7942 /* Initially switch on the length of the name. */
7945 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7946 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7949 /* Names all of length 5. */
7950 /* alnum alpha ascii blank cntrl digit graph lower
7951 print punct space upper */
7952 /* Offset 4 gives the best switch position. */
7953 switch (posixcc[4]) {
7955 if (memEQ(posixcc, "alph", 4)) /* alpha */
7956 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7959 if (memEQ(posixcc, "spac", 4)) /* space */
7960 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7963 if (memEQ(posixcc, "grap", 4)) /* graph */
7964 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7967 if (memEQ(posixcc, "asci", 4)) /* ascii */
7968 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7971 if (memEQ(posixcc, "blan", 4)) /* blank */
7972 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7975 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7976 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7979 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7980 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7983 if (memEQ(posixcc, "lowe", 4)) /* lower */
7984 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7985 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7986 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7989 if (memEQ(posixcc, "digi", 4)) /* digit */
7990 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7991 else if (memEQ(posixcc, "prin", 4)) /* print */
7992 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7993 else if (memEQ(posixcc, "punc", 4)) /* punct */
7994 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7999 if (memEQ(posixcc, "xdigit", 6))
8000 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8004 if (namedclass == OOB_NAMEDCLASS)
8005 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8007 assert (posixcc[skip] == ':');
8008 assert (posixcc[skip+1] == ']');
8009 } else if (!SIZE_ONLY) {
8010 /* [[=foo=]] and [[.foo.]] are still future. */
8012 /* adjust RExC_parse so the warning shows after
8014 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8016 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8019 /* Maternal grandfather:
8020 * "[:" ending in ":" but not in ":]" */
8030 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8034 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8036 if (POSIXCC(UCHARAT(RExC_parse))) {
8037 const char *s = RExC_parse;
8038 const char c = *s++;
8042 if (*s && c == *s && s[1] == ']') {
8044 "POSIX syntax [%c %c] belongs inside character classes",
8047 /* [[=foo=]] and [[.foo.]] are still future. */
8048 if (POSIXCC_NOTYET(c)) {
8049 /* adjust RExC_parse so the error shows after
8051 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8053 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8060 #define _C_C_T_(NAME,TEST,WORD) \
8063 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8065 for (value = 0; value < 256; value++) \
8067 ANYOF_BITMAP_SET(ret, value); \
8072 case ANYOF_N##NAME: \
8074 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8076 for (value = 0; value < 256; value++) \
8078 ANYOF_BITMAP_SET(ret, value); \
8084 /* Like above, but no locale test */
8085 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8087 for (value = 0; value < 256; value++) \
8089 ANYOF_BITMAP_SET(ret, value); \
8093 case ANYOF_N##NAME: \
8094 for (value = 0; value < 256; value++) \
8096 ANYOF_BITMAP_SET(ret, value); \
8101 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8102 * there are two tests passed in, to use depending on that. There aren't any
8103 * cases where the label is different from the name, so no need for that
8105 #define _C_C_T_UNI_8_BIT(NAME,TEST_8,TEST_7,WORD) \
8107 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8108 else if (UNI_SEMANTICS) { \
8109 for (value = 0; value < 256; value++) { \
8110 if (TEST_8) ANYOF_BITMAP_SET(ret, value); \
8114 for (value = 0; value < 256; value++) { \
8115 if (TEST_7) ANYOF_BITMAP_SET(ret, value); \
8121 case ANYOF_N##NAME: \
8122 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8123 else if (UNI_SEMANTICS) { \
8124 for (value = 0; value < 256; value++) { \
8125 if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
8129 for (value = 0; value < 256; value++) { \
8130 if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
8138 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8139 so that it is possible to override the option here without having to
8140 rebuild the entire core. as we are required to do if we change regcomp.h
8141 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8143 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8144 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8147 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8148 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8150 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8154 parse a class specification and produce either an ANYOF node that
8155 matches the pattern or if the pattern matches a single char only and
8156 that char is < 256 and we are case insensitive then we produce an
8161 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8164 register UV nextvalue;
8165 register IV prevvalue = OOB_UNICODE;
8166 register IV range = 0;
8167 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8168 register regnode *ret;
8171 char *rangebegin = NULL;
8172 bool need_class = 0;
8175 bool optimize_invert = TRUE;
8176 AV* unicode_alternate = NULL;
8178 UV literal_endpoint = 0;
8180 UV stored = 0; /* number of chars stored in the class */
8182 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8183 case we need to change the emitted regop to an EXACT. */
8184 const char * orig_parse = RExC_parse;
8185 GET_RE_DEBUG_FLAGS_DECL;
8187 PERL_ARGS_ASSERT_REGCLASS;
8189 PERL_UNUSED_ARG(depth);
8192 DEBUG_PARSE("clas");
8194 /* Assume we are going to generate an ANYOF node. */
8195 ret = reganode(pRExC_state, ANYOF, 0);
8198 ANYOF_FLAGS(ret) = 0;
8200 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
8204 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8208 RExC_size += ANYOF_SKIP;
8209 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8212 RExC_emit += ANYOF_SKIP;
8214 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8216 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8217 ANYOF_BITMAP_ZERO(ret);
8218 listsv = newSVpvs("# comment\n");
8221 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8223 if (!SIZE_ONLY && POSIXCC(nextvalue))
8224 checkposixcc(pRExC_state);
8226 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8227 if (UCHARAT(RExC_parse) == ']')
8231 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8235 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8238 rangebegin = RExC_parse;
8240 value = utf8n_to_uvchr((U8*)RExC_parse,
8241 RExC_end - RExC_parse,
8242 &numlen, UTF8_ALLOW_DEFAULT);
8243 RExC_parse += numlen;
8246 value = UCHARAT(RExC_parse++);
8248 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8249 if (value == '[' && POSIXCC(nextvalue))
8250 namedclass = regpposixcc(pRExC_state, value);
8251 else if (value == '\\') {
8253 value = utf8n_to_uvchr((U8*)RExC_parse,
8254 RExC_end - RExC_parse,
8255 &numlen, UTF8_ALLOW_DEFAULT);
8256 RExC_parse += numlen;
8259 value = UCHARAT(RExC_parse++);
8260 /* Some compilers cannot handle switching on 64-bit integer
8261 * values, therefore value cannot be an UV. Yes, this will
8262 * be a problem later if we want switch on Unicode.
8263 * A similar issue a little bit later when switching on
8264 * namedclass. --jhi */
8265 switch ((I32)value) {
8266 case 'w': namedclass = ANYOF_ALNUM; break;
8267 case 'W': namedclass = ANYOF_NALNUM; break;
8268 case 's': namedclass = ANYOF_SPACE; break;
8269 case 'S': namedclass = ANYOF_NSPACE; break;
8270 case 'd': namedclass = ANYOF_DIGIT; break;
8271 case 'D': namedclass = ANYOF_NDIGIT; break;
8272 case 'v': namedclass = ANYOF_VERTWS; break;
8273 case 'V': namedclass = ANYOF_NVERTWS; break;
8274 case 'h': namedclass = ANYOF_HORIZWS; break;
8275 case 'H': namedclass = ANYOF_NHORIZWS; break;
8276 case 'N': /* Handle \N{NAME} in class */
8278 /* We only pay attention to the first char of
8279 multichar strings being returned. I kinda wonder
8280 if this makes sense as it does change the behaviour
8281 from earlier versions, OTOH that behaviour was broken
8283 UV v; /* value is register so we cant & it /grrr */
8284 if (reg_namedseq(pRExC_state, &v, NULL)) {
8294 if (RExC_parse >= RExC_end)
8295 vFAIL2("Empty \\%c{}", (U8)value);
8296 if (*RExC_parse == '{') {
8297 const U8 c = (U8)value;
8298 e = strchr(RExC_parse++, '}');
8300 vFAIL2("Missing right brace on \\%c{}", c);
8301 while (isSPACE(UCHARAT(RExC_parse)))
8303 if (e == RExC_parse)
8304 vFAIL2("Empty \\%c{}", c);
8306 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8314 if (UCHARAT(RExC_parse) == '^') {
8317 value = value == 'p' ? 'P' : 'p'; /* toggle */
8318 while (isSPACE(UCHARAT(RExC_parse))) {
8323 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8324 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8327 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8328 namedclass = ANYOF_MAX; /* no official name, but it's named */
8331 case 'n': value = '\n'; break;
8332 case 'r': value = '\r'; break;
8333 case 't': value = '\t'; break;
8334 case 'f': value = '\f'; break;
8335 case 'b': value = '\b'; break;
8336 case 'e': value = ASCII_TO_NATIVE('\033');break;
8337 case 'a': value = ASCII_TO_NATIVE('\007');break;
8339 RExC_parse--; /* function expects to be pointed at the 'o' */
8341 const char* error_msg;
8342 bool valid = grok_bslash_o(RExC_parse,
8347 RExC_parse += numlen;
8352 if (PL_encoding && value < 0x100) {
8353 goto recode_encoding;
8357 if (*RExC_parse == '{') {
8358 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8359 | PERL_SCAN_DISALLOW_PREFIX;
8360 char * const e = strchr(RExC_parse++, '}');
8362 vFAIL("Missing right brace on \\x{}");
8364 numlen = e - RExC_parse;
8365 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8369 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8371 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8372 RExC_parse += numlen;
8374 if (PL_encoding && value < 0x100)
8375 goto recode_encoding;
8378 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8380 case '0': case '1': case '2': case '3': case '4':
8381 case '5': case '6': case '7':
8383 /* Take 1-3 octal digits */
8384 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8386 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8387 RExC_parse += numlen;
8388 if (PL_encoding && value < 0x100)
8389 goto recode_encoding;
8394 SV* enc = PL_encoding;
8395 value = reg_recode((const char)(U8)value, &enc);
8396 if (!enc && SIZE_ONLY)
8397 ckWARNreg(RExC_parse,
8398 "Invalid escape in the specified encoding");
8402 /* Allow \_ to not give an error */
8403 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8404 ckWARN2reg(RExC_parse,
8405 "Unrecognized escape \\%c in character class passed through",
8410 } /* end of \blah */
8416 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8418 if (!SIZE_ONLY && !need_class)
8419 ANYOF_CLASS_ZERO(ret);
8423 /* a bad range like a-\d, a-[:digit:] ? */
8427 RExC_parse >= rangebegin ?
8428 RExC_parse - rangebegin : 0;
8429 ckWARN4reg(RExC_parse,
8430 "False [] range \"%*.*s\"",
8433 if (prevvalue < 256) {
8434 ANYOF_BITMAP_SET(ret, prevvalue);
8435 ANYOF_BITMAP_SET(ret, '-');
8438 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8439 Perl_sv_catpvf(aTHX_ listsv,
8440 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8444 range = 0; /* this was not a true range */
8450 const char *what = NULL;
8453 if (namedclass > OOB_NAMEDCLASS)
8454 optimize_invert = FALSE;
8455 /* Possible truncation here but in some 64-bit environments
8456 * the compiler gets heartburn about switch on 64-bit values.
8457 * A similar issue a little earlier when switching on value.
8459 switch ((I32)namedclass) {
8461 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8462 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8463 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8464 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8465 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8466 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8467 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8468 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8469 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8470 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8471 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8472 /* \s, \w match all unicode if utf8. */
8473 case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8474 case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8476 /* \s, \w match ascii and locale only */
8477 case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8478 case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8480 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8481 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8482 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8485 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8488 for (value = 0; value < 128; value++)
8489 ANYOF_BITMAP_SET(ret, value);
8491 for (value = 0; value < 256; value++) {
8493 ANYOF_BITMAP_SET(ret, value);
8502 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8505 for (value = 128; value < 256; value++)
8506 ANYOF_BITMAP_SET(ret, value);
8508 for (value = 0; value < 256; value++) {
8509 if (!isASCII(value))
8510 ANYOF_BITMAP_SET(ret, value);
8519 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8521 /* consecutive digits assumed */
8522 for (value = '0'; value <= '9'; value++)
8523 ANYOF_BITMAP_SET(ret, value);
8526 what = POSIX_CC_UNI_NAME("Digit");
8530 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8532 /* consecutive digits assumed */
8533 for (value = 0; value < '0'; value++)
8534 ANYOF_BITMAP_SET(ret, value);
8535 for (value = '9' + 1; value < 256; value++)
8536 ANYOF_BITMAP_SET(ret, value);
8539 what = POSIX_CC_UNI_NAME("Digit");
8542 /* this is to handle \p and \P */
8545 vFAIL("Invalid [::] class");
8549 /* Strings such as "+utf8::isWord\n" */
8550 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8553 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8556 } /* end of namedclass \blah */
8559 if (prevvalue > (IV)value) /* b-a */ {
8560 const int w = RExC_parse - rangebegin;
8561 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8562 range = 0; /* not a valid range */
8566 prevvalue = value; /* save the beginning of the range */
8567 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8568 RExC_parse[1] != ']') {
8571 /* a bad range like \w-, [:word:]- ? */
8572 if (namedclass > OOB_NAMEDCLASS) {
8573 if (ckWARN(WARN_REGEXP)) {
8575 RExC_parse >= rangebegin ?
8576 RExC_parse - rangebegin : 0;
8578 "False [] range \"%*.*s\"",
8582 ANYOF_BITMAP_SET(ret, '-');
8584 range = 1; /* yeah, it's a range! */
8585 continue; /* but do it the next time */
8589 /* now is the next time */
8590 /*stored += (value - prevvalue + 1);*/
8592 if (prevvalue < 256) {
8593 const IV ceilvalue = value < 256 ? value : 255;
8596 /* In EBCDIC [\x89-\x91] should include
8597 * the \x8e but [i-j] should not. */
8598 if (literal_endpoint == 2 &&
8599 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8600 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8602 if (isLOWER(prevvalue)) {
8603 for (i = prevvalue; i <= ceilvalue; i++)
8604 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8606 ANYOF_BITMAP_SET(ret, i);
8609 for (i = prevvalue; i <= ceilvalue; i++)
8610 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8612 ANYOF_BITMAP_SET(ret, i);
8618 for (i = prevvalue; i <= ceilvalue; i++) {
8619 if (!ANYOF_BITMAP_TEST(ret,i)) {
8621 ANYOF_BITMAP_SET(ret, i);
8625 if (value > 255 || UTF) {
8626 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8627 const UV natvalue = NATIVE_TO_UNI(value);
8628 stored+=2; /* can't optimize this class */
8629 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8630 if (prevnatvalue < natvalue) { /* what about > ? */
8631 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8632 prevnatvalue, natvalue);
8634 else if (prevnatvalue == natvalue) {
8635 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8637 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8639 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8641 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8642 if (RExC_precomp[0] == ':' &&
8643 RExC_precomp[1] == '[' &&
8644 (f == 0xDF || f == 0x92)) {
8645 f = NATIVE_TO_UNI(f);
8648 /* If folding and foldable and a single
8649 * character, insert also the folded version
8650 * to the charclass. */
8652 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8653 if ((RExC_precomp[0] == ':' &&
8654 RExC_precomp[1] == '[' &&
8656 (value == 0xFB05 || value == 0xFB06))) ?
8657 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8658 foldlen == (STRLEN)UNISKIP(f) )
8660 if (foldlen == (STRLEN)UNISKIP(f))
8662 Perl_sv_catpvf(aTHX_ listsv,
8665 /* Any multicharacter foldings
8666 * require the following transform:
8667 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8668 * where E folds into "pq" and F folds
8669 * into "rst", all other characters
8670 * fold to single characters. We save
8671 * away these multicharacter foldings,
8672 * to be later saved as part of the
8673 * additional "s" data. */
8676 if (!unicode_alternate)
8677 unicode_alternate = newAV();
8678 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8680 av_push(unicode_alternate, sv);
8684 /* If folding and the value is one of the Greek
8685 * sigmas insert a few more sigmas to make the
8686 * folding rules of the sigmas to work right.
8687 * Note that not all the possible combinations
8688 * are handled here: some of them are handled
8689 * by the standard folding rules, and some of
8690 * them (literal or EXACTF cases) are handled
8691 * during runtime in regexec.c:S_find_byclass(). */
8692 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8693 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8694 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8695 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8696 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8698 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8699 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8700 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8705 literal_endpoint = 0;
8709 range = 0; /* this range (if it was one) is done now */
8713 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8715 RExC_size += ANYOF_CLASS_ADD_SKIP;
8717 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8723 /****** !SIZE_ONLY AFTER HERE *********/
8725 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8726 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8728 /* optimize single char class to an EXACT node
8729 but *only* when its not a UTF/high char */
8730 const char * cur_parse= RExC_parse;
8731 RExC_emit = (regnode *)orig_emit;
8732 RExC_parse = (char *)orig_parse;
8733 ret = reg_node(pRExC_state,
8734 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8735 RExC_parse = (char *)cur_parse;
8736 *STRING(ret)= (char)value;
8738 RExC_emit += STR_SZ(1);
8739 SvREFCNT_dec(listsv);
8742 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8743 if ( /* If the only flag is folding (plus possibly inversion). */
8744 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8746 for (value = 0; value < 256; ++value) {
8747 if (ANYOF_BITMAP_TEST(ret, value)) {
8748 UV fold = PL_fold[value];
8751 ANYOF_BITMAP_SET(ret, fold);
8754 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8757 /* optimize inverted simple patterns (e.g. [^a-z]) */
8758 if (optimize_invert &&
8759 /* If the only flag is inversion. */
8760 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8761 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8762 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8763 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8766 AV * const av = newAV();
8768 /* The 0th element stores the character class description
8769 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8770 * to initialize the appropriate swash (which gets stored in
8771 * the 1st element), and also useful for dumping the regnode.
8772 * The 2nd element stores the multicharacter foldings,
8773 * used later (regexec.c:S_reginclass()). */
8774 av_store(av, 0, listsv);
8775 av_store(av, 1, NULL);
8776 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8777 rv = newRV_noinc(MUTABLE_SV(av));
8778 n = add_data(pRExC_state, 1, "s");
8779 RExC_rxi->data->data[n] = (void*)rv;
8787 /* reg_skipcomment()
8789 Absorbs an /x style # comments from the input stream.
8790 Returns true if there is more text remaining in the stream.
8791 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8792 terminates the pattern without including a newline.
8794 Note its the callers responsibility to ensure that we are
8800 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8804 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8806 while (RExC_parse < RExC_end)
8807 if (*RExC_parse++ == '\n') {
8812 /* we ran off the end of the pattern without ending
8813 the comment, so we have to add an \n when wrapping */
8814 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8822 Advance that parse position, and optionall absorbs
8823 "whitespace" from the inputstream.
8825 Without /x "whitespace" means (?#...) style comments only,
8826 with /x this means (?#...) and # comments and whitespace proper.
8828 Returns the RExC_parse point from BEFORE the scan occurs.
8830 This is the /x friendly way of saying RExC_parse++.
8834 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8836 char* const retval = RExC_parse++;
8838 PERL_ARGS_ASSERT_NEXTCHAR;
8841 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8842 RExC_parse[2] == '#') {
8843 while (*RExC_parse != ')') {
8844 if (RExC_parse == RExC_end)
8845 FAIL("Sequence (?#... not terminated");
8851 if (RExC_flags & RXf_PMf_EXTENDED) {
8852 if (isSPACE(*RExC_parse)) {
8856 else if (*RExC_parse == '#') {
8857 if ( reg_skipcomment( pRExC_state ) )
8866 - reg_node - emit a node
8868 STATIC regnode * /* Location. */
8869 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8872 register regnode *ptr;
8873 regnode * const ret = RExC_emit;
8874 GET_RE_DEBUG_FLAGS_DECL;
8876 PERL_ARGS_ASSERT_REG_NODE;
8879 SIZE_ALIGN(RExC_size);
8883 if (RExC_emit >= RExC_emit_bound)
8884 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8886 NODE_ALIGN_FILL(ret);
8888 FILL_ADVANCE_NODE(ptr, op);
8889 #ifdef RE_TRACK_PATTERN_OFFSETS
8890 if (RExC_offsets) { /* MJD */
8891 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8892 "reg_node", __LINE__,
8894 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8895 ? "Overwriting end of array!\n" : "OK",
8896 (UV)(RExC_emit - RExC_emit_start),
8897 (UV)(RExC_parse - RExC_start),
8898 (UV)RExC_offsets[0]));
8899 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8907 - reganode - emit a node with an argument
8909 STATIC regnode * /* Location. */
8910 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8913 register regnode *ptr;
8914 regnode * const ret = RExC_emit;
8915 GET_RE_DEBUG_FLAGS_DECL;
8917 PERL_ARGS_ASSERT_REGANODE;
8920 SIZE_ALIGN(RExC_size);
8925 assert(2==regarglen[op]+1);
8927 Anything larger than this has to allocate the extra amount.
8928 If we changed this to be:
8930 RExC_size += (1 + regarglen[op]);
8932 then it wouldn't matter. Its not clear what side effect
8933 might come from that so its not done so far.
8938 if (RExC_emit >= RExC_emit_bound)
8939 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8941 NODE_ALIGN_FILL(ret);
8943 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8944 #ifdef RE_TRACK_PATTERN_OFFSETS
8945 if (RExC_offsets) { /* MJD */
8946 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8950 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8951 "Overwriting end of array!\n" : "OK",
8952 (UV)(RExC_emit - RExC_emit_start),
8953 (UV)(RExC_parse - RExC_start),
8954 (UV)RExC_offsets[0]));
8955 Set_Cur_Node_Offset;
8963 - reguni - emit (if appropriate) a Unicode character
8966 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8970 PERL_ARGS_ASSERT_REGUNI;
8972 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8976 - reginsert - insert an operator in front of already-emitted operand
8978 * Means relocating the operand.
8981 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8984 register regnode *src;
8985 register regnode *dst;
8986 register regnode *place;
8987 const int offset = regarglen[(U8)op];
8988 const int size = NODE_STEP_REGNODE + offset;
8989 GET_RE_DEBUG_FLAGS_DECL;
8991 PERL_ARGS_ASSERT_REGINSERT;
8992 PERL_UNUSED_ARG(depth);
8993 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8994 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9003 if (RExC_open_parens) {
9005 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9006 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9007 if ( RExC_open_parens[paren] >= opnd ) {
9008 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9009 RExC_open_parens[paren] += size;
9011 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9013 if ( RExC_close_parens[paren] >= opnd ) {
9014 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9015 RExC_close_parens[paren] += size;
9017 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9022 while (src > opnd) {
9023 StructCopy(--src, --dst, regnode);
9024 #ifdef RE_TRACK_PATTERN_OFFSETS
9025 if (RExC_offsets) { /* MJD 20010112 */
9026 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9030 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9031 ? "Overwriting end of array!\n" : "OK",
9032 (UV)(src - RExC_emit_start),
9033 (UV)(dst - RExC_emit_start),
9034 (UV)RExC_offsets[0]));
9035 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9036 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9042 place = opnd; /* Op node, where operand used to be. */
9043 #ifdef RE_TRACK_PATTERN_OFFSETS
9044 if (RExC_offsets) { /* MJD */
9045 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9049 (UV)(place - RExC_emit_start) > RExC_offsets[0]
9050 ? "Overwriting end of array!\n" : "OK",
9051 (UV)(place - RExC_emit_start),
9052 (UV)(RExC_parse - RExC_start),
9053 (UV)RExC_offsets[0]));
9054 Set_Node_Offset(place, RExC_parse);
9055 Set_Node_Length(place, 1);
9058 src = NEXTOPER(place);
9059 FILL_ADVANCE_NODE(place, op);
9060 Zero(src, offset, regnode);
9064 - regtail - set the next-pointer at the end of a node chain of p to val.
9065 - SEE ALSO: regtail_study
9067 /* TODO: All three parms should be const */
9069 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9072 register regnode *scan;
9073 GET_RE_DEBUG_FLAGS_DECL;
9075 PERL_ARGS_ASSERT_REGTAIL;
9077 PERL_UNUSED_ARG(depth);
9083 /* Find last node. */
9086 regnode * const temp = regnext(scan);
9088 SV * const mysv=sv_newmortal();
9089 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9090 regprop(RExC_rx, mysv, scan);
9091 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9092 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9093 (temp == NULL ? "->" : ""),
9094 (temp == NULL ? PL_reg_name[OP(val)] : "")
9102 if (reg_off_by_arg[OP(scan)]) {
9103 ARG_SET(scan, val - scan);
9106 NEXT_OFF(scan) = val - scan;
9112 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9113 - Look for optimizable sequences at the same time.
9114 - currently only looks for EXACT chains.
9116 This is expermental code. The idea is to use this routine to perform
9117 in place optimizations on branches and groups as they are constructed,
9118 with the long term intention of removing optimization from study_chunk so
9119 that it is purely analytical.
9121 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9122 to control which is which.
9125 /* TODO: All four parms should be const */
9128 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9131 register regnode *scan;
9133 #ifdef EXPERIMENTAL_INPLACESCAN
9136 GET_RE_DEBUG_FLAGS_DECL;
9138 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9144 /* Find last node. */
9148 regnode * const temp = regnext(scan);
9149 #ifdef EXPERIMENTAL_INPLACESCAN
9150 if (PL_regkind[OP(scan)] == EXACT)
9151 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9159 if( exact == PSEUDO )
9161 else if ( exact != OP(scan) )
9170 SV * const mysv=sv_newmortal();
9171 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9172 regprop(RExC_rx, mysv, scan);
9173 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9174 SvPV_nolen_const(mysv),
9176 PL_reg_name[exact]);
9183 SV * const mysv_val=sv_newmortal();
9184 DEBUG_PARSE_MSG("");
9185 regprop(RExC_rx, mysv_val, val);
9186 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9187 SvPV_nolen_const(mysv_val),
9188 (IV)REG_NODE_NUM(val),
9192 if (reg_off_by_arg[OP(scan)]) {
9193 ARG_SET(scan, val - scan);
9196 NEXT_OFF(scan) = val - scan;
9204 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9208 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9213 for (bit=0; bit<32; bit++) {
9214 if (flags & (1<<bit)) {
9216 PerlIO_printf(Perl_debug_log, "%s",lead);
9217 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9222 PerlIO_printf(Perl_debug_log, "\n");
9224 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9230 Perl_regdump(pTHX_ const regexp *r)
9234 SV * const sv = sv_newmortal();
9235 SV *dsv= sv_newmortal();
9237 GET_RE_DEBUG_FLAGS_DECL;
9239 PERL_ARGS_ASSERT_REGDUMP;
9241 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9243 /* Header fields of interest. */
9244 if (r->anchored_substr) {
9245 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9246 RE_SV_DUMPLEN(r->anchored_substr), 30);
9247 PerlIO_printf(Perl_debug_log,
9248 "anchored %s%s at %"IVdf" ",
9249 s, RE_SV_TAIL(r->anchored_substr),
9250 (IV)r->anchored_offset);
9251 } else if (r->anchored_utf8) {
9252 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9253 RE_SV_DUMPLEN(r->anchored_utf8), 30);
9254 PerlIO_printf(Perl_debug_log,
9255 "anchored utf8 %s%s at %"IVdf" ",
9256 s, RE_SV_TAIL(r->anchored_utf8),
9257 (IV)r->anchored_offset);
9259 if (r->float_substr) {
9260 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9261 RE_SV_DUMPLEN(r->float_substr), 30);
9262 PerlIO_printf(Perl_debug_log,
9263 "floating %s%s at %"IVdf"..%"UVuf" ",
9264 s, RE_SV_TAIL(r->float_substr),
9265 (IV)r->float_min_offset, (UV)r->float_max_offset);
9266 } else if (r->float_utf8) {
9267 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9268 RE_SV_DUMPLEN(r->float_utf8), 30);
9269 PerlIO_printf(Perl_debug_log,
9270 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9271 s, RE_SV_TAIL(r->float_utf8),
9272 (IV)r->float_min_offset, (UV)r->float_max_offset);
9274 if (r->check_substr || r->check_utf8)
9275 PerlIO_printf(Perl_debug_log,
9277 (r->check_substr == r->float_substr
9278 && r->check_utf8 == r->float_utf8
9279 ? "(checking floating" : "(checking anchored"));
9280 if (r->extflags & RXf_NOSCAN)
9281 PerlIO_printf(Perl_debug_log, " noscan");
9282 if (r->extflags & RXf_CHECK_ALL)
9283 PerlIO_printf(Perl_debug_log, " isall");
9284 if (r->check_substr || r->check_utf8)
9285 PerlIO_printf(Perl_debug_log, ") ");
9287 if (ri->regstclass) {
9288 regprop(r, sv, ri->regstclass);
9289 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9291 if (r->extflags & RXf_ANCH) {
9292 PerlIO_printf(Perl_debug_log, "anchored");
9293 if (r->extflags & RXf_ANCH_BOL)
9294 PerlIO_printf(Perl_debug_log, "(BOL)");
9295 if (r->extflags & RXf_ANCH_MBOL)
9296 PerlIO_printf(Perl_debug_log, "(MBOL)");
9297 if (r->extflags & RXf_ANCH_SBOL)
9298 PerlIO_printf(Perl_debug_log, "(SBOL)");
9299 if (r->extflags & RXf_ANCH_GPOS)
9300 PerlIO_printf(Perl_debug_log, "(GPOS)");
9301 PerlIO_putc(Perl_debug_log, ' ');
9303 if (r->extflags & RXf_GPOS_SEEN)
9304 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9305 if (r->intflags & PREGf_SKIP)
9306 PerlIO_printf(Perl_debug_log, "plus ");
9307 if (r->intflags & PREGf_IMPLICIT)
9308 PerlIO_printf(Perl_debug_log, "implicit ");
9309 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9310 if (r->extflags & RXf_EVAL_SEEN)
9311 PerlIO_printf(Perl_debug_log, "with eval ");
9312 PerlIO_printf(Perl_debug_log, "\n");
9313 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9315 PERL_ARGS_ASSERT_REGDUMP;
9316 PERL_UNUSED_CONTEXT;
9318 #endif /* DEBUGGING */
9322 - regprop - printable representation of opcode
9324 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9327 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9328 if (flags & ANYOF_INVERT) \
9329 /*make sure the invert info is in each */ \
9330 sv_catpvs(sv, "^"); \
9336 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9341 RXi_GET_DECL(prog,progi);
9342 GET_RE_DEBUG_FLAGS_DECL;
9344 PERL_ARGS_ASSERT_REGPROP;
9348 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9349 /* It would be nice to FAIL() here, but this may be called from
9350 regexec.c, and it would be hard to supply pRExC_state. */
9351 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9352 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9354 k = PL_regkind[OP(o)];
9358 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9359 * is a crude hack but it may be the best for now since
9360 * we have no flag "this EXACTish node was UTF-8"
9362 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9363 PERL_PV_ESCAPE_UNI_DETECT |
9364 PERL_PV_PRETTY_ELLIPSES |
9365 PERL_PV_PRETTY_LTGT |
9366 PERL_PV_PRETTY_NOCLEAR
9368 } else if (k == TRIE) {
9369 /* print the details of the trie in dumpuntil instead, as
9370 * progi->data isn't available here */
9371 const char op = OP(o);
9372 const U32 n = ARG(o);
9373 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9374 (reg_ac_data *)progi->data->data[n] :
9376 const reg_trie_data * const trie
9377 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9379 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9380 DEBUG_TRIE_COMPILE_r(
9381 Perl_sv_catpvf(aTHX_ sv,
9382 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9383 (UV)trie->startstate,
9384 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9385 (UV)trie->wordcount,
9388 (UV)TRIE_CHARCOUNT(trie),
9389 (UV)trie->uniquecharcount
9392 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9394 int rangestart = -1;
9395 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9397 for (i = 0; i <= 256; i++) {
9398 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9399 if (rangestart == -1)
9401 } else if (rangestart != -1) {
9402 if (i <= rangestart + 3)
9403 for (; rangestart < i; rangestart++)
9404 put_byte(sv, rangestart);
9406 put_byte(sv, rangestart);
9408 put_byte(sv, i - 1);
9416 } else if (k == CURLY) {
9417 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9418 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9419 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9421 else if (k == WHILEM && o->flags) /* Ordinal/of */
9422 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9423 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9424 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9425 if ( RXp_PAREN_NAMES(prog) ) {
9426 if ( k != REF || OP(o) < NREF) {
9427 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9428 SV **name= av_fetch(list, ARG(o), 0 );
9430 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9433 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9434 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9435 I32 *nums=(I32*)SvPVX(sv_dat);
9436 SV **name= av_fetch(list, nums[0], 0 );
9439 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9440 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9441 (n ? "," : ""), (IV)nums[n]);
9443 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9447 } else if (k == GOSUB)
9448 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9449 else if (k == VERB) {
9451 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9452 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9453 } else if (k == LOGICAL)
9454 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9455 else if (k == FOLDCHAR)
9456 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9457 else if (k == ANYOF) {
9458 int i, rangestart = -1;
9459 const U8 flags = ANYOF_FLAGS(o);
9462 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9463 static const char * const anyofs[] = {
9496 if (flags & ANYOF_LOCALE)
9497 sv_catpvs(sv, "{loc}");
9498 if (flags & ANYOF_FOLD)
9499 sv_catpvs(sv, "{i}");
9500 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9501 if (flags & ANYOF_INVERT)
9504 /* output what the standard cp 0-255 bitmap matches */
9505 for (i = 0; i <= 256; i++) {
9506 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9507 if (rangestart == -1)
9509 } else if (rangestart != -1) {
9510 if (i <= rangestart + 3)
9511 for (; rangestart < i; rangestart++)
9512 put_byte(sv, rangestart);
9514 put_byte(sv, rangestart);
9516 put_byte(sv, i - 1);
9523 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9524 /* output any special charclass tests (used mostly under use locale) */
9525 if (o->flags & ANYOF_CLASS)
9526 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9527 if (ANYOF_CLASS_TEST(o,i)) {
9528 sv_catpv(sv, anyofs[i]);
9532 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9534 /* output information about the unicode matching */
9535 if (flags & ANYOF_UNICODE)
9536 sv_catpvs(sv, "{unicode}");
9537 else if (flags & ANYOF_UNICODE_ALL)
9538 sv_catpvs(sv, "{unicode_all}");
9542 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9546 U8 s[UTF8_MAXBYTES_CASE+1];
9548 for (i = 0; i <= 256; i++) { /* just the first 256 */
9549 uvchr_to_utf8(s, i);
9551 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9552 if (rangestart == -1)
9554 } else if (rangestart != -1) {
9555 if (i <= rangestart + 3)
9556 for (; rangestart < i; rangestart++) {
9557 const U8 * const e = uvchr_to_utf8(s,rangestart);
9559 for(p = s; p < e; p++)
9563 const U8 *e = uvchr_to_utf8(s,rangestart);
9565 for (p = s; p < e; p++)
9568 e = uvchr_to_utf8(s, i-1);
9569 for (p = s; p < e; p++)
9576 sv_catpvs(sv, "..."); /* et cetera */
9580 char *s = savesvpv(lv);
9581 char * const origs = s;
9583 while (*s && *s != '\n')
9587 const char * const t = ++s;
9605 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9607 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9608 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9610 PERL_UNUSED_CONTEXT;
9611 PERL_UNUSED_ARG(sv);
9613 PERL_UNUSED_ARG(prog);
9614 #endif /* DEBUGGING */
9618 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9619 { /* Assume that RE_INTUIT is set */
9621 struct regexp *const prog = (struct regexp *)SvANY(r);
9622 GET_RE_DEBUG_FLAGS_DECL;
9624 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9625 PERL_UNUSED_CONTEXT;
9629 const char * const s = SvPV_nolen_const(prog->check_substr
9630 ? prog->check_substr : prog->check_utf8);
9632 if (!PL_colorset) reginitcolors();
9633 PerlIO_printf(Perl_debug_log,
9634 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9636 prog->check_substr ? "" : "utf8 ",
9637 PL_colors[5],PL_colors[0],
9640 (strlen(s) > 60 ? "..." : ""));
9643 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9649 handles refcounting and freeing the perl core regexp structure. When
9650 it is necessary to actually free the structure the first thing it
9651 does is call the 'free' method of the regexp_engine associated to to
9652 the regexp, allowing the handling of the void *pprivate; member
9653 first. (This routine is not overridable by extensions, which is why
9654 the extensions free is called first.)
9656 See regdupe and regdupe_internal if you change anything here.
9658 #ifndef PERL_IN_XSUB_RE
9660 Perl_pregfree(pTHX_ REGEXP *r)
9666 Perl_pregfree2(pTHX_ REGEXP *rx)
9669 struct regexp *const r = (struct regexp *)SvANY(rx);
9670 GET_RE_DEBUG_FLAGS_DECL;
9672 PERL_ARGS_ASSERT_PREGFREE2;
9675 ReREFCNT_dec(r->mother_re);
9677 CALLREGFREE_PVT(rx); /* free the private data */
9678 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9681 SvREFCNT_dec(r->anchored_substr);
9682 SvREFCNT_dec(r->anchored_utf8);
9683 SvREFCNT_dec(r->float_substr);
9684 SvREFCNT_dec(r->float_utf8);
9685 Safefree(r->substrs);
9687 RX_MATCH_COPY_FREE(rx);
9688 #ifdef PERL_OLD_COPY_ON_WRITE
9689 SvREFCNT_dec(r->saved_copy);
9696 This is a hacky workaround to the structural issue of match results
9697 being stored in the regexp structure which is in turn stored in
9698 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9699 could be PL_curpm in multiple contexts, and could require multiple
9700 result sets being associated with the pattern simultaneously, such
9701 as when doing a recursive match with (??{$qr})
9703 The solution is to make a lightweight copy of the regexp structure
9704 when a qr// is returned from the code executed by (??{$qr}) this
9705 lightweight copy doesnt actually own any of its data except for
9706 the starp/end and the actual regexp structure itself.
9712 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9715 struct regexp *const r = (struct regexp *)SvANY(rx);
9716 register const I32 npar = r->nparens+1;
9718 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9721 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9722 ret = (struct regexp *)SvANY(ret_x);
9724 (void)ReREFCNT_inc(rx);
9725 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9726 by pointing directly at the buffer, but flagging that the allocated
9727 space in the copy is zero. As we've just done a struct copy, it's now
9728 a case of zero-ing that, rather than copying the current length. */
9729 SvPV_set(ret_x, RX_WRAPPED(rx));
9730 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9731 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9732 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9733 SvLEN_set(ret_x, 0);
9734 SvSTASH_set(ret_x, NULL);
9735 SvMAGIC_set(ret_x, NULL);
9736 Newx(ret->offs, npar, regexp_paren_pair);
9737 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9739 Newx(ret->substrs, 1, struct reg_substr_data);
9740 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9742 SvREFCNT_inc_void(ret->anchored_substr);
9743 SvREFCNT_inc_void(ret->anchored_utf8);
9744 SvREFCNT_inc_void(ret->float_substr);
9745 SvREFCNT_inc_void(ret->float_utf8);
9747 /* check_substr and check_utf8, if non-NULL, point to either their
9748 anchored or float namesakes, and don't hold a second reference. */
9750 RX_MATCH_COPIED_off(ret_x);
9751 #ifdef PERL_OLD_COPY_ON_WRITE
9752 ret->saved_copy = NULL;
9754 ret->mother_re = rx;
9760 /* regfree_internal()
9762 Free the private data in a regexp. This is overloadable by
9763 extensions. Perl takes care of the regexp structure in pregfree(),
9764 this covers the *pprivate pointer which technically perldoesnt
9765 know about, however of course we have to handle the
9766 regexp_internal structure when no extension is in use.
9768 Note this is called before freeing anything in the regexp
9773 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9776 struct regexp *const r = (struct regexp *)SvANY(rx);
9778 GET_RE_DEBUG_FLAGS_DECL;
9780 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9786 SV *dsv= sv_newmortal();
9787 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9788 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9789 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9790 PL_colors[4],PL_colors[5],s);
9793 #ifdef RE_TRACK_PATTERN_OFFSETS
9795 Safefree(ri->u.offsets); /* 20010421 MJD */
9798 int n = ri->data->count;
9799 PAD* new_comppad = NULL;
9804 /* If you add a ->what type here, update the comment in regcomp.h */
9805 switch (ri->data->what[n]) {
9810 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9813 Safefree(ri->data->data[n]);
9816 new_comppad = MUTABLE_AV(ri->data->data[n]);
9819 if (new_comppad == NULL)
9820 Perl_croak(aTHX_ "panic: pregfree comppad");
9821 PAD_SAVE_LOCAL(old_comppad,
9822 /* Watch out for global destruction's random ordering. */
9823 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9826 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9829 op_free((OP_4tree*)ri->data->data[n]);
9831 PAD_RESTORE_LOCAL(old_comppad);
9832 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9838 { /* Aho Corasick add-on structure for a trie node.
9839 Used in stclass optimization only */
9841 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9843 refcount = --aho->refcount;
9846 PerlMemShared_free(aho->states);
9847 PerlMemShared_free(aho->fail);
9848 /* do this last!!!! */
9849 PerlMemShared_free(ri->data->data[n]);
9850 PerlMemShared_free(ri->regstclass);
9856 /* trie structure. */
9858 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9860 refcount = --trie->refcount;
9863 PerlMemShared_free(trie->charmap);
9864 PerlMemShared_free(trie->states);
9865 PerlMemShared_free(trie->trans);
9867 PerlMemShared_free(trie->bitmap);
9869 PerlMemShared_free(trie->jump);
9870 PerlMemShared_free(trie->wordinfo);
9871 /* do this last!!!! */
9872 PerlMemShared_free(ri->data->data[n]);
9877 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9880 Safefree(ri->data->what);
9887 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9888 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
9889 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9892 re_dup - duplicate a regexp.
9894 This routine is expected to clone a given regexp structure. It is only
9895 compiled under USE_ITHREADS.
9897 After all of the core data stored in struct regexp is duplicated
9898 the regexp_engine.dupe method is used to copy any private data
9899 stored in the *pprivate pointer. This allows extensions to handle
9900 any duplication it needs to do.
9902 See pregfree() and regfree_internal() if you change anything here.
9904 #if defined(USE_ITHREADS)
9905 #ifndef PERL_IN_XSUB_RE
9907 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9911 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9912 struct regexp *ret = (struct regexp *)SvANY(dstr);
9914 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9916 npar = r->nparens+1;
9917 Newx(ret->offs, npar, regexp_paren_pair);
9918 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9920 /* no need to copy these */
9921 Newx(ret->swap, npar, regexp_paren_pair);
9925 /* Do it this way to avoid reading from *r after the StructCopy().
9926 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9927 cache, it doesn't matter. */
9928 const bool anchored = r->check_substr
9929 ? r->check_substr == r->anchored_substr
9930 : r->check_utf8 == r->anchored_utf8;
9931 Newx(ret->substrs, 1, struct reg_substr_data);
9932 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9934 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9935 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9936 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9937 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9939 /* check_substr and check_utf8, if non-NULL, point to either their
9940 anchored or float namesakes, and don't hold a second reference. */
9942 if (ret->check_substr) {
9944 assert(r->check_utf8 == r->anchored_utf8);
9945 ret->check_substr = ret->anchored_substr;
9946 ret->check_utf8 = ret->anchored_utf8;
9948 assert(r->check_substr == r->float_substr);
9949 assert(r->check_utf8 == r->float_utf8);
9950 ret->check_substr = ret->float_substr;
9951 ret->check_utf8 = ret->float_utf8;
9953 } else if (ret->check_utf8) {
9955 ret->check_utf8 = ret->anchored_utf8;
9957 ret->check_utf8 = ret->float_utf8;
9962 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9965 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9967 if (RX_MATCH_COPIED(dstr))
9968 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9971 #ifdef PERL_OLD_COPY_ON_WRITE
9972 ret->saved_copy = NULL;
9975 if (ret->mother_re) {
9976 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9977 /* Our storage points directly to our mother regexp, but that's
9978 1: a buffer in a different thread
9979 2: something we no longer hold a reference on
9980 so we need to copy it locally. */
9981 /* Note we need to sue SvCUR() on our mother_re, because it, in
9982 turn, may well be pointing to its own mother_re. */
9983 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9984 SvCUR(ret->mother_re)+1));
9985 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9987 ret->mother_re = NULL;
9991 #endif /* PERL_IN_XSUB_RE */
9996 This is the internal complement to regdupe() which is used to copy
9997 the structure pointed to by the *pprivate pointer in the regexp.
9998 This is the core version of the extension overridable cloning hook.
9999 The regexp structure being duplicated will be copied by perl prior
10000 to this and will be provided as the regexp *r argument, however
10001 with the /old/ structures pprivate pointer value. Thus this routine
10002 may override any copying normally done by perl.
10004 It returns a pointer to the new regexp_internal structure.
10008 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10011 struct regexp *const r = (struct regexp *)SvANY(rx);
10012 regexp_internal *reti;
10014 RXi_GET_DECL(r,ri);
10016 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10018 npar = r->nparens+1;
10021 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10022 Copy(ri->program, reti->program, len+1, regnode);
10025 reti->regstclass = NULL;
10028 struct reg_data *d;
10029 const int count = ri->data->count;
10032 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10033 char, struct reg_data);
10034 Newx(d->what, count, U8);
10037 for (i = 0; i < count; i++) {
10038 d->what[i] = ri->data->what[i];
10039 switch (d->what[i]) {
10040 /* legal options are one of: sSfpontTua
10041 see also regcomp.h and pregfree() */
10042 case 'a': /* actually an AV, but the dup function is identical. */
10045 case 'p': /* actually an AV, but the dup function is identical. */
10046 case 'u': /* actually an HV, but the dup function is identical. */
10047 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10050 /* This is cheating. */
10051 Newx(d->data[i], 1, struct regnode_charclass_class);
10052 StructCopy(ri->data->data[i], d->data[i],
10053 struct regnode_charclass_class);
10054 reti->regstclass = (regnode*)d->data[i];
10057 /* Compiled op trees are readonly and in shared memory,
10058 and can thus be shared without duplication. */
10060 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10064 /* Trie stclasses are readonly and can thus be shared
10065 * without duplication. We free the stclass in pregfree
10066 * when the corresponding reg_ac_data struct is freed.
10068 reti->regstclass= ri->regstclass;
10072 ((reg_trie_data*)ri->data->data[i])->refcount++;
10076 d->data[i] = ri->data->data[i];
10079 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10088 reti->name_list_idx = ri->name_list_idx;
10090 #ifdef RE_TRACK_PATTERN_OFFSETS
10091 if (ri->u.offsets) {
10092 Newx(reti->u.offsets, 2*len+1, U32);
10093 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10096 SetProgLen(reti,len);
10099 return (void*)reti;
10102 #endif /* USE_ITHREADS */
10104 #ifndef PERL_IN_XSUB_RE
10107 - regnext - dig the "next" pointer out of a node
10110 Perl_regnext(pTHX_ register regnode *p)
10113 register I32 offset;
10118 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10119 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10122 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10131 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10134 STRLEN l1 = strlen(pat1);
10135 STRLEN l2 = strlen(pat2);
10138 const char *message;
10140 PERL_ARGS_ASSERT_RE_CROAK2;
10146 Copy(pat1, buf, l1 , char);
10147 Copy(pat2, buf + l1, l2 , char);
10148 buf[l1 + l2] = '\n';
10149 buf[l1 + l2 + 1] = '\0';
10151 /* ANSI variant takes additional second argument */
10152 va_start(args, pat2);
10156 msv = vmess(buf, &args);
10158 message = SvPV_const(msv,l1);
10161 Copy(message, buf, l1 , char);
10162 buf[l1-1] = '\0'; /* Overwrite \n */
10163 Perl_croak(aTHX_ "%s", buf);
10166 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10168 #ifndef PERL_IN_XSUB_RE
10170 Perl_save_re_context(pTHX)
10174 struct re_save_state *state;
10176 SAVEVPTR(PL_curcop);
10177 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10179 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10180 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10181 SSPUSHUV(SAVEt_RE_STATE);
10183 Copy(&PL_reg_state, state, 1, struct re_save_state);
10185 PL_reg_start_tmp = 0;
10186 PL_reg_start_tmpl = 0;
10187 PL_reg_oldsaved = NULL;
10188 PL_reg_oldsavedlen = 0;
10189 PL_reg_maxiter = 0;
10190 PL_reg_leftiter = 0;
10191 PL_reg_poscache = NULL;
10192 PL_reg_poscache_size = 0;
10193 #ifdef PERL_OLD_COPY_ON_WRITE
10197 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10199 const REGEXP * const rx = PM_GETRE(PL_curpm);
10202 for (i = 1; i <= RX_NPARENS(rx); i++) {
10203 char digits[TYPE_CHARS(long)];
10204 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10205 GV *const *const gvp
10206 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10209 GV * const gv = *gvp;
10210 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10220 clear_re(pTHX_ void *r)
10223 ReREFCNT_dec((REGEXP *)r);
10229 S_put_byte(pTHX_ SV *sv, int c)
10231 PERL_ARGS_ASSERT_PUT_BYTE;
10233 /* Our definition of isPRINT() ignores locales, so only bytes that are
10234 not part of UTF-8 are considered printable. I assume that the same
10235 holds for UTF-EBCDIC.
10236 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10237 which Wikipedia says:
10239 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10240 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10241 identical, to the ASCII delete (DEL) or rubout control character.
10242 ) So the old condition can be simplified to !isPRINT(c) */
10244 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
10246 const char string = c;
10247 if (c == '-' || c == ']' || c == '\\' || c == '^')
10248 sv_catpvs(sv, "\\");
10249 sv_catpvn(sv, &string, 1);
10254 #define CLEAR_OPTSTART \
10255 if (optstart) STMT_START { \
10256 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10260 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10262 STATIC const regnode *
10263 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10264 const regnode *last, const regnode *plast,
10265 SV* sv, I32 indent, U32 depth)
10268 register U8 op = PSEUDO; /* Arbitrary non-END op. */
10269 register const regnode *next;
10270 const regnode *optstart= NULL;
10272 RXi_GET_DECL(r,ri);
10273 GET_RE_DEBUG_FLAGS_DECL;
10275 PERL_ARGS_ASSERT_DUMPUNTIL;
10277 #ifdef DEBUG_DUMPUNTIL
10278 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10279 last ? last-start : 0,plast ? plast-start : 0);
10282 if (plast && plast < last)
10285 while (PL_regkind[op] != END && (!last || node < last)) {
10286 /* While that wasn't END last time... */
10289 if (op == CLOSE || op == WHILEM)
10291 next = regnext((regnode *)node);
10294 if (OP(node) == OPTIMIZED) {
10295 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10302 regprop(r, sv, node);
10303 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10304 (int)(2*indent + 1), "", SvPVX_const(sv));
10306 if (OP(node) != OPTIMIZED) {
10307 if (next == NULL) /* Next ptr. */
10308 PerlIO_printf(Perl_debug_log, " (0)");
10309 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10310 PerlIO_printf(Perl_debug_log, " (FAIL)");
10312 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10313 (void)PerlIO_putc(Perl_debug_log, '\n');
10317 if (PL_regkind[(U8)op] == BRANCHJ) {
10320 register const regnode *nnode = (OP(next) == LONGJMP
10321 ? regnext((regnode *)next)
10323 if (last && nnode > last)
10325 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10328 else if (PL_regkind[(U8)op] == BRANCH) {
10330 DUMPUNTIL(NEXTOPER(node), next);
10332 else if ( PL_regkind[(U8)op] == TRIE ) {
10333 const regnode *this_trie = node;
10334 const char op = OP(node);
10335 const U32 n = ARG(node);
10336 const reg_ac_data * const ac = op>=AHOCORASICK ?
10337 (reg_ac_data *)ri->data->data[n] :
10339 const reg_trie_data * const trie =
10340 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10342 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10344 const regnode *nextbranch= NULL;
10347 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10348 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10350 PerlIO_printf(Perl_debug_log, "%*s%s ",
10351 (int)(2*(indent+3)), "",
10352 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10353 PL_colors[0], PL_colors[1],
10354 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10355 PERL_PV_PRETTY_ELLIPSES |
10356 PERL_PV_PRETTY_LTGT
10361 U16 dist= trie->jump[word_idx+1];
10362 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10363 (UV)((dist ? this_trie + dist : next) - start));
10366 nextbranch= this_trie + trie->jump[0];
10367 DUMPUNTIL(this_trie + dist, nextbranch);
10369 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10370 nextbranch= regnext((regnode *)nextbranch);
10372 PerlIO_printf(Perl_debug_log, "\n");
10375 if (last && next > last)
10380 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10381 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10382 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10384 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10386 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10388 else if ( op == PLUS || op == STAR) {
10389 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10391 else if (op == ANYOF) {
10392 /* arglen 1 + class block */
10393 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10394 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10395 node = NEXTOPER(node);
10397 else if (PL_regkind[(U8)op] == EXACT) {
10398 /* Literal string, where present. */
10399 node += NODE_SZ_STR(node) - 1;
10400 node = NEXTOPER(node);
10403 node = NEXTOPER(node);
10404 node += regarglen[(U8)op];
10406 if (op == CURLYX || op == OPEN)
10410 #ifdef DEBUG_DUMPUNTIL
10411 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10416 #endif /* DEBUGGING */
10420 * c-indentation-style: bsd
10421 * c-basic-offset: 4
10422 * indent-tabs-mode: t
10425 * ex: set ts=8 sts=4 sw=4 noet: