5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
95 # if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97 # pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99 # pragma optimize("w",on )
100 # endif /* BUGGY_MSC6 */
104 #define STATIC static
107 typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 HV *paren_names; /* Paren names */
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
142 char *starttry; /* -Dr: where regtry was called. */
143 #define RExC_starttry (pRExC_state->starttry)
146 const char *lastparse;
148 AV *paren_name_list; /* idx -> name */
149 #define RExC_lastparse (pRExC_state->lastparse)
150 #define RExC_lastnum (pRExC_state->lastnum)
151 #define RExC_paren_name_list (pRExC_state->paren_name_list)
155 #define RExC_flags (pRExC_state->flags)
156 #define RExC_precomp (pRExC_state->precomp)
157 #define RExC_rx_sv (pRExC_state->rx_sv)
158 #define RExC_rx (pRExC_state->rx)
159 #define RExC_rxi (pRExC_state->rxi)
160 #define RExC_start (pRExC_state->start)
161 #define RExC_end (pRExC_state->end)
162 #define RExC_parse (pRExC_state->parse)
163 #define RExC_whilem_seen (pRExC_state->whilem_seen)
164 #ifdef RE_TRACK_PATTERN_OFFSETS
165 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
167 #define RExC_emit (pRExC_state->emit)
168 #define RExC_emit_start (pRExC_state->emit_start)
169 #define RExC_emit_bound (pRExC_state->emit_bound)
170 #define RExC_naughty (pRExC_state->naughty)
171 #define RExC_sawback (pRExC_state->sawback)
172 #define RExC_seen (pRExC_state->seen)
173 #define RExC_size (pRExC_state->size)
174 #define RExC_npar (pRExC_state->npar)
175 #define RExC_nestroot (pRExC_state->nestroot)
176 #define RExC_extralen (pRExC_state->extralen)
177 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178 #define RExC_seen_evals (pRExC_state->seen_evals)
179 #define RExC_utf8 (pRExC_state->utf8)
180 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
181 #define RExC_open_parens (pRExC_state->open_parens)
182 #define RExC_close_parens (pRExC_state->close_parens)
183 #define RExC_opend (pRExC_state->opend)
184 #define RExC_paren_names (pRExC_state->paren_names)
185 #define RExC_recurse (pRExC_state->recurse)
186 #define RExC_recurse_count (pRExC_state->recurse_count)
189 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
194 #undef SPSTART /* dratted cpp namespace... */
197 * Flags to be passed up and down.
199 #define WORST 0 /* Worst case. */
200 #define HASWIDTH 0x01 /* Known to match non-null strings. */
202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
205 #define SPSTART 0x04 /* Starts with * or +. */
206 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
207 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
211 /* whether trie related optimizations are enabled */
212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213 #define TRIE_STUDY_OPT
214 #define FULL_TRIE_STUDY
220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221 #define PBITVAL(paren) (1 << ((paren) & 7))
222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
226 /* If not already in utf8, do a longjmp back to the beginning */
227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228 #define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
232 /* About scan_data_t.
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
238 string that must appear for at a fixed location, and we look for the
239 longest string that may appear at a floating location. So for instance
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
248 The strings can be composites, for instance
252 will result in a composite fixed substring 'foo'.
254 For each string some basic information is maintained:
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
259 character must match before the string we are searching.
260 Likewise when combined with minlenp and the length of the string
261 tells us how many characters must appear after the string we have
265 Only used for floating strings. This is the rightmost point that
266 the string can appear at. Ifset to I32 max it indicates that the
267 string can occur infinitely far to the right.
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
307 typedef struct scan_data_t {
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
313 I32 last_end; /* min value, <0 unless valid. */
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
329 struct regnode_charclass_class *start_class;
333 * Forward declarations for pregcomp()'s friends.
336 static const scan_data_t zero_scan_data =
337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
339 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
340 #define SF_BEFORE_SEOL 0x0001
341 #define SF_BEFORE_MEOL 0x0002
342 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
346 # define SF_FIX_SHIFT_EOL (0+2)
347 # define SF_FL_SHIFT_EOL (0+4)
349 # define SF_FIX_SHIFT_EOL (+2)
350 # define SF_FL_SHIFT_EOL (+4)
353 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
356 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
358 #define SF_IS_INF 0x0040
359 #define SF_HAS_PAR 0x0080
360 #define SF_IN_PAR 0x0100
361 #define SF_HAS_EVAL 0x0200
362 #define SCF_DO_SUBSTR 0x0400
363 #define SCF_DO_STCLASS_AND 0x0800
364 #define SCF_DO_STCLASS_OR 0x1000
365 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
366 #define SCF_WHILEM_VISITED_POS 0x2000
368 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
369 #define SCF_SEEN_ACCEPT 0x8000
371 #define UTF cBOOL(RExC_utf8)
372 #define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373 #define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
376 #define OOB_UNICODE 12345678
377 #define OOB_NAMEDCLASS -1
379 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
383 /* length of regex to show in messages that don't mark a position within */
384 #define RegexLengthToShowInErrorMessages 127
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
391 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
392 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
394 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
401 #define _FAIL(code) STMT_START { \
402 const char *ellipses = ""; \
403 IV len = RExC_end - RExC_precomp; \
406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
415 #define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
419 #define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
426 #define Simple_vFAIL(m) STMT_START { \
427 const IV offset = RExC_parse - RExC_precomp; \
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
435 #define vFAIL(m) STMT_START { \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
442 * Like Simple_vFAIL(), but accepts two arguments.
444 #define Simple_vFAIL2(m,a1) STMT_START { \
445 const IV offset = RExC_parse - RExC_precomp; \
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
453 #define vFAIL2(m,a1) STMT_START { \
455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
456 Simple_vFAIL2(m, a1); \
461 * Like Simple_vFAIL(), but accepts three arguments.
463 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
464 const IV offset = RExC_parse - RExC_precomp; \
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
472 #define vFAIL3(m,a1,a2) STMT_START { \
474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
475 Simple_vFAIL3(m, a1, a2); \
479 * Like Simple_vFAIL(), but accepts four arguments.
481 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
482 const IV offset = RExC_parse - RExC_precomp; \
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define ckWARNreg(loc,m) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define ckWARNregdep(loc,m) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
500 #define ckWARN2reg(loc, m, a1) STMT_START { \
501 const IV offset = loc - RExC_precomp; \
502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
506 #define vWARN3(loc, m, a1, a2) STMT_START { \
507 const IV offset = loc - RExC_precomp; \
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
512 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
518 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
519 const IV offset = loc - RExC_precomp; \
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
524 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
530 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
531 const IV offset = loc - RExC_precomp; \
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
537 /* Allow for side effects in s */
538 #define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
542 /* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
546 * Position is 1 indexed.
548 #ifndef RE_TRACK_PATTERN_OFFSETS
549 #define Set_Node_Offset_To_R(node,byte)
550 #define Set_Node_Offset(node,byte)
551 #define Set_Cur_Node_Offset
552 #define Set_Node_Length_To_R(node,len)
553 #define Set_Node_Length(node,len)
554 #define Set_Node_Cur_Length(node)
555 #define Node_Offset(n)
556 #define Node_Length(n)
557 #define Set_Node_Offset_Length(node,offset,len)
558 #define ProgLen(ri) ri->u.proglen
559 #define SetProgLen(ri,x) ri->u.proglen = x
561 #define ProgLen(ri) ri->u.offsets[0]
562 #define SetProgLen(ri,x) ri->u.offsets[0] = x
563 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
566 __LINE__, (int)(node), (int)(byte))); \
568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
570 RExC_offsets[2*(node)-1] = (byte); \
575 #define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
579 #define Set_Node_Length_To_R(node,len) STMT_START { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
582 __LINE__, (int)(node), (int)(len))); \
584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
586 RExC_offsets[2*(node)] = (len); \
591 #define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594 #define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
597 /* Get offsets and lengths */
598 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
601 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
607 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608 #define EXPERIMENTAL_INPLACESCAN
609 #endif /*RE_TRACK_PATTERN_OFFSETS*/
611 #define DEBUG_STUDYDATA(str,data,depth) \
612 DEBUG_OPTIMISE_MORE_r(if(data){ \
613 PerlIO_printf(Perl_debug_log, \
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
619 (UV)((data)->flags), \
620 (IV)((data)->whilem_c), \
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
642 PerlIO_printf(Perl_debug_log,"\n"); \
645 static void clear_re(pTHX_ void *r);
647 /* Mark that we cannot extend a found fixed substring at this point.
648 Update the longest found anchored substring and the longest found
649 floating substrings if needed. */
652 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
656 GET_RE_DEBUG_FLAGS_DECL;
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
661 SvSetMagicSV(*data->longest, data->last_found);
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
668 data->flags &= ~SF_FIX_BEFORE_EOL;
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
672 else { /* *data->longest == data->longest_float */
673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
674 data->offset_float_max = (l
675 ? data->last_start_max
676 : data->pos_min + data->pos_delta);
677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
678 data->offset_float_max = I32_MAX;
679 if (data->flags & SF_BEFORE_EOL)
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
683 data->flags &= ~SF_FL_BEFORE_EOL;
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
688 SvCUR_set(data->last_found, 0);
690 SV * const sv = data->last_found;
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
698 data->flags &= ~SF_BEFORE_EOL;
699 DEBUG_STUDYDATA("commit: ",data,0);
702 /* Can match anything (initialization) */
704 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
706 PERL_ARGS_ASSERT_CL_ANYTHING;
708 ANYOF_CLASS_ZERO(cl);
709 ANYOF_BITMAP_SETALL(cl);
710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
712 cl->flags |= ANYOF_LOCALE;
715 /* Can match anything (initialization) */
717 S_cl_is_anything(const struct regnode_charclass_class *cl)
721 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
723 for (value = 0; value <= ANYOF_MAX; value += 2)
724 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
726 if (!(cl->flags & ANYOF_UNICODE_ALL))
728 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
733 /* Can match anything (initialization) */
735 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
737 PERL_ARGS_ASSERT_CL_INIT;
739 Zero(cl, 1, struct regnode_charclass_class);
741 cl_anything(pRExC_state, cl);
745 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
747 PERL_ARGS_ASSERT_CL_INIT_ZERO;
749 Zero(cl, 1, struct regnode_charclass_class);
751 cl_anything(pRExC_state, cl);
753 cl->flags |= ANYOF_LOCALE;
756 /* 'And' a given class with another one. Can create false positives */
757 /* We assume that cl is not inverted */
759 S_cl_and(struct regnode_charclass_class *cl,
760 const struct regnode_charclass_class *and_with)
762 PERL_ARGS_ASSERT_CL_AND;
764 assert(and_with->type == ANYOF);
765 if (!(and_with->flags & ANYOF_CLASS)
766 && !(cl->flags & ANYOF_CLASS)
767 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
768 && !(and_with->flags & ANYOF_FOLD)
769 && !(cl->flags & ANYOF_FOLD)) {
772 if (and_with->flags & ANYOF_INVERT)
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] &= ~and_with->bitmap[i];
776 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
777 cl->bitmap[i] &= and_with->bitmap[i];
778 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
779 if (!(and_with->flags & ANYOF_EOS))
780 cl->flags &= ~ANYOF_EOS;
782 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
783 !(and_with->flags & ANYOF_INVERT)) {
784 cl->flags &= ~ANYOF_UNICODE_ALL;
785 cl->flags |= ANYOF_UNICODE;
786 ARG_SET(cl, ARG(and_with));
788 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
789 !(and_with->flags & ANYOF_INVERT))
790 cl->flags &= ~ANYOF_UNICODE_ALL;
791 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
792 !(and_with->flags & ANYOF_INVERT))
793 cl->flags &= ~ANYOF_UNICODE;
796 /* 'OR' a given class with another one. Can create false positives */
797 /* We assume that cl is not inverted */
799 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
801 PERL_ARGS_ASSERT_CL_OR;
803 if (or_with->flags & ANYOF_INVERT) {
805 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
806 * <= (B1 | !B2) | (CL1 | !CL2)
807 * which is wasteful if CL2 is small, but we ignore CL2:
808 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
809 * XXXX Can we handle case-fold? Unclear:
810 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
811 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
813 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
814 && !(or_with->flags & ANYOF_FOLD)
815 && !(cl->flags & ANYOF_FOLD) ) {
818 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
819 cl->bitmap[i] |= ~or_with->bitmap[i];
820 } /* XXXX: logic is complicated otherwise */
822 cl_anything(pRExC_state, cl);
825 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
826 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
827 && (!(or_with->flags & ANYOF_FOLD)
828 || (cl->flags & ANYOF_FOLD)) ) {
831 /* OR char bitmap and class bitmap separately */
832 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
833 cl->bitmap[i] |= or_with->bitmap[i];
834 if (or_with->flags & ANYOF_CLASS) {
835 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
836 cl->classflags[i] |= or_with->classflags[i];
837 cl->flags |= ANYOF_CLASS;
840 else { /* XXXX: logic is complicated, leave it along for a moment. */
841 cl_anything(pRExC_state, cl);
844 if (or_with->flags & ANYOF_EOS)
845 cl->flags |= ANYOF_EOS;
847 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
848 ARG(cl) != ARG(or_with)) {
849 cl->flags |= ANYOF_UNICODE_ALL;
850 cl->flags &= ~ANYOF_UNICODE;
852 if (or_with->flags & ANYOF_UNICODE_ALL) {
853 cl->flags |= ANYOF_UNICODE_ALL;
854 cl->flags &= ~ANYOF_UNICODE;
858 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
859 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
860 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
861 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
866 dump_trie(trie,widecharmap,revcharmap)
867 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
868 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
870 These routines dump out a trie in a somewhat readable format.
871 The _interim_ variants are used for debugging the interim
872 tables that are used to generate the final compressed
873 representation which is what dump_trie expects.
875 Part of the reason for their existance is to provide a form
876 of documentation as to how the different representations function.
881 Dumps the final compressed table form of the trie to Perl_debug_log.
882 Used for debugging make_trie().
886 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
887 AV *revcharmap, U32 depth)
890 SV *sv=sv_newmortal();
891 int colwidth= widecharmap ? 6 : 4;
893 GET_RE_DEBUG_FLAGS_DECL;
895 PERL_ARGS_ASSERT_DUMP_TRIE;
897 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
898 (int)depth * 2 + 2,"",
899 "Match","Base","Ofs" );
901 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
902 SV ** const tmp = av_fetch( revcharmap, state, 0);
904 PerlIO_printf( Perl_debug_log, "%*s",
906 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
907 PL_colors[0], PL_colors[1],
908 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
909 PERL_PV_ESCAPE_FIRSTCHAR
914 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
915 (int)depth * 2 + 2,"");
917 for( state = 0 ; state < trie->uniquecharcount ; state++ )
918 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
919 PerlIO_printf( Perl_debug_log, "\n");
921 for( state = 1 ; state < trie->statecount ; state++ ) {
922 const U32 base = trie->states[ state ].trans.base;
924 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
926 if ( trie->states[ state ].wordnum ) {
927 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
929 PerlIO_printf( Perl_debug_log, "%6s", "" );
932 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
937 while( ( base + ofs < trie->uniquecharcount ) ||
938 ( base + ofs - trie->uniquecharcount < trie->lasttrans
939 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
942 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
944 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
945 if ( ( base + ofs >= trie->uniquecharcount ) &&
946 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
947 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
949 PerlIO_printf( Perl_debug_log, "%*"UVXf,
951 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
953 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
957 PerlIO_printf( Perl_debug_log, "]");
960 PerlIO_printf( Perl_debug_log, "\n" );
962 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
963 for (word=1; word <= trie->wordcount; word++) {
964 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
965 (int)word, (int)(trie->wordinfo[word].prev),
966 (int)(trie->wordinfo[word].len));
968 PerlIO_printf(Perl_debug_log, "\n" );
971 Dumps a fully constructed but uncompressed trie in list form.
972 List tries normally only are used for construction when the number of
973 possible chars (trie->uniquecharcount) is very high.
974 Used for debugging make_trie().
977 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
978 HV *widecharmap, AV *revcharmap, U32 next_alloc,
982 SV *sv=sv_newmortal();
983 int colwidth= widecharmap ? 6 : 4;
984 GET_RE_DEBUG_FLAGS_DECL;
986 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
988 /* print out the table precompression. */
989 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
990 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
991 "------:-----+-----------------\n" );
993 for( state=1 ; state < next_alloc ; state ++ ) {
996 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
997 (int)depth * 2 + 2,"", (UV)state );
998 if ( ! trie->states[ state ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, "%5s| ","");
1001 PerlIO_printf( Perl_debug_log, "W%4x| ",
1002 trie->states[ state ].wordnum
1005 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1006 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1008 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1010 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1011 PL_colors[0], PL_colors[1],
1012 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1013 PERL_PV_ESCAPE_FIRSTCHAR
1015 TRIE_LIST_ITEM(state,charid).forid,
1016 (UV)TRIE_LIST_ITEM(state,charid).newstate
1019 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1020 (int)((depth * 2) + 14), "");
1023 PerlIO_printf( Perl_debug_log, "\n");
1028 Dumps a fully constructed but uncompressed trie in table form.
1029 This is the normal DFA style state transition table, with a few
1030 twists to facilitate compression later.
1031 Used for debugging make_trie().
1034 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1035 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1040 SV *sv=sv_newmortal();
1041 int colwidth= widecharmap ? 6 : 4;
1042 GET_RE_DEBUG_FLAGS_DECL;
1044 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1047 print out the table precompression so that we can do a visual check
1048 that they are identical.
1051 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1053 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1054 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1056 PerlIO_printf( Perl_debug_log, "%*s",
1058 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1059 PL_colors[0], PL_colors[1],
1060 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1061 PERL_PV_ESCAPE_FIRSTCHAR
1067 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1069 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1070 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1073 PerlIO_printf( Perl_debug_log, "\n" );
1075 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1077 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1078 (int)depth * 2 + 2,"",
1079 (UV)TRIE_NODENUM( state ) );
1081 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1082 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1084 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1086 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1088 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1089 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1091 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1092 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1100 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1101 startbranch: the first branch in the whole branch sequence
1102 first : start branch of sequence of branch-exact nodes.
1103 May be the same as startbranch
1104 last : Thing following the last branch.
1105 May be the same as tail.
1106 tail : item following the branch sequence
1107 count : words in the sequence
1108 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1109 depth : indent depth
1111 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1113 A trie is an N'ary tree where the branches are determined by digital
1114 decomposition of the key. IE, at the root node you look up the 1st character and
1115 follow that branch repeat until you find the end of the branches. Nodes can be
1116 marked as "accepting" meaning they represent a complete word. Eg:
1120 would convert into the following structure. Numbers represent states, letters
1121 following numbers represent valid transitions on the letter from that state, if
1122 the number is in square brackets it represents an accepting state, otherwise it
1123 will be in parenthesis.
1125 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1129 (1) +-i->(6)-+-s->[7]
1131 +-s->(3)-+-h->(4)-+-e->[5]
1133 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1135 This shows that when matching against the string 'hers' we will begin at state 1
1136 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1137 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1138 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1139 single traverse. We store a mapping from accepting to state to which word was
1140 matched, and then when we have multiple possibilities we try to complete the
1141 rest of the regex in the order in which they occured in the alternation.
1143 The only prior NFA like behaviour that would be changed by the TRIE support is
1144 the silent ignoring of duplicate alternations which are of the form:
1146 / (DUPE|DUPE) X? (?{ ... }) Y /x
1148 Thus EVAL blocks follwing a trie may be called a different number of times with
1149 and without the optimisation. With the optimisations dupes will be silently
1150 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1151 the following demonstrates:
1153 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1155 which prints out 'word' three times, but
1157 'words'=~/(word|word|word)(?{ print $1 })S/
1159 which doesnt print it out at all. This is due to other optimisations kicking in.
1161 Example of what happens on a structural level:
1163 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1165 1: CURLYM[1] {1,32767}(18)
1176 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1177 and should turn into:
1179 1: CURLYM[1] {1,32767}(18)
1181 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1189 Cases where tail != last would be like /(?foo|bar)baz/:
1199 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1200 and would end up looking like:
1203 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1210 d = uvuni_to_utf8_flags(d, uv, 0);
1212 is the recommended Unicode-aware way of saying
1217 #define TRIE_STORE_REVCHAR \
1220 SV *zlopp = newSV(2); \
1221 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1222 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1223 SvCUR_set(zlopp, kapow - flrbbbbb); \
1226 av_push(revcharmap, zlopp); \
1228 char ooooff = (char)uvc; \
1229 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1233 #define TRIE_READ_CHAR STMT_START { \
1237 if ( foldlen > 0 ) { \
1238 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1243 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1244 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1245 foldlen -= UNISKIP( uvc ); \
1246 scan = foldbuf + UNISKIP( uvc ); \
1249 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1259 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1260 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1261 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1262 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1264 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1265 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1266 TRIE_LIST_CUR( state )++; \
1269 #define TRIE_LIST_NEW(state) STMT_START { \
1270 Newxz( trie->states[ state ].trans.list, \
1271 4, reg_trie_trans_le ); \
1272 TRIE_LIST_CUR( state ) = 1; \
1273 TRIE_LIST_LEN( state ) = 4; \
1276 #define TRIE_HANDLE_WORD(state) STMT_START { \
1277 U16 dupe= trie->states[ state ].wordnum; \
1278 regnode * const noper_next = regnext( noper ); \
1281 /* store the word for dumping */ \
1283 if (OP(noper) != NOTHING) \
1284 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1286 tmp = newSVpvn_utf8( "", 0, UTF ); \
1287 av_push( trie_words, tmp ); \
1291 trie->wordinfo[curword].prev = 0; \
1292 trie->wordinfo[curword].len = wordlen; \
1293 trie->wordinfo[curword].accept = state; \
1295 if ( noper_next < tail ) { \
1297 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1298 trie->jump[curword] = (U16)(noper_next - convert); \
1300 jumper = noper_next; \
1302 nextbranch= regnext(cur); \
1306 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1307 /* chain, so that when the bits of chain are later */\
1308 /* linked together, the dups appear in the chain */\
1309 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1310 trie->wordinfo[dupe].prev = curword; \
1312 /* we haven't inserted this word yet. */ \
1313 trie->states[ state ].wordnum = curword; \
1318 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1319 ( ( base + charid >= ucharcount \
1320 && base + charid < ubound \
1321 && state == trie->trans[ base - ucharcount + charid ].check \
1322 && trie->trans[ base - ucharcount + charid ].next ) \
1323 ? trie->trans[ base - ucharcount + charid ].next \
1324 : ( state==1 ? special : 0 ) \
1328 #define MADE_JUMP_TRIE 2
1329 #define MADE_EXACT_TRIE 4
1332 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1335 /* first pass, loop through and scan words */
1336 reg_trie_data *trie;
1337 HV *widecharmap = NULL;
1338 AV *revcharmap = newAV();
1340 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1345 regnode *jumper = NULL;
1346 regnode *nextbranch = NULL;
1347 regnode *convert = NULL;
1348 U32 *prev_states; /* temp array mapping each state to previous one */
1349 /* we just use folder as a flag in utf8 */
1350 const U8 * const folder = ( flags == EXACTF
1352 : ( flags == EXACTFL
1359 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1360 AV *trie_words = NULL;
1361 /* along with revcharmap, this only used during construction but both are
1362 * useful during debugging so we store them in the struct when debugging.
1365 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1366 STRLEN trie_charcount=0;
1368 SV *re_trie_maxbuff;
1369 GET_RE_DEBUG_FLAGS_DECL;
1371 PERL_ARGS_ASSERT_MAKE_TRIE;
1373 PERL_UNUSED_ARG(depth);
1376 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1378 trie->startstate = 1;
1379 trie->wordcount = word_count;
1380 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1381 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1382 if (!(UTF && folder))
1383 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1384 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1385 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1388 trie_words = newAV();
1391 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1392 if (!SvIOK(re_trie_maxbuff)) {
1393 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1396 PerlIO_printf( Perl_debug_log,
1397 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1398 (int)depth * 2 + 2, "",
1399 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1400 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1404 /* Find the node we are going to overwrite */
1405 if ( first == startbranch && OP( last ) != BRANCH ) {
1406 /* whole branch chain */
1409 /* branch sub-chain */
1410 convert = NEXTOPER( first );
1413 /* -- First loop and Setup --
1415 We first traverse the branches and scan each word to determine if it
1416 contains widechars, and how many unique chars there are, this is
1417 important as we have to build a table with at least as many columns as we
1420 We use an array of integers to represent the character codes 0..255
1421 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1422 native representation of the character value as the key and IV's for the
1425 *TODO* If we keep track of how many times each character is used we can
1426 remap the columns so that the table compression later on is more
1427 efficient in terms of memory by ensuring most common value is in the
1428 middle and the least common are on the outside. IMO this would be better
1429 than a most to least common mapping as theres a decent chance the most
1430 common letter will share a node with the least common, meaning the node
1431 will not be compressable. With a middle is most common approach the worst
1432 case is when we have the least common nodes twice.
1436 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1437 regnode * const noper = NEXTOPER( cur );
1438 const U8 *uc = (U8*)STRING( noper );
1439 const U8 * const e = uc + STR_LEN( noper );
1441 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1442 const U8 *scan = (U8*)NULL;
1443 U32 wordlen = 0; /* required init */
1445 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1447 if (OP(noper) == NOTHING) {
1451 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1452 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1453 regardless of encoding */
1455 for ( ; uc < e ; uc += len ) {
1456 TRIE_CHARCOUNT(trie)++;
1460 if ( !trie->charmap[ uvc ] ) {
1461 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1463 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1467 /* store the codepoint in the bitmap, and if its ascii
1468 also store its folded equivelent. */
1469 TRIE_BITMAP_SET(trie,uvc);
1471 /* store the folded codepoint */
1472 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1475 /* store first byte of utf8 representation of
1476 codepoints in the 127 < uvc < 256 range */
1477 if (127 < uvc && uvc < 192) {
1478 TRIE_BITMAP_SET(trie,194);
1479 } else if (191 < uvc ) {
1480 TRIE_BITMAP_SET(trie,195);
1481 /* && uvc < 256 -- we know uvc is < 256 already */
1484 set_bit = 0; /* We've done our bit :-) */
1489 widecharmap = newHV();
1491 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1494 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1496 if ( !SvTRUE( *svpp ) ) {
1497 sv_setiv( *svpp, ++trie->uniquecharcount );
1502 if( cur == first ) {
1505 } else if (chars < trie->minlen) {
1507 } else if (chars > trie->maxlen) {
1511 } /* end first pass */
1512 DEBUG_TRIE_COMPILE_r(
1513 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1514 (int)depth * 2 + 2,"",
1515 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1516 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1517 (int)trie->minlen, (int)trie->maxlen )
1521 We now know what we are dealing with in terms of unique chars and
1522 string sizes so we can calculate how much memory a naive
1523 representation using a flat table will take. If it's over a reasonable
1524 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1525 conservative but potentially much slower representation using an array
1528 At the end we convert both representations into the same compressed
1529 form that will be used in regexec.c for matching with. The latter
1530 is a form that cannot be used to construct with but has memory
1531 properties similar to the list form and access properties similar
1532 to the table form making it both suitable for fast searches and
1533 small enough that its feasable to store for the duration of a program.
1535 See the comment in the code where the compressed table is produced
1536 inplace from the flat tabe representation for an explanation of how
1537 the compression works.
1542 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1545 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1547 Second Pass -- Array Of Lists Representation
1549 Each state will be represented by a list of charid:state records
1550 (reg_trie_trans_le) the first such element holds the CUR and LEN
1551 points of the allocated array. (See defines above).
1553 We build the initial structure using the lists, and then convert
1554 it into the compressed table form which allows faster lookups
1555 (but cant be modified once converted).
1558 STRLEN transcount = 1;
1560 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1561 "%*sCompiling trie using list compiler\n",
1562 (int)depth * 2 + 2, ""));
1564 trie->states = (reg_trie_state *)
1565 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1566 sizeof(reg_trie_state) );
1570 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1572 regnode * const noper = NEXTOPER( cur );
1573 U8 *uc = (U8*)STRING( noper );
1574 const U8 * const e = uc + STR_LEN( noper );
1575 U32 state = 1; /* required init */
1576 U16 charid = 0; /* sanity init */
1577 U8 *scan = (U8*)NULL; /* sanity init */
1578 STRLEN foldlen = 0; /* required init */
1579 U32 wordlen = 0; /* required init */
1580 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1582 if (OP(noper) != NOTHING) {
1583 for ( ; uc < e ; uc += len ) {
1588 charid = trie->charmap[ uvc ];
1590 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1594 charid=(U16)SvIV( *svpp );
1597 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1604 if ( !trie->states[ state ].trans.list ) {
1605 TRIE_LIST_NEW( state );
1607 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1608 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1609 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1614 newstate = next_alloc++;
1615 prev_states[newstate] = state;
1616 TRIE_LIST_PUSH( state, charid, newstate );
1621 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1625 TRIE_HANDLE_WORD(state);
1627 } /* end second pass */
1629 /* next alloc is the NEXT state to be allocated */
1630 trie->statecount = next_alloc;
1631 trie->states = (reg_trie_state *)
1632 PerlMemShared_realloc( trie->states,
1634 * sizeof(reg_trie_state) );
1636 /* and now dump it out before we compress it */
1637 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1638 revcharmap, next_alloc,
1642 trie->trans = (reg_trie_trans *)
1643 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1650 for( state=1 ; state < next_alloc ; state ++ ) {
1654 DEBUG_TRIE_COMPILE_MORE_r(
1655 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1659 if (trie->states[state].trans.list) {
1660 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1664 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1665 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1666 if ( forid < minid ) {
1668 } else if ( forid > maxid ) {
1672 if ( transcount < tp + maxid - minid + 1) {
1674 trie->trans = (reg_trie_trans *)
1675 PerlMemShared_realloc( trie->trans,
1677 * sizeof(reg_trie_trans) );
1678 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1680 base = trie->uniquecharcount + tp - minid;
1681 if ( maxid == minid ) {
1683 for ( ; zp < tp ; zp++ ) {
1684 if ( ! trie->trans[ zp ].next ) {
1685 base = trie->uniquecharcount + zp - minid;
1686 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1687 trie->trans[ zp ].check = state;
1693 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1694 trie->trans[ tp ].check = state;
1699 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1700 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1701 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1702 trie->trans[ tid ].check = state;
1704 tp += ( maxid - minid + 1 );
1706 Safefree(trie->states[ state ].trans.list);
1709 DEBUG_TRIE_COMPILE_MORE_r(
1710 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1713 trie->states[ state ].trans.base=base;
1715 trie->lasttrans = tp + 1;
1719 Second Pass -- Flat Table Representation.
1721 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1722 We know that we will need Charcount+1 trans at most to store the data
1723 (one row per char at worst case) So we preallocate both structures
1724 assuming worst case.
1726 We then construct the trie using only the .next slots of the entry
1729 We use the .check field of the first entry of the node temporarily to
1730 make compression both faster and easier by keeping track of how many non
1731 zero fields are in the node.
1733 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1736 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1737 number representing the first entry of the node, and state as a
1738 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1739 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1740 are 2 entrys per node. eg:
1748 The table is internally in the right hand, idx form. However as we also
1749 have to deal with the states array which is indexed by nodenum we have to
1750 use TRIE_NODENUM() to convert.
1753 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1754 "%*sCompiling trie using table compiler\n",
1755 (int)depth * 2 + 2, ""));
1757 trie->trans = (reg_trie_trans *)
1758 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1759 * trie->uniquecharcount + 1,
1760 sizeof(reg_trie_trans) );
1761 trie->states = (reg_trie_state *)
1762 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1763 sizeof(reg_trie_state) );
1764 next_alloc = trie->uniquecharcount + 1;
1767 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1769 regnode * const noper = NEXTOPER( cur );
1770 const U8 *uc = (U8*)STRING( noper );
1771 const U8 * const e = uc + STR_LEN( noper );
1773 U32 state = 1; /* required init */
1775 U16 charid = 0; /* sanity init */
1776 U32 accept_state = 0; /* sanity init */
1777 U8 *scan = (U8*)NULL; /* sanity init */
1779 STRLEN foldlen = 0; /* required init */
1780 U32 wordlen = 0; /* required init */
1781 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1783 if ( OP(noper) != NOTHING ) {
1784 for ( ; uc < e ; uc += len ) {
1789 charid = trie->charmap[ uvc ];
1791 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1792 charid = svpp ? (U16)SvIV(*svpp) : 0;
1796 if ( !trie->trans[ state + charid ].next ) {
1797 trie->trans[ state + charid ].next = next_alloc;
1798 trie->trans[ state ].check++;
1799 prev_states[TRIE_NODENUM(next_alloc)]
1800 = TRIE_NODENUM(state);
1801 next_alloc += trie->uniquecharcount;
1803 state = trie->trans[ state + charid ].next;
1805 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1807 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1810 accept_state = TRIE_NODENUM( state );
1811 TRIE_HANDLE_WORD(accept_state);
1813 } /* end second pass */
1815 /* and now dump it out before we compress it */
1816 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1818 next_alloc, depth+1));
1822 * Inplace compress the table.*
1824 For sparse data sets the table constructed by the trie algorithm will
1825 be mostly 0/FAIL transitions or to put it another way mostly empty.
1826 (Note that leaf nodes will not contain any transitions.)
1828 This algorithm compresses the tables by eliminating most such
1829 transitions, at the cost of a modest bit of extra work during lookup:
1831 - Each states[] entry contains a .base field which indicates the
1832 index in the state[] array wheres its transition data is stored.
1834 - If .base is 0 there are no valid transitions from that node.
1836 - If .base is nonzero then charid is added to it to find an entry in
1839 -If trans[states[state].base+charid].check!=state then the
1840 transition is taken to be a 0/Fail transition. Thus if there are fail
1841 transitions at the front of the node then the .base offset will point
1842 somewhere inside the previous nodes data (or maybe even into a node
1843 even earlier), but the .check field determines if the transition is
1847 The following process inplace converts the table to the compressed
1848 table: We first do not compress the root node 1,and mark its all its
1849 .check pointers as 1 and set its .base pointer as 1 as well. This
1850 allows to do a DFA construction from the compressed table later, and
1851 ensures that any .base pointers we calculate later are greater than
1854 - We set 'pos' to indicate the first entry of the second node.
1856 - We then iterate over the columns of the node, finding the first and
1857 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1858 and set the .check pointers accordingly, and advance pos
1859 appropriately and repreat for the next node. Note that when we copy
1860 the next pointers we have to convert them from the original
1861 NODEIDX form to NODENUM form as the former is not valid post
1864 - If a node has no transitions used we mark its base as 0 and do not
1865 advance the pos pointer.
1867 - If a node only has one transition we use a second pointer into the
1868 structure to fill in allocated fail transitions from other states.
1869 This pointer is independent of the main pointer and scans forward
1870 looking for null transitions that are allocated to a state. When it
1871 finds one it writes the single transition into the "hole". If the
1872 pointer doesnt find one the single transition is appended as normal.
1874 - Once compressed we can Renew/realloc the structures to release the
1877 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1878 specifically Fig 3.47 and the associated pseudocode.
1882 const U32 laststate = TRIE_NODENUM( next_alloc );
1885 trie->statecount = laststate;
1887 for ( state = 1 ; state < laststate ; state++ ) {
1889 const U32 stateidx = TRIE_NODEIDX( state );
1890 const U32 o_used = trie->trans[ stateidx ].check;
1891 U32 used = trie->trans[ stateidx ].check;
1892 trie->trans[ stateidx ].check = 0;
1894 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1895 if ( flag || trie->trans[ stateidx + charid ].next ) {
1896 if ( trie->trans[ stateidx + charid ].next ) {
1898 for ( ; zp < pos ; zp++ ) {
1899 if ( ! trie->trans[ zp ].next ) {
1903 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1904 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1905 trie->trans[ zp ].check = state;
1906 if ( ++zp > pos ) pos = zp;
1913 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1915 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1916 trie->trans[ pos ].check = state;
1921 trie->lasttrans = pos + 1;
1922 trie->states = (reg_trie_state *)
1923 PerlMemShared_realloc( trie->states, laststate
1924 * sizeof(reg_trie_state) );
1925 DEBUG_TRIE_COMPILE_MORE_r(
1926 PerlIO_printf( Perl_debug_log,
1927 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1928 (int)depth * 2 + 2,"",
1929 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1932 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1935 } /* end table compress */
1937 DEBUG_TRIE_COMPILE_MORE_r(
1938 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1939 (int)depth * 2 + 2, "",
1940 (UV)trie->statecount,
1941 (UV)trie->lasttrans)
1943 /* resize the trans array to remove unused space */
1944 trie->trans = (reg_trie_trans *)
1945 PerlMemShared_realloc( trie->trans, trie->lasttrans
1946 * sizeof(reg_trie_trans) );
1948 { /* Modify the program and insert the new TRIE node*/
1949 U8 nodetype =(U8)(flags & 0xFF);
1953 regnode *optimize = NULL;
1954 #ifdef RE_TRACK_PATTERN_OFFSETS
1957 U32 mjd_nodelen = 0;
1958 #endif /* RE_TRACK_PATTERN_OFFSETS */
1959 #endif /* DEBUGGING */
1961 This means we convert either the first branch or the first Exact,
1962 depending on whether the thing following (in 'last') is a branch
1963 or not and whther first is the startbranch (ie is it a sub part of
1964 the alternation or is it the whole thing.)
1965 Assuming its a sub part we conver the EXACT otherwise we convert
1966 the whole branch sequence, including the first.
1968 /* Find the node we are going to overwrite */
1969 if ( first != startbranch || OP( last ) == BRANCH ) {
1970 /* branch sub-chain */
1971 NEXT_OFF( first ) = (U16)(last - first);
1972 #ifdef RE_TRACK_PATTERN_OFFSETS
1974 mjd_offset= Node_Offset((convert));
1975 mjd_nodelen= Node_Length((convert));
1978 /* whole branch chain */
1980 #ifdef RE_TRACK_PATTERN_OFFSETS
1983 const regnode *nop = NEXTOPER( convert );
1984 mjd_offset= Node_Offset((nop));
1985 mjd_nodelen= Node_Length((nop));
1989 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1990 (int)depth * 2 + 2, "",
1991 (UV)mjd_offset, (UV)mjd_nodelen)
1994 /* But first we check to see if there is a common prefix we can
1995 split out as an EXACT and put in front of the TRIE node. */
1996 trie->startstate= 1;
1997 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1999 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2003 const U32 base = trie->states[ state ].trans.base;
2005 if ( trie->states[state].wordnum )
2008 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2009 if ( ( base + ofs >= trie->uniquecharcount ) &&
2010 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2011 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2013 if ( ++count > 1 ) {
2014 SV **tmp = av_fetch( revcharmap, ofs, 0);
2015 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2016 if ( state == 1 ) break;
2018 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2020 PerlIO_printf(Perl_debug_log,
2021 "%*sNew Start State=%"UVuf" Class: [",
2022 (int)depth * 2 + 2, "",
2025 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2026 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2028 TRIE_BITMAP_SET(trie,*ch);
2030 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2032 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2036 TRIE_BITMAP_SET(trie,*ch);
2038 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2039 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2045 SV **tmp = av_fetch( revcharmap, idx, 0);
2047 char *ch = SvPV( *tmp, len );
2049 SV *sv=sv_newmortal();
2050 PerlIO_printf( Perl_debug_log,
2051 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2052 (int)depth * 2 + 2, "",
2054 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2055 PL_colors[0], PL_colors[1],
2056 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2057 PERL_PV_ESCAPE_FIRSTCHAR
2062 OP( convert ) = nodetype;
2063 str=STRING(convert);
2066 STR_LEN(convert) += len;
2072 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2077 trie->prefixlen = (state-1);
2079 regnode *n = convert+NODE_SZ_STR(convert);
2080 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2081 trie->startstate = state;
2082 trie->minlen -= (state - 1);
2083 trie->maxlen -= (state - 1);
2085 /* At least the UNICOS C compiler choked on this
2086 * being argument to DEBUG_r(), so let's just have
2089 #ifdef PERL_EXT_RE_BUILD
2095 regnode *fix = convert;
2096 U32 word = trie->wordcount;
2098 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2099 while( ++fix < n ) {
2100 Set_Node_Offset_Length(fix, 0, 0);
2103 SV ** const tmp = av_fetch( trie_words, word, 0 );
2105 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2106 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2108 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2116 NEXT_OFF(convert) = (U16)(tail - convert);
2117 DEBUG_r(optimize= n);
2123 if ( trie->maxlen ) {
2124 NEXT_OFF( convert ) = (U16)(tail - convert);
2125 ARG_SET( convert, data_slot );
2126 /* Store the offset to the first unabsorbed branch in
2127 jump[0], which is otherwise unused by the jump logic.
2128 We use this when dumping a trie and during optimisation. */
2130 trie->jump[0] = (U16)(nextbranch - convert);
2133 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2134 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2136 OP( convert ) = TRIEC;
2137 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2138 PerlMemShared_free(trie->bitmap);
2141 OP( convert ) = TRIE;
2143 /* store the type in the flags */
2144 convert->flags = nodetype;
2148 + regarglen[ OP( convert ) ];
2150 /* XXX We really should free up the resource in trie now,
2151 as we won't use them - (which resources?) dmq */
2153 /* needed for dumping*/
2154 DEBUG_r(if (optimize) {
2155 regnode *opt = convert;
2157 while ( ++opt < optimize) {
2158 Set_Node_Offset_Length(opt,0,0);
2161 Try to clean up some of the debris left after the
2164 while( optimize < jumper ) {
2165 mjd_nodelen += Node_Length((optimize));
2166 OP( optimize ) = OPTIMIZED;
2167 Set_Node_Offset_Length(optimize,0,0);
2170 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2172 } /* end node insert */
2174 /* Finish populating the prev field of the wordinfo array. Walk back
2175 * from each accept state until we find another accept state, and if
2176 * so, point the first word's .prev field at the second word. If the
2177 * second already has a .prev field set, stop now. This will be the
2178 * case either if we've already processed that word's accept state,
2179 * or that that state had multiple words, and the overspill words
2180 * were already linked up earlier.
2187 for (word=1; word <= trie->wordcount; word++) {
2189 if (trie->wordinfo[word].prev)
2191 state = trie->wordinfo[word].accept;
2193 state = prev_states[state];
2196 prev = trie->states[state].wordnum;
2200 trie->wordinfo[word].prev = prev;
2202 Safefree(prev_states);
2206 /* and now dump out the compressed format */
2207 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2209 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2211 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2212 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2214 SvREFCNT_dec(revcharmap);
2218 : trie->startstate>1
2224 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2226 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2228 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2229 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2232 We find the fail state for each state in the trie, this state is the longest proper
2233 suffix of the current states 'word' that is also a proper prefix of another word in our
2234 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2235 the DFA not to have to restart after its tried and failed a word at a given point, it
2236 simply continues as though it had been matching the other word in the first place.
2238 'abcdgu'=~/abcdefg|cdgu/
2239 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2240 fail, which would bring use to the state representing 'd' in the second word where we would
2241 try 'g' and succeed, prodceding to match 'cdgu'.
2243 /* add a fail transition */
2244 const U32 trie_offset = ARG(source);
2245 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2247 const U32 ucharcount = trie->uniquecharcount;
2248 const U32 numstates = trie->statecount;
2249 const U32 ubound = trie->lasttrans + ucharcount;
2253 U32 base = trie->states[ 1 ].trans.base;
2256 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2257 GET_RE_DEBUG_FLAGS_DECL;
2259 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2261 PERL_UNUSED_ARG(depth);
2265 ARG_SET( stclass, data_slot );
2266 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2267 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2268 aho->trie=trie_offset;
2269 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2270 Copy( trie->states, aho->states, numstates, reg_trie_state );
2271 Newxz( q, numstates, U32);
2272 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2275 /* initialize fail[0..1] to be 1 so that we always have
2276 a valid final fail state */
2277 fail[ 0 ] = fail[ 1 ] = 1;
2279 for ( charid = 0; charid < ucharcount ; charid++ ) {
2280 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2282 q[ q_write ] = newstate;
2283 /* set to point at the root */
2284 fail[ q[ q_write++ ] ]=1;
2287 while ( q_read < q_write) {
2288 const U32 cur = q[ q_read++ % numstates ];
2289 base = trie->states[ cur ].trans.base;
2291 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2292 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2294 U32 fail_state = cur;
2297 fail_state = fail[ fail_state ];
2298 fail_base = aho->states[ fail_state ].trans.base;
2299 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2301 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2302 fail[ ch_state ] = fail_state;
2303 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2305 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2307 q[ q_write++ % numstates] = ch_state;
2311 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2312 when we fail in state 1, this allows us to use the
2313 charclass scan to find a valid start char. This is based on the principle
2314 that theres a good chance the string being searched contains lots of stuff
2315 that cant be a start char.
2317 fail[ 0 ] = fail[ 1 ] = 0;
2318 DEBUG_TRIE_COMPILE_r({
2319 PerlIO_printf(Perl_debug_log,
2320 "%*sStclass Failtable (%"UVuf" states): 0",
2321 (int)(depth * 2), "", (UV)numstates
2323 for( q_read=1; q_read<numstates; q_read++ ) {
2324 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2326 PerlIO_printf(Perl_debug_log, "\n");
2329 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2334 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2335 * These need to be revisited when a newer toolchain becomes available.
2337 #if defined(__sparc64__) && defined(__GNUC__)
2338 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2339 # undef SPARC64_GCC_WORKAROUND
2340 # define SPARC64_GCC_WORKAROUND 1
2344 #define DEBUG_PEEP(str,scan,depth) \
2345 DEBUG_OPTIMISE_r({if (scan){ \
2346 SV * const mysv=sv_newmortal(); \
2347 regnode *Next = regnext(scan); \
2348 regprop(RExC_rx, mysv, scan); \
2349 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2350 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2351 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2358 #define JOIN_EXACT(scan,min,flags) \
2359 if (PL_regkind[OP(scan)] == EXACT) \
2360 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2363 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2364 /* Merge several consecutive EXACTish nodes into one. */
2365 regnode *n = regnext(scan);
2367 regnode *next = scan + NODE_SZ_STR(scan);
2371 regnode *stop = scan;
2372 GET_RE_DEBUG_FLAGS_DECL;
2374 PERL_UNUSED_ARG(depth);
2377 PERL_ARGS_ASSERT_JOIN_EXACT;
2378 #ifndef EXPERIMENTAL_INPLACESCAN
2379 PERL_UNUSED_ARG(flags);
2380 PERL_UNUSED_ARG(val);
2382 DEBUG_PEEP("join",scan,depth);
2384 /* Skip NOTHING, merge EXACT*. */
2386 ( PL_regkind[OP(n)] == NOTHING ||
2387 (stringok && (OP(n) == OP(scan))))
2389 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2391 if (OP(n) == TAIL || n > next)
2393 if (PL_regkind[OP(n)] == NOTHING) {
2394 DEBUG_PEEP("skip:",n,depth);
2395 NEXT_OFF(scan) += NEXT_OFF(n);
2396 next = n + NODE_STEP_REGNODE;
2403 else if (stringok) {
2404 const unsigned int oldl = STR_LEN(scan);
2405 regnode * const nnext = regnext(n);
2407 DEBUG_PEEP("merg",n,depth);
2410 if (oldl + STR_LEN(n) > U8_MAX)
2412 NEXT_OFF(scan) += NEXT_OFF(n);
2413 STR_LEN(scan) += STR_LEN(n);
2414 next = n + NODE_SZ_STR(n);
2415 /* Now we can overwrite *n : */
2416 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2424 #ifdef EXPERIMENTAL_INPLACESCAN
2425 if (flags && !NEXT_OFF(n)) {
2426 DEBUG_PEEP("atch", val, depth);
2427 if (reg_off_by_arg[OP(n)]) {
2428 ARG_SET(n, val - n);
2431 NEXT_OFF(n) = val - n;
2438 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2440 Two problematic code points in Unicode casefolding of EXACT nodes:
2442 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2443 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2449 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2450 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2452 This means that in case-insensitive matching (or "loose matching",
2453 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2454 length of the above casefolded versions) can match a target string
2455 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2456 This would rather mess up the minimum length computation.
2458 What we'll do is to look for the tail four bytes, and then peek
2459 at the preceding two bytes to see whether we need to decrease
2460 the minimum length by four (six minus two).
2462 Thanks to the design of UTF-8, there cannot be false matches:
2463 A sequence of valid UTF-8 bytes cannot be a subsequence of
2464 another valid sequence of UTF-8 bytes.
2467 char * const s0 = STRING(scan), *s, *t;
2468 char * const s1 = s0 + STR_LEN(scan) - 1;
2469 char * const s2 = s1 - 4;
2470 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2471 const char t0[] = "\xaf\x49\xaf\x42";
2473 const char t0[] = "\xcc\x88\xcc\x81";
2475 const char * const t1 = t0 + 3;
2478 s < s2 && (t = ninstr(s, s1, t0, t1));
2481 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2482 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2484 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2485 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2493 n = scan + NODE_SZ_STR(scan);
2495 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2502 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2506 /* REx optimizer. Converts nodes into quickier variants "in place".
2507 Finds fixed substrings. */
2509 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2510 to the position after last scanned or to NULL. */
2512 #define INIT_AND_WITHP \
2513 assert(!and_withp); \
2514 Newx(and_withp,1,struct regnode_charclass_class); \
2515 SAVEFREEPV(and_withp)
2517 /* this is a chain of data about sub patterns we are processing that
2518 need to be handled seperately/specially in study_chunk. Its so
2519 we can simulate recursion without losing state. */
2521 typedef struct scan_frame {
2522 regnode *last; /* last node to process in this frame */
2523 regnode *next; /* next node to process when last is reached */
2524 struct scan_frame *prev; /*previous frame*/
2525 I32 stop; /* what stopparen do we use */
2529 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2531 #define CASE_SYNST_FNC(nAmE) \
2533 if (flags & SCF_DO_STCLASS_AND) { \
2534 for (value = 0; value < 256; value++) \
2535 if (!is_ ## nAmE ## _cp(value)) \
2536 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2539 for (value = 0; value < 256; value++) \
2540 if (is_ ## nAmE ## _cp(value)) \
2541 ANYOF_BITMAP_SET(data->start_class, value); \
2545 if (flags & SCF_DO_STCLASS_AND) { \
2546 for (value = 0; value < 256; value++) \
2547 if (is_ ## nAmE ## _cp(value)) \
2548 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2551 for (value = 0; value < 256; value++) \
2552 if (!is_ ## nAmE ## _cp(value)) \
2553 ANYOF_BITMAP_SET(data->start_class, value); \
2560 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2561 I32 *minlenp, I32 *deltap,
2566 struct regnode_charclass_class *and_withp,
2567 U32 flags, U32 depth)
2568 /* scanp: Start here (read-write). */
2569 /* deltap: Write maxlen-minlen here. */
2570 /* last: Stop before this one. */
2571 /* data: string data about the pattern */
2572 /* stopparen: treat close N as END */
2573 /* recursed: which subroutines have we recursed into */
2574 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2577 I32 min = 0, pars = 0, code;
2578 regnode *scan = *scanp, *next;
2580 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2581 int is_inf_internal = 0; /* The studied chunk is infinite */
2582 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2583 scan_data_t data_fake;
2584 SV *re_trie_maxbuff = NULL;
2585 regnode *first_non_open = scan;
2586 I32 stopmin = I32_MAX;
2587 scan_frame *frame = NULL;
2588 GET_RE_DEBUG_FLAGS_DECL;
2590 PERL_ARGS_ASSERT_STUDY_CHUNK;
2593 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2597 while (first_non_open && OP(first_non_open) == OPEN)
2598 first_non_open=regnext(first_non_open);
2603 while ( scan && OP(scan) != END && scan < last ){
2604 /* Peephole optimizer: */
2605 DEBUG_STUDYDATA("Peep:", data,depth);
2606 DEBUG_PEEP("Peep",scan,depth);
2607 JOIN_EXACT(scan,&min,0);
2609 /* Follow the next-chain of the current node and optimize
2610 away all the NOTHINGs from it. */
2611 if (OP(scan) != CURLYX) {
2612 const int max = (reg_off_by_arg[OP(scan)]
2614 /* I32 may be smaller than U16 on CRAYs! */
2615 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2616 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2620 /* Skip NOTHING and LONGJMP. */
2621 while ((n = regnext(n))
2622 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2623 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2624 && off + noff < max)
2626 if (reg_off_by_arg[OP(scan)])
2629 NEXT_OFF(scan) = off;
2634 /* The principal pseudo-switch. Cannot be a switch, since we
2635 look into several different things. */
2636 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2637 || OP(scan) == IFTHEN) {
2638 next = regnext(scan);
2640 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2642 if (OP(next) == code || code == IFTHEN) {
2643 /* NOTE - There is similar code to this block below for handling
2644 TRIE nodes on a re-study. If you change stuff here check there
2646 I32 max1 = 0, min1 = I32_MAX, num = 0;
2647 struct regnode_charclass_class accum;
2648 regnode * const startbranch=scan;
2650 if (flags & SCF_DO_SUBSTR)
2651 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2652 if (flags & SCF_DO_STCLASS)
2653 cl_init_zero(pRExC_state, &accum);
2655 while (OP(scan) == code) {
2656 I32 deltanext, minnext, f = 0, fake;
2657 struct regnode_charclass_class this_class;
2660 data_fake.flags = 0;
2662 data_fake.whilem_c = data->whilem_c;
2663 data_fake.last_closep = data->last_closep;
2666 data_fake.last_closep = &fake;
2668 data_fake.pos_delta = delta;
2669 next = regnext(scan);
2670 scan = NEXTOPER(scan);
2672 scan = NEXTOPER(scan);
2673 if (flags & SCF_DO_STCLASS) {
2674 cl_init(pRExC_state, &this_class);
2675 data_fake.start_class = &this_class;
2676 f = SCF_DO_STCLASS_AND;
2678 if (flags & SCF_WHILEM_VISITED_POS)
2679 f |= SCF_WHILEM_VISITED_POS;
2681 /* we suppose the run is continuous, last=next...*/
2682 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2684 stopparen, recursed, NULL, f,depth+1);
2687 if (max1 < minnext + deltanext)
2688 max1 = minnext + deltanext;
2689 if (deltanext == I32_MAX)
2690 is_inf = is_inf_internal = 1;
2692 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2694 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2695 if ( stopmin > minnext)
2696 stopmin = min + min1;
2697 flags &= ~SCF_DO_SUBSTR;
2699 data->flags |= SCF_SEEN_ACCEPT;
2702 if (data_fake.flags & SF_HAS_EVAL)
2703 data->flags |= SF_HAS_EVAL;
2704 data->whilem_c = data_fake.whilem_c;
2706 if (flags & SCF_DO_STCLASS)
2707 cl_or(pRExC_state, &accum, &this_class);
2709 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2711 if (flags & SCF_DO_SUBSTR) {
2712 data->pos_min += min1;
2713 data->pos_delta += max1 - min1;
2714 if (max1 != min1 || is_inf)
2715 data->longest = &(data->longest_float);
2718 delta += max1 - min1;
2719 if (flags & SCF_DO_STCLASS_OR) {
2720 cl_or(pRExC_state, data->start_class, &accum);
2722 cl_and(data->start_class, and_withp);
2723 flags &= ~SCF_DO_STCLASS;
2726 else if (flags & SCF_DO_STCLASS_AND) {
2728 cl_and(data->start_class, &accum);
2729 flags &= ~SCF_DO_STCLASS;
2732 /* Switch to OR mode: cache the old value of
2733 * data->start_class */
2735 StructCopy(data->start_class, and_withp,
2736 struct regnode_charclass_class);
2737 flags &= ~SCF_DO_STCLASS_AND;
2738 StructCopy(&accum, data->start_class,
2739 struct regnode_charclass_class);
2740 flags |= SCF_DO_STCLASS_OR;
2741 data->start_class->flags |= ANYOF_EOS;
2745 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2748 Assuming this was/is a branch we are dealing with: 'scan' now
2749 points at the item that follows the branch sequence, whatever
2750 it is. We now start at the beginning of the sequence and look
2757 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2759 If we can find such a subseqence we need to turn the first
2760 element into a trie and then add the subsequent branch exact
2761 strings to the trie.
2765 1. patterns where the whole set of branch can be converted.
2767 2. patterns where only a subset can be converted.
2769 In case 1 we can replace the whole set with a single regop
2770 for the trie. In case 2 we need to keep the start and end
2773 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2774 becomes BRANCH TRIE; BRANCH X;
2776 There is an additional case, that being where there is a
2777 common prefix, which gets split out into an EXACT like node
2778 preceding the TRIE node.
2780 If x(1..n)==tail then we can do a simple trie, if not we make
2781 a "jump" trie, such that when we match the appropriate word
2782 we "jump" to the appopriate tail node. Essentailly we turn
2783 a nested if into a case structure of sorts.
2788 if (!re_trie_maxbuff) {
2789 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2790 if (!SvIOK(re_trie_maxbuff))
2791 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2793 if ( SvIV(re_trie_maxbuff)>=0 ) {
2795 regnode *first = (regnode *)NULL;
2796 regnode *last = (regnode *)NULL;
2797 regnode *tail = scan;
2802 SV * const mysv = sv_newmortal(); /* for dumping */
2804 /* var tail is used because there may be a TAIL
2805 regop in the way. Ie, the exacts will point to the
2806 thing following the TAIL, but the last branch will
2807 point at the TAIL. So we advance tail. If we
2808 have nested (?:) we may have to move through several
2812 while ( OP( tail ) == TAIL ) {
2813 /* this is the TAIL generated by (?:) */
2814 tail = regnext( tail );
2819 regprop(RExC_rx, mysv, tail );
2820 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2821 (int)depth * 2 + 2, "",
2822 "Looking for TRIE'able sequences. Tail node is: ",
2823 SvPV_nolen_const( mysv )
2829 step through the branches, cur represents each
2830 branch, noper is the first thing to be matched
2831 as part of that branch and noper_next is the
2832 regnext() of that node. if noper is an EXACT
2833 and noper_next is the same as scan (our current
2834 position in the regex) then the EXACT branch is
2835 a possible optimization target. Once we have
2836 two or more consequetive such branches we can
2837 create a trie of the EXACT's contents and stich
2838 it in place. If the sequence represents all of
2839 the branches we eliminate the whole thing and
2840 replace it with a single TRIE. If it is a
2841 subsequence then we need to stitch it in. This
2842 means the first branch has to remain, and needs
2843 to be repointed at the item on the branch chain
2844 following the last branch optimized. This could
2845 be either a BRANCH, in which case the
2846 subsequence is internal, or it could be the
2847 item following the branch sequence in which
2848 case the subsequence is at the end.
2852 /* dont use tail as the end marker for this traverse */
2853 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2854 regnode * const noper = NEXTOPER( cur );
2855 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2856 regnode * const noper_next = regnext( noper );
2860 regprop(RExC_rx, mysv, cur);
2861 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2862 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2864 regprop(RExC_rx, mysv, noper);
2865 PerlIO_printf( Perl_debug_log, " -> %s",
2866 SvPV_nolen_const(mysv));
2869 regprop(RExC_rx, mysv, noper_next );
2870 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2871 SvPV_nolen_const(mysv));
2873 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2874 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2876 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2877 : PL_regkind[ OP( noper ) ] == EXACT )
2878 || OP(noper) == NOTHING )
2880 && noper_next == tail
2885 if ( !first || optype == NOTHING ) {
2886 if (!first) first = cur;
2887 optype = OP( noper );
2893 Currently we do not believe that the trie logic can
2894 handle case insensitive matching properly when the
2895 pattern is not unicode (thus forcing unicode semantics).
2897 If/when this is fixed the following define can be swapped
2898 in below to fully enable trie logic.
2900 #define TRIE_TYPE_IS_SAFE 1
2903 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2905 if ( last && TRIE_TYPE_IS_SAFE ) {
2906 make_trie( pRExC_state,
2907 startbranch, first, cur, tail, count,
2910 if ( PL_regkind[ OP( noper ) ] == EXACT
2912 && noper_next == tail
2917 optype = OP( noper );
2927 regprop(RExC_rx, mysv, cur);
2928 PerlIO_printf( Perl_debug_log,
2929 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2930 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2934 if ( last && TRIE_TYPE_IS_SAFE ) {
2935 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2936 #ifdef TRIE_STUDY_OPT
2937 if ( ((made == MADE_EXACT_TRIE &&
2938 startbranch == first)
2939 || ( first_non_open == first )) &&
2941 flags |= SCF_TRIE_RESTUDY;
2942 if ( startbranch == first
2945 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2955 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2956 scan = NEXTOPER(NEXTOPER(scan));
2957 } else /* single branch is optimized. */
2958 scan = NEXTOPER(scan);
2960 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2961 scan_frame *newframe = NULL;
2966 if (OP(scan) != SUSPEND) {
2967 /* set the pointer */
2968 if (OP(scan) == GOSUB) {
2970 RExC_recurse[ARG2L(scan)] = scan;
2971 start = RExC_open_parens[paren-1];
2972 end = RExC_close_parens[paren-1];
2975 start = RExC_rxi->program + 1;
2979 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2980 SAVEFREEPV(recursed);
2982 if (!PAREN_TEST(recursed,paren+1)) {
2983 PAREN_SET(recursed,paren+1);
2984 Newx(newframe,1,scan_frame);
2986 if (flags & SCF_DO_SUBSTR) {
2987 SCAN_COMMIT(pRExC_state,data,minlenp);
2988 data->longest = &(data->longest_float);
2990 is_inf = is_inf_internal = 1;
2991 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2992 cl_anything(pRExC_state, data->start_class);
2993 flags &= ~SCF_DO_STCLASS;
2996 Newx(newframe,1,scan_frame);
2999 end = regnext(scan);
3004 SAVEFREEPV(newframe);
3005 newframe->next = regnext(scan);
3006 newframe->last = last;
3007 newframe->stop = stopparen;
3008 newframe->prev = frame;
3018 else if (OP(scan) == EXACT) {
3019 I32 l = STR_LEN(scan);
3022 const U8 * const s = (U8*)STRING(scan);
3023 l = utf8_length(s, s + l);
3024 uc = utf8_to_uvchr(s, NULL);
3026 uc = *((U8*)STRING(scan));
3029 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3030 /* The code below prefers earlier match for fixed
3031 offset, later match for variable offset. */
3032 if (data->last_end == -1) { /* Update the start info. */
3033 data->last_start_min = data->pos_min;
3034 data->last_start_max = is_inf
3035 ? I32_MAX : data->pos_min + data->pos_delta;
3037 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3039 SvUTF8_on(data->last_found);
3041 SV * const sv = data->last_found;
3042 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3043 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3044 if (mg && mg->mg_len >= 0)
3045 mg->mg_len += utf8_length((U8*)STRING(scan),
3046 (U8*)STRING(scan)+STR_LEN(scan));
3048 data->last_end = data->pos_min + l;
3049 data->pos_min += l; /* As in the first entry. */
3050 data->flags &= ~SF_BEFORE_EOL;
3052 if (flags & SCF_DO_STCLASS_AND) {
3053 /* Check whether it is compatible with what we know already! */
3057 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3058 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3059 && (!(data->start_class->flags & ANYOF_FOLD)
3060 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3063 ANYOF_CLASS_ZERO(data->start_class);
3064 ANYOF_BITMAP_ZERO(data->start_class);
3066 ANYOF_BITMAP_SET(data->start_class, uc);
3067 data->start_class->flags &= ~ANYOF_EOS;
3069 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3071 else if (flags & SCF_DO_STCLASS_OR) {
3072 /* false positive possible if the class is case-folded */
3074 ANYOF_BITMAP_SET(data->start_class, uc);
3076 data->start_class->flags |= ANYOF_UNICODE_ALL;
3077 data->start_class->flags &= ~ANYOF_EOS;
3078 cl_and(data->start_class, and_withp);
3080 flags &= ~SCF_DO_STCLASS;
3082 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3083 I32 l = STR_LEN(scan);
3084 UV uc = *((U8*)STRING(scan));
3086 /* Search for fixed substrings supports EXACT only. */
3087 if (flags & SCF_DO_SUBSTR) {
3089 SCAN_COMMIT(pRExC_state, data, minlenp);
3092 const U8 * const s = (U8 *)STRING(scan);
3093 l = utf8_length(s, s + l);
3094 uc = utf8_to_uvchr(s, NULL);
3097 if (flags & SCF_DO_SUBSTR)
3099 if (flags & SCF_DO_STCLASS_AND) {
3100 /* Check whether it is compatible with what we know already! */
3104 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3105 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3106 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3108 ANYOF_CLASS_ZERO(data->start_class);
3109 ANYOF_BITMAP_ZERO(data->start_class);
3111 ANYOF_BITMAP_SET(data->start_class, uc);
3112 data->start_class->flags &= ~ANYOF_EOS;
3113 data->start_class->flags |= ANYOF_FOLD;
3114 if (OP(scan) == EXACTFL)
3115 data->start_class->flags |= ANYOF_LOCALE;
3118 else if (flags & SCF_DO_STCLASS_OR) {
3119 if (data->start_class->flags & ANYOF_FOLD) {
3120 /* false positive possible if the class is case-folded.
3121 Assume that the locale settings are the same... */
3123 ANYOF_BITMAP_SET(data->start_class, uc);
3124 data->start_class->flags &= ~ANYOF_EOS;
3126 cl_and(data->start_class, and_withp);
3128 flags &= ~SCF_DO_STCLASS;
3130 else if (REGNODE_VARIES(OP(scan))) {
3131 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3132 I32 f = flags, pos_before = 0;
3133 regnode * const oscan = scan;
3134 struct regnode_charclass_class this_class;
3135 struct regnode_charclass_class *oclass = NULL;
3136 I32 next_is_eval = 0;
3138 switch (PL_regkind[OP(scan)]) {
3139 case WHILEM: /* End of (?:...)* . */
3140 scan = NEXTOPER(scan);
3143 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3144 next = NEXTOPER(scan);
3145 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3147 maxcount = REG_INFTY;
3148 next = regnext(scan);
3149 scan = NEXTOPER(scan);
3153 if (flags & SCF_DO_SUBSTR)
3158 if (flags & SCF_DO_STCLASS) {
3160 maxcount = REG_INFTY;
3161 next = regnext(scan);
3162 scan = NEXTOPER(scan);
3165 is_inf = is_inf_internal = 1;
3166 scan = regnext(scan);
3167 if (flags & SCF_DO_SUBSTR) {
3168 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3169 data->longest = &(data->longest_float);
3171 goto optimize_curly_tail;
3173 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3174 && (scan->flags == stopparen))
3179 mincount = ARG1(scan);
3180 maxcount = ARG2(scan);
3182 next = regnext(scan);
3183 if (OP(scan) == CURLYX) {
3184 I32 lp = (data ? *(data->last_closep) : 0);
3185 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3187 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3188 next_is_eval = (OP(scan) == EVAL);
3190 if (flags & SCF_DO_SUBSTR) {
3191 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3192 pos_before = data->pos_min;
3196 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3198 data->flags |= SF_IS_INF;
3200 if (flags & SCF_DO_STCLASS) {
3201 cl_init(pRExC_state, &this_class);
3202 oclass = data->start_class;
3203 data->start_class = &this_class;
3204 f |= SCF_DO_STCLASS_AND;
3205 f &= ~SCF_DO_STCLASS_OR;
3207 /* These are the cases when once a subexpression
3208 fails at a particular position, it cannot succeed
3209 even after backtracking at the enclosing scope.
3211 XXXX what if minimal match and we are at the
3212 initial run of {n,m}? */
3213 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3214 f &= ~SCF_WHILEM_VISITED_POS;
3216 /* This will finish on WHILEM, setting scan, or on NULL: */
3217 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3218 last, data, stopparen, recursed, NULL,
3220 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3222 if (flags & SCF_DO_STCLASS)
3223 data->start_class = oclass;
3224 if (mincount == 0 || minnext == 0) {
3225 if (flags & SCF_DO_STCLASS_OR) {
3226 cl_or(pRExC_state, data->start_class, &this_class);
3228 else if (flags & SCF_DO_STCLASS_AND) {
3229 /* Switch to OR mode: cache the old value of
3230 * data->start_class */
3232 StructCopy(data->start_class, and_withp,
3233 struct regnode_charclass_class);
3234 flags &= ~SCF_DO_STCLASS_AND;
3235 StructCopy(&this_class, data->start_class,
3236 struct regnode_charclass_class);
3237 flags |= SCF_DO_STCLASS_OR;
3238 data->start_class->flags |= ANYOF_EOS;
3240 } else { /* Non-zero len */
3241 if (flags & SCF_DO_STCLASS_OR) {
3242 cl_or(pRExC_state, data->start_class, &this_class);
3243 cl_and(data->start_class, and_withp);
3245 else if (flags & SCF_DO_STCLASS_AND)
3246 cl_and(data->start_class, &this_class);
3247 flags &= ~SCF_DO_STCLASS;
3249 if (!scan) /* It was not CURLYX, but CURLY. */
3251 if ( /* ? quantifier ok, except for (?{ ... }) */
3252 (next_is_eval || !(mincount == 0 && maxcount == 1))
3253 && (minnext == 0) && (deltanext == 0)
3254 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3255 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3257 ckWARNreg(RExC_parse,
3258 "Quantifier unexpected on zero-length expression");
3261 min += minnext * mincount;
3262 is_inf_internal |= ((maxcount == REG_INFTY
3263 && (minnext + deltanext) > 0)
3264 || deltanext == I32_MAX);
3265 is_inf |= is_inf_internal;
3266 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3268 /* Try powerful optimization CURLYX => CURLYN. */
3269 if ( OP(oscan) == CURLYX && data
3270 && data->flags & SF_IN_PAR
3271 && !(data->flags & SF_HAS_EVAL)
3272 && !deltanext && minnext == 1 ) {
3273 /* Try to optimize to CURLYN. */
3274 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3275 regnode * const nxt1 = nxt;
3282 if (!REGNODE_SIMPLE(OP(nxt))
3283 && !(PL_regkind[OP(nxt)] == EXACT
3284 && STR_LEN(nxt) == 1))
3290 if (OP(nxt) != CLOSE)
3292 if (RExC_open_parens) {
3293 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3294 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3296 /* Now we know that nxt2 is the only contents: */
3297 oscan->flags = (U8)ARG(nxt);
3299 OP(nxt1) = NOTHING; /* was OPEN. */
3302 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3303 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3304 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3305 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3306 OP(nxt + 1) = OPTIMIZED; /* was count. */
3307 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3312 /* Try optimization CURLYX => CURLYM. */
3313 if ( OP(oscan) == CURLYX && data
3314 && !(data->flags & SF_HAS_PAR)
3315 && !(data->flags & SF_HAS_EVAL)
3316 && !deltanext /* atom is fixed width */
3317 && minnext != 0 /* CURLYM can't handle zero width */
3319 /* XXXX How to optimize if data == 0? */
3320 /* Optimize to a simpler form. */
3321 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3325 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3326 && (OP(nxt2) != WHILEM))
3328 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3329 /* Need to optimize away parenths. */
3330 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3331 /* Set the parenth number. */
3332 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3334 oscan->flags = (U8)ARG(nxt);
3335 if (RExC_open_parens) {
3336 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3337 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3339 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3340 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3343 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3344 OP(nxt + 1) = OPTIMIZED; /* was count. */
3345 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3346 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3349 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3350 regnode *nnxt = regnext(nxt1);
3352 if (reg_off_by_arg[OP(nxt1)])
3353 ARG_SET(nxt1, nxt2 - nxt1);
3354 else if (nxt2 - nxt1 < U16_MAX)
3355 NEXT_OFF(nxt1) = nxt2 - nxt1;
3357 OP(nxt) = NOTHING; /* Cannot beautify */
3362 /* Optimize again: */
3363 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3364 NULL, stopparen, recursed, NULL, 0,depth+1);
3369 else if ((OP(oscan) == CURLYX)
3370 && (flags & SCF_WHILEM_VISITED_POS)
3371 /* See the comment on a similar expression above.
3372 However, this time it not a subexpression
3373 we care about, but the expression itself. */
3374 && (maxcount == REG_INFTY)
3375 && data && ++data->whilem_c < 16) {
3376 /* This stays as CURLYX, we can put the count/of pair. */
3377 /* Find WHILEM (as in regexec.c) */
3378 regnode *nxt = oscan + NEXT_OFF(oscan);
3380 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3382 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3383 | (RExC_whilem_seen << 4)); /* On WHILEM */
3385 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3387 if (flags & SCF_DO_SUBSTR) {
3388 SV *last_str = NULL;
3389 int counted = mincount != 0;
3391 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3392 #if defined(SPARC64_GCC_WORKAROUND)
3395 const char *s = NULL;
3398 if (pos_before >= data->last_start_min)
3401 b = data->last_start_min;
3404 s = SvPV_const(data->last_found, l);
3405 old = b - data->last_start_min;
3408 I32 b = pos_before >= data->last_start_min
3409 ? pos_before : data->last_start_min;
3411 const char * const s = SvPV_const(data->last_found, l);
3412 I32 old = b - data->last_start_min;
3416 old = utf8_hop((U8*)s, old) - (U8*)s;
3418 /* Get the added string: */
3419 last_str = newSVpvn_utf8(s + old, l, UTF);
3420 if (deltanext == 0 && pos_before == b) {
3421 /* What was added is a constant string */
3423 SvGROW(last_str, (mincount * l) + 1);
3424 repeatcpy(SvPVX(last_str) + l,
3425 SvPVX_const(last_str), l, mincount - 1);
3426 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3427 /* Add additional parts. */
3428 SvCUR_set(data->last_found,
3429 SvCUR(data->last_found) - l);
3430 sv_catsv(data->last_found, last_str);
3432 SV * sv = data->last_found;
3434 SvUTF8(sv) && SvMAGICAL(sv) ?
3435 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3436 if (mg && mg->mg_len >= 0)
3437 mg->mg_len += CHR_SVLEN(last_str) - l;
3439 data->last_end += l * (mincount - 1);
3442 /* start offset must point into the last copy */
3443 data->last_start_min += minnext * (mincount - 1);
3444 data->last_start_max += is_inf ? I32_MAX
3445 : (maxcount - 1) * (minnext + data->pos_delta);
3448 /* It is counted once already... */
3449 data->pos_min += minnext * (mincount - counted);
3450 data->pos_delta += - counted * deltanext +
3451 (minnext + deltanext) * maxcount - minnext * mincount;
3452 if (mincount != maxcount) {
3453 /* Cannot extend fixed substrings found inside
3455 SCAN_COMMIT(pRExC_state,data,minlenp);
3456 if (mincount && last_str) {
3457 SV * const sv = data->last_found;
3458 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3459 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3463 sv_setsv(sv, last_str);
3464 data->last_end = data->pos_min;
3465 data->last_start_min =
3466 data->pos_min - CHR_SVLEN(last_str);
3467 data->last_start_max = is_inf
3469 : data->pos_min + data->pos_delta
3470 - CHR_SVLEN(last_str);
3472 data->longest = &(data->longest_float);
3474 SvREFCNT_dec(last_str);
3476 if (data && (fl & SF_HAS_EVAL))
3477 data->flags |= SF_HAS_EVAL;
3478 optimize_curly_tail:
3479 if (OP(oscan) != CURLYX) {
3480 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3482 NEXT_OFF(oscan) += NEXT_OFF(next);
3485 default: /* REF and CLUMP only? */
3486 if (flags & SCF_DO_SUBSTR) {
3487 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3488 data->longest = &(data->longest_float);
3490 is_inf = is_inf_internal = 1;
3491 if (flags & SCF_DO_STCLASS_OR)
3492 cl_anything(pRExC_state, data->start_class);
3493 flags &= ~SCF_DO_STCLASS;
3497 else if (OP(scan) == LNBREAK) {
3498 if (flags & SCF_DO_STCLASS) {
3500 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3501 if (flags & SCF_DO_STCLASS_AND) {
3502 for (value = 0; value < 256; value++)
3503 if (!is_VERTWS_cp(value))
3504 ANYOF_BITMAP_CLEAR(data->start_class, value);
3507 for (value = 0; value < 256; value++)
3508 if (is_VERTWS_cp(value))
3509 ANYOF_BITMAP_SET(data->start_class, value);
3511 if (flags & SCF_DO_STCLASS_OR)
3512 cl_and(data->start_class, and_withp);
3513 flags &= ~SCF_DO_STCLASS;
3517 if (flags & SCF_DO_SUBSTR) {
3518 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3520 data->pos_delta += 1;
3521 data->longest = &(data->longest_float);
3524 else if (OP(scan) == FOLDCHAR) {
3525 int d = ARG(scan)==0xDF ? 1 : 2;
3526 flags &= ~SCF_DO_STCLASS;
3529 if (flags & SCF_DO_SUBSTR) {
3530 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3532 data->pos_delta += d;
3533 data->longest = &(data->longest_float);
3536 else if (REGNODE_SIMPLE(OP(scan))) {
3539 if (flags & SCF_DO_SUBSTR) {
3540 SCAN_COMMIT(pRExC_state,data,minlenp);
3544 if (flags & SCF_DO_STCLASS) {
3545 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3547 /* Some of the logic below assumes that switching
3548 locale on will only add false positives. */
3549 switch (PL_regkind[OP(scan)]) {
3553 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3554 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3555 cl_anything(pRExC_state, data->start_class);
3558 if (OP(scan) == SANY)
3560 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3561 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3562 || (data->start_class->flags & ANYOF_CLASS));
3563 cl_anything(pRExC_state, data->start_class);
3565 if (flags & SCF_DO_STCLASS_AND || !value)
3566 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3569 if (flags & SCF_DO_STCLASS_AND)
3570 cl_and(data->start_class,
3571 (struct regnode_charclass_class*)scan);
3573 cl_or(pRExC_state, data->start_class,
3574 (struct regnode_charclass_class*)scan);
3577 if (flags & SCF_DO_STCLASS_AND) {
3578 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3579 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3580 if (FLAGS(scan) & USE_UNI) {
3581 for (value = 0; value < 256; value++) {
3582 if (!isWORDCHAR_L1(value)) {
3583 ANYOF_BITMAP_CLEAR(data->start_class, value);
3587 for (value = 0; value < 256; value++) {
3588 if (!isALNUM(value)) {
3589 ANYOF_BITMAP_CLEAR(data->start_class, value);
3596 if (data->start_class->flags & ANYOF_LOCALE)
3597 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3598 else if (FLAGS(scan) & USE_UNI) {
3599 for (value = 0; value < 256; value++) {
3600 if (isWORDCHAR_L1(value)) {
3601 ANYOF_BITMAP_SET(data->start_class, value);
3605 for (value = 0; value < 256; value++) {
3606 if (isALNUM(value)) {
3607 ANYOF_BITMAP_SET(data->start_class, value);
3614 if (flags & SCF_DO_STCLASS_AND) {
3615 if (data->start_class->flags & ANYOF_LOCALE)
3616 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3619 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3620 data->start_class->flags |= ANYOF_LOCALE;
3624 if (flags & SCF_DO_STCLASS_AND) {
3625 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3626 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3627 if (FLAGS(scan) & USE_UNI) {
3628 for (value = 0; value < 256; value++) {
3629 if (isWORDCHAR_L1(value)) {
3630 ANYOF_BITMAP_CLEAR(data->start_class, value);
3634 for (value = 0; value < 256; value++) {
3635 if (isALNUM(value)) {
3636 ANYOF_BITMAP_CLEAR(data->start_class, value);
3643 if (data->start_class->flags & ANYOF_LOCALE)
3644 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3646 for (value = 0; value < 256; value++)
3647 if (!isALNUM(value))
3648 ANYOF_BITMAP_SET(data->start_class, value);
3653 if (flags & SCF_DO_STCLASS_AND) {
3654 if (data->start_class->flags & ANYOF_LOCALE)
3655 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3658 data->start_class->flags |= ANYOF_LOCALE;
3659 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3663 if (flags & SCF_DO_STCLASS_AND) {
3664 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3665 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3666 if (FLAGS(scan) & USE_UNI) {
3667 for (value = 0; value < 256; value++) {
3668 if (!isSPACE_L1(value)) {
3669 ANYOF_BITMAP_CLEAR(data->start_class, value);
3673 for (value = 0; value < 256; value++) {
3674 if (!isSPACE(value)) {
3675 ANYOF_BITMAP_CLEAR(data->start_class, value);
3682 if (data->start_class->flags & ANYOF_LOCALE) {
3683 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3685 else if (FLAGS(scan) & USE_UNI) {
3686 for (value = 0; value < 256; value++) {
3687 if (isSPACE_L1(value)) {
3688 ANYOF_BITMAP_SET(data->start_class, value);
3692 for (value = 0; value < 256; value++) {
3693 if (isSPACE(value)) {
3694 ANYOF_BITMAP_SET(data->start_class, value);
3701 if (flags & SCF_DO_STCLASS_AND) {
3702 if (data->start_class->flags & ANYOF_LOCALE)
3703 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3706 data->start_class->flags |= ANYOF_LOCALE;
3707 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3711 if (flags & SCF_DO_STCLASS_AND) {
3712 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3713 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3714 if (FLAGS(scan) & USE_UNI) {
3715 for (value = 0; value < 256; value++) {
3716 if (isSPACE_L1(value)) {
3717 ANYOF_BITMAP_CLEAR(data->start_class, value);
3721 for (value = 0; value < 256; value++) {
3722 if (isSPACE(value)) {
3723 ANYOF_BITMAP_CLEAR(data->start_class, value);
3730 if (data->start_class->flags & ANYOF_LOCALE)
3731 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3732 else if (FLAGS(scan) & USE_UNI) {
3733 for (value = 0; value < 256; value++) {
3734 if (!isSPACE_L1(value)) {
3735 ANYOF_BITMAP_SET(data->start_class, value);
3740 for (value = 0; value < 256; value++) {
3741 if (!isSPACE(value)) {
3742 ANYOF_BITMAP_SET(data->start_class, value);
3749 if (flags & SCF_DO_STCLASS_AND) {
3750 if (data->start_class->flags & ANYOF_LOCALE) {
3751 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3752 for (value = 0; value < 256; value++)
3753 if (!isSPACE(value))
3754 ANYOF_BITMAP_CLEAR(data->start_class, value);
3758 data->start_class->flags |= ANYOF_LOCALE;
3759 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3763 if (flags & SCF_DO_STCLASS_AND) {
3764 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3765 for (value = 0; value < 256; value++)
3766 if (!isDIGIT(value))
3767 ANYOF_BITMAP_CLEAR(data->start_class, value);
3770 if (data->start_class->flags & ANYOF_LOCALE)
3771 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3773 for (value = 0; value < 256; value++)
3775 ANYOF_BITMAP_SET(data->start_class, value);
3780 if (flags & SCF_DO_STCLASS_AND) {
3781 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3782 for (value = 0; value < 256; value++)
3784 ANYOF_BITMAP_CLEAR(data->start_class, value);
3787 if (data->start_class->flags & ANYOF_LOCALE)
3788 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3790 for (value = 0; value < 256; value++)
3791 if (!isDIGIT(value))
3792 ANYOF_BITMAP_SET(data->start_class, value);
3796 CASE_SYNST_FNC(VERTWS);
3797 CASE_SYNST_FNC(HORIZWS);
3800 if (flags & SCF_DO_STCLASS_OR)
3801 cl_and(data->start_class, and_withp);
3802 flags &= ~SCF_DO_STCLASS;
3805 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3806 data->flags |= (OP(scan) == MEOL
3810 else if ( PL_regkind[OP(scan)] == BRANCHJ
3811 /* Lookbehind, or need to calculate parens/evals/stclass: */
3812 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3813 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3814 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3815 || OP(scan) == UNLESSM )
3817 /* Negative Lookahead/lookbehind
3818 In this case we can't do fixed string optimisation.
3821 I32 deltanext, minnext, fake = 0;
3823 struct regnode_charclass_class intrnl;
3826 data_fake.flags = 0;
3828 data_fake.whilem_c = data->whilem_c;
3829 data_fake.last_closep = data->last_closep;
3832 data_fake.last_closep = &fake;
3833 data_fake.pos_delta = delta;
3834 if ( flags & SCF_DO_STCLASS && !scan->flags
3835 && OP(scan) == IFMATCH ) { /* Lookahead */
3836 cl_init(pRExC_state, &intrnl);
3837 data_fake.start_class = &intrnl;
3838 f |= SCF_DO_STCLASS_AND;
3840 if (flags & SCF_WHILEM_VISITED_POS)
3841 f |= SCF_WHILEM_VISITED_POS;
3842 next = regnext(scan);
3843 nscan = NEXTOPER(NEXTOPER(scan));
3844 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3845 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3848 FAIL("Variable length lookbehind not implemented");
3850 else if (minnext > (I32)U8_MAX) {
3851 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3853 scan->flags = (U8)minnext;
3856 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3858 if (data_fake.flags & SF_HAS_EVAL)
3859 data->flags |= SF_HAS_EVAL;
3860 data->whilem_c = data_fake.whilem_c;
3862 if (f & SCF_DO_STCLASS_AND) {
3863 if (flags & SCF_DO_STCLASS_OR) {
3864 /* OR before, AND after: ideally we would recurse with
3865 * data_fake to get the AND applied by study of the
3866 * remainder of the pattern, and then derecurse;
3867 * *** HACK *** for now just treat as "no information".
3868 * See [perl #56690].
3870 cl_init(pRExC_state, data->start_class);
3872 /* AND before and after: combine and continue */
3873 const int was = (data->start_class->flags & ANYOF_EOS);
3875 cl_and(data->start_class, &intrnl);
3877 data->start_class->flags |= ANYOF_EOS;
3881 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3883 /* Positive Lookahead/lookbehind
3884 In this case we can do fixed string optimisation,
3885 but we must be careful about it. Note in the case of
3886 lookbehind the positions will be offset by the minimum
3887 length of the pattern, something we won't know about
3888 until after the recurse.
3890 I32 deltanext, fake = 0;
3892 struct regnode_charclass_class intrnl;
3894 /* We use SAVEFREEPV so that when the full compile
3895 is finished perl will clean up the allocated
3896 minlens when its all done. This was we don't
3897 have to worry about freeing them when we know
3898 they wont be used, which would be a pain.
3901 Newx( minnextp, 1, I32 );
3902 SAVEFREEPV(minnextp);
3905 StructCopy(data, &data_fake, scan_data_t);
3906 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3909 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3910 data_fake.last_found=newSVsv(data->last_found);
3914 data_fake.last_closep = &fake;
3915 data_fake.flags = 0;
3916 data_fake.pos_delta = delta;
3918 data_fake.flags |= SF_IS_INF;
3919 if ( flags & SCF_DO_STCLASS && !scan->flags
3920 && OP(scan) == IFMATCH ) { /* Lookahead */
3921 cl_init(pRExC_state, &intrnl);
3922 data_fake.start_class = &intrnl;
3923 f |= SCF_DO_STCLASS_AND;
3925 if (flags & SCF_WHILEM_VISITED_POS)
3926 f |= SCF_WHILEM_VISITED_POS;
3927 next = regnext(scan);
3928 nscan = NEXTOPER(NEXTOPER(scan));
3930 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3931 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3934 FAIL("Variable length lookbehind not implemented");
3936 else if (*minnextp > (I32)U8_MAX) {
3937 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3939 scan->flags = (U8)*minnextp;
3944 if (f & SCF_DO_STCLASS_AND) {
3945 const int was = (data->start_class->flags & ANYOF_EOS);
3947 cl_and(data->start_class, &intrnl);
3949 data->start_class->flags |= ANYOF_EOS;
3952 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3954 if (data_fake.flags & SF_HAS_EVAL)
3955 data->flags |= SF_HAS_EVAL;
3956 data->whilem_c = data_fake.whilem_c;
3957 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3958 if (RExC_rx->minlen<*minnextp)
3959 RExC_rx->minlen=*minnextp;
3960 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3961 SvREFCNT_dec(data_fake.last_found);
3963 if ( data_fake.minlen_fixed != minlenp )
3965 data->offset_fixed= data_fake.offset_fixed;
3966 data->minlen_fixed= data_fake.minlen_fixed;
3967 data->lookbehind_fixed+= scan->flags;
3969 if ( data_fake.minlen_float != minlenp )
3971 data->minlen_float= data_fake.minlen_float;
3972 data->offset_float_min=data_fake.offset_float_min;
3973 data->offset_float_max=data_fake.offset_float_max;
3974 data->lookbehind_float+= scan->flags;
3983 else if (OP(scan) == OPEN) {
3984 if (stopparen != (I32)ARG(scan))
3987 else if (OP(scan) == CLOSE) {
3988 if (stopparen == (I32)ARG(scan)) {
3991 if ((I32)ARG(scan) == is_par) {
3992 next = regnext(scan);
3994 if ( next && (OP(next) != WHILEM) && next < last)
3995 is_par = 0; /* Disable optimization */
3998 *(data->last_closep) = ARG(scan);
4000 else if (OP(scan) == EVAL) {
4002 data->flags |= SF_HAS_EVAL;
4004 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4005 if (flags & SCF_DO_SUBSTR) {
4006 SCAN_COMMIT(pRExC_state,data,minlenp);
4007 flags &= ~SCF_DO_SUBSTR;
4009 if (data && OP(scan)==ACCEPT) {
4010 data->flags |= SCF_SEEN_ACCEPT;
4015 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4017 if (flags & SCF_DO_SUBSTR) {
4018 SCAN_COMMIT(pRExC_state,data,minlenp);
4019 data->longest = &(data->longest_float);
4021 is_inf = is_inf_internal = 1;
4022 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4023 cl_anything(pRExC_state, data->start_class);
4024 flags &= ~SCF_DO_STCLASS;
4026 else if (OP(scan) == GPOS) {
4027 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4028 !(delta || is_inf || (data && data->pos_delta)))
4030 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4031 RExC_rx->extflags |= RXf_ANCH_GPOS;
4032 if (RExC_rx->gofs < (U32)min)
4033 RExC_rx->gofs = min;
4035 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4039 #ifdef TRIE_STUDY_OPT
4040 #ifdef FULL_TRIE_STUDY
4041 else if (PL_regkind[OP(scan)] == TRIE) {
4042 /* NOTE - There is similar code to this block above for handling
4043 BRANCH nodes on the initial study. If you change stuff here
4045 regnode *trie_node= scan;
4046 regnode *tail= regnext(scan);
4047 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4048 I32 max1 = 0, min1 = I32_MAX;
4049 struct regnode_charclass_class accum;
4051 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4052 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4053 if (flags & SCF_DO_STCLASS)
4054 cl_init_zero(pRExC_state, &accum);
4060 const regnode *nextbranch= NULL;
4063 for ( word=1 ; word <= trie->wordcount ; word++)
4065 I32 deltanext=0, minnext=0, f = 0, fake;
4066 struct regnode_charclass_class this_class;
4068 data_fake.flags = 0;
4070 data_fake.whilem_c = data->whilem_c;
4071 data_fake.last_closep = data->last_closep;
4074 data_fake.last_closep = &fake;
4075 data_fake.pos_delta = delta;
4076 if (flags & SCF_DO_STCLASS) {
4077 cl_init(pRExC_state, &this_class);
4078 data_fake.start_class = &this_class;
4079 f = SCF_DO_STCLASS_AND;
4081 if (flags & SCF_WHILEM_VISITED_POS)
4082 f |= SCF_WHILEM_VISITED_POS;
4084 if (trie->jump[word]) {
4086 nextbranch = trie_node + trie->jump[0];
4087 scan= trie_node + trie->jump[word];
4088 /* We go from the jump point to the branch that follows
4089 it. Note this means we need the vestigal unused branches
4090 even though they arent otherwise used.
4092 minnext = study_chunk(pRExC_state, &scan, minlenp,
4093 &deltanext, (regnode *)nextbranch, &data_fake,
4094 stopparen, recursed, NULL, f,depth+1);
4096 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4097 nextbranch= regnext((regnode*)nextbranch);
4099 if (min1 > (I32)(minnext + trie->minlen))
4100 min1 = minnext + trie->minlen;
4101 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4102 max1 = minnext + deltanext + trie->maxlen;
4103 if (deltanext == I32_MAX)
4104 is_inf = is_inf_internal = 1;
4106 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4108 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4109 if ( stopmin > min + min1)
4110 stopmin = min + min1;
4111 flags &= ~SCF_DO_SUBSTR;
4113 data->flags |= SCF_SEEN_ACCEPT;
4116 if (data_fake.flags & SF_HAS_EVAL)
4117 data->flags |= SF_HAS_EVAL;
4118 data->whilem_c = data_fake.whilem_c;
4120 if (flags & SCF_DO_STCLASS)
4121 cl_or(pRExC_state, &accum, &this_class);
4124 if (flags & SCF_DO_SUBSTR) {
4125 data->pos_min += min1;
4126 data->pos_delta += max1 - min1;
4127 if (max1 != min1 || is_inf)
4128 data->longest = &(data->longest_float);
4131 delta += max1 - min1;
4132 if (flags & SCF_DO_STCLASS_OR) {
4133 cl_or(pRExC_state, data->start_class, &accum);
4135 cl_and(data->start_class, and_withp);
4136 flags &= ~SCF_DO_STCLASS;
4139 else if (flags & SCF_DO_STCLASS_AND) {
4141 cl_and(data->start_class, &accum);
4142 flags &= ~SCF_DO_STCLASS;
4145 /* Switch to OR mode: cache the old value of
4146 * data->start_class */
4148 StructCopy(data->start_class, and_withp,
4149 struct regnode_charclass_class);
4150 flags &= ~SCF_DO_STCLASS_AND;
4151 StructCopy(&accum, data->start_class,
4152 struct regnode_charclass_class);
4153 flags |= SCF_DO_STCLASS_OR;
4154 data->start_class->flags |= ANYOF_EOS;
4161 else if (PL_regkind[OP(scan)] == TRIE) {
4162 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4165 min += trie->minlen;
4166 delta += (trie->maxlen - trie->minlen);
4167 flags &= ~SCF_DO_STCLASS; /* xxx */
4168 if (flags & SCF_DO_SUBSTR) {
4169 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4170 data->pos_min += trie->minlen;
4171 data->pos_delta += (trie->maxlen - trie->minlen);
4172 if (trie->maxlen != trie->minlen)
4173 data->longest = &(data->longest_float);
4175 if (trie->jump) /* no more substrings -- for now /grr*/
4176 flags &= ~SCF_DO_SUBSTR;
4178 #endif /* old or new */
4179 #endif /* TRIE_STUDY_OPT */
4181 /* Else: zero-length, ignore. */
4182 scan = regnext(scan);
4187 stopparen = frame->stop;
4188 frame = frame->prev;
4189 goto fake_study_recurse;
4194 DEBUG_STUDYDATA("pre-fin:",data,depth);
4197 *deltap = is_inf_internal ? I32_MAX : delta;
4198 if (flags & SCF_DO_SUBSTR && is_inf)
4199 data->pos_delta = I32_MAX - data->pos_min;
4200 if (is_par > (I32)U8_MAX)
4202 if (is_par && pars==1 && data) {
4203 data->flags |= SF_IN_PAR;
4204 data->flags &= ~SF_HAS_PAR;
4206 else if (pars && data) {
4207 data->flags |= SF_HAS_PAR;
4208 data->flags &= ~SF_IN_PAR;
4210 if (flags & SCF_DO_STCLASS_OR)
4211 cl_and(data->start_class, and_withp);
4212 if (flags & SCF_TRIE_RESTUDY)
4213 data->flags |= SCF_TRIE_RESTUDY;
4215 DEBUG_STUDYDATA("post-fin:",data,depth);
4217 return min < stopmin ? min : stopmin;
4221 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4223 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4225 PERL_ARGS_ASSERT_ADD_DATA;
4227 Renewc(RExC_rxi->data,
4228 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4229 char, struct reg_data);
4231 Renew(RExC_rxi->data->what, count + n, U8);
4233 Newx(RExC_rxi->data->what, n, U8);
4234 RExC_rxi->data->count = count + n;
4235 Copy(s, RExC_rxi->data->what + count, n, U8);
4239 /*XXX: todo make this not included in a non debugging perl */
4240 #ifndef PERL_IN_XSUB_RE
4242 Perl_reginitcolors(pTHX)
4245 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4247 char *t = savepv(s);
4251 t = strchr(t, '\t');
4257 PL_colors[i] = t = (char *)"";
4262 PL_colors[i++] = (char *)"";
4269 #ifdef TRIE_STUDY_OPT
4270 #define CHECK_RESTUDY_GOTO \
4272 (data.flags & SCF_TRIE_RESTUDY) \
4276 #define CHECK_RESTUDY_GOTO
4280 - pregcomp - compile a regular expression into internal code
4282 * We can't allocate space until we know how big the compiled form will be,
4283 * but we can't compile it (and thus know how big it is) until we've got a
4284 * place to put the code. So we cheat: we compile it twice, once with code
4285 * generation turned off and size counting turned on, and once "for real".
4286 * This also means that we don't allocate space until we are sure that the
4287 * thing really will compile successfully, and we never have to move the
4288 * code and thus invalidate pointers into it. (Note that it has to be in
4289 * one piece because free() must be able to free it all.) [NB: not true in perl]
4291 * Beware that the optimization-preparation code in here knows about some
4292 * of the structure of the compiled regexp. [I'll say.]
4297 #ifndef PERL_IN_XSUB_RE
4298 #define RE_ENGINE_PTR &PL_core_reg_engine
4300 extern const struct regexp_engine my_reg_engine;
4301 #define RE_ENGINE_PTR &my_reg_engine
4304 #ifndef PERL_IN_XSUB_RE
4306 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4309 HV * const table = GvHV(PL_hintgv);
4311 PERL_ARGS_ASSERT_PREGCOMP;
4313 /* Dispatch a request to compile a regexp to correct
4316 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4317 GET_RE_DEBUG_FLAGS_DECL;
4318 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4319 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4321 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4324 return CALLREGCOMP_ENG(eng, pattern, flags);
4327 return Perl_re_compile(aTHX_ pattern, flags);
4332 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4337 register regexp_internal *ri;
4349 RExC_state_t RExC_state;
4350 RExC_state_t * const pRExC_state = &RExC_state;
4351 #ifdef TRIE_STUDY_OPT
4353 RExC_state_t copyRExC_state;
4355 GET_RE_DEBUG_FLAGS_DECL;
4357 PERL_ARGS_ASSERT_RE_COMPILE;
4359 DEBUG_r(if (!PL_colorset) reginitcolors());
4361 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4364 /* Longjmp back to here if have to switch in midstream to utf8 */
4365 if (! RExC_orig_utf8) {
4366 JMPENV_PUSH(jump_ret);
4369 if (jump_ret == 0) { /* First time through */
4370 exp = SvPV(pattern, plen);
4374 SV *dsv= sv_newmortal();
4375 RE_PV_QUOTED_DECL(s, RExC_utf8,
4376 dsv, exp, plen, 60);
4377 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4378 PL_colors[4],PL_colors[5],s);
4381 else { /* longjumped back */
4384 /* If the cause for the longjmp was other than changing to utf8, pop
4385 * our own setjmp, and longjmp to the correct handler */
4386 if (jump_ret != UTF8_LONGJMP) {
4388 JMPENV_JUMP(jump_ret);
4393 /* It's possible to write a regexp in ascii that represents Unicode
4394 codepoints outside of the byte range, such as via \x{100}. If we
4395 detect such a sequence we have to convert the entire pattern to utf8
4396 and then recompile, as our sizing calculation will have been based
4397 on 1 byte == 1 character, but we will need to use utf8 to encode
4398 at least some part of the pattern, and therefore must convert the whole
4401 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4402 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4403 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4405 RExC_orig_utf8 = RExC_utf8 = 1;
4409 #ifdef TRIE_STUDY_OPT
4414 RExC_flags = pm_flags;
4418 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4419 RExC_seen_evals = 0;
4422 /* First pass: determine size, legality. */
4430 RExC_emit = &PL_regdummy;
4431 RExC_whilem_seen = 0;
4432 RExC_open_parens = NULL;
4433 RExC_close_parens = NULL;
4435 RExC_paren_names = NULL;
4437 RExC_paren_name_list = NULL;
4439 RExC_recurse = NULL;
4440 RExC_recurse_count = 0;
4442 #if 0 /* REGC() is (currently) a NOP at the first pass.
4443 * Clever compilers notice this and complain. --jhi */
4444 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4446 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4447 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4448 RExC_precomp = NULL;
4452 /* Here, finished first pass. Get rid of our setjmp, which we added for
4453 * efficiency only if the passed-in string wasn't in utf8, as shown by
4454 * RExC_orig_utf8. But if the first pass was redone, that variable will be
4455 * 1 here even though the original string wasn't utf8, but in this case
4456 * there will have been a long jump */
4457 if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4461 PerlIO_printf(Perl_debug_log,
4462 "Required size %"IVdf" nodes\n"
4463 "Starting second pass (creation)\n",
4466 RExC_lastparse=NULL;
4468 /* Small enough for pointer-storage convention?
4469 If extralen==0, this means that we will not need long jumps. */
4470 if (RExC_size >= 0x10000L && RExC_extralen)
4471 RExC_size += RExC_extralen;
4474 if (RExC_whilem_seen > 15)
4475 RExC_whilem_seen = 15;
4477 /* Allocate space and zero-initialize. Note, the two step process
4478 of zeroing when in debug mode, thus anything assigned has to
4479 happen after that */
4480 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4481 r = (struct regexp*)SvANY(rx);
4482 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4483 char, regexp_internal);
4484 if ( r == NULL || ri == NULL )
4485 FAIL("Regexp out of space");
4487 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4488 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4490 /* bulk initialize base fields with 0. */
4491 Zero(ri, sizeof(regexp_internal), char);
4494 /* non-zero initialization begins here */
4496 r->engine= RE_ENGINE_PTR;
4497 r->extflags = pm_flags;
4499 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4500 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
4502 /* The caret is output if there are any defaults: if not all the STD
4503 * flags are set, or if no character set specifier is needed */
4505 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4507 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4508 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4509 >> RXf_PMf_STD_PMMOD_SHIFT);
4510 const char *fptr = STD_PAT_MODS; /*"msix"*/
4512 /* Allocate for the worst case, which is all the std flags are turned
4513 * on. If more precision is desired, we could do a population count of
4514 * the flags set. This could be done with a small lookup table, or by
4515 * shifting, masking and adding, or even, when available, assembly
4516 * language for a machine-language population count.
4517 * We never output a minus, as all those are defaults, so are
4518 * covered by the caret */
4519 const STRLEN wraplen = plen + has_p + has_runon
4520 + has_default /* If needs a caret */
4521 + has_charset /* If needs a character set specifier */
4522 + (sizeof(STD_PAT_MODS) - 1)
4523 + (sizeof("(?:)") - 1);
4525 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4527 SvFLAGS(rx) |= SvUTF8(pattern);
4530 /* If a default, cover it using the caret */
4532 *p++= DEFAULT_PAT_MOD;
4535 if (r->extflags & RXf_PMf_LOCALE) {
4536 *p++ = LOCALE_PAT_MOD;
4538 *p++ = UNICODE_PAT_MOD;
4542 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4545 while((ch = *fptr++)) {
4553 Copy(RExC_precomp, p, plen, char);
4554 assert ((RX_WRAPPED(rx) - p) < 16);
4555 r->pre_prefix = p - RX_WRAPPED(rx);
4561 SvCUR_set(rx, p - SvPVX_const(rx));
4565 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4567 if (RExC_seen & REG_SEEN_RECURSE) {
4568 Newxz(RExC_open_parens, RExC_npar,regnode *);
4569 SAVEFREEPV(RExC_open_parens);
4570 Newxz(RExC_close_parens,RExC_npar,regnode *);
4571 SAVEFREEPV(RExC_close_parens);
4574 /* Useful during FAIL. */
4575 #ifdef RE_TRACK_PATTERN_OFFSETS
4576 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4577 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4578 "%s %"UVuf" bytes for offset annotations.\n",
4579 ri->u.offsets ? "Got" : "Couldn't get",
4580 (UV)((2*RExC_size+1) * sizeof(U32))));
4582 SetProgLen(ri,RExC_size);
4586 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4588 /* Second pass: emit code. */
4589 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4594 RExC_emit_start = ri->program;
4595 RExC_emit = ri->program;
4596 RExC_emit_bound = ri->program + RExC_size + 1;
4598 /* Store the count of eval-groups for security checks: */
4599 RExC_rx->seen_evals = RExC_seen_evals;
4600 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4601 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4605 /* XXXX To minimize changes to RE engine we always allocate
4606 3-units-long substrs field. */
4607 Newx(r->substrs, 1, struct reg_substr_data);
4608 if (RExC_recurse_count) {
4609 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4610 SAVEFREEPV(RExC_recurse);
4614 r->minlen = minlen = sawplus = sawopen = 0;
4615 Zero(r->substrs, 1, struct reg_substr_data);
4617 #ifdef TRIE_STUDY_OPT
4619 StructCopy(&zero_scan_data, &data, scan_data_t);
4620 copyRExC_state = RExC_state;
4623 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4625 RExC_state = copyRExC_state;
4626 if (seen & REG_TOP_LEVEL_BRANCHES)
4627 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4629 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4630 if (data.last_found) {
4631 SvREFCNT_dec(data.longest_fixed);
4632 SvREFCNT_dec(data.longest_float);
4633 SvREFCNT_dec(data.last_found);
4635 StructCopy(&zero_scan_data, &data, scan_data_t);
4638 StructCopy(&zero_scan_data, &data, scan_data_t);
4641 /* Dig out information for optimizations. */
4642 r->extflags = RExC_flags; /* was pm_op */
4643 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4646 SvUTF8_on(rx); /* Unicode in it? */
4647 ri->regstclass = NULL;
4648 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4649 r->intflags |= PREGf_NAUGHTY;
4650 scan = ri->program + 1; /* First BRANCH. */
4652 /* testing for BRANCH here tells us whether there is "must appear"
4653 data in the pattern. If there is then we can use it for optimisations */
4654 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4656 STRLEN longest_float_length, longest_fixed_length;
4657 struct regnode_charclass_class ch_class; /* pointed to by data */
4659 I32 last_close = 0; /* pointed to by data */
4660 regnode *first= scan;
4661 regnode *first_next= regnext(first);
4664 * Skip introductions and multiplicators >= 1
4665 * so that we can extract the 'meat' of the pattern that must
4666 * match in the large if() sequence following.
4667 * NOTE that EXACT is NOT covered here, as it is normally
4668 * picked up by the optimiser separately.
4670 * This is unfortunate as the optimiser isnt handling lookahead
4671 * properly currently.
4674 while ((OP(first) == OPEN && (sawopen = 1)) ||
4675 /* An OR of *one* alternative - should not happen now. */
4676 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4677 /* for now we can't handle lookbehind IFMATCH*/
4678 (OP(first) == IFMATCH && !first->flags) ||
4679 (OP(first) == PLUS) ||
4680 (OP(first) == MINMOD) ||
4681 /* An {n,m} with n>0 */
4682 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4683 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4686 * the only op that could be a regnode is PLUS, all the rest
4687 * will be regnode_1 or regnode_2.
4690 if (OP(first) == PLUS)
4693 first += regarglen[OP(first)];
4695 first = NEXTOPER(first);
4696 first_next= regnext(first);
4699 /* Starting-point info. */
4701 DEBUG_PEEP("first:",first,0);
4702 /* Ignore EXACT as we deal with it later. */
4703 if (PL_regkind[OP(first)] == EXACT) {
4704 if (OP(first) == EXACT)
4705 NOOP; /* Empty, get anchored substr later. */
4706 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4707 ri->regstclass = first;
4710 else if (PL_regkind[OP(first)] == TRIE &&
4711 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4714 /* this can happen only on restudy */
4715 if ( OP(first) == TRIE ) {
4716 struct regnode_1 *trieop = (struct regnode_1 *)
4717 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4718 StructCopy(first,trieop,struct regnode_1);
4719 trie_op=(regnode *)trieop;
4721 struct regnode_charclass *trieop = (struct regnode_charclass *)
4722 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4723 StructCopy(first,trieop,struct regnode_charclass);
4724 trie_op=(regnode *)trieop;
4727 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4728 ri->regstclass = trie_op;
4731 else if (REGNODE_SIMPLE(OP(first)))
4732 ri->regstclass = first;
4733 else if (PL_regkind[OP(first)] == BOUND ||
4734 PL_regkind[OP(first)] == NBOUND)
4735 ri->regstclass = first;
4736 else if (PL_regkind[OP(first)] == BOL) {
4737 r->extflags |= (OP(first) == MBOL
4739 : (OP(first) == SBOL
4742 first = NEXTOPER(first);
4745 else if (OP(first) == GPOS) {
4746 r->extflags |= RXf_ANCH_GPOS;
4747 first = NEXTOPER(first);
4750 else if ((!sawopen || !RExC_sawback) &&
4751 (OP(first) == STAR &&
4752 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4753 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4755 /* turn .* into ^.* with an implied $*=1 */
4757 (OP(NEXTOPER(first)) == REG_ANY)
4760 r->extflags |= type;
4761 r->intflags |= PREGf_IMPLICIT;
4762 first = NEXTOPER(first);
4765 if (sawplus && (!sawopen || !RExC_sawback)
4766 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4767 /* x+ must match at the 1st pos of run of x's */
4768 r->intflags |= PREGf_SKIP;
4770 /* Scan is after the zeroth branch, first is atomic matcher. */
4771 #ifdef TRIE_STUDY_OPT
4774 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4775 (IV)(first - scan + 1))
4779 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4780 (IV)(first - scan + 1))
4786 * If there's something expensive in the r.e., find the
4787 * longest literal string that must appear and make it the
4788 * regmust. Resolve ties in favor of later strings, since
4789 * the regstart check works with the beginning of the r.e.
4790 * and avoiding duplication strengthens checking. Not a
4791 * strong reason, but sufficient in the absence of others.
4792 * [Now we resolve ties in favor of the earlier string if
4793 * it happens that c_offset_min has been invalidated, since the
4794 * earlier string may buy us something the later one won't.]
4797 data.longest_fixed = newSVpvs("");
4798 data.longest_float = newSVpvs("");
4799 data.last_found = newSVpvs("");
4800 data.longest = &(data.longest_fixed);
4802 if (!ri->regstclass) {
4803 cl_init(pRExC_state, &ch_class);
4804 data.start_class = &ch_class;
4805 stclass_flag = SCF_DO_STCLASS_AND;
4806 } else /* XXXX Check for BOUND? */
4808 data.last_closep = &last_close;
4810 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4811 &data, -1, NULL, NULL,
4812 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4818 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4819 && data.last_start_min == 0 && data.last_end > 0
4820 && !RExC_seen_zerolen
4821 && !(RExC_seen & REG_SEEN_VERBARG)
4822 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4823 r->extflags |= RXf_CHECK_ALL;
4824 scan_commit(pRExC_state, &data,&minlen,0);
4825 SvREFCNT_dec(data.last_found);
4827 /* Note that code very similar to this but for anchored string
4828 follows immediately below, changes may need to be made to both.
4831 longest_float_length = CHR_SVLEN(data.longest_float);
4832 if (longest_float_length
4833 || (data.flags & SF_FL_BEFORE_EOL
4834 && (!(data.flags & SF_FL_BEFORE_MEOL)
4835 || (RExC_flags & RXf_PMf_MULTILINE))))
4839 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4840 && data.offset_fixed == data.offset_float_min
4841 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4842 goto remove_float; /* As in (a)+. */
4844 /* copy the information about the longest float from the reg_scan_data
4845 over to the program. */
4846 if (SvUTF8(data.longest_float)) {
4847 r->float_utf8 = data.longest_float;
4848 r->float_substr = NULL;
4850 r->float_substr = data.longest_float;
4851 r->float_utf8 = NULL;
4853 /* float_end_shift is how many chars that must be matched that
4854 follow this item. We calculate it ahead of time as once the
4855 lookbehind offset is added in we lose the ability to correctly
4857 ml = data.minlen_float ? *(data.minlen_float)
4858 : (I32)longest_float_length;
4859 r->float_end_shift = ml - data.offset_float_min
4860 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4861 + data.lookbehind_float;
4862 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4863 r->float_max_offset = data.offset_float_max;
4864 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4865 r->float_max_offset -= data.lookbehind_float;
4867 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4868 && (!(data.flags & SF_FL_BEFORE_MEOL)
4869 || (RExC_flags & RXf_PMf_MULTILINE)));
4870 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4874 r->float_substr = r->float_utf8 = NULL;
4875 SvREFCNT_dec(data.longest_float);
4876 longest_float_length = 0;
4879 /* Note that code very similar to this but for floating string
4880 is immediately above, changes may need to be made to both.
4883 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4884 if (longest_fixed_length
4885 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4886 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4887 || (RExC_flags & RXf_PMf_MULTILINE))))
4891 /* copy the information about the longest fixed
4892 from the reg_scan_data over to the program. */
4893 if (SvUTF8(data.longest_fixed)) {
4894 r->anchored_utf8 = data.longest_fixed;
4895 r->anchored_substr = NULL;
4897 r->anchored_substr = data.longest_fixed;
4898 r->anchored_utf8 = NULL;
4900 /* fixed_end_shift is how many chars that must be matched that
4901 follow this item. We calculate it ahead of time as once the
4902 lookbehind offset is added in we lose the ability to correctly
4904 ml = data.minlen_fixed ? *(data.minlen_fixed)
4905 : (I32)longest_fixed_length;
4906 r->anchored_end_shift = ml - data.offset_fixed
4907 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4908 + data.lookbehind_fixed;
4909 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4911 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4912 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4913 || (RExC_flags & RXf_PMf_MULTILINE)));
4914 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4917 r->anchored_substr = r->anchored_utf8 = NULL;
4918 SvREFCNT_dec(data.longest_fixed);
4919 longest_fixed_length = 0;
4922 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4923 ri->regstclass = NULL;
4924 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4926 && !(data.start_class->flags & ANYOF_EOS)
4927 && !cl_is_anything(data.start_class))
4929 const U32 n = add_data(pRExC_state, 1, "f");
4931 Newx(RExC_rxi->data->data[n], 1,
4932 struct regnode_charclass_class);
4933 StructCopy(data.start_class,
4934 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4935 struct regnode_charclass_class);
4936 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4937 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4938 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4939 regprop(r, sv, (regnode*)data.start_class);
4940 PerlIO_printf(Perl_debug_log,
4941 "synthetic stclass \"%s\".\n",
4942 SvPVX_const(sv));});
4945 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4946 if (longest_fixed_length > longest_float_length) {
4947 r->check_end_shift = r->anchored_end_shift;
4948 r->check_substr = r->anchored_substr;
4949 r->check_utf8 = r->anchored_utf8;
4950 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4951 if (r->extflags & RXf_ANCH_SINGLE)
4952 r->extflags |= RXf_NOSCAN;
4955 r->check_end_shift = r->float_end_shift;
4956 r->check_substr = r->float_substr;
4957 r->check_utf8 = r->float_utf8;
4958 r->check_offset_min = r->float_min_offset;
4959 r->check_offset_max = r->float_max_offset;
4961 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4962 This should be changed ASAP! */
4963 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4964 r->extflags |= RXf_USE_INTUIT;
4965 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4966 r->extflags |= RXf_INTUIT_TAIL;
4968 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4969 if ( (STRLEN)minlen < longest_float_length )
4970 minlen= longest_float_length;
4971 if ( (STRLEN)minlen < longest_fixed_length )
4972 minlen= longest_fixed_length;
4976 /* Several toplevels. Best we can is to set minlen. */
4978 struct regnode_charclass_class ch_class;
4981 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4983 scan = ri->program + 1;
4984 cl_init(pRExC_state, &ch_class);
4985 data.start_class = &ch_class;
4986 data.last_closep = &last_close;
4989 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4990 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4994 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4995 = r->float_substr = r->float_utf8 = NULL;
4996 if (!(data.start_class->flags & ANYOF_EOS)
4997 && !cl_is_anything(data.start_class))
4999 const U32 n = add_data(pRExC_state, 1, "f");
5001 Newx(RExC_rxi->data->data[n], 1,
5002 struct regnode_charclass_class);
5003 StructCopy(data.start_class,
5004 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5005 struct regnode_charclass_class);
5006 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5007 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5008 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5009 regprop(r, sv, (regnode*)data.start_class);
5010 PerlIO_printf(Perl_debug_log,
5011 "synthetic stclass \"%s\".\n",
5012 SvPVX_const(sv));});
5016 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5017 the "real" pattern. */
5019 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5020 (IV)minlen, (IV)r->minlen);
5022 r->minlenret = minlen;
5023 if (r->minlen < minlen)
5026 if (RExC_seen & REG_SEEN_GPOS)
5027 r->extflags |= RXf_GPOS_SEEN;
5028 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5029 r->extflags |= RXf_LOOKBEHIND_SEEN;
5030 if (RExC_seen & REG_SEEN_EVAL)
5031 r->extflags |= RXf_EVAL_SEEN;
5032 if (RExC_seen & REG_SEEN_CANY)
5033 r->extflags |= RXf_CANY_SEEN;
5034 if (RExC_seen & REG_SEEN_VERBARG)
5035 r->intflags |= PREGf_VERBARG_SEEN;
5036 if (RExC_seen & REG_SEEN_CUTGROUP)
5037 r->intflags |= PREGf_CUTGROUP_SEEN;
5038 if (RExC_paren_names)
5039 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5041 RXp_PAREN_NAMES(r) = NULL;
5043 #ifdef STUPID_PATTERN_CHECKS
5044 if (RX_PRELEN(rx) == 0)
5045 r->extflags |= RXf_NULL;
5046 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5047 /* XXX: this should happen BEFORE we compile */
5048 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5049 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5050 r->extflags |= RXf_WHITE;
5051 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5052 r->extflags |= RXf_START_ONLY;
5054 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5055 /* XXX: this should happen BEFORE we compile */
5056 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5058 regnode *first = ri->program + 1;
5060 U8 nop = OP(NEXTOPER(first));
5062 if (PL_regkind[fop] == NOTHING && nop == END)
5063 r->extflags |= RXf_NULL;
5064 else if (PL_regkind[fop] == BOL && nop == END)
5065 r->extflags |= RXf_START_ONLY;
5066 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5067 r->extflags |= RXf_WHITE;
5071 if (RExC_paren_names) {
5072 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5073 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5076 ri->name_list_idx = 0;
5078 if (RExC_recurse_count) {
5079 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5080 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5081 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5084 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5085 /* assume we don't need to swap parens around before we match */
5088 PerlIO_printf(Perl_debug_log,"Final program:\n");
5091 #ifdef RE_TRACK_PATTERN_OFFSETS
5092 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5093 const U32 len = ri->u.offsets[0];
5095 GET_RE_DEBUG_FLAGS_DECL;
5096 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5097 for (i = 1; i <= len; i++) {
5098 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5099 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5100 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5102 PerlIO_printf(Perl_debug_log, "\n");
5108 #undef RE_ENGINE_PTR
5112 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5115 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5117 PERL_UNUSED_ARG(value);
5119 if (flags & RXapif_FETCH) {
5120 return reg_named_buff_fetch(rx, key, flags);
5121 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5122 Perl_croak_no_modify(aTHX);
5124 } else if (flags & RXapif_EXISTS) {
5125 return reg_named_buff_exists(rx, key, flags)
5128 } else if (flags & RXapif_REGNAMES) {
5129 return reg_named_buff_all(rx, flags);
5130 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5131 return reg_named_buff_scalar(rx, flags);
5133 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5139 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5142 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5143 PERL_UNUSED_ARG(lastkey);
5145 if (flags & RXapif_FIRSTKEY)
5146 return reg_named_buff_firstkey(rx, flags);
5147 else if (flags & RXapif_NEXTKEY)
5148 return reg_named_buff_nextkey(rx, flags);
5150 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5156 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5159 AV *retarray = NULL;
5161 struct regexp *const rx = (struct regexp *)SvANY(r);
5163 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5165 if (flags & RXapif_ALL)
5168 if (rx && RXp_PAREN_NAMES(rx)) {
5169 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5172 SV* sv_dat=HeVAL(he_str);
5173 I32 *nums=(I32*)SvPVX(sv_dat);
5174 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5175 if ((I32)(rx->nparens) >= nums[i]
5176 && rx->offs[nums[i]].start != -1
5177 && rx->offs[nums[i]].end != -1)
5180 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5184 ret = newSVsv(&PL_sv_undef);
5187 av_push(retarray, ret);
5190 return newRV_noinc(MUTABLE_SV(retarray));
5197 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5200 struct regexp *const rx = (struct regexp *)SvANY(r);
5202 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5204 if (rx && RXp_PAREN_NAMES(rx)) {
5205 if (flags & RXapif_ALL) {
5206 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5208 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5222 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5224 struct regexp *const rx = (struct regexp *)SvANY(r);
5226 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5228 if ( rx && RXp_PAREN_NAMES(rx) ) {
5229 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5231 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5238 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5240 struct regexp *const rx = (struct regexp *)SvANY(r);
5241 GET_RE_DEBUG_FLAGS_DECL;
5243 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5245 if (rx && RXp_PAREN_NAMES(rx)) {
5246 HV *hv = RXp_PAREN_NAMES(rx);
5248 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5251 SV* sv_dat = HeVAL(temphe);
5252 I32 *nums = (I32*)SvPVX(sv_dat);
5253 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5254 if ((I32)(rx->lastparen) >= nums[i] &&
5255 rx->offs[nums[i]].start != -1 &&
5256 rx->offs[nums[i]].end != -1)
5262 if (parno || flags & RXapif_ALL) {
5263 return newSVhek(HeKEY_hek(temphe));
5271 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5276 struct regexp *const rx = (struct regexp *)SvANY(r);
5278 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5280 if (rx && RXp_PAREN_NAMES(rx)) {
5281 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5282 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5283 } else if (flags & RXapif_ONE) {
5284 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5285 av = MUTABLE_AV(SvRV(ret));
5286 length = av_len(av);
5288 return newSViv(length + 1);
5290 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5294 return &PL_sv_undef;
5298 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5300 struct regexp *const rx = (struct regexp *)SvANY(r);
5303 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5305 if (rx && RXp_PAREN_NAMES(rx)) {
5306 HV *hv= RXp_PAREN_NAMES(rx);
5308 (void)hv_iterinit(hv);
5309 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5312 SV* sv_dat = HeVAL(temphe);
5313 I32 *nums = (I32*)SvPVX(sv_dat);
5314 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5315 if ((I32)(rx->lastparen) >= nums[i] &&
5316 rx->offs[nums[i]].start != -1 &&
5317 rx->offs[nums[i]].end != -1)
5323 if (parno || flags & RXapif_ALL) {
5324 av_push(av, newSVhek(HeKEY_hek(temphe)));
5329 return newRV_noinc(MUTABLE_SV(av));
5333 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5336 struct regexp *const rx = (struct regexp *)SvANY(r);
5341 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5344 sv_setsv(sv,&PL_sv_undef);
5348 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5350 i = rx->offs[0].start;
5354 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5356 s = rx->subbeg + rx->offs[0].end;
5357 i = rx->sublen - rx->offs[0].end;
5360 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5361 (s1 = rx->offs[paren].start) != -1 &&
5362 (t1 = rx->offs[paren].end) != -1)
5366 s = rx->subbeg + s1;
5368 sv_setsv(sv,&PL_sv_undef);
5371 assert(rx->sublen >= (s - rx->subbeg) + i );
5373 const int oldtainted = PL_tainted;
5375 sv_setpvn(sv, s, i);
5376 PL_tainted = oldtainted;
5377 if ( (rx->extflags & RXf_CANY_SEEN)
5378 ? (RXp_MATCH_UTF8(rx)
5379 && (!i || is_utf8_string((U8*)s, i)))
5380 : (RXp_MATCH_UTF8(rx)) )
5387 if (RXp_MATCH_TAINTED(rx)) {
5388 if (SvTYPE(sv) >= SVt_PVMG) {
5389 MAGIC* const mg = SvMAGIC(sv);
5392 SvMAGIC_set(sv, mg->mg_moremagic);
5394 if ((mgt = SvMAGIC(sv))) {
5395 mg->mg_moremagic = mgt;
5396 SvMAGIC_set(sv, mg);
5406 sv_setsv(sv,&PL_sv_undef);
5412 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5413 SV const * const value)
5415 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5417 PERL_UNUSED_ARG(rx);
5418 PERL_UNUSED_ARG(paren);
5419 PERL_UNUSED_ARG(value);
5422 Perl_croak_no_modify(aTHX);
5426 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5429 struct regexp *const rx = (struct regexp *)SvANY(r);
5433 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5435 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5437 /* $` / ${^PREMATCH} */
5438 case RX_BUFF_IDX_PREMATCH:
5439 if (rx->offs[0].start != -1) {
5440 i = rx->offs[0].start;
5448 /* $' / ${^POSTMATCH} */
5449 case RX_BUFF_IDX_POSTMATCH:
5450 if (rx->offs[0].end != -1) {
5451 i = rx->sublen - rx->offs[0].end;
5453 s1 = rx->offs[0].end;
5459 /* $& / ${^MATCH}, $1, $2, ... */
5461 if (paren <= (I32)rx->nparens &&
5462 (s1 = rx->offs[paren].start) != -1 &&
5463 (t1 = rx->offs[paren].end) != -1)
5468 if (ckWARN(WARN_UNINITIALIZED))
5469 report_uninit((const SV *)sv);
5474 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5475 const char * const s = rx->subbeg + s1;
5480 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5487 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5489 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5490 PERL_UNUSED_ARG(rx);
5494 return newSVpvs("Regexp");
5497 /* Scans the name of a named buffer from the pattern.
5498 * If flags is REG_RSN_RETURN_NULL returns null.
5499 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5500 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5501 * to the parsed name as looked up in the RExC_paren_names hash.
5502 * If there is an error throws a vFAIL().. type exception.
5505 #define REG_RSN_RETURN_NULL 0
5506 #define REG_RSN_RETURN_NAME 1
5507 #define REG_RSN_RETURN_DATA 2
5510 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5512 char *name_start = RExC_parse;
5514 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5516 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5517 /* skip IDFIRST by using do...while */
5520 RExC_parse += UTF8SKIP(RExC_parse);
5521 } while (isALNUM_utf8((U8*)RExC_parse));
5525 } while (isALNUM(*RExC_parse));
5530 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5531 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5532 if ( flags == REG_RSN_RETURN_NAME)
5534 else if (flags==REG_RSN_RETURN_DATA) {
5537 if ( ! sv_name ) /* should not happen*/
5538 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5539 if (RExC_paren_names)
5540 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5542 sv_dat = HeVAL(he_str);
5544 vFAIL("Reference to nonexistent named group");
5548 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5555 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5556 int rem=(int)(RExC_end - RExC_parse); \
5565 if (RExC_lastparse!=RExC_parse) \
5566 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5569 iscut ? "..." : "<" \
5572 PerlIO_printf(Perl_debug_log,"%16s",""); \
5575 num = RExC_size + 1; \
5577 num=REG_NODE_NUM(RExC_emit); \
5578 if (RExC_lastnum!=num) \
5579 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5581 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5582 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5583 (int)((depth*2)), "", \
5587 RExC_lastparse=RExC_parse; \
5592 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5593 DEBUG_PARSE_MSG((funcname)); \
5594 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5596 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5597 DEBUG_PARSE_MSG((funcname)); \
5598 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5601 - reg - regular expression, i.e. main body or parenthesized thing
5603 * Caller must absorb opening parenthesis.
5605 * Combining parenthesis handling with the base level of regular expression
5606 * is a trifle forced, but the need to tie the tails of the branches to what
5607 * follows makes it hard to avoid.
5609 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5611 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5613 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5617 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5618 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5621 register regnode *ret; /* Will be the head of the group. */
5622 register regnode *br;
5623 register regnode *lastbr;
5624 register regnode *ender = NULL;
5625 register I32 parno = 0;
5627 U32 oregflags = RExC_flags;
5628 bool have_branch = 0;
5630 I32 freeze_paren = 0;
5631 I32 after_freeze = 0;
5633 /* for (?g), (?gc), and (?o) warnings; warning
5634 about (?c) will warn about (?g) -- japhy */
5636 #define WASTED_O 0x01
5637 #define WASTED_G 0x02
5638 #define WASTED_C 0x04
5639 #define WASTED_GC (0x02|0x04)
5640 I32 wastedflags = 0x00;
5642 char * parse_start = RExC_parse; /* MJD */
5643 char * const oregcomp_parse = RExC_parse;
5645 GET_RE_DEBUG_FLAGS_DECL;
5647 PERL_ARGS_ASSERT_REG;
5648 DEBUG_PARSE("reg ");
5650 *flagp = 0; /* Tentatively. */
5653 /* Make an OPEN node, if parenthesized. */
5655 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5656 char *start_verb = RExC_parse;
5657 STRLEN verb_len = 0;
5658 char *start_arg = NULL;
5659 unsigned char op = 0;
5661 int internal_argval = 0; /* internal_argval is only useful if !argok */
5662 while ( *RExC_parse && *RExC_parse != ')' ) {
5663 if ( *RExC_parse == ':' ) {
5664 start_arg = RExC_parse + 1;
5670 verb_len = RExC_parse - start_verb;
5673 while ( *RExC_parse && *RExC_parse != ')' )
5675 if ( *RExC_parse != ')' )
5676 vFAIL("Unterminated verb pattern argument");
5677 if ( RExC_parse == start_arg )
5680 if ( *RExC_parse != ')' )
5681 vFAIL("Unterminated verb pattern");
5684 switch ( *start_verb ) {
5685 case 'A': /* (*ACCEPT) */
5686 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5688 internal_argval = RExC_nestroot;
5691 case 'C': /* (*COMMIT) */
5692 if ( memEQs(start_verb,verb_len,"COMMIT") )
5695 case 'F': /* (*FAIL) */
5696 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5701 case ':': /* (*:NAME) */
5702 case 'M': /* (*MARK:NAME) */
5703 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5708 case 'P': /* (*PRUNE) */
5709 if ( memEQs(start_verb,verb_len,"PRUNE") )
5712 case 'S': /* (*SKIP) */
5713 if ( memEQs(start_verb,verb_len,"SKIP") )
5716 case 'T': /* (*THEN) */
5717 /* [19:06] <TimToady> :: is then */
5718 if ( memEQs(start_verb,verb_len,"THEN") ) {
5720 RExC_seen |= REG_SEEN_CUTGROUP;
5726 vFAIL3("Unknown verb pattern '%.*s'",
5727 verb_len, start_verb);
5730 if ( start_arg && internal_argval ) {
5731 vFAIL3("Verb pattern '%.*s' may not have an argument",
5732 verb_len, start_verb);
5733 } else if ( argok < 0 && !start_arg ) {
5734 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5735 verb_len, start_verb);
5737 ret = reganode(pRExC_state, op, internal_argval);
5738 if ( ! internal_argval && ! SIZE_ONLY ) {
5740 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5741 ARG(ret) = add_data( pRExC_state, 1, "S" );
5742 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5749 if (!internal_argval)
5750 RExC_seen |= REG_SEEN_VERBARG;
5751 } else if ( start_arg ) {
5752 vFAIL3("Verb pattern '%.*s' may not have an argument",
5753 verb_len, start_verb);
5755 ret = reg_node(pRExC_state, op);
5757 nextchar(pRExC_state);
5760 if (*RExC_parse == '?') { /* (?...) */
5761 bool is_logical = 0;
5762 const char * const seqstart = RExC_parse;
5763 bool has_use_defaults = FALSE;
5766 paren = *RExC_parse++;
5767 ret = NULL; /* For look-ahead/behind. */
5770 case 'P': /* (?P...) variants for those used to PCRE/Python */
5771 paren = *RExC_parse++;
5772 if ( paren == '<') /* (?P<...>) named capture */
5774 else if (paren == '>') { /* (?P>name) named recursion */
5775 goto named_recursion;
5777 else if (paren == '=') { /* (?P=...) named backref */
5778 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5779 you change this make sure you change that */
5780 char* name_start = RExC_parse;
5782 SV *sv_dat = reg_scan_name(pRExC_state,
5783 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5784 if (RExC_parse == name_start || *RExC_parse != ')')
5785 vFAIL2("Sequence %.3s... not terminated",parse_start);
5788 num = add_data( pRExC_state, 1, "S" );
5789 RExC_rxi->data->data[num]=(void*)sv_dat;
5790 SvREFCNT_inc_simple_void(sv_dat);
5793 ret = reganode(pRExC_state,
5794 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5798 Set_Node_Offset(ret, parse_start+1);
5799 Set_Node_Cur_Length(ret); /* MJD */
5801 nextchar(pRExC_state);
5805 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5807 case '<': /* (?<...) */
5808 if (*RExC_parse == '!')
5810 else if (*RExC_parse != '=')
5816 case '\'': /* (?'...') */
5817 name_start= RExC_parse;
5818 svname = reg_scan_name(pRExC_state,
5819 SIZE_ONLY ? /* reverse test from the others */
5820 REG_RSN_RETURN_NAME :
5821 REG_RSN_RETURN_NULL);
5822 if (RExC_parse == name_start) {
5824 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5827 if (*RExC_parse != paren)
5828 vFAIL2("Sequence (?%c... not terminated",
5829 paren=='>' ? '<' : paren);
5833 if (!svname) /* shouldnt happen */
5835 "panic: reg_scan_name returned NULL");
5836 if (!RExC_paren_names) {
5837 RExC_paren_names= newHV();
5838 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5840 RExC_paren_name_list= newAV();
5841 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5844 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5846 sv_dat = HeVAL(he_str);
5848 /* croak baby croak */
5850 "panic: paren_name hash element allocation failed");
5851 } else if ( SvPOK(sv_dat) ) {
5852 /* (?|...) can mean we have dupes so scan to check
5853 its already been stored. Maybe a flag indicating
5854 we are inside such a construct would be useful,
5855 but the arrays are likely to be quite small, so
5856 for now we punt -- dmq */
5857 IV count = SvIV(sv_dat);
5858 I32 *pv = (I32*)SvPVX(sv_dat);
5860 for ( i = 0 ; i < count ; i++ ) {
5861 if ( pv[i] == RExC_npar ) {
5867 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5868 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5869 pv[count] = RExC_npar;
5870 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5873 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5874 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5876 SvIV_set(sv_dat, 1);
5879 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5880 SvREFCNT_dec(svname);
5883 /*sv_dump(sv_dat);*/
5885 nextchar(pRExC_state);
5887 goto capturing_parens;
5889 RExC_seen |= REG_SEEN_LOOKBEHIND;
5891 case '=': /* (?=...) */
5892 RExC_seen_zerolen++;
5894 case '!': /* (?!...) */
5895 RExC_seen_zerolen++;
5896 if (*RExC_parse == ')') {
5897 ret=reg_node(pRExC_state, OPFAIL);
5898 nextchar(pRExC_state);
5902 case '|': /* (?|...) */
5903 /* branch reset, behave like a (?:...) except that
5904 buffers in alternations share the same numbers */
5906 after_freeze = freeze_paren = RExC_npar;
5908 case ':': /* (?:...) */
5909 case '>': /* (?>...) */
5911 case '$': /* (?$...) */
5912 case '@': /* (?@...) */
5913 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5915 case '#': /* (?#...) */
5916 while (*RExC_parse && *RExC_parse != ')')
5918 if (*RExC_parse != ')')
5919 FAIL("Sequence (?#... not terminated");
5920 nextchar(pRExC_state);
5923 case '0' : /* (?0) */
5924 case 'R' : /* (?R) */
5925 if (*RExC_parse != ')')
5926 FAIL("Sequence (?R) not terminated");
5927 ret = reg_node(pRExC_state, GOSTART);
5928 *flagp |= POSTPONED;
5929 nextchar(pRExC_state);
5932 { /* named and numeric backreferences */
5934 case '&': /* (?&NAME) */
5935 parse_start = RExC_parse - 1;
5938 SV *sv_dat = reg_scan_name(pRExC_state,
5939 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5940 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5942 goto gen_recurse_regop;
5945 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5947 vFAIL("Illegal pattern");
5949 goto parse_recursion;
5951 case '-': /* (?-1) */
5952 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5953 RExC_parse--; /* rewind to let it be handled later */
5957 case '1': case '2': case '3': case '4': /* (?1) */
5958 case '5': case '6': case '7': case '8': case '9':
5961 num = atoi(RExC_parse);
5962 parse_start = RExC_parse - 1; /* MJD */
5963 if (*RExC_parse == '-')
5965 while (isDIGIT(*RExC_parse))
5967 if (*RExC_parse!=')')
5968 vFAIL("Expecting close bracket");
5971 if ( paren == '-' ) {
5973 Diagram of capture buffer numbering.
5974 Top line is the normal capture buffer numbers
5975 Botton line is the negative indexing as from
5979 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5983 num = RExC_npar + num;
5986 vFAIL("Reference to nonexistent group");
5988 } else if ( paren == '+' ) {
5989 num = RExC_npar + num - 1;
5992 ret = reganode(pRExC_state, GOSUB, num);
5994 if (num > (I32)RExC_rx->nparens) {
5996 vFAIL("Reference to nonexistent group");
5998 ARG2L_SET( ret, RExC_recurse_count++);
6000 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6001 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6005 RExC_seen |= REG_SEEN_RECURSE;
6006 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6007 Set_Node_Offset(ret, parse_start); /* MJD */
6009 *flagp |= POSTPONED;
6010 nextchar(pRExC_state);
6012 } /* named and numeric backreferences */
6015 case '?': /* (??...) */
6017 if (*RExC_parse != '{') {
6019 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6022 *flagp |= POSTPONED;
6023 paren = *RExC_parse++;
6025 case '{': /* (?{...}) */
6030 char *s = RExC_parse;
6032 RExC_seen_zerolen++;
6033 RExC_seen |= REG_SEEN_EVAL;
6034 while (count && (c = *RExC_parse)) {
6045 if (*RExC_parse != ')') {
6047 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6051 OP_4tree *sop, *rop;
6052 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6055 Perl_save_re_context(aTHX);
6056 rop = sv_compile_2op(sv, &sop, "re", &pad);
6057 sop->op_private |= OPpREFCOUNTED;
6058 /* re_dup will OpREFCNT_inc */
6059 OpREFCNT_set(sop, 1);
6062 n = add_data(pRExC_state, 3, "nop");
6063 RExC_rxi->data->data[n] = (void*)rop;
6064 RExC_rxi->data->data[n+1] = (void*)sop;
6065 RExC_rxi->data->data[n+2] = (void*)pad;
6068 else { /* First pass */
6069 if (PL_reginterp_cnt < ++RExC_seen_evals
6071 /* No compiled RE interpolated, has runtime
6072 components ===> unsafe. */
6073 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6074 if (PL_tainting && PL_tainted)
6075 FAIL("Eval-group in insecure regular expression");
6076 #if PERL_VERSION > 8
6077 if (IN_PERL_COMPILETIME)
6082 nextchar(pRExC_state);
6084 ret = reg_node(pRExC_state, LOGICAL);
6087 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6088 /* deal with the length of this later - MJD */
6091 ret = reganode(pRExC_state, EVAL, n);
6092 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6093 Set_Node_Offset(ret, parse_start);
6096 case '(': /* (?(?{...})...) and (?(?=...)...) */
6099 if (RExC_parse[0] == '?') { /* (?(?...)) */
6100 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6101 || RExC_parse[1] == '<'
6102 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6105 ret = reg_node(pRExC_state, LOGICAL);
6108 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6112 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6113 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6115 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6116 char *name_start= RExC_parse++;
6118 SV *sv_dat=reg_scan_name(pRExC_state,
6119 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6120 if (RExC_parse == name_start || *RExC_parse != ch)
6121 vFAIL2("Sequence (?(%c... not terminated",
6122 (ch == '>' ? '<' : ch));
6125 num = add_data( pRExC_state, 1, "S" );
6126 RExC_rxi->data->data[num]=(void*)sv_dat;
6127 SvREFCNT_inc_simple_void(sv_dat);
6129 ret = reganode(pRExC_state,NGROUPP,num);
6130 goto insert_if_check_paren;
6132 else if (RExC_parse[0] == 'D' &&
6133 RExC_parse[1] == 'E' &&
6134 RExC_parse[2] == 'F' &&
6135 RExC_parse[3] == 'I' &&
6136 RExC_parse[4] == 'N' &&
6137 RExC_parse[5] == 'E')
6139 ret = reganode(pRExC_state,DEFINEP,0);
6142 goto insert_if_check_paren;
6144 else if (RExC_parse[0] == 'R') {
6147 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6148 parno = atoi(RExC_parse++);
6149 while (isDIGIT(*RExC_parse))
6151 } else if (RExC_parse[0] == '&') {
6154 sv_dat = reg_scan_name(pRExC_state,
6155 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6156 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6158 ret = reganode(pRExC_state,INSUBP,parno);
6159 goto insert_if_check_paren;
6161 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6164 parno = atoi(RExC_parse++);
6166 while (isDIGIT(*RExC_parse))
6168 ret = reganode(pRExC_state, GROUPP, parno);
6170 insert_if_check_paren:
6171 if ((c = *nextchar(pRExC_state)) != ')')
6172 vFAIL("Switch condition not recognized");
6174 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6175 br = regbranch(pRExC_state, &flags, 1,depth+1);
6177 br = reganode(pRExC_state, LONGJMP, 0);
6179 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6180 c = *nextchar(pRExC_state);
6185 vFAIL("(?(DEFINE)....) does not allow branches");
6186 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6187 regbranch(pRExC_state, &flags, 1,depth+1);
6188 REGTAIL(pRExC_state, ret, lastbr);
6191 c = *nextchar(pRExC_state);
6196 vFAIL("Switch (?(condition)... contains too many branches");
6197 ender = reg_node(pRExC_state, TAIL);
6198 REGTAIL(pRExC_state, br, ender);
6200 REGTAIL(pRExC_state, lastbr, ender);
6201 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6204 REGTAIL(pRExC_state, ret, ender);
6205 RExC_size++; /* XXX WHY do we need this?!!
6206 For large programs it seems to be required
6207 but I can't figure out why. -- dmq*/
6211 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6215 RExC_parse--; /* for vFAIL to print correctly */
6216 vFAIL("Sequence (? incomplete");
6218 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6220 has_use_defaults = TRUE;
6221 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6222 RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6226 parse_flags: /* (?i) */
6228 U32 posflags = 0, negflags = 0;
6229 U32 *flagsp = &posflags;
6230 bool has_charset_modifier = 0;
6232 while (*RExC_parse) {
6233 /* && strchr("iogcmsx", *RExC_parse) */
6234 /* (?g), (?gc) and (?o) are useless here
6235 and must be globally applied -- japhy */
6236 switch (*RExC_parse) {
6237 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6238 case LOCALE_PAT_MOD:
6239 if (has_charset_modifier || flagsp == &negflags) {
6240 goto fail_modifiers;
6242 *flagsp &= ~RXf_PMf_UNICODE;
6243 *flagsp |= RXf_PMf_LOCALE;
6244 has_charset_modifier = 1;
6246 case UNICODE_PAT_MOD:
6247 if (has_charset_modifier || flagsp == &negflags) {
6248 goto fail_modifiers;
6250 *flagsp &= ~RXf_PMf_LOCALE;
6251 *flagsp |= RXf_PMf_UNICODE;
6252 has_charset_modifier = 1;
6255 if (has_use_defaults
6256 || has_charset_modifier
6257 || flagsp == &negflags)
6259 goto fail_modifiers;
6261 *flagsp &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6262 has_charset_modifier = 1;
6264 case ONCE_PAT_MOD: /* 'o' */
6265 case GLOBAL_PAT_MOD: /* 'g' */
6266 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6267 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6268 if (! (wastedflags & wflagbit) ) {
6269 wastedflags |= wflagbit;
6272 "Useless (%s%c) - %suse /%c modifier",
6273 flagsp == &negflags ? "?-" : "?",
6275 flagsp == &negflags ? "don't " : "",
6282 case CONTINUE_PAT_MOD: /* 'c' */
6283 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6284 if (! (wastedflags & WASTED_C) ) {
6285 wastedflags |= WASTED_GC;
6288 "Useless (%sc) - %suse /gc modifier",
6289 flagsp == &negflags ? "?-" : "?",
6290 flagsp == &negflags ? "don't " : ""
6295 case KEEPCOPY_PAT_MOD: /* 'p' */
6296 if (flagsp == &negflags) {
6298 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6300 *flagsp |= RXf_PMf_KEEPCOPY;
6304 /* A flag is a default iff it is following a minus, so
6305 * if there is a minus, it means will be trying to
6306 * re-specify a default which is an error */
6307 if (has_use_defaults || flagsp == &negflags) {
6310 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6314 wastedflags = 0; /* reset so (?g-c) warns twice */
6320 RExC_flags |= posflags;
6321 RExC_flags &= ~negflags;
6323 oregflags |= posflags;
6324 oregflags &= ~negflags;
6326 nextchar(pRExC_state);
6337 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6342 }} /* one for the default block, one for the switch */
6349 ret = reganode(pRExC_state, OPEN, parno);
6352 RExC_nestroot = parno;
6353 if (RExC_seen & REG_SEEN_RECURSE
6354 && !RExC_open_parens[parno-1])
6356 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6357 "Setting open paren #%"IVdf" to %d\n",
6358 (IV)parno, REG_NODE_NUM(ret)));
6359 RExC_open_parens[parno-1]= ret;
6362 Set_Node_Length(ret, 1); /* MJD */
6363 Set_Node_Offset(ret, RExC_parse); /* MJD */
6371 /* Pick up the branches, linking them together. */
6372 parse_start = RExC_parse; /* MJD */
6373 br = regbranch(pRExC_state, &flags, 1,depth+1);
6376 if (RExC_npar > after_freeze)
6377 after_freeze = RExC_npar;
6378 RExC_npar = freeze_paren;
6381 /* branch_len = (paren != 0); */
6385 if (*RExC_parse == '|') {
6386 if (!SIZE_ONLY && RExC_extralen) {
6387 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6390 reginsert(pRExC_state, BRANCH, br, depth+1);
6391 Set_Node_Length(br, paren != 0);
6392 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6396 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6398 else if (paren == ':') {
6399 *flagp |= flags&SIMPLE;
6401 if (is_open) { /* Starts with OPEN. */
6402 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6404 else if (paren != '?') /* Not Conditional */
6406 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6408 while (*RExC_parse == '|') {
6409 if (!SIZE_ONLY && RExC_extralen) {
6410 ender = reganode(pRExC_state, LONGJMP,0);
6411 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6414 RExC_extralen += 2; /* Account for LONGJMP. */
6415 nextchar(pRExC_state);
6417 if (RExC_npar > after_freeze)
6418 after_freeze = RExC_npar;
6419 RExC_npar = freeze_paren;
6421 br = regbranch(pRExC_state, &flags, 0, depth+1);
6425 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6427 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6430 if (have_branch || paren != ':') {
6431 /* Make a closing node, and hook it on the end. */
6434 ender = reg_node(pRExC_state, TAIL);
6437 ender = reganode(pRExC_state, CLOSE, parno);
6438 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6439 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6440 "Setting close paren #%"IVdf" to %d\n",
6441 (IV)parno, REG_NODE_NUM(ender)));
6442 RExC_close_parens[parno-1]= ender;
6443 if (RExC_nestroot == parno)
6446 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6447 Set_Node_Length(ender,1); /* MJD */
6453 *flagp &= ~HASWIDTH;
6456 ender = reg_node(pRExC_state, SUCCEED);
6459 ender = reg_node(pRExC_state, END);
6461 assert(!RExC_opend); /* there can only be one! */
6466 REGTAIL(pRExC_state, lastbr, ender);
6468 if (have_branch && !SIZE_ONLY) {
6470 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6472 /* Hook the tails of the branches to the closing node. */
6473 for (br = ret; br; br = regnext(br)) {
6474 const U8 op = PL_regkind[OP(br)];
6476 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6478 else if (op == BRANCHJ) {
6479 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6487 static const char parens[] = "=!<,>";
6489 if (paren && (p = strchr(parens, paren))) {
6490 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6491 int flag = (p - parens) > 1;
6494 node = SUSPEND, flag = 0;
6495 reginsert(pRExC_state, node,ret, depth+1);
6496 Set_Node_Cur_Length(ret);
6497 Set_Node_Offset(ret, parse_start + 1);
6499 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6503 /* Check for proper termination. */
6505 RExC_flags = oregflags;
6506 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6507 RExC_parse = oregcomp_parse;
6508 vFAIL("Unmatched (");
6511 else if (!paren && RExC_parse < RExC_end) {
6512 if (*RExC_parse == ')') {
6514 vFAIL("Unmatched )");
6517 FAIL("Junk on end of regexp"); /* "Can't happen". */
6521 RExC_npar = after_freeze;
6526 - regbranch - one alternative of an | operator
6528 * Implements the concatenation operator.
6531 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6534 register regnode *ret;
6535 register regnode *chain = NULL;
6536 register regnode *latest;
6537 I32 flags = 0, c = 0;
6538 GET_RE_DEBUG_FLAGS_DECL;
6540 PERL_ARGS_ASSERT_REGBRANCH;
6542 DEBUG_PARSE("brnc");
6547 if (!SIZE_ONLY && RExC_extralen)
6548 ret = reganode(pRExC_state, BRANCHJ,0);
6550 ret = reg_node(pRExC_state, BRANCH);
6551 Set_Node_Length(ret, 1);
6555 if (!first && SIZE_ONLY)
6556 RExC_extralen += 1; /* BRANCHJ */
6558 *flagp = WORST; /* Tentatively. */
6561 nextchar(pRExC_state);
6562 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6564 latest = regpiece(pRExC_state, &flags,depth+1);
6565 if (latest == NULL) {
6566 if (flags & TRYAGAIN)
6570 else if (ret == NULL)
6572 *flagp |= flags&(HASWIDTH|POSTPONED);
6573 if (chain == NULL) /* First piece. */
6574 *flagp |= flags&SPSTART;
6577 REGTAIL(pRExC_state, chain, latest);
6582 if (chain == NULL) { /* Loop ran zero times. */
6583 chain = reg_node(pRExC_state, NOTHING);
6588 *flagp |= flags&SIMPLE;
6595 - regpiece - something followed by possible [*+?]
6597 * Note that the branching code sequences used for ? and the general cases
6598 * of * and + are somewhat optimized: they use the same NOTHING node as
6599 * both the endmarker for their branch list and the body of the last branch.
6600 * It might seem that this node could be dispensed with entirely, but the
6601 * endmarker role is not redundant.
6604 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6607 register regnode *ret;
6609 register char *next;
6611 const char * const origparse = RExC_parse;
6613 I32 max = REG_INFTY;
6615 const char *maxpos = NULL;
6616 GET_RE_DEBUG_FLAGS_DECL;
6618 PERL_ARGS_ASSERT_REGPIECE;
6620 DEBUG_PARSE("piec");
6622 ret = regatom(pRExC_state, &flags,depth+1);
6624 if (flags & TRYAGAIN)
6631 if (op == '{' && regcurly(RExC_parse)) {
6633 parse_start = RExC_parse; /* MJD */
6634 next = RExC_parse + 1;
6635 while (isDIGIT(*next) || *next == ',') {
6644 if (*next == '}') { /* got one */
6648 min = atoi(RExC_parse);
6652 maxpos = RExC_parse;
6654 if (!max && *maxpos != '0')
6655 max = REG_INFTY; /* meaning "infinity" */
6656 else if (max >= REG_INFTY)
6657 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6659 nextchar(pRExC_state);
6662 if ((flags&SIMPLE)) {
6663 RExC_naughty += 2 + RExC_naughty / 2;
6664 reginsert(pRExC_state, CURLY, ret, depth+1);
6665 Set_Node_Offset(ret, parse_start+1); /* MJD */
6666 Set_Node_Cur_Length(ret);
6669 regnode * const w = reg_node(pRExC_state, WHILEM);
6672 REGTAIL(pRExC_state, ret, w);
6673 if (!SIZE_ONLY && RExC_extralen) {
6674 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6675 reginsert(pRExC_state, NOTHING,ret, depth+1);
6676 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6678 reginsert(pRExC_state, CURLYX,ret, depth+1);
6680 Set_Node_Offset(ret, parse_start+1);
6681 Set_Node_Length(ret,
6682 op == '{' ? (RExC_parse - parse_start) : 1);
6684 if (!SIZE_ONLY && RExC_extralen)
6685 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6686 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6688 RExC_whilem_seen++, RExC_extralen += 3;
6689 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6698 vFAIL("Can't do {n,m} with n > m");
6700 ARG1_SET(ret, (U16)min);
6701 ARG2_SET(ret, (U16)max);
6713 #if 0 /* Now runtime fix should be reliable. */
6715 /* if this is reinstated, don't forget to put this back into perldiag:
6717 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6719 (F) The part of the regexp subject to either the * or + quantifier
6720 could match an empty string. The {#} shows in the regular
6721 expression about where the problem was discovered.
6725 if (!(flags&HASWIDTH) && op != '?')
6726 vFAIL("Regexp *+ operand could be empty");
6729 parse_start = RExC_parse;
6730 nextchar(pRExC_state);
6732 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6734 if (op == '*' && (flags&SIMPLE)) {
6735 reginsert(pRExC_state, STAR, ret, depth+1);
6739 else if (op == '*') {
6743 else if (op == '+' && (flags&SIMPLE)) {
6744 reginsert(pRExC_state, PLUS, ret, depth+1);
6748 else if (op == '+') {
6752 else if (op == '?') {
6757 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6758 ckWARN3reg(RExC_parse,
6759 "%.*s matches null string many times",
6760 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6764 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6765 nextchar(pRExC_state);
6766 reginsert(pRExC_state, MINMOD, ret, depth+1);
6767 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6769 #ifndef REG_ALLOW_MINMOD_SUSPEND
6772 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6774 nextchar(pRExC_state);
6775 ender = reg_node(pRExC_state, SUCCEED);
6776 REGTAIL(pRExC_state, ret, ender);
6777 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6779 ender = reg_node(pRExC_state, TAIL);
6780 REGTAIL(pRExC_state, ret, ender);
6784 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6786 vFAIL("Nested quantifiers");
6793 /* reg_namedseq(pRExC_state,UVp)
6795 This is expected to be called by a parser routine that has
6796 recognized '\N' and needs to handle the rest. RExC_parse is
6797 expected to point at the first char following the N at the time
6800 The \N may be inside (indicated by valuep not being NULL) or outside a
6803 \N may begin either a named sequence, or if outside a character class, mean
6804 to match a non-newline. For non single-quoted regexes, the tokenizer has
6805 attempted to decide which, and in the case of a named sequence converted it
6806 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6807 where c1... are the characters in the sequence. For single-quoted regexes,
6808 the tokenizer passes the \N sequence through unchanged; this code will not
6809 attempt to determine this nor expand those. The net effect is that if the
6810 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6811 signals that this \N occurrence means to match a non-newline.
6813 Only the \N{U+...} form should occur in a character class, for the same
6814 reason that '.' inside a character class means to just match a period: it
6815 just doesn't make sense.
6817 If valuep is non-null then it is assumed that we are parsing inside
6818 of a charclass definition and the first codepoint in the resolved
6819 string is returned via *valuep and the routine will return NULL.
6820 In this mode if a multichar string is returned from the charnames
6821 handler, a warning will be issued, and only the first char in the
6822 sequence will be examined. If the string returned is zero length
6823 then the value of *valuep is undefined and NON-NULL will
6824 be returned to indicate failure. (This will NOT be a valid pointer
6827 If valuep is null then it is assumed that we are parsing normal text and a
6828 new EXACT node is inserted into the program containing the resolved string,
6829 and a pointer to the new node is returned. But if the string is zero length
6830 a NOTHING node is emitted instead.
6832 On success RExC_parse is set to the char following the endbrace.
6833 Parsing failures will generate a fatal error via vFAIL(...)
6836 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6838 char * endbrace; /* '}' following the name */
6839 regnode *ret = NULL;
6841 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6845 GET_RE_DEBUG_FLAGS_DECL;
6847 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6851 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6852 * modifier. The other meaning does not */
6853 p = (RExC_flags & RXf_PMf_EXTENDED)
6854 ? regwhite( pRExC_state, RExC_parse )
6857 /* Disambiguate between \N meaning a named character versus \N meaning
6858 * [^\n]. The former is assumed when it can't be the latter. */
6859 if (*p != '{' || regcurly(p)) {
6862 /* no bare \N in a charclass */
6863 vFAIL("\\N in a character class must be a named character: \\N{...}");
6865 nextchar(pRExC_state);
6866 ret = reg_node(pRExC_state, REG_ANY);
6867 *flagp |= HASWIDTH|SIMPLE;
6870 Set_Node_Length(ret, 1); /* MJD */
6874 /* Here, we have decided it should be a named sequence */
6876 /* The test above made sure that the next real character is a '{', but
6877 * under the /x modifier, it could be separated by space (or a comment and
6878 * \n) and this is not allowed (for consistency with \x{...} and the
6879 * tokenizer handling of \N{NAME}). */
6880 if (*RExC_parse != '{') {
6881 vFAIL("Missing braces on \\N{}");
6884 RExC_parse++; /* Skip past the '{' */
6886 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6887 || ! (endbrace == RExC_parse /* nothing between the {} */
6888 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6889 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6891 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6892 vFAIL("\\N{NAME} must be resolved by the lexer");
6895 if (endbrace == RExC_parse) { /* empty: \N{} */
6897 RExC_parse = endbrace + 1;
6898 return reg_node(pRExC_state,NOTHING);
6902 ckWARNreg(RExC_parse,
6903 "Ignoring zero length \\N{} in character class"
6905 RExC_parse = endbrace + 1;
6908 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6911 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
6912 RExC_parse += 2; /* Skip past the 'U+' */
6914 if (valuep) { /* In a bracketed char class */
6915 /* We only pay attention to the first char of
6916 multichar strings being returned. I kinda wonder
6917 if this makes sense as it does change the behaviour
6918 from earlier versions, OTOH that behaviour was broken
6919 as well. XXX Solution is to recharacterize as
6920 [rest-of-class]|multi1|multi2... */
6922 STRLEN length_of_hex;
6923 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6924 | PERL_SCAN_DISALLOW_PREFIX
6925 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6927 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6928 if (endchar < endbrace) {
6929 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6932 length_of_hex = (STRLEN)(endchar - RExC_parse);
6933 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6935 /* The tokenizer should have guaranteed validity, but it's possible to
6936 * bypass it by using single quoting, so check */
6937 if (length_of_hex == 0
6938 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6940 RExC_parse += length_of_hex; /* Includes all the valid */
6941 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6942 ? UTF8SKIP(RExC_parse)
6944 /* Guard against malformed utf8 */
6945 if (RExC_parse >= endchar) RExC_parse = endchar;
6946 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6949 RExC_parse = endbrace + 1;
6950 if (endchar == endbrace) return NULL;
6952 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
6954 else { /* Not a char class */
6955 char *s; /* String to put in generated EXACT node */
6956 STRLEN len = 0; /* Its current byte length */
6957 char *endchar; /* Points to '.' or '}' ending cur char in the input
6960 ret = reg_node(pRExC_state,
6961 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6964 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6965 * the input which is of the form now 'c1.c2.c3...}' until find the
6966 * ending brace or exceed length 255. The characters that exceed this
6967 * limit are dropped. The limit could be relaxed should it become
6968 * desirable by reparsing this as (?:\N{NAME}), so could generate
6969 * multiple EXACT nodes, as is done for just regular input. But this
6970 * is primarily a named character, and not intended to be a huge long
6971 * string, so 255 bytes should be good enough */
6973 STRLEN length_of_hex;
6974 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6975 | PERL_SCAN_DISALLOW_PREFIX
6976 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6977 UV cp; /* Ord of current character */
6979 /* Code points are separated by dots. If none, there is only one
6980 * code point, and is terminated by the brace */
6981 endchar = RExC_parse + strcspn(RExC_parse, ".}");
6983 /* The values are Unicode even on EBCDIC machines */
6984 length_of_hex = (STRLEN)(endchar - RExC_parse);
6985 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6986 if ( length_of_hex == 0
6987 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6989 RExC_parse += length_of_hex; /* Includes all the valid */
6990 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6991 ? UTF8SKIP(RExC_parse)
6993 /* Guard against malformed utf8 */
6994 if (RExC_parse >= endchar) RExC_parse = endchar;
6995 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6998 if (! FOLD) { /* Not folding, just append to the string */
7001 /* Quit before adding this character if would exceed limit */
7002 if (len + UNISKIP(cp) > U8_MAX) break;
7004 unilen = reguni(pRExC_state, cp, s);
7009 } else { /* Folding, output the folded equivalent */
7010 STRLEN foldlen,numlen;
7011 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7012 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7014 /* Quit before exceeding size limit */
7015 if (len + foldlen > U8_MAX) break;
7017 for (foldbuf = tmpbuf;
7021 cp = utf8_to_uvchr(foldbuf, &numlen);
7023 const STRLEN unilen = reguni(pRExC_state, cp, s);
7026 /* In EBCDIC the numlen and unilen can differ. */
7028 if (numlen >= foldlen)
7032 break; /* "Can't happen." */
7036 /* Point to the beginning of the next character in the sequence. */
7037 RExC_parse = endchar + 1;
7039 /* Quit if no more characters */
7040 if (RExC_parse >= endbrace) break;
7045 if (RExC_parse < endbrace) {
7046 ckWARNreg(RExC_parse - 1,
7047 "Using just the first characters returned by \\N{}");
7050 RExC_size += STR_SZ(len);
7053 RExC_emit += STR_SZ(len);
7056 RExC_parse = endbrace + 1;
7058 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7059 with malformed in t/re/pat_advanced.t */
7061 Set_Node_Cur_Length(ret); /* MJD */
7062 nextchar(pRExC_state);
7072 * It returns the code point in utf8 for the value in *encp.
7073 * value: a code value in the source encoding
7074 * encp: a pointer to an Encode object
7076 * If the result from Encode is not a single character,
7077 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7080 S_reg_recode(pTHX_ const char value, SV **encp)
7083 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7084 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7085 const STRLEN newlen = SvCUR(sv);
7086 UV uv = UNICODE_REPLACEMENT;
7088 PERL_ARGS_ASSERT_REG_RECODE;
7092 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7095 if (!newlen || numlen != newlen) {
7096 uv = UNICODE_REPLACEMENT;
7104 - regatom - the lowest level
7106 Try to identify anything special at the start of the pattern. If there
7107 is, then handle it as required. This may involve generating a single regop,
7108 such as for an assertion; or it may involve recursing, such as to
7109 handle a () structure.
7111 If the string doesn't start with something special then we gobble up
7112 as much literal text as we can.
7114 Once we have been able to handle whatever type of thing started the
7115 sequence, we return.
7117 Note: we have to be careful with escapes, as they can be both literal
7118 and special, and in the case of \10 and friends can either, depending
7119 on context. Specifically there are two seperate switches for handling
7120 escape sequences, with the one for handling literal escapes requiring
7121 a dummy entry for all of the special escapes that are actually handled
7126 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7129 register regnode *ret = NULL;
7131 char *parse_start = RExC_parse;
7132 GET_RE_DEBUG_FLAGS_DECL;
7133 DEBUG_PARSE("atom");
7134 *flagp = WORST; /* Tentatively. */
7136 PERL_ARGS_ASSERT_REGATOM;
7139 switch ((U8)*RExC_parse) {
7141 RExC_seen_zerolen++;
7142 nextchar(pRExC_state);
7143 if (RExC_flags & RXf_PMf_MULTILINE)
7144 ret = reg_node(pRExC_state, MBOL);
7145 else if (RExC_flags & RXf_PMf_SINGLELINE)
7146 ret = reg_node(pRExC_state, SBOL);
7148 ret = reg_node(pRExC_state, BOL);
7149 Set_Node_Length(ret, 1); /* MJD */
7152 nextchar(pRExC_state);
7154 RExC_seen_zerolen++;
7155 if (RExC_flags & RXf_PMf_MULTILINE)
7156 ret = reg_node(pRExC_state, MEOL);
7157 else if (RExC_flags & RXf_PMf_SINGLELINE)
7158 ret = reg_node(pRExC_state, SEOL);
7160 ret = reg_node(pRExC_state, EOL);
7161 Set_Node_Length(ret, 1); /* MJD */
7164 nextchar(pRExC_state);
7165 if (RExC_flags & RXf_PMf_SINGLELINE)
7166 ret = reg_node(pRExC_state, SANY);
7168 ret = reg_node(pRExC_state, REG_ANY);
7169 *flagp |= HASWIDTH|SIMPLE;
7171 Set_Node_Length(ret, 1); /* MJD */
7175 char * const oregcomp_parse = ++RExC_parse;
7176 ret = regclass(pRExC_state,depth+1);
7177 if (*RExC_parse != ']') {
7178 RExC_parse = oregcomp_parse;
7179 vFAIL("Unmatched [");
7181 nextchar(pRExC_state);
7182 *flagp |= HASWIDTH|SIMPLE;
7183 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7187 nextchar(pRExC_state);
7188 ret = reg(pRExC_state, 1, &flags,depth+1);
7190 if (flags & TRYAGAIN) {
7191 if (RExC_parse == RExC_end) {
7192 /* Make parent create an empty node if needed. */
7200 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7204 if (flags & TRYAGAIN) {
7208 vFAIL("Internal urp");
7209 /* Supposed to be caught earlier. */
7212 if (!regcurly(RExC_parse)) {
7221 vFAIL("Quantifier follows nothing");
7229 len=0; /* silence a spurious compiler warning */
7230 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7231 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7232 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7233 ret = reganode(pRExC_state, FOLDCHAR, cp);
7234 Set_Node_Length(ret, 1); /* MJD */
7235 nextchar(pRExC_state); /* kill whitespace under /x */
7243 This switch handles escape sequences that resolve to some kind
7244 of special regop and not to literal text. Escape sequnces that
7245 resolve to literal text are handled below in the switch marked
7248 Every entry in this switch *must* have a corresponding entry
7249 in the literal escape switch. However, the opposite is not
7250 required, as the default for this switch is to jump to the
7251 literal text handling code.
7253 switch ((U8)*++RExC_parse) {
7258 /* Special Escapes */
7260 RExC_seen_zerolen++;
7261 ret = reg_node(pRExC_state, SBOL);
7263 goto finish_meta_pat;
7265 ret = reg_node(pRExC_state, GPOS);
7266 RExC_seen |= REG_SEEN_GPOS;
7268 goto finish_meta_pat;
7270 RExC_seen_zerolen++;
7271 ret = reg_node(pRExC_state, KEEPS);
7273 /* XXX:dmq : disabling in-place substitution seems to
7274 * be necessary here to avoid cases of memory corruption, as
7275 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7277 RExC_seen |= REG_SEEN_LOOKBEHIND;
7278 goto finish_meta_pat;
7280 ret = reg_node(pRExC_state, SEOL);
7282 RExC_seen_zerolen++; /* Do not optimize RE away */
7283 goto finish_meta_pat;
7285 ret = reg_node(pRExC_state, EOS);
7287 RExC_seen_zerolen++; /* Do not optimize RE away */
7288 goto finish_meta_pat;
7290 ret = reg_node(pRExC_state, CANY);
7291 RExC_seen |= REG_SEEN_CANY;
7292 *flagp |= HASWIDTH|SIMPLE;
7293 goto finish_meta_pat;
7295 ret = reg_node(pRExC_state, CLUMP);
7297 goto finish_meta_pat;
7300 ret = reg_node(pRExC_state, (U8)(ALNUML));
7302 ret = reg_node(pRExC_state, (U8)(ALNUM));
7303 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7305 *flagp |= HASWIDTH|SIMPLE;
7306 goto finish_meta_pat;
7309 ret = reg_node(pRExC_state, (U8)(NALNUML));
7311 ret = reg_node(pRExC_state, (U8)(NALNUM));
7312 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7314 *flagp |= HASWIDTH|SIMPLE;
7315 goto finish_meta_pat;
7317 RExC_seen_zerolen++;
7318 RExC_seen |= REG_SEEN_LOOKBEHIND;
7320 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7322 ret = reg_node(pRExC_state, (U8)(BOUND));
7323 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7326 goto finish_meta_pat;
7328 RExC_seen_zerolen++;
7329 RExC_seen |= REG_SEEN_LOOKBEHIND;
7331 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7333 ret = reg_node(pRExC_state, (U8)(NBOUND));
7334 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7337 goto finish_meta_pat;
7340 ret = reg_node(pRExC_state, (U8)(SPACEL));
7342 ret = reg_node(pRExC_state, (U8)(SPACE));
7343 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7345 *flagp |= HASWIDTH|SIMPLE;
7346 goto finish_meta_pat;
7349 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7351 ret = reg_node(pRExC_state, (U8)(NSPACE));
7352 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7354 *flagp |= HASWIDTH|SIMPLE;
7355 goto finish_meta_pat;
7357 ret = reg_node(pRExC_state, DIGIT);
7358 *flagp |= HASWIDTH|SIMPLE;
7359 goto finish_meta_pat;
7361 ret = reg_node(pRExC_state, NDIGIT);
7362 *flagp |= HASWIDTH|SIMPLE;
7363 goto finish_meta_pat;
7365 ret = reg_node(pRExC_state, LNBREAK);
7366 *flagp |= HASWIDTH|SIMPLE;
7367 goto finish_meta_pat;
7369 ret = reg_node(pRExC_state, HORIZWS);
7370 *flagp |= HASWIDTH|SIMPLE;
7371 goto finish_meta_pat;
7373 ret = reg_node(pRExC_state, NHORIZWS);
7374 *flagp |= HASWIDTH|SIMPLE;
7375 goto finish_meta_pat;
7377 ret = reg_node(pRExC_state, VERTWS);
7378 *flagp |= HASWIDTH|SIMPLE;
7379 goto finish_meta_pat;
7381 ret = reg_node(pRExC_state, NVERTWS);
7382 *flagp |= HASWIDTH|SIMPLE;
7384 nextchar(pRExC_state);
7385 Set_Node_Length(ret, 2); /* MJD */
7390 char* const oldregxend = RExC_end;
7392 char* parse_start = RExC_parse - 2;
7395 if (RExC_parse[1] == '{') {
7396 /* a lovely hack--pretend we saw [\pX] instead */
7397 RExC_end = strchr(RExC_parse, '}');
7399 const U8 c = (U8)*RExC_parse;
7401 RExC_end = oldregxend;
7402 vFAIL2("Missing right brace on \\%c{}", c);
7407 RExC_end = RExC_parse + 2;
7408 if (RExC_end > oldregxend)
7409 RExC_end = oldregxend;
7413 ret = regclass(pRExC_state,depth+1);
7415 RExC_end = oldregxend;
7418 Set_Node_Offset(ret, parse_start + 2);
7419 Set_Node_Cur_Length(ret);
7420 nextchar(pRExC_state);
7421 *flagp |= HASWIDTH|SIMPLE;
7425 /* Handle \N and \N{NAME} here and not below because it can be
7426 multicharacter. join_exact() will join them up later on.
7427 Also this makes sure that things like /\N{BLAH}+/ and
7428 \N{BLAH} being multi char Just Happen. dmq*/
7430 ret= reg_namedseq(pRExC_state, NULL, flagp);
7432 case 'k': /* Handle \k<NAME> and \k'NAME' */
7435 char ch= RExC_parse[1];
7436 if (ch != '<' && ch != '\'' && ch != '{') {
7438 vFAIL2("Sequence %.2s... not terminated",parse_start);
7440 /* this pretty much dupes the code for (?P=...) in reg(), if
7441 you change this make sure you change that */
7442 char* name_start = (RExC_parse += 2);
7444 SV *sv_dat = reg_scan_name(pRExC_state,
7445 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7446 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7447 if (RExC_parse == name_start || *RExC_parse != ch)
7448 vFAIL2("Sequence %.3s... not terminated",parse_start);
7451 num = add_data( pRExC_state, 1, "S" );
7452 RExC_rxi->data->data[num]=(void*)sv_dat;
7453 SvREFCNT_inc_simple_void(sv_dat);
7457 ret = reganode(pRExC_state,
7458 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7462 /* override incorrect value set in reganode MJD */
7463 Set_Node_Offset(ret, parse_start+1);
7464 Set_Node_Cur_Length(ret); /* MJD */
7465 nextchar(pRExC_state);
7471 case '1': case '2': case '3': case '4':
7472 case '5': case '6': case '7': case '8': case '9':
7475 bool isg = *RExC_parse == 'g';
7480 if (*RExC_parse == '{') {
7484 if (*RExC_parse == '-') {
7488 if (hasbrace && !isDIGIT(*RExC_parse)) {
7489 if (isrel) RExC_parse--;
7491 goto parse_named_seq;
7493 num = atoi(RExC_parse);
7494 if (isg && num == 0)
7495 vFAIL("Reference to invalid group 0");
7497 num = RExC_npar - num;
7499 vFAIL("Reference to nonexistent or unclosed group");
7501 if (!isg && num > 9 && num >= RExC_npar)
7504 char * const parse_start = RExC_parse - 1; /* MJD */
7505 while (isDIGIT(*RExC_parse))
7507 if (parse_start == RExC_parse - 1)
7508 vFAIL("Unterminated \\g... pattern");
7510 if (*RExC_parse != '}')
7511 vFAIL("Unterminated \\g{...} pattern");
7515 if (num > (I32)RExC_rx->nparens)
7516 vFAIL("Reference to nonexistent group");
7519 ret = reganode(pRExC_state,
7520 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7524 /* override incorrect value set in reganode MJD */
7525 Set_Node_Offset(ret, parse_start+1);
7526 Set_Node_Cur_Length(ret); /* MJD */
7528 nextchar(pRExC_state);
7533 if (RExC_parse >= RExC_end)
7534 FAIL("Trailing \\");
7537 /* Do not generate "unrecognized" warnings here, we fall
7538 back into the quick-grab loop below */
7545 if (RExC_flags & RXf_PMf_EXTENDED) {
7546 if ( reg_skipcomment( pRExC_state ) )
7553 register STRLEN len;
7558 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7560 parse_start = RExC_parse - 1;
7566 ret = reg_node(pRExC_state,
7567 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7569 for (len = 0, p = RExC_parse - 1;
7570 len < 127 && p < RExC_end;
7573 char * const oldp = p;
7575 if (RExC_flags & RXf_PMf_EXTENDED)
7576 p = regwhite( pRExC_state, p );
7581 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7582 goto normal_default;
7592 /* Literal Escapes Switch
7594 This switch is meant to handle escape sequences that
7595 resolve to a literal character.
7597 Every escape sequence that represents something
7598 else, like an assertion or a char class, is handled
7599 in the switch marked 'Special Escapes' above in this
7600 routine, but also has an entry here as anything that
7601 isn't explicitly mentioned here will be treated as
7602 an unescaped equivalent literal.
7606 /* These are all the special escapes. */
7610 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7611 goto normal_default;
7612 case 'A': /* Start assertion */
7613 case 'b': case 'B': /* Word-boundary assertion*/
7614 case 'C': /* Single char !DANGEROUS! */
7615 case 'd': case 'D': /* digit class */
7616 case 'g': case 'G': /* generic-backref, pos assertion */
7617 case 'h': case 'H': /* HORIZWS */
7618 case 'k': case 'K': /* named backref, keep marker */
7619 case 'N': /* named char sequence */
7620 case 'p': case 'P': /* Unicode property */
7621 case 'R': /* LNBREAK */
7622 case 's': case 'S': /* space class */
7623 case 'v': case 'V': /* VERTWS */
7624 case 'w': case 'W': /* word class */
7625 case 'X': /* eXtended Unicode "combining character sequence" */
7626 case 'z': case 'Z': /* End of line/string assertion */
7630 /* Anything after here is an escape that resolves to a
7631 literal. (Except digits, which may or may not)
7650 ender = ASCII_TO_NATIVE('\033');
7654 ender = ASCII_TO_NATIVE('\007');
7659 STRLEN brace_len = len;
7661 const char* error_msg;
7663 bool valid = grok_bslash_o(p,
7670 RExC_parse = p; /* going to die anyway; point
7671 to exact spot of failure */
7678 if (PL_encoding && ender < 0x100) {
7679 goto recode_encoding;
7688 char* const e = strchr(p, '}');
7692 vFAIL("Missing right brace on \\x{}");
7695 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7696 | PERL_SCAN_DISALLOW_PREFIX;
7697 STRLEN numlen = e - p - 1;
7698 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7705 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7707 ender = grok_hex(p, &numlen, &flags, NULL);
7710 if (PL_encoding && ender < 0x100)
7711 goto recode_encoding;
7715 ender = grok_bslash_c(*p++, SIZE_ONLY);
7717 case '0': case '1': case '2': case '3':case '4':
7718 case '5': case '6': case '7': case '8':case '9':
7720 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7722 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7724 ender = grok_oct(p, &numlen, &flags, NULL);
7734 if (PL_encoding && ender < 0x100)
7735 goto recode_encoding;
7739 SV* enc = PL_encoding;
7740 ender = reg_recode((const char)(U8)ender, &enc);
7741 if (!enc && SIZE_ONLY)
7742 ckWARNreg(p, "Invalid escape in the specified encoding");
7748 FAIL("Trailing \\");
7751 if (!SIZE_ONLY&& isALPHA(*p))
7752 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7753 goto normal_default;
7758 if (UTF8_IS_START(*p) && UTF) {
7760 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7761 &numlen, UTF8_ALLOW_DEFAULT);
7768 if ( RExC_flags & RXf_PMf_EXTENDED)
7769 p = regwhite( pRExC_state, p );
7771 /* Prime the casefolded buffer. */
7772 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7774 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7779 /* Emit all the Unicode characters. */
7781 for (foldbuf = tmpbuf;
7783 foldlen -= numlen) {
7784 ender = utf8_to_uvchr(foldbuf, &numlen);
7786 const STRLEN unilen = reguni(pRExC_state, ender, s);
7789 /* In EBCDIC the numlen
7790 * and unilen can differ. */
7792 if (numlen >= foldlen)
7796 break; /* "Can't happen." */
7800 const STRLEN unilen = reguni(pRExC_state, ender, s);
7809 REGC((char)ender, s++);
7815 /* Emit all the Unicode characters. */
7817 for (foldbuf = tmpbuf;
7819 foldlen -= numlen) {
7820 ender = utf8_to_uvchr(foldbuf, &numlen);
7822 const STRLEN unilen = reguni(pRExC_state, ender, s);
7825 /* In EBCDIC the numlen
7826 * and unilen can differ. */
7828 if (numlen >= foldlen)
7836 const STRLEN unilen = reguni(pRExC_state, ender, s);
7845 REGC((char)ender, s++);
7849 Set_Node_Cur_Length(ret); /* MJD */
7850 nextchar(pRExC_state);
7852 /* len is STRLEN which is unsigned, need to copy to signed */
7855 vFAIL("Internal disaster");
7859 if (len == 1 && UNI_IS_INVARIANT(ender))
7863 RExC_size += STR_SZ(len);
7866 RExC_emit += STR_SZ(len);
7876 S_regwhite( RExC_state_t *pRExC_state, char *p )
7878 const char *e = RExC_end;
7880 PERL_ARGS_ASSERT_REGWHITE;
7885 else if (*p == '#') {
7894 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7902 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7903 Character classes ([:foo:]) can also be negated ([:^foo:]).
7904 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7905 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7906 but trigger failures because they are currently unimplemented. */
7908 #define POSIXCC_DONE(c) ((c) == ':')
7909 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7910 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7913 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7916 I32 namedclass = OOB_NAMEDCLASS;
7918 PERL_ARGS_ASSERT_REGPPOSIXCC;
7920 if (value == '[' && RExC_parse + 1 < RExC_end &&
7921 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7922 POSIXCC(UCHARAT(RExC_parse))) {
7923 const char c = UCHARAT(RExC_parse);
7924 char* const s = RExC_parse++;
7926 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7928 if (RExC_parse == RExC_end)
7929 /* Grandfather lone [:, [=, [. */
7932 const char* const t = RExC_parse++; /* skip over the c */
7935 if (UCHARAT(RExC_parse) == ']') {
7936 const char *posixcc = s + 1;
7937 RExC_parse++; /* skip over the ending ] */
7940 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7941 const I32 skip = t - posixcc;
7943 /* Initially switch on the length of the name. */
7946 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7947 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7950 /* Names all of length 5. */
7951 /* alnum alpha ascii blank cntrl digit graph lower
7952 print punct space upper */
7953 /* Offset 4 gives the best switch position. */
7954 switch (posixcc[4]) {
7956 if (memEQ(posixcc, "alph", 4)) /* alpha */
7957 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7960 if (memEQ(posixcc, "spac", 4)) /* space */
7961 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7964 if (memEQ(posixcc, "grap", 4)) /* graph */
7965 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7968 if (memEQ(posixcc, "asci", 4)) /* ascii */
7969 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7972 if (memEQ(posixcc, "blan", 4)) /* blank */
7973 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7976 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7977 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7980 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7981 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7984 if (memEQ(posixcc, "lowe", 4)) /* lower */
7985 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7986 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7987 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7990 if (memEQ(posixcc, "digi", 4)) /* digit */
7991 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7992 else if (memEQ(posixcc, "prin", 4)) /* print */
7993 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7994 else if (memEQ(posixcc, "punc", 4)) /* punct */
7995 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8000 if (memEQ(posixcc, "xdigit", 6))
8001 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8005 if (namedclass == OOB_NAMEDCLASS)
8006 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8008 assert (posixcc[skip] == ':');
8009 assert (posixcc[skip+1] == ']');
8010 } else if (!SIZE_ONLY) {
8011 /* [[=foo=]] and [[.foo.]] are still future. */
8013 /* adjust RExC_parse so the warning shows after
8015 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8017 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8020 /* Maternal grandfather:
8021 * "[:" ending in ":" but not in ":]" */
8031 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8035 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8037 if (POSIXCC(UCHARAT(RExC_parse))) {
8038 const char *s = RExC_parse;
8039 const char c = *s++;
8043 if (*s && c == *s && s[1] == ']') {
8045 "POSIX syntax [%c %c] belongs inside character classes",
8048 /* [[=foo=]] and [[.foo.]] are still future. */
8049 if (POSIXCC_NOTYET(c)) {
8050 /* adjust RExC_parse so the error shows after
8052 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8054 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8061 #define _C_C_T_(NAME,TEST,WORD) \
8064 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8066 for (value = 0; value < 256; value++) \
8068 ANYOF_BITMAP_SET(ret, value); \
8073 case ANYOF_N##NAME: \
8075 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8077 for (value = 0; value < 256; value++) \
8079 ANYOF_BITMAP_SET(ret, value); \
8085 /* Like above, but no locale test */
8086 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8088 for (value = 0; value < 256; value++) \
8090 ANYOF_BITMAP_SET(ret, value); \
8094 case ANYOF_N##NAME: \
8095 for (value = 0; value < 256; value++) \
8097 ANYOF_BITMAP_SET(ret, value); \
8102 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8103 * there are two tests passed in, to use depending on that. There aren't any
8104 * cases where the label is different from the name, so no need for that
8106 #define _C_C_T_UNI_8_BIT(NAME,TEST_8,TEST_7,WORD) \
8108 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8109 else if (UNI_SEMANTICS) { \
8110 for (value = 0; value < 256; value++) { \
8111 if (TEST_8) ANYOF_BITMAP_SET(ret, value); \
8115 for (value = 0; value < 256; value++) { \
8116 if (TEST_7) ANYOF_BITMAP_SET(ret, value); \
8122 case ANYOF_N##NAME: \
8123 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8124 else if (UNI_SEMANTICS) { \
8125 for (value = 0; value < 256; value++) { \
8126 if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
8130 for (value = 0; value < 256; value++) { \
8131 if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
8139 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8140 so that it is possible to override the option here without having to
8141 rebuild the entire core. as we are required to do if we change regcomp.h
8142 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8144 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8145 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8148 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8149 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8151 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8155 parse a class specification and produce either an ANYOF node that
8156 matches the pattern or if the pattern matches a single char only and
8157 that char is < 256 and we are case insensitive then we produce an
8162 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8165 register UV nextvalue;
8166 register IV prevvalue = OOB_UNICODE;
8167 register IV range = 0;
8168 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8169 register regnode *ret;
8172 char *rangebegin = NULL;
8173 bool need_class = 0;
8176 bool optimize_invert = TRUE;
8177 AV* unicode_alternate = NULL;
8179 UV literal_endpoint = 0;
8181 UV stored = 0; /* number of chars stored in the class */
8183 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8184 case we need to change the emitted regop to an EXACT. */
8185 const char * orig_parse = RExC_parse;
8186 GET_RE_DEBUG_FLAGS_DECL;
8188 PERL_ARGS_ASSERT_REGCLASS;
8190 PERL_UNUSED_ARG(depth);
8193 DEBUG_PARSE("clas");
8195 /* Assume we are going to generate an ANYOF node. */
8196 ret = reganode(pRExC_state, ANYOF, 0);
8199 ANYOF_FLAGS(ret) = 0;
8201 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
8205 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8209 RExC_size += ANYOF_SKIP;
8210 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8213 RExC_emit += ANYOF_SKIP;
8215 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8217 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8218 ANYOF_BITMAP_ZERO(ret);
8219 listsv = newSVpvs("# comment\n");
8222 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8224 if (!SIZE_ONLY && POSIXCC(nextvalue))
8225 checkposixcc(pRExC_state);
8227 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8228 if (UCHARAT(RExC_parse) == ']')
8232 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8236 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8239 rangebegin = RExC_parse;
8241 value = utf8n_to_uvchr((U8*)RExC_parse,
8242 RExC_end - RExC_parse,
8243 &numlen, UTF8_ALLOW_DEFAULT);
8244 RExC_parse += numlen;
8247 value = UCHARAT(RExC_parse++);
8249 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8250 if (value == '[' && POSIXCC(nextvalue))
8251 namedclass = regpposixcc(pRExC_state, value);
8252 else if (value == '\\') {
8254 value = utf8n_to_uvchr((U8*)RExC_parse,
8255 RExC_end - RExC_parse,
8256 &numlen, UTF8_ALLOW_DEFAULT);
8257 RExC_parse += numlen;
8260 value = UCHARAT(RExC_parse++);
8261 /* Some compilers cannot handle switching on 64-bit integer
8262 * values, therefore value cannot be an UV. Yes, this will
8263 * be a problem later if we want switch on Unicode.
8264 * A similar issue a little bit later when switching on
8265 * namedclass. --jhi */
8266 switch ((I32)value) {
8267 case 'w': namedclass = ANYOF_ALNUM; break;
8268 case 'W': namedclass = ANYOF_NALNUM; break;
8269 case 's': namedclass = ANYOF_SPACE; break;
8270 case 'S': namedclass = ANYOF_NSPACE; break;
8271 case 'd': namedclass = ANYOF_DIGIT; break;
8272 case 'D': namedclass = ANYOF_NDIGIT; break;
8273 case 'v': namedclass = ANYOF_VERTWS; break;
8274 case 'V': namedclass = ANYOF_NVERTWS; break;
8275 case 'h': namedclass = ANYOF_HORIZWS; break;
8276 case 'H': namedclass = ANYOF_NHORIZWS; break;
8277 case 'N': /* Handle \N{NAME} in class */
8279 /* We only pay attention to the first char of
8280 multichar strings being returned. I kinda wonder
8281 if this makes sense as it does change the behaviour
8282 from earlier versions, OTOH that behaviour was broken
8284 UV v; /* value is register so we cant & it /grrr */
8285 if (reg_namedseq(pRExC_state, &v, NULL)) {
8295 if (RExC_parse >= RExC_end)
8296 vFAIL2("Empty \\%c{}", (U8)value);
8297 if (*RExC_parse == '{') {
8298 const U8 c = (U8)value;
8299 e = strchr(RExC_parse++, '}');
8301 vFAIL2("Missing right brace on \\%c{}", c);
8302 while (isSPACE(UCHARAT(RExC_parse)))
8304 if (e == RExC_parse)
8305 vFAIL2("Empty \\%c{}", c);
8307 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8315 if (UCHARAT(RExC_parse) == '^') {
8318 value = value == 'p' ? 'P' : 'p'; /* toggle */
8319 while (isSPACE(UCHARAT(RExC_parse))) {
8324 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8325 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8328 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8329 namedclass = ANYOF_MAX; /* no official name, but it's named */
8332 case 'n': value = '\n'; break;
8333 case 'r': value = '\r'; break;
8334 case 't': value = '\t'; break;
8335 case 'f': value = '\f'; break;
8336 case 'b': value = '\b'; break;
8337 case 'e': value = ASCII_TO_NATIVE('\033');break;
8338 case 'a': value = ASCII_TO_NATIVE('\007');break;
8340 RExC_parse--; /* function expects to be pointed at the 'o' */
8342 const char* error_msg;
8343 bool valid = grok_bslash_o(RExC_parse,
8348 RExC_parse += numlen;
8353 if (PL_encoding && value < 0x100) {
8354 goto recode_encoding;
8358 if (*RExC_parse == '{') {
8359 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8360 | PERL_SCAN_DISALLOW_PREFIX;
8361 char * const e = strchr(RExC_parse++, '}');
8363 vFAIL("Missing right brace on \\x{}");
8365 numlen = e - RExC_parse;
8366 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8370 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8372 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8373 RExC_parse += numlen;
8375 if (PL_encoding && value < 0x100)
8376 goto recode_encoding;
8379 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8381 case '0': case '1': case '2': case '3': case '4':
8382 case '5': case '6': case '7':
8384 /* Take 1-3 octal digits */
8385 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8387 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8388 RExC_parse += numlen;
8389 if (PL_encoding && value < 0x100)
8390 goto recode_encoding;
8395 SV* enc = PL_encoding;
8396 value = reg_recode((const char)(U8)value, &enc);
8397 if (!enc && SIZE_ONLY)
8398 ckWARNreg(RExC_parse,
8399 "Invalid escape in the specified encoding");
8403 /* Allow \_ to not give an error */
8404 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8405 ckWARN2reg(RExC_parse,
8406 "Unrecognized escape \\%c in character class passed through",
8411 } /* end of \blah */
8417 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8419 if (!SIZE_ONLY && !need_class)
8420 ANYOF_CLASS_ZERO(ret);
8424 /* a bad range like a-\d, a-[:digit:] ? */
8428 RExC_parse >= rangebegin ?
8429 RExC_parse - rangebegin : 0;
8430 ckWARN4reg(RExC_parse,
8431 "False [] range \"%*.*s\"",
8434 if (prevvalue < 256) {
8435 ANYOF_BITMAP_SET(ret, prevvalue);
8436 ANYOF_BITMAP_SET(ret, '-');
8439 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8440 Perl_sv_catpvf(aTHX_ listsv,
8441 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8445 range = 0; /* this was not a true range */
8451 const char *what = NULL;
8454 if (namedclass > OOB_NAMEDCLASS)
8455 optimize_invert = FALSE;
8456 /* Possible truncation here but in some 64-bit environments
8457 * the compiler gets heartburn about switch on 64-bit values.
8458 * A similar issue a little earlier when switching on value.
8460 switch ((I32)namedclass) {
8462 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8463 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8464 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8465 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8466 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8467 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8468 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8469 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8470 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8471 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8472 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8473 /* \s, \w match all unicode if utf8. */
8474 case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8475 case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8477 /* \s, \w match ascii and locale only */
8478 case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8479 case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8481 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8482 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8483 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8486 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8489 for (value = 0; value < 128; value++)
8490 ANYOF_BITMAP_SET(ret, value);
8492 for (value = 0; value < 256; value++) {
8494 ANYOF_BITMAP_SET(ret, value);
8503 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8506 for (value = 128; value < 256; value++)
8507 ANYOF_BITMAP_SET(ret, value);
8509 for (value = 0; value < 256; value++) {
8510 if (!isASCII(value))
8511 ANYOF_BITMAP_SET(ret, value);
8520 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8522 /* consecutive digits assumed */
8523 for (value = '0'; value <= '9'; value++)
8524 ANYOF_BITMAP_SET(ret, value);
8527 what = POSIX_CC_UNI_NAME("Digit");
8531 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8533 /* consecutive digits assumed */
8534 for (value = 0; value < '0'; value++)
8535 ANYOF_BITMAP_SET(ret, value);
8536 for (value = '9' + 1; value < 256; value++)
8537 ANYOF_BITMAP_SET(ret, value);
8540 what = POSIX_CC_UNI_NAME("Digit");
8543 /* this is to handle \p and \P */
8546 vFAIL("Invalid [::] class");
8550 /* Strings such as "+utf8::isWord\n" */
8551 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8554 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8557 } /* end of namedclass \blah */
8560 if (prevvalue > (IV)value) /* b-a */ {
8561 const int w = RExC_parse - rangebegin;
8562 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8563 range = 0; /* not a valid range */
8567 prevvalue = value; /* save the beginning of the range */
8568 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8569 RExC_parse[1] != ']') {
8572 /* a bad range like \w-, [:word:]- ? */
8573 if (namedclass > OOB_NAMEDCLASS) {
8574 if (ckWARN(WARN_REGEXP)) {
8576 RExC_parse >= rangebegin ?
8577 RExC_parse - rangebegin : 0;
8579 "False [] range \"%*.*s\"",
8583 ANYOF_BITMAP_SET(ret, '-');
8585 range = 1; /* yeah, it's a range! */
8586 continue; /* but do it the next time */
8590 /* now is the next time */
8591 /*stored += (value - prevvalue + 1);*/
8593 if (prevvalue < 256) {
8594 const IV ceilvalue = value < 256 ? value : 255;
8597 /* In EBCDIC [\x89-\x91] should include
8598 * the \x8e but [i-j] should not. */
8599 if (literal_endpoint == 2 &&
8600 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8601 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8603 if (isLOWER(prevvalue)) {
8604 for (i = prevvalue; i <= ceilvalue; i++)
8605 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8607 ANYOF_BITMAP_SET(ret, i);
8610 for (i = prevvalue; i <= ceilvalue; i++)
8611 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8613 ANYOF_BITMAP_SET(ret, i);
8619 for (i = prevvalue; i <= ceilvalue; i++) {
8620 if (!ANYOF_BITMAP_TEST(ret,i)) {
8622 ANYOF_BITMAP_SET(ret, i);
8626 if (value > 255 || UTF) {
8627 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8628 const UV natvalue = NATIVE_TO_UNI(value);
8629 stored+=2; /* can't optimize this class */
8630 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8631 if (prevnatvalue < natvalue) { /* what about > ? */
8632 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8633 prevnatvalue, natvalue);
8635 else if (prevnatvalue == natvalue) {
8636 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8638 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8640 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8642 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8643 if (RExC_precomp[0] == ':' &&
8644 RExC_precomp[1] == '[' &&
8645 (f == 0xDF || f == 0x92)) {
8646 f = NATIVE_TO_UNI(f);
8649 /* If folding and foldable and a single
8650 * character, insert also the folded version
8651 * to the charclass. */
8653 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8654 if ((RExC_precomp[0] == ':' &&
8655 RExC_precomp[1] == '[' &&
8657 (value == 0xFB05 || value == 0xFB06))) ?
8658 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8659 foldlen == (STRLEN)UNISKIP(f) )
8661 if (foldlen == (STRLEN)UNISKIP(f))
8663 Perl_sv_catpvf(aTHX_ listsv,
8666 /* Any multicharacter foldings
8667 * require the following transform:
8668 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8669 * where E folds into "pq" and F folds
8670 * into "rst", all other characters
8671 * fold to single characters. We save
8672 * away these multicharacter foldings,
8673 * to be later saved as part of the
8674 * additional "s" data. */
8677 if (!unicode_alternate)
8678 unicode_alternate = newAV();
8679 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8681 av_push(unicode_alternate, sv);
8685 /* If folding and the value is one of the Greek
8686 * sigmas insert a few more sigmas to make the
8687 * folding rules of the sigmas to work right.
8688 * Note that not all the possible combinations
8689 * are handled here: some of them are handled
8690 * by the standard folding rules, and some of
8691 * them (literal or EXACTF cases) are handled
8692 * during runtime in regexec.c:S_find_byclass(). */
8693 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8694 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8695 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8696 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8697 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8699 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8700 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8701 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8706 literal_endpoint = 0;
8710 range = 0; /* this range (if it was one) is done now */
8714 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8716 RExC_size += ANYOF_CLASS_ADD_SKIP;
8718 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8724 /****** !SIZE_ONLY AFTER HERE *********/
8726 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8727 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8729 /* optimize single char class to an EXACT node
8730 but *only* when its not a UTF/high char */
8731 const char * cur_parse= RExC_parse;
8732 RExC_emit = (regnode *)orig_emit;
8733 RExC_parse = (char *)orig_parse;
8734 ret = reg_node(pRExC_state,
8735 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8736 RExC_parse = (char *)cur_parse;
8737 *STRING(ret)= (char)value;
8739 RExC_emit += STR_SZ(1);
8740 SvREFCNT_dec(listsv);
8743 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8744 if ( /* If the only flag is folding (plus possibly inversion). */
8745 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8747 for (value = 0; value < 256; ++value) {
8748 if (ANYOF_BITMAP_TEST(ret, value)) {
8749 UV fold = PL_fold[value];
8752 ANYOF_BITMAP_SET(ret, fold);
8755 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8758 /* optimize inverted simple patterns (e.g. [^a-z]) */
8759 if (optimize_invert &&
8760 /* If the only flag is inversion. */
8761 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8762 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8763 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8764 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8767 AV * const av = newAV();
8769 /* The 0th element stores the character class description
8770 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8771 * to initialize the appropriate swash (which gets stored in
8772 * the 1st element), and also useful for dumping the regnode.
8773 * The 2nd element stores the multicharacter foldings,
8774 * used later (regexec.c:S_reginclass()). */
8775 av_store(av, 0, listsv);
8776 av_store(av, 1, NULL);
8777 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8778 rv = newRV_noinc(MUTABLE_SV(av));
8779 n = add_data(pRExC_state, 1, "s");
8780 RExC_rxi->data->data[n] = (void*)rv;
8788 /* reg_skipcomment()
8790 Absorbs an /x style # comments from the input stream.
8791 Returns true if there is more text remaining in the stream.
8792 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8793 terminates the pattern without including a newline.
8795 Note its the callers responsibility to ensure that we are
8801 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8805 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8807 while (RExC_parse < RExC_end)
8808 if (*RExC_parse++ == '\n') {
8813 /* we ran off the end of the pattern without ending
8814 the comment, so we have to add an \n when wrapping */
8815 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8823 Advance that parse position, and optionall absorbs
8824 "whitespace" from the inputstream.
8826 Without /x "whitespace" means (?#...) style comments only,
8827 with /x this means (?#...) and # comments and whitespace proper.
8829 Returns the RExC_parse point from BEFORE the scan occurs.
8831 This is the /x friendly way of saying RExC_parse++.
8835 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8837 char* const retval = RExC_parse++;
8839 PERL_ARGS_ASSERT_NEXTCHAR;
8842 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8843 RExC_parse[2] == '#') {
8844 while (*RExC_parse != ')') {
8845 if (RExC_parse == RExC_end)
8846 FAIL("Sequence (?#... not terminated");
8852 if (RExC_flags & RXf_PMf_EXTENDED) {
8853 if (isSPACE(*RExC_parse)) {
8857 else if (*RExC_parse == '#') {
8858 if ( reg_skipcomment( pRExC_state ) )
8867 - reg_node - emit a node
8869 STATIC regnode * /* Location. */
8870 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8873 register regnode *ptr;
8874 regnode * const ret = RExC_emit;
8875 GET_RE_DEBUG_FLAGS_DECL;
8877 PERL_ARGS_ASSERT_REG_NODE;
8880 SIZE_ALIGN(RExC_size);
8884 if (RExC_emit >= RExC_emit_bound)
8885 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8887 NODE_ALIGN_FILL(ret);
8889 FILL_ADVANCE_NODE(ptr, op);
8890 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8891 #ifdef RE_TRACK_PATTERN_OFFSETS
8892 if (RExC_offsets) { /* MJD */
8893 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8894 "reg_node", __LINE__,
8896 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8897 ? "Overwriting end of array!\n" : "OK",
8898 (UV)(RExC_emit - RExC_emit_start),
8899 (UV)(RExC_parse - RExC_start),
8900 (UV)RExC_offsets[0]));
8901 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8909 - reganode - emit a node with an argument
8911 STATIC regnode * /* Location. */
8912 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8915 register regnode *ptr;
8916 regnode * const ret = RExC_emit;
8917 GET_RE_DEBUG_FLAGS_DECL;
8919 PERL_ARGS_ASSERT_REGANODE;
8922 SIZE_ALIGN(RExC_size);
8927 assert(2==regarglen[op]+1);
8929 Anything larger than this has to allocate the extra amount.
8930 If we changed this to be:
8932 RExC_size += (1 + regarglen[op]);
8934 then it wouldn't matter. Its not clear what side effect
8935 might come from that so its not done so far.
8940 if (RExC_emit >= RExC_emit_bound)
8941 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8943 NODE_ALIGN_FILL(ret);
8945 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8946 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8947 #ifdef RE_TRACK_PATTERN_OFFSETS
8948 if (RExC_offsets) { /* MJD */
8949 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8953 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8954 "Overwriting end of array!\n" : "OK",
8955 (UV)(RExC_emit - RExC_emit_start),
8956 (UV)(RExC_parse - RExC_start),
8957 (UV)RExC_offsets[0]));
8958 Set_Cur_Node_Offset;
8966 - reguni - emit (if appropriate) a Unicode character
8969 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8973 PERL_ARGS_ASSERT_REGUNI;
8975 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8979 - reginsert - insert an operator in front of already-emitted operand
8981 * Means relocating the operand.
8984 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8987 register regnode *src;
8988 register regnode *dst;
8989 register regnode *place;
8990 const int offset = regarglen[(U8)op];
8991 const int size = NODE_STEP_REGNODE + offset;
8992 GET_RE_DEBUG_FLAGS_DECL;
8994 PERL_ARGS_ASSERT_REGINSERT;
8995 PERL_UNUSED_ARG(depth);
8996 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8997 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9006 if (RExC_open_parens) {
9008 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9009 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9010 if ( RExC_open_parens[paren] >= opnd ) {
9011 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9012 RExC_open_parens[paren] += size;
9014 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9016 if ( RExC_close_parens[paren] >= opnd ) {
9017 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9018 RExC_close_parens[paren] += size;
9020 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9025 while (src > opnd) {
9026 StructCopy(--src, --dst, regnode);
9027 #ifdef RE_TRACK_PATTERN_OFFSETS
9028 if (RExC_offsets) { /* MJD 20010112 */
9029 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9033 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9034 ? "Overwriting end of array!\n" : "OK",
9035 (UV)(src - RExC_emit_start),
9036 (UV)(dst - RExC_emit_start),
9037 (UV)RExC_offsets[0]));
9038 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9039 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9045 place = opnd; /* Op node, where operand used to be. */
9046 #ifdef RE_TRACK_PATTERN_OFFSETS
9047 if (RExC_offsets) { /* MJD */
9048 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9052 (UV)(place - RExC_emit_start) > RExC_offsets[0]
9053 ? "Overwriting end of array!\n" : "OK",
9054 (UV)(place - RExC_emit_start),
9055 (UV)(RExC_parse - RExC_start),
9056 (UV)RExC_offsets[0]));
9057 Set_Node_Offset(place, RExC_parse);
9058 Set_Node_Length(place, 1);
9061 src = NEXTOPER(place);
9062 FILL_ADVANCE_NODE(place, op);
9063 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
9064 Zero(src, offset, regnode);
9068 - regtail - set the next-pointer at the end of a node chain of p to val.
9069 - SEE ALSO: regtail_study
9071 /* TODO: All three parms should be const */
9073 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9076 register regnode *scan;
9077 GET_RE_DEBUG_FLAGS_DECL;
9079 PERL_ARGS_ASSERT_REGTAIL;
9081 PERL_UNUSED_ARG(depth);
9087 /* Find last node. */
9090 regnode * const temp = regnext(scan);
9092 SV * const mysv=sv_newmortal();
9093 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9094 regprop(RExC_rx, mysv, scan);
9095 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9096 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9097 (temp == NULL ? "->" : ""),
9098 (temp == NULL ? PL_reg_name[OP(val)] : "")
9106 if (reg_off_by_arg[OP(scan)]) {
9107 ARG_SET(scan, val - scan);
9110 NEXT_OFF(scan) = val - scan;
9116 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9117 - Look for optimizable sequences at the same time.
9118 - currently only looks for EXACT chains.
9120 This is expermental code. The idea is to use this routine to perform
9121 in place optimizations on branches and groups as they are constructed,
9122 with the long term intention of removing optimization from study_chunk so
9123 that it is purely analytical.
9125 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9126 to control which is which.
9129 /* TODO: All four parms should be const */
9132 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9135 register regnode *scan;
9137 #ifdef EXPERIMENTAL_INPLACESCAN
9140 GET_RE_DEBUG_FLAGS_DECL;
9142 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9148 /* Find last node. */
9152 regnode * const temp = regnext(scan);
9153 #ifdef EXPERIMENTAL_INPLACESCAN
9154 if (PL_regkind[OP(scan)] == EXACT)
9155 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9163 if( exact == PSEUDO )
9165 else if ( exact != OP(scan) )
9174 SV * const mysv=sv_newmortal();
9175 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9176 regprop(RExC_rx, mysv, scan);
9177 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9178 SvPV_nolen_const(mysv),
9180 PL_reg_name[exact]);
9187 SV * const mysv_val=sv_newmortal();
9188 DEBUG_PARSE_MSG("");
9189 regprop(RExC_rx, mysv_val, val);
9190 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9191 SvPV_nolen_const(mysv_val),
9192 (IV)REG_NODE_NUM(val),
9196 if (reg_off_by_arg[OP(scan)]) {
9197 ARG_SET(scan, val - scan);
9200 NEXT_OFF(scan) = val - scan;
9208 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9212 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9217 for (bit=0; bit<32; bit++) {
9218 if (flags & (1<<bit)) {
9220 PerlIO_printf(Perl_debug_log, "%s",lead);
9221 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9226 PerlIO_printf(Perl_debug_log, "\n");
9228 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9234 Perl_regdump(pTHX_ const regexp *r)
9238 SV * const sv = sv_newmortal();
9239 SV *dsv= sv_newmortal();
9241 GET_RE_DEBUG_FLAGS_DECL;
9243 PERL_ARGS_ASSERT_REGDUMP;
9245 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9247 /* Header fields of interest. */
9248 if (r->anchored_substr) {
9249 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9250 RE_SV_DUMPLEN(r->anchored_substr), 30);
9251 PerlIO_printf(Perl_debug_log,
9252 "anchored %s%s at %"IVdf" ",
9253 s, RE_SV_TAIL(r->anchored_substr),
9254 (IV)r->anchored_offset);
9255 } else if (r->anchored_utf8) {
9256 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9257 RE_SV_DUMPLEN(r->anchored_utf8), 30);
9258 PerlIO_printf(Perl_debug_log,
9259 "anchored utf8 %s%s at %"IVdf" ",
9260 s, RE_SV_TAIL(r->anchored_utf8),
9261 (IV)r->anchored_offset);
9263 if (r->float_substr) {
9264 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9265 RE_SV_DUMPLEN(r->float_substr), 30);
9266 PerlIO_printf(Perl_debug_log,
9267 "floating %s%s at %"IVdf"..%"UVuf" ",
9268 s, RE_SV_TAIL(r->float_substr),
9269 (IV)r->float_min_offset, (UV)r->float_max_offset);
9270 } else if (r->float_utf8) {
9271 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9272 RE_SV_DUMPLEN(r->float_utf8), 30);
9273 PerlIO_printf(Perl_debug_log,
9274 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9275 s, RE_SV_TAIL(r->float_utf8),
9276 (IV)r->float_min_offset, (UV)r->float_max_offset);
9278 if (r->check_substr || r->check_utf8)
9279 PerlIO_printf(Perl_debug_log,
9281 (r->check_substr == r->float_substr
9282 && r->check_utf8 == r->float_utf8
9283 ? "(checking floating" : "(checking anchored"));
9284 if (r->extflags & RXf_NOSCAN)
9285 PerlIO_printf(Perl_debug_log, " noscan");
9286 if (r->extflags & RXf_CHECK_ALL)
9287 PerlIO_printf(Perl_debug_log, " isall");
9288 if (r->check_substr || r->check_utf8)
9289 PerlIO_printf(Perl_debug_log, ") ");
9291 if (ri->regstclass) {
9292 regprop(r, sv, ri->regstclass);
9293 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9295 if (r->extflags & RXf_ANCH) {
9296 PerlIO_printf(Perl_debug_log, "anchored");
9297 if (r->extflags & RXf_ANCH_BOL)
9298 PerlIO_printf(Perl_debug_log, "(BOL)");
9299 if (r->extflags & RXf_ANCH_MBOL)
9300 PerlIO_printf(Perl_debug_log, "(MBOL)");
9301 if (r->extflags & RXf_ANCH_SBOL)
9302 PerlIO_printf(Perl_debug_log, "(SBOL)");
9303 if (r->extflags & RXf_ANCH_GPOS)
9304 PerlIO_printf(Perl_debug_log, "(GPOS)");
9305 PerlIO_putc(Perl_debug_log, ' ');
9307 if (r->extflags & RXf_GPOS_SEEN)
9308 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9309 if (r->intflags & PREGf_SKIP)
9310 PerlIO_printf(Perl_debug_log, "plus ");
9311 if (r->intflags & PREGf_IMPLICIT)
9312 PerlIO_printf(Perl_debug_log, "implicit ");
9313 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9314 if (r->extflags & RXf_EVAL_SEEN)
9315 PerlIO_printf(Perl_debug_log, "with eval ");
9316 PerlIO_printf(Perl_debug_log, "\n");
9317 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9319 PERL_ARGS_ASSERT_REGDUMP;
9320 PERL_UNUSED_CONTEXT;
9322 #endif /* DEBUGGING */
9326 - regprop - printable representation of opcode
9328 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9331 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9332 if (flags & ANYOF_INVERT) \
9333 /*make sure the invert info is in each */ \
9334 sv_catpvs(sv, "^"); \
9340 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9345 RXi_GET_DECL(prog,progi);
9346 GET_RE_DEBUG_FLAGS_DECL;
9348 PERL_ARGS_ASSERT_REGPROP;
9352 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9353 /* It would be nice to FAIL() here, but this may be called from
9354 regexec.c, and it would be hard to supply pRExC_state. */
9355 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9356 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9358 k = PL_regkind[OP(o)];
9362 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9363 * is a crude hack but it may be the best for now since
9364 * we have no flag "this EXACTish node was UTF-8"
9366 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9367 PERL_PV_ESCAPE_UNI_DETECT |
9368 PERL_PV_PRETTY_ELLIPSES |
9369 PERL_PV_PRETTY_LTGT |
9370 PERL_PV_PRETTY_NOCLEAR
9372 } else if (k == TRIE) {
9373 /* print the details of the trie in dumpuntil instead, as
9374 * progi->data isn't available here */
9375 const char op = OP(o);
9376 const U32 n = ARG(o);
9377 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9378 (reg_ac_data *)progi->data->data[n] :
9380 const reg_trie_data * const trie
9381 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9383 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9384 DEBUG_TRIE_COMPILE_r(
9385 Perl_sv_catpvf(aTHX_ sv,
9386 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9387 (UV)trie->startstate,
9388 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9389 (UV)trie->wordcount,
9392 (UV)TRIE_CHARCOUNT(trie),
9393 (UV)trie->uniquecharcount
9396 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9398 int rangestart = -1;
9399 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9401 for (i = 0; i <= 256; i++) {
9402 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9403 if (rangestart == -1)
9405 } else if (rangestart != -1) {
9406 if (i <= rangestart + 3)
9407 for (; rangestart < i; rangestart++)
9408 put_byte(sv, rangestart);
9410 put_byte(sv, rangestart);
9412 put_byte(sv, i - 1);
9420 } else if (k == CURLY) {
9421 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9422 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9423 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9425 else if (k == WHILEM && o->flags) /* Ordinal/of */
9426 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9427 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9428 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9429 if ( RXp_PAREN_NAMES(prog) ) {
9430 if ( k != REF || OP(o) < NREF) {
9431 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9432 SV **name= av_fetch(list, ARG(o), 0 );
9434 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9437 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9438 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9439 I32 *nums=(I32*)SvPVX(sv_dat);
9440 SV **name= av_fetch(list, nums[0], 0 );
9443 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9444 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9445 (n ? "," : ""), (IV)nums[n]);
9447 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9451 } else if (k == GOSUB)
9452 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9453 else if (k == VERB) {
9455 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9456 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9457 } else if (k == LOGICAL)
9458 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9459 else if (k == FOLDCHAR)
9460 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9461 else if (k == ANYOF) {
9462 int i, rangestart = -1;
9463 const U8 flags = ANYOF_FLAGS(o);
9466 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9467 static const char * const anyofs[] = {
9500 if (flags & ANYOF_LOCALE)
9501 sv_catpvs(sv, "{loc}");
9502 if (flags & ANYOF_FOLD)
9503 sv_catpvs(sv, "{i}");
9504 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9505 if (flags & ANYOF_INVERT)
9508 /* output what the standard cp 0-255 bitmap matches */
9509 for (i = 0; i <= 256; i++) {
9510 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9511 if (rangestart == -1)
9513 } else if (rangestart != -1) {
9514 if (i <= rangestart + 3)
9515 for (; rangestart < i; rangestart++)
9516 put_byte(sv, rangestart);
9518 put_byte(sv, rangestart);
9520 put_byte(sv, i - 1);
9527 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9528 /* output any special charclass tests (used mostly under use locale) */
9529 if (o->flags & ANYOF_CLASS)
9530 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9531 if (ANYOF_CLASS_TEST(o,i)) {
9532 sv_catpv(sv, anyofs[i]);
9536 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9538 /* output information about the unicode matching */
9539 if (flags & ANYOF_UNICODE)
9540 sv_catpvs(sv, "{unicode}");
9541 else if (flags & ANYOF_UNICODE_ALL)
9542 sv_catpvs(sv, "{unicode_all}");
9546 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9550 U8 s[UTF8_MAXBYTES_CASE+1];
9552 for (i = 0; i <= 256; i++) { /* just the first 256 */
9553 uvchr_to_utf8(s, i);
9555 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9556 if (rangestart == -1)
9558 } else if (rangestart != -1) {
9559 if (i <= rangestart + 3)
9560 for (; rangestart < i; rangestart++) {
9561 const U8 * const e = uvchr_to_utf8(s,rangestart);
9563 for(p = s; p < e; p++)
9567 const U8 *e = uvchr_to_utf8(s,rangestart);
9569 for (p = s; p < e; p++)
9572 e = uvchr_to_utf8(s, i-1);
9573 for (p = s; p < e; p++)
9580 sv_catpvs(sv, "..."); /* et cetera */
9584 char *s = savesvpv(lv);
9585 char * const origs = s;
9587 while (*s && *s != '\n')
9591 const char * const t = ++s;
9609 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9611 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9612 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9614 PERL_UNUSED_CONTEXT;
9615 PERL_UNUSED_ARG(sv);
9617 PERL_UNUSED_ARG(prog);
9618 #endif /* DEBUGGING */
9622 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9623 { /* Assume that RE_INTUIT is set */
9625 struct regexp *const prog = (struct regexp *)SvANY(r);
9626 GET_RE_DEBUG_FLAGS_DECL;
9628 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9629 PERL_UNUSED_CONTEXT;
9633 const char * const s = SvPV_nolen_const(prog->check_substr
9634 ? prog->check_substr : prog->check_utf8);
9636 if (!PL_colorset) reginitcolors();
9637 PerlIO_printf(Perl_debug_log,
9638 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9640 prog->check_substr ? "" : "utf8 ",
9641 PL_colors[5],PL_colors[0],
9644 (strlen(s) > 60 ? "..." : ""));
9647 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9653 handles refcounting and freeing the perl core regexp structure. When
9654 it is necessary to actually free the structure the first thing it
9655 does is call the 'free' method of the regexp_engine associated to to
9656 the regexp, allowing the handling of the void *pprivate; member
9657 first. (This routine is not overridable by extensions, which is why
9658 the extensions free is called first.)
9660 See regdupe and regdupe_internal if you change anything here.
9662 #ifndef PERL_IN_XSUB_RE
9664 Perl_pregfree(pTHX_ REGEXP *r)
9670 Perl_pregfree2(pTHX_ REGEXP *rx)
9673 struct regexp *const r = (struct regexp *)SvANY(rx);
9674 GET_RE_DEBUG_FLAGS_DECL;
9676 PERL_ARGS_ASSERT_PREGFREE2;
9679 ReREFCNT_dec(r->mother_re);
9681 CALLREGFREE_PVT(rx); /* free the private data */
9682 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9685 SvREFCNT_dec(r->anchored_substr);
9686 SvREFCNT_dec(r->anchored_utf8);
9687 SvREFCNT_dec(r->float_substr);
9688 SvREFCNT_dec(r->float_utf8);
9689 Safefree(r->substrs);
9691 RX_MATCH_COPY_FREE(rx);
9692 #ifdef PERL_OLD_COPY_ON_WRITE
9693 SvREFCNT_dec(r->saved_copy);
9700 This is a hacky workaround to the structural issue of match results
9701 being stored in the regexp structure which is in turn stored in
9702 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9703 could be PL_curpm in multiple contexts, and could require multiple
9704 result sets being associated with the pattern simultaneously, such
9705 as when doing a recursive match with (??{$qr})
9707 The solution is to make a lightweight copy of the regexp structure
9708 when a qr// is returned from the code executed by (??{$qr}) this
9709 lightweight copy doesnt actually own any of its data except for
9710 the starp/end and the actual regexp structure itself.
9716 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9719 struct regexp *const r = (struct regexp *)SvANY(rx);
9720 register const I32 npar = r->nparens+1;
9722 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9725 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9726 ret = (struct regexp *)SvANY(ret_x);
9728 (void)ReREFCNT_inc(rx);
9729 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9730 by pointing directly at the buffer, but flagging that the allocated
9731 space in the copy is zero. As we've just done a struct copy, it's now
9732 a case of zero-ing that, rather than copying the current length. */
9733 SvPV_set(ret_x, RX_WRAPPED(rx));
9734 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9735 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9736 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9737 SvLEN_set(ret_x, 0);
9738 SvSTASH_set(ret_x, NULL);
9739 SvMAGIC_set(ret_x, NULL);
9740 Newx(ret->offs, npar, regexp_paren_pair);
9741 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9743 Newx(ret->substrs, 1, struct reg_substr_data);
9744 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9746 SvREFCNT_inc_void(ret->anchored_substr);
9747 SvREFCNT_inc_void(ret->anchored_utf8);
9748 SvREFCNT_inc_void(ret->float_substr);
9749 SvREFCNT_inc_void(ret->float_utf8);
9751 /* check_substr and check_utf8, if non-NULL, point to either their
9752 anchored or float namesakes, and don't hold a second reference. */
9754 RX_MATCH_COPIED_off(ret_x);
9755 #ifdef PERL_OLD_COPY_ON_WRITE
9756 ret->saved_copy = NULL;
9758 ret->mother_re = rx;
9764 /* regfree_internal()
9766 Free the private data in a regexp. This is overloadable by
9767 extensions. Perl takes care of the regexp structure in pregfree(),
9768 this covers the *pprivate pointer which technically perldoesnt
9769 know about, however of course we have to handle the
9770 regexp_internal structure when no extension is in use.
9772 Note this is called before freeing anything in the regexp
9777 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9780 struct regexp *const r = (struct regexp *)SvANY(rx);
9782 GET_RE_DEBUG_FLAGS_DECL;
9784 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9790 SV *dsv= sv_newmortal();
9791 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9792 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9793 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9794 PL_colors[4],PL_colors[5],s);
9797 #ifdef RE_TRACK_PATTERN_OFFSETS
9799 Safefree(ri->u.offsets); /* 20010421 MJD */
9802 int n = ri->data->count;
9803 PAD* new_comppad = NULL;
9808 /* If you add a ->what type here, update the comment in regcomp.h */
9809 switch (ri->data->what[n]) {
9814 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9817 Safefree(ri->data->data[n]);
9820 new_comppad = MUTABLE_AV(ri->data->data[n]);
9823 if (new_comppad == NULL)
9824 Perl_croak(aTHX_ "panic: pregfree comppad");
9825 PAD_SAVE_LOCAL(old_comppad,
9826 /* Watch out for global destruction's random ordering. */
9827 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9830 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9833 op_free((OP_4tree*)ri->data->data[n]);
9835 PAD_RESTORE_LOCAL(old_comppad);
9836 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9842 { /* Aho Corasick add-on structure for a trie node.
9843 Used in stclass optimization only */
9845 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9847 refcount = --aho->refcount;
9850 PerlMemShared_free(aho->states);
9851 PerlMemShared_free(aho->fail);
9852 /* do this last!!!! */
9853 PerlMemShared_free(ri->data->data[n]);
9854 PerlMemShared_free(ri->regstclass);
9860 /* trie structure. */
9862 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9864 refcount = --trie->refcount;
9867 PerlMemShared_free(trie->charmap);
9868 PerlMemShared_free(trie->states);
9869 PerlMemShared_free(trie->trans);
9871 PerlMemShared_free(trie->bitmap);
9873 PerlMemShared_free(trie->jump);
9874 PerlMemShared_free(trie->wordinfo);
9875 /* do this last!!!! */
9876 PerlMemShared_free(ri->data->data[n]);
9881 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9884 Safefree(ri->data->what);
9891 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9892 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
9893 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9896 re_dup - duplicate a regexp.
9898 This routine is expected to clone a given regexp structure. It is only
9899 compiled under USE_ITHREADS.
9901 After all of the core data stored in struct regexp is duplicated
9902 the regexp_engine.dupe method is used to copy any private data
9903 stored in the *pprivate pointer. This allows extensions to handle
9904 any duplication it needs to do.
9906 See pregfree() and regfree_internal() if you change anything here.
9908 #if defined(USE_ITHREADS)
9909 #ifndef PERL_IN_XSUB_RE
9911 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9915 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9916 struct regexp *ret = (struct regexp *)SvANY(dstr);
9918 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9920 npar = r->nparens+1;
9921 Newx(ret->offs, npar, regexp_paren_pair);
9922 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9924 /* no need to copy these */
9925 Newx(ret->swap, npar, regexp_paren_pair);
9929 /* Do it this way to avoid reading from *r after the StructCopy().
9930 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9931 cache, it doesn't matter. */
9932 const bool anchored = r->check_substr
9933 ? r->check_substr == r->anchored_substr
9934 : r->check_utf8 == r->anchored_utf8;
9935 Newx(ret->substrs, 1, struct reg_substr_data);
9936 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9938 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9939 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9940 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9941 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9943 /* check_substr and check_utf8, if non-NULL, point to either their
9944 anchored or float namesakes, and don't hold a second reference. */
9946 if (ret->check_substr) {
9948 assert(r->check_utf8 == r->anchored_utf8);
9949 ret->check_substr = ret->anchored_substr;
9950 ret->check_utf8 = ret->anchored_utf8;
9952 assert(r->check_substr == r->float_substr);
9953 assert(r->check_utf8 == r->float_utf8);
9954 ret->check_substr = ret->float_substr;
9955 ret->check_utf8 = ret->float_utf8;
9957 } else if (ret->check_utf8) {
9959 ret->check_utf8 = ret->anchored_utf8;
9961 ret->check_utf8 = ret->float_utf8;
9966 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9969 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9971 if (RX_MATCH_COPIED(dstr))
9972 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9975 #ifdef PERL_OLD_COPY_ON_WRITE
9976 ret->saved_copy = NULL;
9979 if (ret->mother_re) {
9980 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9981 /* Our storage points directly to our mother regexp, but that's
9982 1: a buffer in a different thread
9983 2: something we no longer hold a reference on
9984 so we need to copy it locally. */
9985 /* Note we need to sue SvCUR() on our mother_re, because it, in
9986 turn, may well be pointing to its own mother_re. */
9987 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9988 SvCUR(ret->mother_re)+1));
9989 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9991 ret->mother_re = NULL;
9995 #endif /* PERL_IN_XSUB_RE */
10000 This is the internal complement to regdupe() which is used to copy
10001 the structure pointed to by the *pprivate pointer in the regexp.
10002 This is the core version of the extension overridable cloning hook.
10003 The regexp structure being duplicated will be copied by perl prior
10004 to this and will be provided as the regexp *r argument, however
10005 with the /old/ structures pprivate pointer value. Thus this routine
10006 may override any copying normally done by perl.
10008 It returns a pointer to the new regexp_internal structure.
10012 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10015 struct regexp *const r = (struct regexp *)SvANY(rx);
10016 regexp_internal *reti;
10018 RXi_GET_DECL(r,ri);
10020 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10022 npar = r->nparens+1;
10025 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10026 Copy(ri->program, reti->program, len+1, regnode);
10029 reti->regstclass = NULL;
10032 struct reg_data *d;
10033 const int count = ri->data->count;
10036 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10037 char, struct reg_data);
10038 Newx(d->what, count, U8);
10041 for (i = 0; i < count; i++) {
10042 d->what[i] = ri->data->what[i];
10043 switch (d->what[i]) {
10044 /* legal options are one of: sSfpontTua
10045 see also regcomp.h and pregfree() */
10046 case 'a': /* actually an AV, but the dup function is identical. */
10049 case 'p': /* actually an AV, but the dup function is identical. */
10050 case 'u': /* actually an HV, but the dup function is identical. */
10051 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10054 /* This is cheating. */
10055 Newx(d->data[i], 1, struct regnode_charclass_class);
10056 StructCopy(ri->data->data[i], d->data[i],
10057 struct regnode_charclass_class);
10058 reti->regstclass = (regnode*)d->data[i];
10061 /* Compiled op trees are readonly and in shared memory,
10062 and can thus be shared without duplication. */
10064 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10068 /* Trie stclasses are readonly and can thus be shared
10069 * without duplication. We free the stclass in pregfree
10070 * when the corresponding reg_ac_data struct is freed.
10072 reti->regstclass= ri->regstclass;
10076 ((reg_trie_data*)ri->data->data[i])->refcount++;
10080 d->data[i] = ri->data->data[i];
10083 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10092 reti->name_list_idx = ri->name_list_idx;
10094 #ifdef RE_TRACK_PATTERN_OFFSETS
10095 if (ri->u.offsets) {
10096 Newx(reti->u.offsets, 2*len+1, U32);
10097 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10100 SetProgLen(reti,len);
10103 return (void*)reti;
10106 #endif /* USE_ITHREADS */
10108 #ifndef PERL_IN_XSUB_RE
10111 - regnext - dig the "next" pointer out of a node
10114 Perl_regnext(pTHX_ register regnode *p)
10117 register I32 offset;
10122 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10123 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10126 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10135 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10138 STRLEN l1 = strlen(pat1);
10139 STRLEN l2 = strlen(pat2);
10142 const char *message;
10144 PERL_ARGS_ASSERT_RE_CROAK2;
10150 Copy(pat1, buf, l1 , char);
10151 Copy(pat2, buf + l1, l2 , char);
10152 buf[l1 + l2] = '\n';
10153 buf[l1 + l2 + 1] = '\0';
10155 /* ANSI variant takes additional second argument */
10156 va_start(args, pat2);
10160 msv = vmess(buf, &args);
10162 message = SvPV_const(msv,l1);
10165 Copy(message, buf, l1 , char);
10166 buf[l1-1] = '\0'; /* Overwrite \n */
10167 Perl_croak(aTHX_ "%s", buf);
10170 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10172 #ifndef PERL_IN_XSUB_RE
10174 Perl_save_re_context(pTHX)
10178 struct re_save_state *state;
10180 SAVEVPTR(PL_curcop);
10181 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10183 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10184 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10185 SSPUSHUV(SAVEt_RE_STATE);
10187 Copy(&PL_reg_state, state, 1, struct re_save_state);
10189 PL_reg_start_tmp = 0;
10190 PL_reg_start_tmpl = 0;
10191 PL_reg_oldsaved = NULL;
10192 PL_reg_oldsavedlen = 0;
10193 PL_reg_maxiter = 0;
10194 PL_reg_leftiter = 0;
10195 PL_reg_poscache = NULL;
10196 PL_reg_poscache_size = 0;
10197 #ifdef PERL_OLD_COPY_ON_WRITE
10201 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10203 const REGEXP * const rx = PM_GETRE(PL_curpm);
10206 for (i = 1; i <= RX_NPARENS(rx); i++) {
10207 char digits[TYPE_CHARS(long)];
10208 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10209 GV *const *const gvp
10210 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10213 GV * const gv = *gvp;
10214 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10224 clear_re(pTHX_ void *r)
10227 ReREFCNT_dec((REGEXP *)r);
10233 S_put_byte(pTHX_ SV *sv, int c)
10235 PERL_ARGS_ASSERT_PUT_BYTE;
10237 /* Our definition of isPRINT() ignores locales, so only bytes that are
10238 not part of UTF-8 are considered printable. I assume that the same
10239 holds for UTF-EBCDIC.
10240 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10241 which Wikipedia says:
10243 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10244 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10245 identical, to the ASCII delete (DEL) or rubout control character.
10246 ) So the old condition can be simplified to !isPRINT(c) */
10248 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
10250 const char string = c;
10251 if (c == '-' || c == ']' || c == '\\' || c == '^')
10252 sv_catpvs(sv, "\\");
10253 sv_catpvn(sv, &string, 1);
10258 #define CLEAR_OPTSTART \
10259 if (optstart) STMT_START { \
10260 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10264 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10266 STATIC const regnode *
10267 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10268 const regnode *last, const regnode *plast,
10269 SV* sv, I32 indent, U32 depth)
10272 register U8 op = PSEUDO; /* Arbitrary non-END op. */
10273 register const regnode *next;
10274 const regnode *optstart= NULL;
10276 RXi_GET_DECL(r,ri);
10277 GET_RE_DEBUG_FLAGS_DECL;
10279 PERL_ARGS_ASSERT_DUMPUNTIL;
10281 #ifdef DEBUG_DUMPUNTIL
10282 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10283 last ? last-start : 0,plast ? plast-start : 0);
10286 if (plast && plast < last)
10289 while (PL_regkind[op] != END && (!last || node < last)) {
10290 /* While that wasn't END last time... */
10293 if (op == CLOSE || op == WHILEM)
10295 next = regnext((regnode *)node);
10298 if (OP(node) == OPTIMIZED) {
10299 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10306 regprop(r, sv, node);
10307 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10308 (int)(2*indent + 1), "", SvPVX_const(sv));
10310 if (OP(node) != OPTIMIZED) {
10311 if (next == NULL) /* Next ptr. */
10312 PerlIO_printf(Perl_debug_log, " (0)");
10313 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10314 PerlIO_printf(Perl_debug_log, " (FAIL)");
10316 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10317 (void)PerlIO_putc(Perl_debug_log, '\n');
10321 if (PL_regkind[(U8)op] == BRANCHJ) {
10324 register const regnode *nnode = (OP(next) == LONGJMP
10325 ? regnext((regnode *)next)
10327 if (last && nnode > last)
10329 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10332 else if (PL_regkind[(U8)op] == BRANCH) {
10334 DUMPUNTIL(NEXTOPER(node), next);
10336 else if ( PL_regkind[(U8)op] == TRIE ) {
10337 const regnode *this_trie = node;
10338 const char op = OP(node);
10339 const U32 n = ARG(node);
10340 const reg_ac_data * const ac = op>=AHOCORASICK ?
10341 (reg_ac_data *)ri->data->data[n] :
10343 const reg_trie_data * const trie =
10344 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10346 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10348 const regnode *nextbranch= NULL;
10351 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10352 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10354 PerlIO_printf(Perl_debug_log, "%*s%s ",
10355 (int)(2*(indent+3)), "",
10356 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10357 PL_colors[0], PL_colors[1],
10358 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10359 PERL_PV_PRETTY_ELLIPSES |
10360 PERL_PV_PRETTY_LTGT
10365 U16 dist= trie->jump[word_idx+1];
10366 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10367 (UV)((dist ? this_trie + dist : next) - start));
10370 nextbranch= this_trie + trie->jump[0];
10371 DUMPUNTIL(this_trie + dist, nextbranch);
10373 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10374 nextbranch= regnext((regnode *)nextbranch);
10376 PerlIO_printf(Perl_debug_log, "\n");
10379 if (last && next > last)
10384 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10385 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10386 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10388 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10390 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10392 else if ( op == PLUS || op == STAR) {
10393 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10395 else if (op == ANYOF) {
10396 /* arglen 1 + class block */
10397 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10398 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10399 node = NEXTOPER(node);
10401 else if (PL_regkind[(U8)op] == EXACT) {
10402 /* Literal string, where present. */
10403 node += NODE_SZ_STR(node) - 1;
10404 node = NEXTOPER(node);
10407 node = NEXTOPER(node);
10408 node += regarglen[(U8)op];
10410 if (op == CURLYX || op == OPEN)
10414 #ifdef DEBUG_DUMPUNTIL
10415 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10420 #endif /* DEBUGGING */
10424 * c-indentation-style: bsd
10425 * c-basic-offset: 4
10426 * indent-tabs-mode: t
10429 * ex: set ts=8 sts=4 sw=4 noet: