5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
95 # if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97 # pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99 # pragma optimize("w",on )
100 # endif /* BUGGY_MSC6 */
104 #define STATIC static
107 typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 HV *paren_names; /* Paren names */
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
142 char *starttry; /* -Dr: where regtry was called. */
143 #define RExC_starttry (pRExC_state->starttry)
146 const char *lastparse;
148 AV *paren_name_list; /* idx -> name */
149 #define RExC_lastparse (pRExC_state->lastparse)
150 #define RExC_lastnum (pRExC_state->lastnum)
151 #define RExC_paren_name_list (pRExC_state->paren_name_list)
155 #define RExC_flags (pRExC_state->flags)
156 #define RExC_precomp (pRExC_state->precomp)
157 #define RExC_rx_sv (pRExC_state->rx_sv)
158 #define RExC_rx (pRExC_state->rx)
159 #define RExC_rxi (pRExC_state->rxi)
160 #define RExC_start (pRExC_state->start)
161 #define RExC_end (pRExC_state->end)
162 #define RExC_parse (pRExC_state->parse)
163 #define RExC_whilem_seen (pRExC_state->whilem_seen)
164 #ifdef RE_TRACK_PATTERN_OFFSETS
165 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
167 #define RExC_emit (pRExC_state->emit)
168 #define RExC_emit_start (pRExC_state->emit_start)
169 #define RExC_emit_bound (pRExC_state->emit_bound)
170 #define RExC_naughty (pRExC_state->naughty)
171 #define RExC_sawback (pRExC_state->sawback)
172 #define RExC_seen (pRExC_state->seen)
173 #define RExC_size (pRExC_state->size)
174 #define RExC_npar (pRExC_state->npar)
175 #define RExC_nestroot (pRExC_state->nestroot)
176 #define RExC_extralen (pRExC_state->extralen)
177 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178 #define RExC_seen_evals (pRExC_state->seen_evals)
179 #define RExC_utf8 (pRExC_state->utf8)
180 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
181 #define RExC_open_parens (pRExC_state->open_parens)
182 #define RExC_close_parens (pRExC_state->close_parens)
183 #define RExC_opend (pRExC_state->opend)
184 #define RExC_paren_names (pRExC_state->paren_names)
185 #define RExC_recurse (pRExC_state->recurse)
186 #define RExC_recurse_count (pRExC_state->recurse_count)
189 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
194 #undef SPSTART /* dratted cpp namespace... */
197 * Flags to be passed up and down.
199 #define WORST 0 /* Worst case. */
200 #define HASWIDTH 0x01 /* Known to match non-null strings. */
202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
205 #define SPSTART 0x04 /* Starts with * or +. */
206 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
207 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
211 /* whether trie related optimizations are enabled */
212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213 #define TRIE_STUDY_OPT
214 #define FULL_TRIE_STUDY
220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221 #define PBITVAL(paren) (1 << ((paren) & 7))
222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
226 /* If not already in utf8, do a longjmp back to the beginning */
227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228 #define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
232 /* About scan_data_t.
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
238 string that must appear for at a fixed location, and we look for the
239 longest string that may appear at a floating location. So for instance
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
248 The strings can be composites, for instance
252 will result in a composite fixed substring 'foo'.
254 For each string some basic information is maintained:
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
259 character must match before the string we are searching.
260 Likewise when combined with minlenp and the length of the string
261 tells us how many characters must appear after the string we have
265 Only used for floating strings. This is the rightmost point that
266 the string can appear at. Ifset to I32 max it indicates that the
267 string can occur infinitely far to the right.
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
307 typedef struct scan_data_t {
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
313 I32 last_end; /* min value, <0 unless valid. */
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
329 struct regnode_charclass_class *start_class;
333 * Forward declarations for pregcomp()'s friends.
336 static const scan_data_t zero_scan_data =
337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
339 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
340 #define SF_BEFORE_SEOL 0x0001
341 #define SF_BEFORE_MEOL 0x0002
342 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
346 # define SF_FIX_SHIFT_EOL (0+2)
347 # define SF_FL_SHIFT_EOL (0+4)
349 # define SF_FIX_SHIFT_EOL (+2)
350 # define SF_FL_SHIFT_EOL (+4)
353 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
356 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
358 #define SF_IS_INF 0x0040
359 #define SF_HAS_PAR 0x0080
360 #define SF_IN_PAR 0x0100
361 #define SF_HAS_EVAL 0x0200
362 #define SCF_DO_SUBSTR 0x0400
363 #define SCF_DO_STCLASS_AND 0x0800
364 #define SCF_DO_STCLASS_OR 0x1000
365 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
366 #define SCF_WHILEM_VISITED_POS 0x2000
368 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
369 #define SCF_SEEN_ACCEPT 0x8000
371 #define UTF cBOOL(RExC_utf8)
372 #define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373 #define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
376 #define OOB_UNICODE 12345678
377 #define OOB_NAMEDCLASS -1
379 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
383 /* length of regex to show in messages that don't mark a position within */
384 #define RegexLengthToShowInErrorMessages 127
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
391 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
392 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
394 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
401 #define _FAIL(code) STMT_START { \
402 const char *ellipses = ""; \
403 IV len = RExC_end - RExC_precomp; \
406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
415 #define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
419 #define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
426 #define Simple_vFAIL(m) STMT_START { \
427 const IV offset = RExC_parse - RExC_precomp; \
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
435 #define vFAIL(m) STMT_START { \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
442 * Like Simple_vFAIL(), but accepts two arguments.
444 #define Simple_vFAIL2(m,a1) STMT_START { \
445 const IV offset = RExC_parse - RExC_precomp; \
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
453 #define vFAIL2(m,a1) STMT_START { \
455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
456 Simple_vFAIL2(m, a1); \
461 * Like Simple_vFAIL(), but accepts three arguments.
463 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
464 const IV offset = RExC_parse - RExC_precomp; \
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
472 #define vFAIL3(m,a1,a2) STMT_START { \
474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
475 Simple_vFAIL3(m, a1, a2); \
479 * Like Simple_vFAIL(), but accepts four arguments.
481 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
482 const IV offset = RExC_parse - RExC_precomp; \
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define ckWARNreg(loc,m) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define ckWARNregdep(loc,m) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
500 #define ckWARN2reg(loc, m, a1) STMT_START { \
501 const IV offset = loc - RExC_precomp; \
502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
506 #define vWARN3(loc, m, a1, a2) STMT_START { \
507 const IV offset = loc - RExC_precomp; \
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
512 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
518 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
519 const IV offset = loc - RExC_precomp; \
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
524 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
530 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
531 const IV offset = loc - RExC_precomp; \
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
537 /* Allow for side effects in s */
538 #define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
542 /* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
546 * Position is 1 indexed.
548 #ifndef RE_TRACK_PATTERN_OFFSETS
549 #define Set_Node_Offset_To_R(node,byte)
550 #define Set_Node_Offset(node,byte)
551 #define Set_Cur_Node_Offset
552 #define Set_Node_Length_To_R(node,len)
553 #define Set_Node_Length(node,len)
554 #define Set_Node_Cur_Length(node)
555 #define Node_Offset(n)
556 #define Node_Length(n)
557 #define Set_Node_Offset_Length(node,offset,len)
558 #define ProgLen(ri) ri->u.proglen
559 #define SetProgLen(ri,x) ri->u.proglen = x
561 #define ProgLen(ri) ri->u.offsets[0]
562 #define SetProgLen(ri,x) ri->u.offsets[0] = x
563 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
566 __LINE__, (int)(node), (int)(byte))); \
568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
570 RExC_offsets[2*(node)-1] = (byte); \
575 #define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
579 #define Set_Node_Length_To_R(node,len) STMT_START { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
582 __LINE__, (int)(node), (int)(len))); \
584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
586 RExC_offsets[2*(node)] = (len); \
591 #define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594 #define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
597 /* Get offsets and lengths */
598 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
601 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
607 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608 #define EXPERIMENTAL_INPLACESCAN
609 #endif /*RE_TRACK_PATTERN_OFFSETS*/
611 #define DEBUG_STUDYDATA(str,data,depth) \
612 DEBUG_OPTIMISE_MORE_r(if(data){ \
613 PerlIO_printf(Perl_debug_log, \
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
619 (UV)((data)->flags), \
620 (IV)((data)->whilem_c), \
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
642 PerlIO_printf(Perl_debug_log,"\n"); \
645 static void clear_re(pTHX_ void *r);
647 /* Mark that we cannot extend a found fixed substring at this point.
648 Update the longest found anchored substring and the longest found
649 floating substrings if needed. */
652 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
656 GET_RE_DEBUG_FLAGS_DECL;
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
661 SvSetMagicSV(*data->longest, data->last_found);
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
668 data->flags &= ~SF_FIX_BEFORE_EOL;
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
672 else { /* *data->longest == data->longest_float */
673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
674 data->offset_float_max = (l
675 ? data->last_start_max
676 : data->pos_min + data->pos_delta);
677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
678 data->offset_float_max = I32_MAX;
679 if (data->flags & SF_BEFORE_EOL)
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
683 data->flags &= ~SF_FL_BEFORE_EOL;
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
688 SvCUR_set(data->last_found, 0);
690 SV * const sv = data->last_found;
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
698 data->flags &= ~SF_BEFORE_EOL;
699 DEBUG_STUDYDATA("commit: ",data,0);
702 /* Can match anything (initialization) */
704 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
706 PERL_ARGS_ASSERT_CL_ANYTHING;
708 ANYOF_CLASS_ZERO(cl);
709 ANYOF_BITMAP_SETALL(cl);
710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
712 cl->flags |= ANYOF_LOCALE;
715 /* Can match anything (initialization) */
717 S_cl_is_anything(const struct regnode_charclass_class *cl)
721 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
723 for (value = 0; value <= ANYOF_MAX; value += 2)
724 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
726 if (!(cl->flags & ANYOF_UNICODE_ALL))
728 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
733 /* Can match anything (initialization) */
735 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
737 PERL_ARGS_ASSERT_CL_INIT;
739 Zero(cl, 1, struct regnode_charclass_class);
741 cl_anything(pRExC_state, cl);
745 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
747 PERL_ARGS_ASSERT_CL_INIT_ZERO;
749 Zero(cl, 1, struct regnode_charclass_class);
751 cl_anything(pRExC_state, cl);
753 cl->flags |= ANYOF_LOCALE;
756 /* 'And' a given class with another one. Can create false positives */
757 /* We assume that cl is not inverted */
759 S_cl_and(struct regnode_charclass_class *cl,
760 const struct regnode_charclass_class *and_with)
762 PERL_ARGS_ASSERT_CL_AND;
764 assert(and_with->type == ANYOF);
765 if (!(and_with->flags & ANYOF_CLASS)
766 && !(cl->flags & ANYOF_CLASS)
767 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
768 && !(and_with->flags & ANYOF_FOLD)
769 && !(cl->flags & ANYOF_FOLD)) {
772 if (and_with->flags & ANYOF_INVERT)
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] &= ~and_with->bitmap[i];
776 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
777 cl->bitmap[i] &= and_with->bitmap[i];
778 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
779 if (!(and_with->flags & ANYOF_EOS))
780 cl->flags &= ~ANYOF_EOS;
782 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
783 !(and_with->flags & ANYOF_INVERT)) {
784 cl->flags &= ~ANYOF_UNICODE_ALL;
785 cl->flags |= ANYOF_UNICODE;
786 ARG_SET(cl, ARG(and_with));
788 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
789 !(and_with->flags & ANYOF_INVERT))
790 cl->flags &= ~ANYOF_UNICODE_ALL;
791 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
792 !(and_with->flags & ANYOF_INVERT))
793 cl->flags &= ~ANYOF_UNICODE;
796 /* 'OR' a given class with another one. Can create false positives */
797 /* We assume that cl is not inverted */
799 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
801 PERL_ARGS_ASSERT_CL_OR;
803 if (or_with->flags & ANYOF_INVERT) {
805 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
806 * <= (B1 | !B2) | (CL1 | !CL2)
807 * which is wasteful if CL2 is small, but we ignore CL2:
808 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
809 * XXXX Can we handle case-fold? Unclear:
810 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
811 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
813 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
814 && !(or_with->flags & ANYOF_FOLD)
815 && !(cl->flags & ANYOF_FOLD) ) {
818 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
819 cl->bitmap[i] |= ~or_with->bitmap[i];
820 } /* XXXX: logic is complicated otherwise */
822 cl_anything(pRExC_state, cl);
825 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
826 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
827 && (!(or_with->flags & ANYOF_FOLD)
828 || (cl->flags & ANYOF_FOLD)) ) {
831 /* OR char bitmap and class bitmap separately */
832 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
833 cl->bitmap[i] |= or_with->bitmap[i];
834 if (or_with->flags & ANYOF_CLASS) {
835 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
836 cl->classflags[i] |= or_with->classflags[i];
837 cl->flags |= ANYOF_CLASS;
840 else { /* XXXX: logic is complicated, leave it along for a moment. */
841 cl_anything(pRExC_state, cl);
844 if (or_with->flags & ANYOF_EOS)
845 cl->flags |= ANYOF_EOS;
847 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
848 ARG(cl) != ARG(or_with)) {
849 cl->flags |= ANYOF_UNICODE_ALL;
850 cl->flags &= ~ANYOF_UNICODE;
852 if (or_with->flags & ANYOF_UNICODE_ALL) {
853 cl->flags |= ANYOF_UNICODE_ALL;
854 cl->flags &= ~ANYOF_UNICODE;
858 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
859 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
860 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
861 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
866 dump_trie(trie,widecharmap,revcharmap)
867 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
868 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
870 These routines dump out a trie in a somewhat readable format.
871 The _interim_ variants are used for debugging the interim
872 tables that are used to generate the final compressed
873 representation which is what dump_trie expects.
875 Part of the reason for their existance is to provide a form
876 of documentation as to how the different representations function.
881 Dumps the final compressed table form of the trie to Perl_debug_log.
882 Used for debugging make_trie().
886 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
887 AV *revcharmap, U32 depth)
890 SV *sv=sv_newmortal();
891 int colwidth= widecharmap ? 6 : 4;
893 GET_RE_DEBUG_FLAGS_DECL;
895 PERL_ARGS_ASSERT_DUMP_TRIE;
897 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
898 (int)depth * 2 + 2,"",
899 "Match","Base","Ofs" );
901 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
902 SV ** const tmp = av_fetch( revcharmap, state, 0);
904 PerlIO_printf( Perl_debug_log, "%*s",
906 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
907 PL_colors[0], PL_colors[1],
908 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
909 PERL_PV_ESCAPE_FIRSTCHAR
914 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
915 (int)depth * 2 + 2,"");
917 for( state = 0 ; state < trie->uniquecharcount ; state++ )
918 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
919 PerlIO_printf( Perl_debug_log, "\n");
921 for( state = 1 ; state < trie->statecount ; state++ ) {
922 const U32 base = trie->states[ state ].trans.base;
924 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
926 if ( trie->states[ state ].wordnum ) {
927 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
929 PerlIO_printf( Perl_debug_log, "%6s", "" );
932 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
937 while( ( base + ofs < trie->uniquecharcount ) ||
938 ( base + ofs - trie->uniquecharcount < trie->lasttrans
939 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
942 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
944 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
945 if ( ( base + ofs >= trie->uniquecharcount ) &&
946 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
947 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
949 PerlIO_printf( Perl_debug_log, "%*"UVXf,
951 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
953 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
957 PerlIO_printf( Perl_debug_log, "]");
960 PerlIO_printf( Perl_debug_log, "\n" );
962 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
963 for (word=1; word <= trie->wordcount; word++) {
964 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
965 (int)word, (int)(trie->wordinfo[word].prev),
966 (int)(trie->wordinfo[word].len));
968 PerlIO_printf(Perl_debug_log, "\n" );
971 Dumps a fully constructed but uncompressed trie in list form.
972 List tries normally only are used for construction when the number of
973 possible chars (trie->uniquecharcount) is very high.
974 Used for debugging make_trie().
977 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
978 HV *widecharmap, AV *revcharmap, U32 next_alloc,
982 SV *sv=sv_newmortal();
983 int colwidth= widecharmap ? 6 : 4;
984 GET_RE_DEBUG_FLAGS_DECL;
986 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
988 /* print out the table precompression. */
989 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
990 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
991 "------:-----+-----------------\n" );
993 for( state=1 ; state < next_alloc ; state ++ ) {
996 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
997 (int)depth * 2 + 2,"", (UV)state );
998 if ( ! trie->states[ state ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, "%5s| ","");
1001 PerlIO_printf( Perl_debug_log, "W%4x| ",
1002 trie->states[ state ].wordnum
1005 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1006 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1008 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1010 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1011 PL_colors[0], PL_colors[1],
1012 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1013 PERL_PV_ESCAPE_FIRSTCHAR
1015 TRIE_LIST_ITEM(state,charid).forid,
1016 (UV)TRIE_LIST_ITEM(state,charid).newstate
1019 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1020 (int)((depth * 2) + 14), "");
1023 PerlIO_printf( Perl_debug_log, "\n");
1028 Dumps a fully constructed but uncompressed trie in table form.
1029 This is the normal DFA style state transition table, with a few
1030 twists to facilitate compression later.
1031 Used for debugging make_trie().
1034 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1035 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1040 SV *sv=sv_newmortal();
1041 int colwidth= widecharmap ? 6 : 4;
1042 GET_RE_DEBUG_FLAGS_DECL;
1044 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1047 print out the table precompression so that we can do a visual check
1048 that they are identical.
1051 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1053 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1054 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1056 PerlIO_printf( Perl_debug_log, "%*s",
1058 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1059 PL_colors[0], PL_colors[1],
1060 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1061 PERL_PV_ESCAPE_FIRSTCHAR
1067 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1069 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1070 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1073 PerlIO_printf( Perl_debug_log, "\n" );
1075 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1077 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1078 (int)depth * 2 + 2,"",
1079 (UV)TRIE_NODENUM( state ) );
1081 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1082 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1084 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1086 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1088 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1089 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1091 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1092 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1100 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1101 startbranch: the first branch in the whole branch sequence
1102 first : start branch of sequence of branch-exact nodes.
1103 May be the same as startbranch
1104 last : Thing following the last branch.
1105 May be the same as tail.
1106 tail : item following the branch sequence
1107 count : words in the sequence
1108 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1109 depth : indent depth
1111 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1113 A trie is an N'ary tree where the branches are determined by digital
1114 decomposition of the key. IE, at the root node you look up the 1st character and
1115 follow that branch repeat until you find the end of the branches. Nodes can be
1116 marked as "accepting" meaning they represent a complete word. Eg:
1120 would convert into the following structure. Numbers represent states, letters
1121 following numbers represent valid transitions on the letter from that state, if
1122 the number is in square brackets it represents an accepting state, otherwise it
1123 will be in parenthesis.
1125 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1129 (1) +-i->(6)-+-s->[7]
1131 +-s->(3)-+-h->(4)-+-e->[5]
1133 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1135 This shows that when matching against the string 'hers' we will begin at state 1
1136 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1137 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1138 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1139 single traverse. We store a mapping from accepting to state to which word was
1140 matched, and then when we have multiple possibilities we try to complete the
1141 rest of the regex in the order in which they occured in the alternation.
1143 The only prior NFA like behaviour that would be changed by the TRIE support is
1144 the silent ignoring of duplicate alternations which are of the form:
1146 / (DUPE|DUPE) X? (?{ ... }) Y /x
1148 Thus EVAL blocks follwing a trie may be called a different number of times with
1149 and without the optimisation. With the optimisations dupes will be silently
1150 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1151 the following demonstrates:
1153 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1155 which prints out 'word' three times, but
1157 'words'=~/(word|word|word)(?{ print $1 })S/
1159 which doesnt print it out at all. This is due to other optimisations kicking in.
1161 Example of what happens on a structural level:
1163 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1165 1: CURLYM[1] {1,32767}(18)
1176 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1177 and should turn into:
1179 1: CURLYM[1] {1,32767}(18)
1181 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1189 Cases where tail != last would be like /(?foo|bar)baz/:
1199 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1200 and would end up looking like:
1203 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1210 d = uvuni_to_utf8_flags(d, uv, 0);
1212 is the recommended Unicode-aware way of saying
1217 #define TRIE_STORE_REVCHAR \
1220 SV *zlopp = newSV(2); \
1221 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1222 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1223 SvCUR_set(zlopp, kapow - flrbbbbb); \
1226 av_push(revcharmap, zlopp); \
1228 char ooooff = (char)uvc; \
1229 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1233 #define TRIE_READ_CHAR STMT_START { \
1237 if ( foldlen > 0 ) { \
1238 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1243 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1244 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1245 foldlen -= UNISKIP( uvc ); \
1246 scan = foldbuf + UNISKIP( uvc ); \
1249 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1259 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1260 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1261 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1262 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1264 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1265 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1266 TRIE_LIST_CUR( state )++; \
1269 #define TRIE_LIST_NEW(state) STMT_START { \
1270 Newxz( trie->states[ state ].trans.list, \
1271 4, reg_trie_trans_le ); \
1272 TRIE_LIST_CUR( state ) = 1; \
1273 TRIE_LIST_LEN( state ) = 4; \
1276 #define TRIE_HANDLE_WORD(state) STMT_START { \
1277 U16 dupe= trie->states[ state ].wordnum; \
1278 regnode * const noper_next = regnext( noper ); \
1281 /* store the word for dumping */ \
1283 if (OP(noper) != NOTHING) \
1284 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1286 tmp = newSVpvn_utf8( "", 0, UTF ); \
1287 av_push( trie_words, tmp ); \
1291 trie->wordinfo[curword].prev = 0; \
1292 trie->wordinfo[curword].len = wordlen; \
1293 trie->wordinfo[curword].accept = state; \
1295 if ( noper_next < tail ) { \
1297 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1298 trie->jump[curword] = (U16)(noper_next - convert); \
1300 jumper = noper_next; \
1302 nextbranch= regnext(cur); \
1306 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1307 /* chain, so that when the bits of chain are later */\
1308 /* linked together, the dups appear in the chain */\
1309 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1310 trie->wordinfo[dupe].prev = curword; \
1312 /* we haven't inserted this word yet. */ \
1313 trie->states[ state ].wordnum = curword; \
1318 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1319 ( ( base + charid >= ucharcount \
1320 && base + charid < ubound \
1321 && state == trie->trans[ base - ucharcount + charid ].check \
1322 && trie->trans[ base - ucharcount + charid ].next ) \
1323 ? trie->trans[ base - ucharcount + charid ].next \
1324 : ( state==1 ? special : 0 ) \
1328 #define MADE_JUMP_TRIE 2
1329 #define MADE_EXACT_TRIE 4
1332 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1335 /* first pass, loop through and scan words */
1336 reg_trie_data *trie;
1337 HV *widecharmap = NULL;
1338 AV *revcharmap = newAV();
1340 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1345 regnode *jumper = NULL;
1346 regnode *nextbranch = NULL;
1347 regnode *convert = NULL;
1348 U32 *prev_states; /* temp array mapping each state to previous one */
1349 /* we just use folder as a flag in utf8 */
1350 const U8 * const folder = ( flags == EXACTF
1352 : ( flags == EXACTFL
1359 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1360 AV *trie_words = NULL;
1361 /* along with revcharmap, this only used during construction but both are
1362 * useful during debugging so we store them in the struct when debugging.
1365 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1366 STRLEN trie_charcount=0;
1368 SV *re_trie_maxbuff;
1369 GET_RE_DEBUG_FLAGS_DECL;
1371 PERL_ARGS_ASSERT_MAKE_TRIE;
1373 PERL_UNUSED_ARG(depth);
1376 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1378 trie->startstate = 1;
1379 trie->wordcount = word_count;
1380 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1381 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1382 if (!(UTF && folder))
1383 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1384 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1385 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1388 trie_words = newAV();
1391 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1392 if (!SvIOK(re_trie_maxbuff)) {
1393 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1396 PerlIO_printf( Perl_debug_log,
1397 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1398 (int)depth * 2 + 2, "",
1399 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1400 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1404 /* Find the node we are going to overwrite */
1405 if ( first == startbranch && OP( last ) != BRANCH ) {
1406 /* whole branch chain */
1409 /* branch sub-chain */
1410 convert = NEXTOPER( first );
1413 /* -- First loop and Setup --
1415 We first traverse the branches and scan each word to determine if it
1416 contains widechars, and how many unique chars there are, this is
1417 important as we have to build a table with at least as many columns as we
1420 We use an array of integers to represent the character codes 0..255
1421 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1422 native representation of the character value as the key and IV's for the
1425 *TODO* If we keep track of how many times each character is used we can
1426 remap the columns so that the table compression later on is more
1427 efficient in terms of memory by ensuring most common value is in the
1428 middle and the least common are on the outside. IMO this would be better
1429 than a most to least common mapping as theres a decent chance the most
1430 common letter will share a node with the least common, meaning the node
1431 will not be compressable. With a middle is most common approach the worst
1432 case is when we have the least common nodes twice.
1436 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1437 regnode * const noper = NEXTOPER( cur );
1438 const U8 *uc = (U8*)STRING( noper );
1439 const U8 * const e = uc + STR_LEN( noper );
1441 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1442 const U8 *scan = (U8*)NULL;
1443 U32 wordlen = 0; /* required init */
1445 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1447 if (OP(noper) == NOTHING) {
1451 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1452 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1453 regardless of encoding */
1455 for ( ; uc < e ; uc += len ) {
1456 TRIE_CHARCOUNT(trie)++;
1460 if ( !trie->charmap[ uvc ] ) {
1461 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1463 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1467 /* store the codepoint in the bitmap, and if its ascii
1468 also store its folded equivelent. */
1469 TRIE_BITMAP_SET(trie,uvc);
1471 /* store the folded codepoint */
1472 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1475 /* store first byte of utf8 representation of
1476 codepoints in the 127 < uvc < 256 range */
1477 if (127 < uvc && uvc < 192) {
1478 TRIE_BITMAP_SET(trie,194);
1479 } else if (191 < uvc ) {
1480 TRIE_BITMAP_SET(trie,195);
1481 /* && uvc < 256 -- we know uvc is < 256 already */
1484 set_bit = 0; /* We've done our bit :-) */
1489 widecharmap = newHV();
1491 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1494 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1496 if ( !SvTRUE( *svpp ) ) {
1497 sv_setiv( *svpp, ++trie->uniquecharcount );
1502 if( cur == first ) {
1505 } else if (chars < trie->minlen) {
1507 } else if (chars > trie->maxlen) {
1511 } /* end first pass */
1512 DEBUG_TRIE_COMPILE_r(
1513 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1514 (int)depth * 2 + 2,"",
1515 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1516 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1517 (int)trie->minlen, (int)trie->maxlen )
1521 We now know what we are dealing with in terms of unique chars and
1522 string sizes so we can calculate how much memory a naive
1523 representation using a flat table will take. If it's over a reasonable
1524 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1525 conservative but potentially much slower representation using an array
1528 At the end we convert both representations into the same compressed
1529 form that will be used in regexec.c for matching with. The latter
1530 is a form that cannot be used to construct with but has memory
1531 properties similar to the list form and access properties similar
1532 to the table form making it both suitable for fast searches and
1533 small enough that its feasable to store for the duration of a program.
1535 See the comment in the code where the compressed table is produced
1536 inplace from the flat tabe representation for an explanation of how
1537 the compression works.
1542 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1545 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1547 Second Pass -- Array Of Lists Representation
1549 Each state will be represented by a list of charid:state records
1550 (reg_trie_trans_le) the first such element holds the CUR and LEN
1551 points of the allocated array. (See defines above).
1553 We build the initial structure using the lists, and then convert
1554 it into the compressed table form which allows faster lookups
1555 (but cant be modified once converted).
1558 STRLEN transcount = 1;
1560 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1561 "%*sCompiling trie using list compiler\n",
1562 (int)depth * 2 + 2, ""));
1564 trie->states = (reg_trie_state *)
1565 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1566 sizeof(reg_trie_state) );
1570 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1572 regnode * const noper = NEXTOPER( cur );
1573 U8 *uc = (U8*)STRING( noper );
1574 const U8 * const e = uc + STR_LEN( noper );
1575 U32 state = 1; /* required init */
1576 U16 charid = 0; /* sanity init */
1577 U8 *scan = (U8*)NULL; /* sanity init */
1578 STRLEN foldlen = 0; /* required init */
1579 U32 wordlen = 0; /* required init */
1580 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1582 if (OP(noper) != NOTHING) {
1583 for ( ; uc < e ; uc += len ) {
1588 charid = trie->charmap[ uvc ];
1590 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1594 charid=(U16)SvIV( *svpp );
1597 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1604 if ( !trie->states[ state ].trans.list ) {
1605 TRIE_LIST_NEW( state );
1607 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1608 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1609 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1614 newstate = next_alloc++;
1615 prev_states[newstate] = state;
1616 TRIE_LIST_PUSH( state, charid, newstate );
1621 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1625 TRIE_HANDLE_WORD(state);
1627 } /* end second pass */
1629 /* next alloc is the NEXT state to be allocated */
1630 trie->statecount = next_alloc;
1631 trie->states = (reg_trie_state *)
1632 PerlMemShared_realloc( trie->states,
1634 * sizeof(reg_trie_state) );
1636 /* and now dump it out before we compress it */
1637 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1638 revcharmap, next_alloc,
1642 trie->trans = (reg_trie_trans *)
1643 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1650 for( state=1 ; state < next_alloc ; state ++ ) {
1654 DEBUG_TRIE_COMPILE_MORE_r(
1655 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1659 if (trie->states[state].trans.list) {
1660 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1664 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1665 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1666 if ( forid < minid ) {
1668 } else if ( forid > maxid ) {
1672 if ( transcount < tp + maxid - minid + 1) {
1674 trie->trans = (reg_trie_trans *)
1675 PerlMemShared_realloc( trie->trans,
1677 * sizeof(reg_trie_trans) );
1678 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1680 base = trie->uniquecharcount + tp - minid;
1681 if ( maxid == minid ) {
1683 for ( ; zp < tp ; zp++ ) {
1684 if ( ! trie->trans[ zp ].next ) {
1685 base = trie->uniquecharcount + zp - minid;
1686 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1687 trie->trans[ zp ].check = state;
1693 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1694 trie->trans[ tp ].check = state;
1699 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1700 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1701 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1702 trie->trans[ tid ].check = state;
1704 tp += ( maxid - minid + 1 );
1706 Safefree(trie->states[ state ].trans.list);
1709 DEBUG_TRIE_COMPILE_MORE_r(
1710 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1713 trie->states[ state ].trans.base=base;
1715 trie->lasttrans = tp + 1;
1719 Second Pass -- Flat Table Representation.
1721 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1722 We know that we will need Charcount+1 trans at most to store the data
1723 (one row per char at worst case) So we preallocate both structures
1724 assuming worst case.
1726 We then construct the trie using only the .next slots of the entry
1729 We use the .check field of the first entry of the node temporarily to
1730 make compression both faster and easier by keeping track of how many non
1731 zero fields are in the node.
1733 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1736 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1737 number representing the first entry of the node, and state as a
1738 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1739 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1740 are 2 entrys per node. eg:
1748 The table is internally in the right hand, idx form. However as we also
1749 have to deal with the states array which is indexed by nodenum we have to
1750 use TRIE_NODENUM() to convert.
1753 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1754 "%*sCompiling trie using table compiler\n",
1755 (int)depth * 2 + 2, ""));
1757 trie->trans = (reg_trie_trans *)
1758 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1759 * trie->uniquecharcount + 1,
1760 sizeof(reg_trie_trans) );
1761 trie->states = (reg_trie_state *)
1762 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1763 sizeof(reg_trie_state) );
1764 next_alloc = trie->uniquecharcount + 1;
1767 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1769 regnode * const noper = NEXTOPER( cur );
1770 const U8 *uc = (U8*)STRING( noper );
1771 const U8 * const e = uc + STR_LEN( noper );
1773 U32 state = 1; /* required init */
1775 U16 charid = 0; /* sanity init */
1776 U32 accept_state = 0; /* sanity init */
1777 U8 *scan = (U8*)NULL; /* sanity init */
1779 STRLEN foldlen = 0; /* required init */
1780 U32 wordlen = 0; /* required init */
1781 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1783 if ( OP(noper) != NOTHING ) {
1784 for ( ; uc < e ; uc += len ) {
1789 charid = trie->charmap[ uvc ];
1791 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1792 charid = svpp ? (U16)SvIV(*svpp) : 0;
1796 if ( !trie->trans[ state + charid ].next ) {
1797 trie->trans[ state + charid ].next = next_alloc;
1798 trie->trans[ state ].check++;
1799 prev_states[TRIE_NODENUM(next_alloc)]
1800 = TRIE_NODENUM(state);
1801 next_alloc += trie->uniquecharcount;
1803 state = trie->trans[ state + charid ].next;
1805 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1807 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1810 accept_state = TRIE_NODENUM( state );
1811 TRIE_HANDLE_WORD(accept_state);
1813 } /* end second pass */
1815 /* and now dump it out before we compress it */
1816 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1818 next_alloc, depth+1));
1822 * Inplace compress the table.*
1824 For sparse data sets the table constructed by the trie algorithm will
1825 be mostly 0/FAIL transitions or to put it another way mostly empty.
1826 (Note that leaf nodes will not contain any transitions.)
1828 This algorithm compresses the tables by eliminating most such
1829 transitions, at the cost of a modest bit of extra work during lookup:
1831 - Each states[] entry contains a .base field which indicates the
1832 index in the state[] array wheres its transition data is stored.
1834 - If .base is 0 there are no valid transitions from that node.
1836 - If .base is nonzero then charid is added to it to find an entry in
1839 -If trans[states[state].base+charid].check!=state then the
1840 transition is taken to be a 0/Fail transition. Thus if there are fail
1841 transitions at the front of the node then the .base offset will point
1842 somewhere inside the previous nodes data (or maybe even into a node
1843 even earlier), but the .check field determines if the transition is
1847 The following process inplace converts the table to the compressed
1848 table: We first do not compress the root node 1,and mark its all its
1849 .check pointers as 1 and set its .base pointer as 1 as well. This
1850 allows to do a DFA construction from the compressed table later, and
1851 ensures that any .base pointers we calculate later are greater than
1854 - We set 'pos' to indicate the first entry of the second node.
1856 - We then iterate over the columns of the node, finding the first and
1857 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1858 and set the .check pointers accordingly, and advance pos
1859 appropriately and repreat for the next node. Note that when we copy
1860 the next pointers we have to convert them from the original
1861 NODEIDX form to NODENUM form as the former is not valid post
1864 - If a node has no transitions used we mark its base as 0 and do not
1865 advance the pos pointer.
1867 - If a node only has one transition we use a second pointer into the
1868 structure to fill in allocated fail transitions from other states.
1869 This pointer is independent of the main pointer and scans forward
1870 looking for null transitions that are allocated to a state. When it
1871 finds one it writes the single transition into the "hole". If the
1872 pointer doesnt find one the single transition is appended as normal.
1874 - Once compressed we can Renew/realloc the structures to release the
1877 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1878 specifically Fig 3.47 and the associated pseudocode.
1882 const U32 laststate = TRIE_NODENUM( next_alloc );
1885 trie->statecount = laststate;
1887 for ( state = 1 ; state < laststate ; state++ ) {
1889 const U32 stateidx = TRIE_NODEIDX( state );
1890 const U32 o_used = trie->trans[ stateidx ].check;
1891 U32 used = trie->trans[ stateidx ].check;
1892 trie->trans[ stateidx ].check = 0;
1894 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1895 if ( flag || trie->trans[ stateidx + charid ].next ) {
1896 if ( trie->trans[ stateidx + charid ].next ) {
1898 for ( ; zp < pos ; zp++ ) {
1899 if ( ! trie->trans[ zp ].next ) {
1903 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1904 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1905 trie->trans[ zp ].check = state;
1906 if ( ++zp > pos ) pos = zp;
1913 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1915 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1916 trie->trans[ pos ].check = state;
1921 trie->lasttrans = pos + 1;
1922 trie->states = (reg_trie_state *)
1923 PerlMemShared_realloc( trie->states, laststate
1924 * sizeof(reg_trie_state) );
1925 DEBUG_TRIE_COMPILE_MORE_r(
1926 PerlIO_printf( Perl_debug_log,
1927 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1928 (int)depth * 2 + 2,"",
1929 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1932 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1935 } /* end table compress */
1937 DEBUG_TRIE_COMPILE_MORE_r(
1938 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1939 (int)depth * 2 + 2, "",
1940 (UV)trie->statecount,
1941 (UV)trie->lasttrans)
1943 /* resize the trans array to remove unused space */
1944 trie->trans = (reg_trie_trans *)
1945 PerlMemShared_realloc( trie->trans, trie->lasttrans
1946 * sizeof(reg_trie_trans) );
1948 { /* Modify the program and insert the new TRIE node*/
1949 U8 nodetype =(U8)(flags & 0xFF);
1953 regnode *optimize = NULL;
1954 #ifdef RE_TRACK_PATTERN_OFFSETS
1957 U32 mjd_nodelen = 0;
1958 #endif /* RE_TRACK_PATTERN_OFFSETS */
1959 #endif /* DEBUGGING */
1961 This means we convert either the first branch or the first Exact,
1962 depending on whether the thing following (in 'last') is a branch
1963 or not and whther first is the startbranch (ie is it a sub part of
1964 the alternation or is it the whole thing.)
1965 Assuming its a sub part we conver the EXACT otherwise we convert
1966 the whole branch sequence, including the first.
1968 /* Find the node we are going to overwrite */
1969 if ( first != startbranch || OP( last ) == BRANCH ) {
1970 /* branch sub-chain */
1971 NEXT_OFF( first ) = (U16)(last - first);
1972 #ifdef RE_TRACK_PATTERN_OFFSETS
1974 mjd_offset= Node_Offset((convert));
1975 mjd_nodelen= Node_Length((convert));
1978 /* whole branch chain */
1980 #ifdef RE_TRACK_PATTERN_OFFSETS
1983 const regnode *nop = NEXTOPER( convert );
1984 mjd_offset= Node_Offset((nop));
1985 mjd_nodelen= Node_Length((nop));
1989 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1990 (int)depth * 2 + 2, "",
1991 (UV)mjd_offset, (UV)mjd_nodelen)
1994 /* But first we check to see if there is a common prefix we can
1995 split out as an EXACT and put in front of the TRIE node. */
1996 trie->startstate= 1;
1997 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1999 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2003 const U32 base = trie->states[ state ].trans.base;
2005 if ( trie->states[state].wordnum )
2008 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2009 if ( ( base + ofs >= trie->uniquecharcount ) &&
2010 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2011 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2013 if ( ++count > 1 ) {
2014 SV **tmp = av_fetch( revcharmap, ofs, 0);
2015 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2016 if ( state == 1 ) break;
2018 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2020 PerlIO_printf(Perl_debug_log,
2021 "%*sNew Start State=%"UVuf" Class: [",
2022 (int)depth * 2 + 2, "",
2025 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2026 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2028 TRIE_BITMAP_SET(trie,*ch);
2030 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2032 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2036 TRIE_BITMAP_SET(trie,*ch);
2038 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2039 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2045 SV **tmp = av_fetch( revcharmap, idx, 0);
2047 char *ch = SvPV( *tmp, len );
2049 SV *sv=sv_newmortal();
2050 PerlIO_printf( Perl_debug_log,
2051 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2052 (int)depth * 2 + 2, "",
2054 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2055 PL_colors[0], PL_colors[1],
2056 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2057 PERL_PV_ESCAPE_FIRSTCHAR
2062 OP( convert ) = nodetype;
2063 str=STRING(convert);
2066 STR_LEN(convert) += len;
2072 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2077 trie->prefixlen = (state-1);
2079 regnode *n = convert+NODE_SZ_STR(convert);
2080 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2081 trie->startstate = state;
2082 trie->minlen -= (state - 1);
2083 trie->maxlen -= (state - 1);
2085 /* At least the UNICOS C compiler choked on this
2086 * being argument to DEBUG_r(), so let's just have
2089 #ifdef PERL_EXT_RE_BUILD
2095 regnode *fix = convert;
2096 U32 word = trie->wordcount;
2098 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2099 while( ++fix < n ) {
2100 Set_Node_Offset_Length(fix, 0, 0);
2103 SV ** const tmp = av_fetch( trie_words, word, 0 );
2105 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2106 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2108 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2116 NEXT_OFF(convert) = (U16)(tail - convert);
2117 DEBUG_r(optimize= n);
2123 if ( trie->maxlen ) {
2124 NEXT_OFF( convert ) = (U16)(tail - convert);
2125 ARG_SET( convert, data_slot );
2126 /* Store the offset to the first unabsorbed branch in
2127 jump[0], which is otherwise unused by the jump logic.
2128 We use this when dumping a trie and during optimisation. */
2130 trie->jump[0] = (U16)(nextbranch - convert);
2133 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2134 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2136 OP( convert ) = TRIEC;
2137 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2138 PerlMemShared_free(trie->bitmap);
2141 OP( convert ) = TRIE;
2143 /* store the type in the flags */
2144 convert->flags = nodetype;
2148 + regarglen[ OP( convert ) ];
2150 /* XXX We really should free up the resource in trie now,
2151 as we won't use them - (which resources?) dmq */
2153 /* needed for dumping*/
2154 DEBUG_r(if (optimize) {
2155 regnode *opt = convert;
2157 while ( ++opt < optimize) {
2158 Set_Node_Offset_Length(opt,0,0);
2161 Try to clean up some of the debris left after the
2164 while( optimize < jumper ) {
2165 mjd_nodelen += Node_Length((optimize));
2166 OP( optimize ) = OPTIMIZED;
2167 Set_Node_Offset_Length(optimize,0,0);
2170 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2172 } /* end node insert */
2173 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2175 /* Finish populating the prev field of the wordinfo array. Walk back
2176 * from each accept state until we find another accept state, and if
2177 * so, point the first word's .prev field at the second word. If the
2178 * second already has a .prev field set, stop now. This will be the
2179 * case either if we've already processed that word's accept state,
2180 * or that that state had multiple words, and the overspill words
2181 * were already linked up earlier.
2188 for (word=1; word <= trie->wordcount; word++) {
2190 if (trie->wordinfo[word].prev)
2192 state = trie->wordinfo[word].accept;
2194 state = prev_states[state];
2197 prev = trie->states[state].wordnum;
2201 trie->wordinfo[word].prev = prev;
2203 Safefree(prev_states);
2207 /* and now dump out the compressed format */
2208 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2210 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2212 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2213 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2215 SvREFCNT_dec(revcharmap);
2219 : trie->startstate>1
2225 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2227 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2229 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2230 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2233 We find the fail state for each state in the trie, this state is the longest proper
2234 suffix of the current states 'word' that is also a proper prefix of another word in our
2235 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2236 the DFA not to have to restart after its tried and failed a word at a given point, it
2237 simply continues as though it had been matching the other word in the first place.
2239 'abcdgu'=~/abcdefg|cdgu/
2240 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2241 fail, which would bring use to the state representing 'd' in the second word where we would
2242 try 'g' and succeed, prodceding to match 'cdgu'.
2244 /* add a fail transition */
2245 const U32 trie_offset = ARG(source);
2246 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2248 const U32 ucharcount = trie->uniquecharcount;
2249 const U32 numstates = trie->statecount;
2250 const U32 ubound = trie->lasttrans + ucharcount;
2254 U32 base = trie->states[ 1 ].trans.base;
2257 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2258 GET_RE_DEBUG_FLAGS_DECL;
2260 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2262 PERL_UNUSED_ARG(depth);
2266 ARG_SET( stclass, data_slot );
2267 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2268 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2269 aho->trie=trie_offset;
2270 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2271 Copy( trie->states, aho->states, numstates, reg_trie_state );
2272 Newxz( q, numstates, U32);
2273 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2276 /* initialize fail[0..1] to be 1 so that we always have
2277 a valid final fail state */
2278 fail[ 0 ] = fail[ 1 ] = 1;
2280 for ( charid = 0; charid < ucharcount ; charid++ ) {
2281 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2283 q[ q_write ] = newstate;
2284 /* set to point at the root */
2285 fail[ q[ q_write++ ] ]=1;
2288 while ( q_read < q_write) {
2289 const U32 cur = q[ q_read++ % numstates ];
2290 base = trie->states[ cur ].trans.base;
2292 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2293 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2295 U32 fail_state = cur;
2298 fail_state = fail[ fail_state ];
2299 fail_base = aho->states[ fail_state ].trans.base;
2300 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2302 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2303 fail[ ch_state ] = fail_state;
2304 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2306 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2308 q[ q_write++ % numstates] = ch_state;
2312 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2313 when we fail in state 1, this allows us to use the
2314 charclass scan to find a valid start char. This is based on the principle
2315 that theres a good chance the string being searched contains lots of stuff
2316 that cant be a start char.
2318 fail[ 0 ] = fail[ 1 ] = 0;
2319 DEBUG_TRIE_COMPILE_r({
2320 PerlIO_printf(Perl_debug_log,
2321 "%*sStclass Failtable (%"UVuf" states): 0",
2322 (int)(depth * 2), "", (UV)numstates
2324 for( q_read=1; q_read<numstates; q_read++ ) {
2325 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2327 PerlIO_printf(Perl_debug_log, "\n");
2330 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2335 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2336 * These need to be revisited when a newer toolchain becomes available.
2338 #if defined(__sparc64__) && defined(__GNUC__)
2339 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2340 # undef SPARC64_GCC_WORKAROUND
2341 # define SPARC64_GCC_WORKAROUND 1
2345 #define DEBUG_PEEP(str,scan,depth) \
2346 DEBUG_OPTIMISE_r({if (scan){ \
2347 SV * const mysv=sv_newmortal(); \
2348 regnode *Next = regnext(scan); \
2349 regprop(RExC_rx, mysv, scan); \
2350 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2351 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2352 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2359 #define JOIN_EXACT(scan,min,flags) \
2360 if (PL_regkind[OP(scan)] == EXACT) \
2361 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2364 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2365 /* Merge several consecutive EXACTish nodes into one. */
2366 regnode *n = regnext(scan);
2368 regnode *next = scan + NODE_SZ_STR(scan);
2372 regnode *stop = scan;
2373 GET_RE_DEBUG_FLAGS_DECL;
2375 PERL_UNUSED_ARG(depth);
2378 PERL_ARGS_ASSERT_JOIN_EXACT;
2379 #ifndef EXPERIMENTAL_INPLACESCAN
2380 PERL_UNUSED_ARG(flags);
2381 PERL_UNUSED_ARG(val);
2383 DEBUG_PEEP("join",scan,depth);
2385 /* Skip NOTHING, merge EXACT*. */
2387 ( PL_regkind[OP(n)] == NOTHING ||
2388 (stringok && (OP(n) == OP(scan))))
2390 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2392 if (OP(n) == TAIL || n > next)
2394 if (PL_regkind[OP(n)] == NOTHING) {
2395 DEBUG_PEEP("skip:",n,depth);
2396 NEXT_OFF(scan) += NEXT_OFF(n);
2397 next = n + NODE_STEP_REGNODE;
2404 else if (stringok) {
2405 const unsigned int oldl = STR_LEN(scan);
2406 regnode * const nnext = regnext(n);
2408 DEBUG_PEEP("merg",n,depth);
2411 if (oldl + STR_LEN(n) > U8_MAX)
2413 NEXT_OFF(scan) += NEXT_OFF(n);
2414 STR_LEN(scan) += STR_LEN(n);
2415 next = n + NODE_SZ_STR(n);
2416 /* Now we can overwrite *n : */
2417 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2425 #ifdef EXPERIMENTAL_INPLACESCAN
2426 if (flags && !NEXT_OFF(n)) {
2427 DEBUG_PEEP("atch", val, depth);
2428 if (reg_off_by_arg[OP(n)]) {
2429 ARG_SET(n, val - n);
2432 NEXT_OFF(n) = val - n;
2439 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2441 Two problematic code points in Unicode casefolding of EXACT nodes:
2443 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2444 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2450 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2451 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2453 This means that in case-insensitive matching (or "loose matching",
2454 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2455 length of the above casefolded versions) can match a target string
2456 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2457 This would rather mess up the minimum length computation.
2459 What we'll do is to look for the tail four bytes, and then peek
2460 at the preceding two bytes to see whether we need to decrease
2461 the minimum length by four (six minus two).
2463 Thanks to the design of UTF-8, there cannot be false matches:
2464 A sequence of valid UTF-8 bytes cannot be a subsequence of
2465 another valid sequence of UTF-8 bytes.
2468 char * const s0 = STRING(scan), *s, *t;
2469 char * const s1 = s0 + STR_LEN(scan) - 1;
2470 char * const s2 = s1 - 4;
2471 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2472 const char t0[] = "\xaf\x49\xaf\x42";
2474 const char t0[] = "\xcc\x88\xcc\x81";
2476 const char * const t1 = t0 + 3;
2479 s < s2 && (t = ninstr(s, s1, t0, t1));
2482 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2483 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2485 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2486 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2494 n = scan + NODE_SZ_STR(scan);
2496 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2503 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2507 /* REx optimizer. Converts nodes into quickier variants "in place".
2508 Finds fixed substrings. */
2510 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2511 to the position after last scanned or to NULL. */
2513 #define INIT_AND_WITHP \
2514 assert(!and_withp); \
2515 Newx(and_withp,1,struct regnode_charclass_class); \
2516 SAVEFREEPV(and_withp)
2518 /* this is a chain of data about sub patterns we are processing that
2519 need to be handled seperately/specially in study_chunk. Its so
2520 we can simulate recursion without losing state. */
2522 typedef struct scan_frame {
2523 regnode *last; /* last node to process in this frame */
2524 regnode *next; /* next node to process when last is reached */
2525 struct scan_frame *prev; /*previous frame*/
2526 I32 stop; /* what stopparen do we use */
2530 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2532 #define CASE_SYNST_FNC(nAmE) \
2534 if (flags & SCF_DO_STCLASS_AND) { \
2535 for (value = 0; value < 256; value++) \
2536 if (!is_ ## nAmE ## _cp(value)) \
2537 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2540 for (value = 0; value < 256; value++) \
2541 if (is_ ## nAmE ## _cp(value)) \
2542 ANYOF_BITMAP_SET(data->start_class, value); \
2546 if (flags & SCF_DO_STCLASS_AND) { \
2547 for (value = 0; value < 256; value++) \
2548 if (is_ ## nAmE ## _cp(value)) \
2549 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2552 for (value = 0; value < 256; value++) \
2553 if (!is_ ## nAmE ## _cp(value)) \
2554 ANYOF_BITMAP_SET(data->start_class, value); \
2561 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2562 I32 *minlenp, I32 *deltap,
2567 struct regnode_charclass_class *and_withp,
2568 U32 flags, U32 depth)
2569 /* scanp: Start here (read-write). */
2570 /* deltap: Write maxlen-minlen here. */
2571 /* last: Stop before this one. */
2572 /* data: string data about the pattern */
2573 /* stopparen: treat close N as END */
2574 /* recursed: which subroutines have we recursed into */
2575 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2578 I32 min = 0, pars = 0, code;
2579 regnode *scan = *scanp, *next;
2581 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2582 int is_inf_internal = 0; /* The studied chunk is infinite */
2583 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2584 scan_data_t data_fake;
2585 SV *re_trie_maxbuff = NULL;
2586 regnode *first_non_open = scan;
2587 I32 stopmin = I32_MAX;
2588 scan_frame *frame = NULL;
2589 GET_RE_DEBUG_FLAGS_DECL;
2591 PERL_ARGS_ASSERT_STUDY_CHUNK;
2594 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2598 while (first_non_open && OP(first_non_open) == OPEN)
2599 first_non_open=regnext(first_non_open);
2604 while ( scan && OP(scan) != END && scan < last ){
2605 /* Peephole optimizer: */
2606 DEBUG_STUDYDATA("Peep:", data,depth);
2607 DEBUG_PEEP("Peep",scan,depth);
2608 JOIN_EXACT(scan,&min,0);
2610 /* Follow the next-chain of the current node and optimize
2611 away all the NOTHINGs from it. */
2612 if (OP(scan) != CURLYX) {
2613 const int max = (reg_off_by_arg[OP(scan)]
2615 /* I32 may be smaller than U16 on CRAYs! */
2616 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2617 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2621 /* Skip NOTHING and LONGJMP. */
2622 while ((n = regnext(n))
2623 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2624 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2625 && off + noff < max)
2627 if (reg_off_by_arg[OP(scan)])
2630 NEXT_OFF(scan) = off;
2635 /* The principal pseudo-switch. Cannot be a switch, since we
2636 look into several different things. */
2637 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2638 || OP(scan) == IFTHEN) {
2639 next = regnext(scan);
2641 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2643 if (OP(next) == code || code == IFTHEN) {
2644 /* NOTE - There is similar code to this block below for handling
2645 TRIE nodes on a re-study. If you change stuff here check there
2647 I32 max1 = 0, min1 = I32_MAX, num = 0;
2648 struct regnode_charclass_class accum;
2649 regnode * const startbranch=scan;
2651 if (flags & SCF_DO_SUBSTR)
2652 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2653 if (flags & SCF_DO_STCLASS)
2654 cl_init_zero(pRExC_state, &accum);
2656 while (OP(scan) == code) {
2657 I32 deltanext, minnext, f = 0, fake;
2658 struct regnode_charclass_class this_class;
2661 data_fake.flags = 0;
2663 data_fake.whilem_c = data->whilem_c;
2664 data_fake.last_closep = data->last_closep;
2667 data_fake.last_closep = &fake;
2669 data_fake.pos_delta = delta;
2670 next = regnext(scan);
2671 scan = NEXTOPER(scan);
2673 scan = NEXTOPER(scan);
2674 if (flags & SCF_DO_STCLASS) {
2675 cl_init(pRExC_state, &this_class);
2676 data_fake.start_class = &this_class;
2677 f = SCF_DO_STCLASS_AND;
2679 if (flags & SCF_WHILEM_VISITED_POS)
2680 f |= SCF_WHILEM_VISITED_POS;
2682 /* we suppose the run is continuous, last=next...*/
2683 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2685 stopparen, recursed, NULL, f,depth+1);
2688 if (max1 < minnext + deltanext)
2689 max1 = minnext + deltanext;
2690 if (deltanext == I32_MAX)
2691 is_inf = is_inf_internal = 1;
2693 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2695 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2696 if ( stopmin > minnext)
2697 stopmin = min + min1;
2698 flags &= ~SCF_DO_SUBSTR;
2700 data->flags |= SCF_SEEN_ACCEPT;
2703 if (data_fake.flags & SF_HAS_EVAL)
2704 data->flags |= SF_HAS_EVAL;
2705 data->whilem_c = data_fake.whilem_c;
2707 if (flags & SCF_DO_STCLASS)
2708 cl_or(pRExC_state, &accum, &this_class);
2710 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2712 if (flags & SCF_DO_SUBSTR) {
2713 data->pos_min += min1;
2714 data->pos_delta += max1 - min1;
2715 if (max1 != min1 || is_inf)
2716 data->longest = &(data->longest_float);
2719 delta += max1 - min1;
2720 if (flags & SCF_DO_STCLASS_OR) {
2721 cl_or(pRExC_state, data->start_class, &accum);
2723 cl_and(data->start_class, and_withp);
2724 flags &= ~SCF_DO_STCLASS;
2727 else if (flags & SCF_DO_STCLASS_AND) {
2729 cl_and(data->start_class, &accum);
2730 flags &= ~SCF_DO_STCLASS;
2733 /* Switch to OR mode: cache the old value of
2734 * data->start_class */
2736 StructCopy(data->start_class, and_withp,
2737 struct regnode_charclass_class);
2738 flags &= ~SCF_DO_STCLASS_AND;
2739 StructCopy(&accum, data->start_class,
2740 struct regnode_charclass_class);
2741 flags |= SCF_DO_STCLASS_OR;
2742 data->start_class->flags |= ANYOF_EOS;
2746 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2749 Assuming this was/is a branch we are dealing with: 'scan' now
2750 points at the item that follows the branch sequence, whatever
2751 it is. We now start at the beginning of the sequence and look
2758 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2760 If we can find such a subseqence we need to turn the first
2761 element into a trie and then add the subsequent branch exact
2762 strings to the trie.
2766 1. patterns where the whole set of branch can be converted.
2768 2. patterns where only a subset can be converted.
2770 In case 1 we can replace the whole set with a single regop
2771 for the trie. In case 2 we need to keep the start and end
2774 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2775 becomes BRANCH TRIE; BRANCH X;
2777 There is an additional case, that being where there is a
2778 common prefix, which gets split out into an EXACT like node
2779 preceding the TRIE node.
2781 If x(1..n)==tail then we can do a simple trie, if not we make
2782 a "jump" trie, such that when we match the appropriate word
2783 we "jump" to the appopriate tail node. Essentailly we turn
2784 a nested if into a case structure of sorts.
2789 if (!re_trie_maxbuff) {
2790 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2791 if (!SvIOK(re_trie_maxbuff))
2792 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2794 if ( SvIV(re_trie_maxbuff)>=0 ) {
2796 regnode *first = (regnode *)NULL;
2797 regnode *last = (regnode *)NULL;
2798 regnode *tail = scan;
2803 SV * const mysv = sv_newmortal(); /* for dumping */
2805 /* var tail is used because there may be a TAIL
2806 regop in the way. Ie, the exacts will point to the
2807 thing following the TAIL, but the last branch will
2808 point at the TAIL. So we advance tail. If we
2809 have nested (?:) we may have to move through several
2813 while ( OP( tail ) == TAIL ) {
2814 /* this is the TAIL generated by (?:) */
2815 tail = regnext( tail );
2820 regprop(RExC_rx, mysv, tail );
2821 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2822 (int)depth * 2 + 2, "",
2823 "Looking for TRIE'able sequences. Tail node is: ",
2824 SvPV_nolen_const( mysv )
2830 step through the branches, cur represents each
2831 branch, noper is the first thing to be matched
2832 as part of that branch and noper_next is the
2833 regnext() of that node. if noper is an EXACT
2834 and noper_next is the same as scan (our current
2835 position in the regex) then the EXACT branch is
2836 a possible optimization target. Once we have
2837 two or more consequetive such branches we can
2838 create a trie of the EXACT's contents and stich
2839 it in place. If the sequence represents all of
2840 the branches we eliminate the whole thing and
2841 replace it with a single TRIE. If it is a
2842 subsequence then we need to stitch it in. This
2843 means the first branch has to remain, and needs
2844 to be repointed at the item on the branch chain
2845 following the last branch optimized. This could
2846 be either a BRANCH, in which case the
2847 subsequence is internal, or it could be the
2848 item following the branch sequence in which
2849 case the subsequence is at the end.
2853 /* dont use tail as the end marker for this traverse */
2854 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2855 regnode * const noper = NEXTOPER( cur );
2856 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2857 regnode * const noper_next = regnext( noper );
2861 regprop(RExC_rx, mysv, cur);
2862 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2863 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2865 regprop(RExC_rx, mysv, noper);
2866 PerlIO_printf( Perl_debug_log, " -> %s",
2867 SvPV_nolen_const(mysv));
2870 regprop(RExC_rx, mysv, noper_next );
2871 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2872 SvPV_nolen_const(mysv));
2874 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2875 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2877 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2878 : PL_regkind[ OP( noper ) ] == EXACT )
2879 || OP(noper) == NOTHING )
2881 && noper_next == tail
2886 if ( !first || optype == NOTHING ) {
2887 if (!first) first = cur;
2888 optype = OP( noper );
2894 Currently we do not believe that the trie logic can
2895 handle case insensitive matching properly when the
2896 pattern is not unicode (thus forcing unicode semantics).
2898 If/when this is fixed the following define can be swapped
2899 in below to fully enable trie logic.
2901 #define TRIE_TYPE_IS_SAFE 1
2904 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2906 if ( last && TRIE_TYPE_IS_SAFE ) {
2907 make_trie( pRExC_state,
2908 startbranch, first, cur, tail, count,
2911 if ( PL_regkind[ OP( noper ) ] == EXACT
2913 && noper_next == tail
2918 optype = OP( noper );
2928 regprop(RExC_rx, mysv, cur);
2929 PerlIO_printf( Perl_debug_log,
2930 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2931 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2935 if ( last && TRIE_TYPE_IS_SAFE ) {
2936 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2937 #ifdef TRIE_STUDY_OPT
2938 if ( ((made == MADE_EXACT_TRIE &&
2939 startbranch == first)
2940 || ( first_non_open == first )) &&
2942 flags |= SCF_TRIE_RESTUDY;
2943 if ( startbranch == first
2946 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2956 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2957 scan = NEXTOPER(NEXTOPER(scan));
2958 } else /* single branch is optimized. */
2959 scan = NEXTOPER(scan);
2961 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2962 scan_frame *newframe = NULL;
2967 if (OP(scan) != SUSPEND) {
2968 /* set the pointer */
2969 if (OP(scan) == GOSUB) {
2971 RExC_recurse[ARG2L(scan)] = scan;
2972 start = RExC_open_parens[paren-1];
2973 end = RExC_close_parens[paren-1];
2976 start = RExC_rxi->program + 1;
2980 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2981 SAVEFREEPV(recursed);
2983 if (!PAREN_TEST(recursed,paren+1)) {
2984 PAREN_SET(recursed,paren+1);
2985 Newx(newframe,1,scan_frame);
2987 if (flags & SCF_DO_SUBSTR) {
2988 SCAN_COMMIT(pRExC_state,data,minlenp);
2989 data->longest = &(data->longest_float);
2991 is_inf = is_inf_internal = 1;
2992 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2993 cl_anything(pRExC_state, data->start_class);
2994 flags &= ~SCF_DO_STCLASS;
2997 Newx(newframe,1,scan_frame);
3000 end = regnext(scan);
3005 SAVEFREEPV(newframe);
3006 newframe->next = regnext(scan);
3007 newframe->last = last;
3008 newframe->stop = stopparen;
3009 newframe->prev = frame;
3019 else if (OP(scan) == EXACT) {
3020 I32 l = STR_LEN(scan);
3023 const U8 * const s = (U8*)STRING(scan);
3024 l = utf8_length(s, s + l);
3025 uc = utf8_to_uvchr(s, NULL);
3027 uc = *((U8*)STRING(scan));
3030 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3031 /* The code below prefers earlier match for fixed
3032 offset, later match for variable offset. */
3033 if (data->last_end == -1) { /* Update the start info. */
3034 data->last_start_min = data->pos_min;
3035 data->last_start_max = is_inf
3036 ? I32_MAX : data->pos_min + data->pos_delta;
3038 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3040 SvUTF8_on(data->last_found);
3042 SV * const sv = data->last_found;
3043 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3044 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3045 if (mg && mg->mg_len >= 0)
3046 mg->mg_len += utf8_length((U8*)STRING(scan),
3047 (U8*)STRING(scan)+STR_LEN(scan));
3049 data->last_end = data->pos_min + l;
3050 data->pos_min += l; /* As in the first entry. */
3051 data->flags &= ~SF_BEFORE_EOL;
3053 if (flags & SCF_DO_STCLASS_AND) {
3054 /* Check whether it is compatible with what we know already! */
3058 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3059 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3060 && (!(data->start_class->flags & ANYOF_FOLD)
3061 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3064 ANYOF_CLASS_ZERO(data->start_class);
3065 ANYOF_BITMAP_ZERO(data->start_class);
3067 ANYOF_BITMAP_SET(data->start_class, uc);
3068 data->start_class->flags &= ~ANYOF_EOS;
3070 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3072 else if (flags & SCF_DO_STCLASS_OR) {
3073 /* false positive possible if the class is case-folded */
3075 ANYOF_BITMAP_SET(data->start_class, uc);
3077 data->start_class->flags |= ANYOF_UNICODE_ALL;
3078 data->start_class->flags &= ~ANYOF_EOS;
3079 cl_and(data->start_class, and_withp);
3081 flags &= ~SCF_DO_STCLASS;
3083 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3084 I32 l = STR_LEN(scan);
3085 UV uc = *((U8*)STRING(scan));
3087 /* Search for fixed substrings supports EXACT only. */
3088 if (flags & SCF_DO_SUBSTR) {
3090 SCAN_COMMIT(pRExC_state, data, minlenp);
3093 const U8 * const s = (U8 *)STRING(scan);
3094 l = utf8_length(s, s + l);
3095 uc = utf8_to_uvchr(s, NULL);
3098 if (flags & SCF_DO_SUBSTR)
3100 if (flags & SCF_DO_STCLASS_AND) {
3101 /* Check whether it is compatible with what we know already! */
3105 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3106 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3107 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3109 ANYOF_CLASS_ZERO(data->start_class);
3110 ANYOF_BITMAP_ZERO(data->start_class);
3112 ANYOF_BITMAP_SET(data->start_class, uc);
3113 data->start_class->flags &= ~ANYOF_EOS;
3114 data->start_class->flags |= ANYOF_FOLD;
3115 if (OP(scan) == EXACTFL)
3116 data->start_class->flags |= ANYOF_LOCALE;
3119 else if (flags & SCF_DO_STCLASS_OR) {
3120 if (data->start_class->flags & ANYOF_FOLD) {
3121 /* false positive possible if the class is case-folded.
3122 Assume that the locale settings are the same... */
3124 ANYOF_BITMAP_SET(data->start_class, uc);
3125 data->start_class->flags &= ~ANYOF_EOS;
3127 cl_and(data->start_class, and_withp);
3129 flags &= ~SCF_DO_STCLASS;
3131 else if (REGNODE_VARIES(OP(scan))) {
3132 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3133 I32 f = flags, pos_before = 0;
3134 regnode * const oscan = scan;
3135 struct regnode_charclass_class this_class;
3136 struct regnode_charclass_class *oclass = NULL;
3137 I32 next_is_eval = 0;
3139 switch (PL_regkind[OP(scan)]) {
3140 case WHILEM: /* End of (?:...)* . */
3141 scan = NEXTOPER(scan);
3144 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3145 next = NEXTOPER(scan);
3146 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3148 maxcount = REG_INFTY;
3149 next = regnext(scan);
3150 scan = NEXTOPER(scan);
3154 if (flags & SCF_DO_SUBSTR)
3159 if (flags & SCF_DO_STCLASS) {
3161 maxcount = REG_INFTY;
3162 next = regnext(scan);
3163 scan = NEXTOPER(scan);
3166 is_inf = is_inf_internal = 1;
3167 scan = regnext(scan);
3168 if (flags & SCF_DO_SUBSTR) {
3169 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3170 data->longest = &(data->longest_float);
3172 goto optimize_curly_tail;
3174 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3175 && (scan->flags == stopparen))
3180 mincount = ARG1(scan);
3181 maxcount = ARG2(scan);
3183 next = regnext(scan);
3184 if (OP(scan) == CURLYX) {
3185 I32 lp = (data ? *(data->last_closep) : 0);
3186 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3188 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3189 next_is_eval = (OP(scan) == EVAL);
3191 if (flags & SCF_DO_SUBSTR) {
3192 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3193 pos_before = data->pos_min;
3197 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3199 data->flags |= SF_IS_INF;
3201 if (flags & SCF_DO_STCLASS) {
3202 cl_init(pRExC_state, &this_class);
3203 oclass = data->start_class;
3204 data->start_class = &this_class;
3205 f |= SCF_DO_STCLASS_AND;
3206 f &= ~SCF_DO_STCLASS_OR;
3208 /* These are the cases when once a subexpression
3209 fails at a particular position, it cannot succeed
3210 even after backtracking at the enclosing scope.
3212 XXXX what if minimal match and we are at the
3213 initial run of {n,m}? */
3214 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3215 f &= ~SCF_WHILEM_VISITED_POS;
3217 /* This will finish on WHILEM, setting scan, or on NULL: */
3218 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3219 last, data, stopparen, recursed, NULL,
3221 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3223 if (flags & SCF_DO_STCLASS)
3224 data->start_class = oclass;
3225 if (mincount == 0 || minnext == 0) {
3226 if (flags & SCF_DO_STCLASS_OR) {
3227 cl_or(pRExC_state, data->start_class, &this_class);
3229 else if (flags & SCF_DO_STCLASS_AND) {
3230 /* Switch to OR mode: cache the old value of
3231 * data->start_class */
3233 StructCopy(data->start_class, and_withp,
3234 struct regnode_charclass_class);
3235 flags &= ~SCF_DO_STCLASS_AND;
3236 StructCopy(&this_class, data->start_class,
3237 struct regnode_charclass_class);
3238 flags |= SCF_DO_STCLASS_OR;
3239 data->start_class->flags |= ANYOF_EOS;
3241 } else { /* Non-zero len */
3242 if (flags & SCF_DO_STCLASS_OR) {
3243 cl_or(pRExC_state, data->start_class, &this_class);
3244 cl_and(data->start_class, and_withp);
3246 else if (flags & SCF_DO_STCLASS_AND)
3247 cl_and(data->start_class, &this_class);
3248 flags &= ~SCF_DO_STCLASS;
3250 if (!scan) /* It was not CURLYX, but CURLY. */
3252 if ( /* ? quantifier ok, except for (?{ ... }) */
3253 (next_is_eval || !(mincount == 0 && maxcount == 1))
3254 && (minnext == 0) && (deltanext == 0)
3255 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3256 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3258 ckWARNreg(RExC_parse,
3259 "Quantifier unexpected on zero-length expression");
3262 min += minnext * mincount;
3263 is_inf_internal |= ((maxcount == REG_INFTY
3264 && (minnext + deltanext) > 0)
3265 || deltanext == I32_MAX);
3266 is_inf |= is_inf_internal;
3267 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3269 /* Try powerful optimization CURLYX => CURLYN. */
3270 if ( OP(oscan) == CURLYX && data
3271 && data->flags & SF_IN_PAR
3272 && !(data->flags & SF_HAS_EVAL)
3273 && !deltanext && minnext == 1 ) {
3274 /* Try to optimize to CURLYN. */
3275 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3276 regnode * const nxt1 = nxt;
3283 if (!REGNODE_SIMPLE(OP(nxt))
3284 && !(PL_regkind[OP(nxt)] == EXACT
3285 && STR_LEN(nxt) == 1))
3291 if (OP(nxt) != CLOSE)
3293 if (RExC_open_parens) {
3294 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3295 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3297 /* Now we know that nxt2 is the only contents: */
3298 oscan->flags = (U8)ARG(nxt);
3300 OP(nxt1) = NOTHING; /* was OPEN. */
3303 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3304 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3305 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3306 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3307 OP(nxt + 1) = OPTIMIZED; /* was count. */
3308 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3313 /* Try optimization CURLYX => CURLYM. */
3314 if ( OP(oscan) == CURLYX && data
3315 && !(data->flags & SF_HAS_PAR)
3316 && !(data->flags & SF_HAS_EVAL)
3317 && !deltanext /* atom is fixed width */
3318 && minnext != 0 /* CURLYM can't handle zero width */
3320 /* XXXX How to optimize if data == 0? */
3321 /* Optimize to a simpler form. */
3322 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3326 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3327 && (OP(nxt2) != WHILEM))
3329 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3330 /* Need to optimize away parenths. */
3331 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3332 /* Set the parenth number. */
3333 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3335 oscan->flags = (U8)ARG(nxt);
3336 if (RExC_open_parens) {
3337 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3338 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3340 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3341 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3344 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3345 OP(nxt + 1) = OPTIMIZED; /* was count. */
3346 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3347 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3350 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3351 regnode *nnxt = regnext(nxt1);
3353 if (reg_off_by_arg[OP(nxt1)])
3354 ARG_SET(nxt1, nxt2 - nxt1);
3355 else if (nxt2 - nxt1 < U16_MAX)
3356 NEXT_OFF(nxt1) = nxt2 - nxt1;
3358 OP(nxt) = NOTHING; /* Cannot beautify */
3363 /* Optimize again: */
3364 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3365 NULL, stopparen, recursed, NULL, 0,depth+1);
3370 else if ((OP(oscan) == CURLYX)
3371 && (flags & SCF_WHILEM_VISITED_POS)
3372 /* See the comment on a similar expression above.
3373 However, this time it not a subexpression
3374 we care about, but the expression itself. */
3375 && (maxcount == REG_INFTY)
3376 && data && ++data->whilem_c < 16) {
3377 /* This stays as CURLYX, we can put the count/of pair. */
3378 /* Find WHILEM (as in regexec.c) */
3379 regnode *nxt = oscan + NEXT_OFF(oscan);
3381 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3383 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3384 | (RExC_whilem_seen << 4)); /* On WHILEM */
3386 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3388 if (flags & SCF_DO_SUBSTR) {
3389 SV *last_str = NULL;
3390 int counted = mincount != 0;
3392 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3393 #if defined(SPARC64_GCC_WORKAROUND)
3396 const char *s = NULL;
3399 if (pos_before >= data->last_start_min)
3402 b = data->last_start_min;
3405 s = SvPV_const(data->last_found, l);
3406 old = b - data->last_start_min;
3409 I32 b = pos_before >= data->last_start_min
3410 ? pos_before : data->last_start_min;
3412 const char * const s = SvPV_const(data->last_found, l);
3413 I32 old = b - data->last_start_min;
3417 old = utf8_hop((U8*)s, old) - (U8*)s;
3419 /* Get the added string: */
3420 last_str = newSVpvn_utf8(s + old, l, UTF);
3421 if (deltanext == 0 && pos_before == b) {
3422 /* What was added is a constant string */
3424 SvGROW(last_str, (mincount * l) + 1);
3425 repeatcpy(SvPVX(last_str) + l,
3426 SvPVX_const(last_str), l, mincount - 1);
3427 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3428 /* Add additional parts. */
3429 SvCUR_set(data->last_found,
3430 SvCUR(data->last_found) - l);
3431 sv_catsv(data->last_found, last_str);
3433 SV * sv = data->last_found;
3435 SvUTF8(sv) && SvMAGICAL(sv) ?
3436 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3437 if (mg && mg->mg_len >= 0)
3438 mg->mg_len += CHR_SVLEN(last_str) - l;
3440 data->last_end += l * (mincount - 1);
3443 /* start offset must point into the last copy */
3444 data->last_start_min += minnext * (mincount - 1);
3445 data->last_start_max += is_inf ? I32_MAX
3446 : (maxcount - 1) * (minnext + data->pos_delta);
3449 /* It is counted once already... */
3450 data->pos_min += minnext * (mincount - counted);
3451 data->pos_delta += - counted * deltanext +
3452 (minnext + deltanext) * maxcount - minnext * mincount;
3453 if (mincount != maxcount) {
3454 /* Cannot extend fixed substrings found inside
3456 SCAN_COMMIT(pRExC_state,data,minlenp);
3457 if (mincount && last_str) {
3458 SV * const sv = data->last_found;
3459 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3460 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3464 sv_setsv(sv, last_str);
3465 data->last_end = data->pos_min;
3466 data->last_start_min =
3467 data->pos_min - CHR_SVLEN(last_str);
3468 data->last_start_max = is_inf
3470 : data->pos_min + data->pos_delta
3471 - CHR_SVLEN(last_str);
3473 data->longest = &(data->longest_float);
3475 SvREFCNT_dec(last_str);
3477 if (data && (fl & SF_HAS_EVAL))
3478 data->flags |= SF_HAS_EVAL;
3479 optimize_curly_tail:
3480 if (OP(oscan) != CURLYX) {
3481 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3483 NEXT_OFF(oscan) += NEXT_OFF(next);
3486 default: /* REF and CLUMP only? */
3487 if (flags & SCF_DO_SUBSTR) {
3488 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3489 data->longest = &(data->longest_float);
3491 is_inf = is_inf_internal = 1;
3492 if (flags & SCF_DO_STCLASS_OR)
3493 cl_anything(pRExC_state, data->start_class);
3494 flags &= ~SCF_DO_STCLASS;
3498 else if (OP(scan) == LNBREAK) {
3499 if (flags & SCF_DO_STCLASS) {
3501 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3502 if (flags & SCF_DO_STCLASS_AND) {
3503 for (value = 0; value < 256; value++)
3504 if (!is_VERTWS_cp(value))
3505 ANYOF_BITMAP_CLEAR(data->start_class, value);
3508 for (value = 0; value < 256; value++)
3509 if (is_VERTWS_cp(value))
3510 ANYOF_BITMAP_SET(data->start_class, value);
3512 if (flags & SCF_DO_STCLASS_OR)
3513 cl_and(data->start_class, and_withp);
3514 flags &= ~SCF_DO_STCLASS;
3518 if (flags & SCF_DO_SUBSTR) {
3519 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3521 data->pos_delta += 1;
3522 data->longest = &(data->longest_float);
3525 else if (OP(scan) == FOLDCHAR) {
3526 int d = ARG(scan)==0xDF ? 1 : 2;
3527 flags &= ~SCF_DO_STCLASS;
3530 if (flags & SCF_DO_SUBSTR) {
3531 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3533 data->pos_delta += d;
3534 data->longest = &(data->longest_float);
3537 else if (REGNODE_SIMPLE(OP(scan))) {
3540 if (flags & SCF_DO_SUBSTR) {
3541 SCAN_COMMIT(pRExC_state,data,minlenp);
3545 if (flags & SCF_DO_STCLASS) {
3546 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3548 /* Some of the logic below assumes that switching
3549 locale on will only add false positives. */
3550 switch (PL_regkind[OP(scan)]) {
3554 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3555 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3556 cl_anything(pRExC_state, data->start_class);
3559 if (OP(scan) == SANY)
3561 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3562 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3563 || (data->start_class->flags & ANYOF_CLASS));
3564 cl_anything(pRExC_state, data->start_class);
3566 if (flags & SCF_DO_STCLASS_AND || !value)
3567 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3570 if (flags & SCF_DO_STCLASS_AND)
3571 cl_and(data->start_class,
3572 (struct regnode_charclass_class*)scan);
3574 cl_or(pRExC_state, data->start_class,
3575 (struct regnode_charclass_class*)scan);
3578 if (flags & SCF_DO_STCLASS_AND) {
3579 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3580 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3581 if (FLAGS(scan) & USE_UNI) {
3582 for (value = 0; value < 256; value++) {
3583 if (!isWORDCHAR_L1(value)) {
3584 ANYOF_BITMAP_CLEAR(data->start_class, value);
3588 for (value = 0; value < 256; value++) {
3589 if (!isALNUM(value)) {
3590 ANYOF_BITMAP_CLEAR(data->start_class, value);
3597 if (data->start_class->flags & ANYOF_LOCALE)
3598 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3599 else if (FLAGS(scan) & USE_UNI) {
3600 for (value = 0; value < 256; value++) {
3601 if (isWORDCHAR_L1(value)) {
3602 ANYOF_BITMAP_SET(data->start_class, value);
3606 for (value = 0; value < 256; value++) {
3607 if (isALNUM(value)) {
3608 ANYOF_BITMAP_SET(data->start_class, value);
3615 if (flags & SCF_DO_STCLASS_AND) {
3616 if (data->start_class->flags & ANYOF_LOCALE)
3617 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3620 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3621 data->start_class->flags |= ANYOF_LOCALE;
3625 if (flags & SCF_DO_STCLASS_AND) {
3626 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3627 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3628 if (FLAGS(scan) & USE_UNI) {
3629 for (value = 0; value < 256; value++) {
3630 if (isWORDCHAR_L1(value)) {
3631 ANYOF_BITMAP_CLEAR(data->start_class, value);
3635 for (value = 0; value < 256; value++) {
3636 if (isALNUM(value)) {
3637 ANYOF_BITMAP_CLEAR(data->start_class, value);
3644 if (data->start_class->flags & ANYOF_LOCALE)
3645 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3647 for (value = 0; value < 256; value++)
3648 if (!isALNUM(value))
3649 ANYOF_BITMAP_SET(data->start_class, value);
3654 if (flags & SCF_DO_STCLASS_AND) {
3655 if (data->start_class->flags & ANYOF_LOCALE)
3656 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3659 data->start_class->flags |= ANYOF_LOCALE;
3660 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3664 if (flags & SCF_DO_STCLASS_AND) {
3665 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3666 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3667 if (FLAGS(scan) & USE_UNI) {
3668 for (value = 0; value < 256; value++) {
3669 if (!isSPACE_L1(value)) {
3670 ANYOF_BITMAP_CLEAR(data->start_class, value);
3674 for (value = 0; value < 256; value++) {
3675 if (!isSPACE(value)) {
3676 ANYOF_BITMAP_CLEAR(data->start_class, value);
3683 if (data->start_class->flags & ANYOF_LOCALE) {
3684 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3686 else if (FLAGS(scan) & USE_UNI) {
3687 for (value = 0; value < 256; value++) {
3688 if (isSPACE_L1(value)) {
3689 ANYOF_BITMAP_SET(data->start_class, value);
3693 for (value = 0; value < 256; value++) {
3694 if (isSPACE(value)) {
3695 ANYOF_BITMAP_SET(data->start_class, value);
3702 if (flags & SCF_DO_STCLASS_AND) {
3703 if (data->start_class->flags & ANYOF_LOCALE)
3704 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3707 data->start_class->flags |= ANYOF_LOCALE;
3708 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3712 if (flags & SCF_DO_STCLASS_AND) {
3713 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3714 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3715 if (FLAGS(scan) & USE_UNI) {
3716 for (value = 0; value < 256; value++) {
3717 if (isSPACE_L1(value)) {
3718 ANYOF_BITMAP_CLEAR(data->start_class, value);
3722 for (value = 0; value < 256; value++) {
3723 if (isSPACE(value)) {
3724 ANYOF_BITMAP_CLEAR(data->start_class, value);
3731 if (data->start_class->flags & ANYOF_LOCALE)
3732 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3733 else if (FLAGS(scan) & USE_UNI) {
3734 for (value = 0; value < 256; value++) {
3735 if (!isSPACE_L1(value)) {
3736 ANYOF_BITMAP_SET(data->start_class, value);
3741 for (value = 0; value < 256; value++) {
3742 if (!isSPACE(value)) {
3743 ANYOF_BITMAP_SET(data->start_class, value);
3750 if (flags & SCF_DO_STCLASS_AND) {
3751 if (data->start_class->flags & ANYOF_LOCALE) {
3752 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3753 for (value = 0; value < 256; value++)
3754 if (!isSPACE(value))
3755 ANYOF_BITMAP_CLEAR(data->start_class, value);
3759 data->start_class->flags |= ANYOF_LOCALE;
3760 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3764 if (flags & SCF_DO_STCLASS_AND) {
3765 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3766 for (value = 0; value < 256; value++)
3767 if (!isDIGIT(value))
3768 ANYOF_BITMAP_CLEAR(data->start_class, value);
3771 if (data->start_class->flags & ANYOF_LOCALE)
3772 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3774 for (value = 0; value < 256; value++)
3776 ANYOF_BITMAP_SET(data->start_class, value);
3781 if (flags & SCF_DO_STCLASS_AND) {
3782 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3783 for (value = 0; value < 256; value++)
3785 ANYOF_BITMAP_CLEAR(data->start_class, value);
3788 if (data->start_class->flags & ANYOF_LOCALE)
3789 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3791 for (value = 0; value < 256; value++)
3792 if (!isDIGIT(value))
3793 ANYOF_BITMAP_SET(data->start_class, value);
3797 CASE_SYNST_FNC(VERTWS);
3798 CASE_SYNST_FNC(HORIZWS);
3801 if (flags & SCF_DO_STCLASS_OR)
3802 cl_and(data->start_class, and_withp);
3803 flags &= ~SCF_DO_STCLASS;
3806 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3807 data->flags |= (OP(scan) == MEOL
3811 else if ( PL_regkind[OP(scan)] == BRANCHJ
3812 /* Lookbehind, or need to calculate parens/evals/stclass: */
3813 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3814 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3815 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3816 || OP(scan) == UNLESSM )
3818 /* Negative Lookahead/lookbehind
3819 In this case we can't do fixed string optimisation.
3822 I32 deltanext, minnext, fake = 0;
3824 struct regnode_charclass_class intrnl;
3827 data_fake.flags = 0;
3829 data_fake.whilem_c = data->whilem_c;
3830 data_fake.last_closep = data->last_closep;
3833 data_fake.last_closep = &fake;
3834 data_fake.pos_delta = delta;
3835 if ( flags & SCF_DO_STCLASS && !scan->flags
3836 && OP(scan) == IFMATCH ) { /* Lookahead */
3837 cl_init(pRExC_state, &intrnl);
3838 data_fake.start_class = &intrnl;
3839 f |= SCF_DO_STCLASS_AND;
3841 if (flags & SCF_WHILEM_VISITED_POS)
3842 f |= SCF_WHILEM_VISITED_POS;
3843 next = regnext(scan);
3844 nscan = NEXTOPER(NEXTOPER(scan));
3845 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3846 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3849 FAIL("Variable length lookbehind not implemented");
3851 else if (minnext > (I32)U8_MAX) {
3852 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3854 scan->flags = (U8)minnext;
3857 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3859 if (data_fake.flags & SF_HAS_EVAL)
3860 data->flags |= SF_HAS_EVAL;
3861 data->whilem_c = data_fake.whilem_c;
3863 if (f & SCF_DO_STCLASS_AND) {
3864 if (flags & SCF_DO_STCLASS_OR) {
3865 /* OR before, AND after: ideally we would recurse with
3866 * data_fake to get the AND applied by study of the
3867 * remainder of the pattern, and then derecurse;
3868 * *** HACK *** for now just treat as "no information".
3869 * See [perl #56690].
3871 cl_init(pRExC_state, data->start_class);
3873 /* AND before and after: combine and continue */
3874 const int was = (data->start_class->flags & ANYOF_EOS);
3876 cl_and(data->start_class, &intrnl);
3878 data->start_class->flags |= ANYOF_EOS;
3882 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3884 /* Positive Lookahead/lookbehind
3885 In this case we can do fixed string optimisation,
3886 but we must be careful about it. Note in the case of
3887 lookbehind the positions will be offset by the minimum
3888 length of the pattern, something we won't know about
3889 until after the recurse.
3891 I32 deltanext, fake = 0;
3893 struct regnode_charclass_class intrnl;
3895 /* We use SAVEFREEPV so that when the full compile
3896 is finished perl will clean up the allocated
3897 minlens when its all done. This was we don't
3898 have to worry about freeing them when we know
3899 they wont be used, which would be a pain.
3902 Newx( minnextp, 1, I32 );
3903 SAVEFREEPV(minnextp);
3906 StructCopy(data, &data_fake, scan_data_t);
3907 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3910 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3911 data_fake.last_found=newSVsv(data->last_found);
3915 data_fake.last_closep = &fake;
3916 data_fake.flags = 0;
3917 data_fake.pos_delta = delta;
3919 data_fake.flags |= SF_IS_INF;
3920 if ( flags & SCF_DO_STCLASS && !scan->flags
3921 && OP(scan) == IFMATCH ) { /* Lookahead */
3922 cl_init(pRExC_state, &intrnl);
3923 data_fake.start_class = &intrnl;
3924 f |= SCF_DO_STCLASS_AND;
3926 if (flags & SCF_WHILEM_VISITED_POS)
3927 f |= SCF_WHILEM_VISITED_POS;
3928 next = regnext(scan);
3929 nscan = NEXTOPER(NEXTOPER(scan));
3931 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3932 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3935 FAIL("Variable length lookbehind not implemented");
3937 else if (*minnextp > (I32)U8_MAX) {
3938 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3940 scan->flags = (U8)*minnextp;
3945 if (f & SCF_DO_STCLASS_AND) {
3946 const int was = (data->start_class->flags & ANYOF_EOS);
3948 cl_and(data->start_class, &intrnl);
3950 data->start_class->flags |= ANYOF_EOS;
3953 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3955 if (data_fake.flags & SF_HAS_EVAL)
3956 data->flags |= SF_HAS_EVAL;
3957 data->whilem_c = data_fake.whilem_c;
3958 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3959 if (RExC_rx->minlen<*minnextp)
3960 RExC_rx->minlen=*minnextp;
3961 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3962 SvREFCNT_dec(data_fake.last_found);
3964 if ( data_fake.minlen_fixed != minlenp )
3966 data->offset_fixed= data_fake.offset_fixed;
3967 data->minlen_fixed= data_fake.minlen_fixed;
3968 data->lookbehind_fixed+= scan->flags;
3970 if ( data_fake.minlen_float != minlenp )
3972 data->minlen_float= data_fake.minlen_float;
3973 data->offset_float_min=data_fake.offset_float_min;
3974 data->offset_float_max=data_fake.offset_float_max;
3975 data->lookbehind_float+= scan->flags;
3984 else if (OP(scan) == OPEN) {
3985 if (stopparen != (I32)ARG(scan))
3988 else if (OP(scan) == CLOSE) {
3989 if (stopparen == (I32)ARG(scan)) {
3992 if ((I32)ARG(scan) == is_par) {
3993 next = regnext(scan);
3995 if ( next && (OP(next) != WHILEM) && next < last)
3996 is_par = 0; /* Disable optimization */
3999 *(data->last_closep) = ARG(scan);
4001 else if (OP(scan) == EVAL) {
4003 data->flags |= SF_HAS_EVAL;
4005 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4006 if (flags & SCF_DO_SUBSTR) {
4007 SCAN_COMMIT(pRExC_state,data,minlenp);
4008 flags &= ~SCF_DO_SUBSTR;
4010 if (data && OP(scan)==ACCEPT) {
4011 data->flags |= SCF_SEEN_ACCEPT;
4016 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4018 if (flags & SCF_DO_SUBSTR) {
4019 SCAN_COMMIT(pRExC_state,data,minlenp);
4020 data->longest = &(data->longest_float);
4022 is_inf = is_inf_internal = 1;
4023 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4024 cl_anything(pRExC_state, data->start_class);
4025 flags &= ~SCF_DO_STCLASS;
4027 else if (OP(scan) == GPOS) {
4028 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4029 !(delta || is_inf || (data && data->pos_delta)))
4031 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4032 RExC_rx->extflags |= RXf_ANCH_GPOS;
4033 if (RExC_rx->gofs < (U32)min)
4034 RExC_rx->gofs = min;
4036 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4040 #ifdef TRIE_STUDY_OPT
4041 #ifdef FULL_TRIE_STUDY
4042 else if (PL_regkind[OP(scan)] == TRIE) {
4043 /* NOTE - There is similar code to this block above for handling
4044 BRANCH nodes on the initial study. If you change stuff here
4046 regnode *trie_node= scan;
4047 regnode *tail= regnext(scan);
4048 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4049 I32 max1 = 0, min1 = I32_MAX;
4050 struct regnode_charclass_class accum;
4052 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4053 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4054 if (flags & SCF_DO_STCLASS)
4055 cl_init_zero(pRExC_state, &accum);
4061 const regnode *nextbranch= NULL;
4064 for ( word=1 ; word <= trie->wordcount ; word++)
4066 I32 deltanext=0, minnext=0, f = 0, fake;
4067 struct regnode_charclass_class this_class;
4069 data_fake.flags = 0;
4071 data_fake.whilem_c = data->whilem_c;
4072 data_fake.last_closep = data->last_closep;
4075 data_fake.last_closep = &fake;
4076 data_fake.pos_delta = delta;
4077 if (flags & SCF_DO_STCLASS) {
4078 cl_init(pRExC_state, &this_class);
4079 data_fake.start_class = &this_class;
4080 f = SCF_DO_STCLASS_AND;
4082 if (flags & SCF_WHILEM_VISITED_POS)
4083 f |= SCF_WHILEM_VISITED_POS;
4085 if (trie->jump[word]) {
4087 nextbranch = trie_node + trie->jump[0];
4088 scan= trie_node + trie->jump[word];
4089 /* We go from the jump point to the branch that follows
4090 it. Note this means we need the vestigal unused branches
4091 even though they arent otherwise used.
4093 minnext = study_chunk(pRExC_state, &scan, minlenp,
4094 &deltanext, (regnode *)nextbranch, &data_fake,
4095 stopparen, recursed, NULL, f,depth+1);
4097 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4098 nextbranch= regnext((regnode*)nextbranch);
4100 if (min1 > (I32)(minnext + trie->minlen))
4101 min1 = minnext + trie->minlen;
4102 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4103 max1 = minnext + deltanext + trie->maxlen;
4104 if (deltanext == I32_MAX)
4105 is_inf = is_inf_internal = 1;
4107 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4109 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4110 if ( stopmin > min + min1)
4111 stopmin = min + min1;
4112 flags &= ~SCF_DO_SUBSTR;
4114 data->flags |= SCF_SEEN_ACCEPT;
4117 if (data_fake.flags & SF_HAS_EVAL)
4118 data->flags |= SF_HAS_EVAL;
4119 data->whilem_c = data_fake.whilem_c;
4121 if (flags & SCF_DO_STCLASS)
4122 cl_or(pRExC_state, &accum, &this_class);
4125 if (flags & SCF_DO_SUBSTR) {
4126 data->pos_min += min1;
4127 data->pos_delta += max1 - min1;
4128 if (max1 != min1 || is_inf)
4129 data->longest = &(data->longest_float);
4132 delta += max1 - min1;
4133 if (flags & SCF_DO_STCLASS_OR) {
4134 cl_or(pRExC_state, data->start_class, &accum);
4136 cl_and(data->start_class, and_withp);
4137 flags &= ~SCF_DO_STCLASS;
4140 else if (flags & SCF_DO_STCLASS_AND) {
4142 cl_and(data->start_class, &accum);
4143 flags &= ~SCF_DO_STCLASS;
4146 /* Switch to OR mode: cache the old value of
4147 * data->start_class */
4149 StructCopy(data->start_class, and_withp,
4150 struct regnode_charclass_class);
4151 flags &= ~SCF_DO_STCLASS_AND;
4152 StructCopy(&accum, data->start_class,
4153 struct regnode_charclass_class);
4154 flags |= SCF_DO_STCLASS_OR;
4155 data->start_class->flags |= ANYOF_EOS;
4162 else if (PL_regkind[OP(scan)] == TRIE) {
4163 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4166 min += trie->minlen;
4167 delta += (trie->maxlen - trie->minlen);
4168 flags &= ~SCF_DO_STCLASS; /* xxx */
4169 if (flags & SCF_DO_SUBSTR) {
4170 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4171 data->pos_min += trie->minlen;
4172 data->pos_delta += (trie->maxlen - trie->minlen);
4173 if (trie->maxlen != trie->minlen)
4174 data->longest = &(data->longest_float);
4176 if (trie->jump) /* no more substrings -- for now /grr*/
4177 flags &= ~SCF_DO_SUBSTR;
4179 #endif /* old or new */
4180 #endif /* TRIE_STUDY_OPT */
4182 /* Else: zero-length, ignore. */
4183 scan = regnext(scan);
4188 stopparen = frame->stop;
4189 frame = frame->prev;
4190 goto fake_study_recurse;
4195 DEBUG_STUDYDATA("pre-fin:",data,depth);
4198 *deltap = is_inf_internal ? I32_MAX : delta;
4199 if (flags & SCF_DO_SUBSTR && is_inf)
4200 data->pos_delta = I32_MAX - data->pos_min;
4201 if (is_par > (I32)U8_MAX)
4203 if (is_par && pars==1 && data) {
4204 data->flags |= SF_IN_PAR;
4205 data->flags &= ~SF_HAS_PAR;
4207 else if (pars && data) {
4208 data->flags |= SF_HAS_PAR;
4209 data->flags &= ~SF_IN_PAR;
4211 if (flags & SCF_DO_STCLASS_OR)
4212 cl_and(data->start_class, and_withp);
4213 if (flags & SCF_TRIE_RESTUDY)
4214 data->flags |= SCF_TRIE_RESTUDY;
4216 DEBUG_STUDYDATA("post-fin:",data,depth);
4218 return min < stopmin ? min : stopmin;
4222 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4224 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4226 PERL_ARGS_ASSERT_ADD_DATA;
4228 Renewc(RExC_rxi->data,
4229 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4230 char, struct reg_data);
4232 Renew(RExC_rxi->data->what, count + n, U8);
4234 Newx(RExC_rxi->data->what, n, U8);
4235 RExC_rxi->data->count = count + n;
4236 Copy(s, RExC_rxi->data->what + count, n, U8);
4240 /*XXX: todo make this not included in a non debugging perl */
4241 #ifndef PERL_IN_XSUB_RE
4243 Perl_reginitcolors(pTHX)
4246 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4248 char *t = savepv(s);
4252 t = strchr(t, '\t');
4258 PL_colors[i] = t = (char *)"";
4263 PL_colors[i++] = (char *)"";
4270 #ifdef TRIE_STUDY_OPT
4271 #define CHECK_RESTUDY_GOTO \
4273 (data.flags & SCF_TRIE_RESTUDY) \
4277 #define CHECK_RESTUDY_GOTO
4281 - pregcomp - compile a regular expression into internal code
4283 * We can't allocate space until we know how big the compiled form will be,
4284 * but we can't compile it (and thus know how big it is) until we've got a
4285 * place to put the code. So we cheat: we compile it twice, once with code
4286 * generation turned off and size counting turned on, and once "for real".
4287 * This also means that we don't allocate space until we are sure that the
4288 * thing really will compile successfully, and we never have to move the
4289 * code and thus invalidate pointers into it. (Note that it has to be in
4290 * one piece because free() must be able to free it all.) [NB: not true in perl]
4292 * Beware that the optimization-preparation code in here knows about some
4293 * of the structure of the compiled regexp. [I'll say.]
4298 #ifndef PERL_IN_XSUB_RE
4299 #define RE_ENGINE_PTR &PL_core_reg_engine
4301 extern const struct regexp_engine my_reg_engine;
4302 #define RE_ENGINE_PTR &my_reg_engine
4305 #ifndef PERL_IN_XSUB_RE
4307 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4310 HV * const table = GvHV(PL_hintgv);
4312 PERL_ARGS_ASSERT_PREGCOMP;
4314 /* Dispatch a request to compile a regexp to correct
4317 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4318 GET_RE_DEBUG_FLAGS_DECL;
4319 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4320 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4322 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4325 return CALLREGCOMP_ENG(eng, pattern, flags);
4328 return Perl_re_compile(aTHX_ pattern, flags);
4333 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4338 register regexp_internal *ri;
4350 RExC_state_t RExC_state;
4351 RExC_state_t * const pRExC_state = &RExC_state;
4352 #ifdef TRIE_STUDY_OPT
4354 RExC_state_t copyRExC_state;
4356 GET_RE_DEBUG_FLAGS_DECL;
4358 PERL_ARGS_ASSERT_RE_COMPILE;
4360 DEBUG_r(if (!PL_colorset) reginitcolors());
4362 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4365 /* Longjmp back to here if have to switch in midstream to utf8 */
4366 if (! RExC_orig_utf8) {
4367 JMPENV_PUSH(jump_ret);
4370 if (jump_ret == 0) { /* First time through */
4371 exp = SvPV(pattern, plen);
4375 SV *dsv= sv_newmortal();
4376 RE_PV_QUOTED_DECL(s, RExC_utf8,
4377 dsv, exp, plen, 60);
4378 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4379 PL_colors[4],PL_colors[5],s);
4382 else { /* longjumped back */
4385 /* If the cause for the longjmp was other than changing to utf8, pop
4386 * our own setjmp, and longjmp to the correct handler */
4387 if (jump_ret != UTF8_LONGJMP) {
4389 JMPENV_JUMP(jump_ret);
4394 /* It's possible to write a regexp in ascii that represents Unicode
4395 codepoints outside of the byte range, such as via \x{100}. If we
4396 detect such a sequence we have to convert the entire pattern to utf8
4397 and then recompile, as our sizing calculation will have been based
4398 on 1 byte == 1 character, but we will need to use utf8 to encode
4399 at least some part of the pattern, and therefore must convert the whole
4402 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4403 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4404 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4406 RExC_orig_utf8 = RExC_utf8 = 1;
4410 #ifdef TRIE_STUDY_OPT
4415 RExC_flags = pm_flags;
4419 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4420 RExC_seen_evals = 0;
4423 /* First pass: determine size, legality. */
4431 RExC_emit = &PL_regdummy;
4432 RExC_whilem_seen = 0;
4433 RExC_open_parens = NULL;
4434 RExC_close_parens = NULL;
4436 RExC_paren_names = NULL;
4438 RExC_paren_name_list = NULL;
4440 RExC_recurse = NULL;
4441 RExC_recurse_count = 0;
4443 #if 0 /* REGC() is (currently) a NOP at the first pass.
4444 * Clever compilers notice this and complain. --jhi */
4445 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4447 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4448 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4449 RExC_precomp = NULL;
4453 /* Here, finished first pass. Get rid of our setjmp, which we added for
4454 * efficiency only if the passed-in string wasn't in utf8, as shown by
4455 * RExC_orig_utf8. But if the first pass was redone, that variable will be
4456 * 1 here even though the original string wasn't utf8, but in this case
4457 * there will have been a long jump */
4458 if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4462 PerlIO_printf(Perl_debug_log,
4463 "Required size %"IVdf" nodes\n"
4464 "Starting second pass (creation)\n",
4467 RExC_lastparse=NULL;
4469 /* Small enough for pointer-storage convention?
4470 If extralen==0, this means that we will not need long jumps. */
4471 if (RExC_size >= 0x10000L && RExC_extralen)
4472 RExC_size += RExC_extralen;
4475 if (RExC_whilem_seen > 15)
4476 RExC_whilem_seen = 15;
4478 /* Allocate space and zero-initialize. Note, the two step process
4479 of zeroing when in debug mode, thus anything assigned has to
4480 happen after that */
4481 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4482 r = (struct regexp*)SvANY(rx);
4483 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4484 char, regexp_internal);
4485 if ( r == NULL || ri == NULL )
4486 FAIL("Regexp out of space");
4488 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4489 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4491 /* bulk initialize base fields with 0. */
4492 Zero(ri, sizeof(regexp_internal), char);
4495 /* non-zero initialization begins here */
4497 r->engine= RE_ENGINE_PTR;
4498 r->extflags = pm_flags;
4500 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4501 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
4503 /* The caret is output if there are any defaults: if not all the STD
4504 * flags are set, or if no character set specifier is needed */
4506 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4508 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4509 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4510 >> RXf_PMf_STD_PMMOD_SHIFT);
4511 const char *fptr = STD_PAT_MODS; /*"msix"*/
4513 /* Allocate for the worst case, which is all the std flags are turned
4514 * on. If more precision is desired, we could do a population count of
4515 * the flags set. This could be done with a small lookup table, or by
4516 * shifting, masking and adding, or even, when available, assembly
4517 * language for a machine-language population count.
4518 * We never output a minus, as all those are defaults, so are
4519 * covered by the caret */
4520 const STRLEN wraplen = plen + has_p + has_runon
4521 + has_default /* If needs a caret */
4522 + has_charset /* If needs a character set specifier */
4523 + (sizeof(STD_PAT_MODS) - 1)
4524 + (sizeof("(?:)") - 1);
4526 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4528 SvFLAGS(rx) |= SvUTF8(pattern);
4531 /* If a default, cover it using the caret */
4533 *p++= DEFAULT_PAT_MOD;
4536 if (r->extflags & RXf_PMf_LOCALE) {
4537 *p++ = LOCALE_PAT_MOD;
4539 *p++ = UNICODE_PAT_MOD;
4543 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4546 while((ch = *fptr++)) {
4554 Copy(RExC_precomp, p, plen, char);
4555 assert ((RX_WRAPPED(rx) - p) < 16);
4556 r->pre_prefix = p - RX_WRAPPED(rx);
4562 SvCUR_set(rx, p - SvPVX_const(rx));
4566 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4568 if (RExC_seen & REG_SEEN_RECURSE) {
4569 Newxz(RExC_open_parens, RExC_npar,regnode *);
4570 SAVEFREEPV(RExC_open_parens);
4571 Newxz(RExC_close_parens,RExC_npar,regnode *);
4572 SAVEFREEPV(RExC_close_parens);
4575 /* Useful during FAIL. */
4576 #ifdef RE_TRACK_PATTERN_OFFSETS
4577 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4578 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4579 "%s %"UVuf" bytes for offset annotations.\n",
4580 ri->u.offsets ? "Got" : "Couldn't get",
4581 (UV)((2*RExC_size+1) * sizeof(U32))));
4583 SetProgLen(ri,RExC_size);
4587 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4589 /* Second pass: emit code. */
4590 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4595 RExC_emit_start = ri->program;
4596 RExC_emit = ri->program;
4597 RExC_emit_bound = ri->program + RExC_size + 1;
4599 /* Store the count of eval-groups for security checks: */
4600 RExC_rx->seen_evals = RExC_seen_evals;
4601 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4602 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4606 /* XXXX To minimize changes to RE engine we always allocate
4607 3-units-long substrs field. */
4608 Newx(r->substrs, 1, struct reg_substr_data);
4609 if (RExC_recurse_count) {
4610 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4611 SAVEFREEPV(RExC_recurse);
4615 r->minlen = minlen = sawplus = sawopen = 0;
4616 Zero(r->substrs, 1, struct reg_substr_data);
4618 #ifdef TRIE_STUDY_OPT
4620 StructCopy(&zero_scan_data, &data, scan_data_t);
4621 copyRExC_state = RExC_state;
4624 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4626 RExC_state = copyRExC_state;
4627 if (seen & REG_TOP_LEVEL_BRANCHES)
4628 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4630 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4631 if (data.last_found) {
4632 SvREFCNT_dec(data.longest_fixed);
4633 SvREFCNT_dec(data.longest_float);
4634 SvREFCNT_dec(data.last_found);
4636 StructCopy(&zero_scan_data, &data, scan_data_t);
4639 StructCopy(&zero_scan_data, &data, scan_data_t);
4642 /* Dig out information for optimizations. */
4643 r->extflags = RExC_flags; /* was pm_op */
4644 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4647 SvUTF8_on(rx); /* Unicode in it? */
4648 ri->regstclass = NULL;
4649 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4650 r->intflags |= PREGf_NAUGHTY;
4651 scan = ri->program + 1; /* First BRANCH. */
4653 /* testing for BRANCH here tells us whether there is "must appear"
4654 data in the pattern. If there is then we can use it for optimisations */
4655 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4657 STRLEN longest_float_length, longest_fixed_length;
4658 struct regnode_charclass_class ch_class; /* pointed to by data */
4660 I32 last_close = 0; /* pointed to by data */
4661 regnode *first= scan;
4662 regnode *first_next= regnext(first);
4665 * Skip introductions and multiplicators >= 1
4666 * so that we can extract the 'meat' of the pattern that must
4667 * match in the large if() sequence following.
4668 * NOTE that EXACT is NOT covered here, as it is normally
4669 * picked up by the optimiser separately.
4671 * This is unfortunate as the optimiser isnt handling lookahead
4672 * properly currently.
4675 while ((OP(first) == OPEN && (sawopen = 1)) ||
4676 /* An OR of *one* alternative - should not happen now. */
4677 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4678 /* for now we can't handle lookbehind IFMATCH*/
4679 (OP(first) == IFMATCH && !first->flags) ||
4680 (OP(first) == PLUS) ||
4681 (OP(first) == MINMOD) ||
4682 /* An {n,m} with n>0 */
4683 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4684 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4687 * the only op that could be a regnode is PLUS, all the rest
4688 * will be regnode_1 or regnode_2.
4691 if (OP(first) == PLUS)
4694 first += regarglen[OP(first)];
4696 first = NEXTOPER(first);
4697 first_next= regnext(first);
4700 /* Starting-point info. */
4702 DEBUG_PEEP("first:",first,0);
4703 /* Ignore EXACT as we deal with it later. */
4704 if (PL_regkind[OP(first)] == EXACT) {
4705 if (OP(first) == EXACT)
4706 NOOP; /* Empty, get anchored substr later. */
4707 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4708 ri->regstclass = first;
4711 else if (PL_regkind[OP(first)] == TRIE &&
4712 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4715 /* this can happen only on restudy */
4716 if ( OP(first) == TRIE ) {
4717 struct regnode_1 *trieop = (struct regnode_1 *)
4718 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4719 StructCopy(first,trieop,struct regnode_1);
4720 trie_op=(regnode *)trieop;
4722 struct regnode_charclass *trieop = (struct regnode_charclass *)
4723 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4724 StructCopy(first,trieop,struct regnode_charclass);
4725 trie_op=(regnode *)trieop;
4728 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4729 ri->regstclass = trie_op;
4732 else if (REGNODE_SIMPLE(OP(first)))
4733 ri->regstclass = first;
4734 else if (PL_regkind[OP(first)] == BOUND ||
4735 PL_regkind[OP(first)] == NBOUND)
4736 ri->regstclass = first;
4737 else if (PL_regkind[OP(first)] == BOL) {
4738 r->extflags |= (OP(first) == MBOL
4740 : (OP(first) == SBOL
4743 first = NEXTOPER(first);
4746 else if (OP(first) == GPOS) {
4747 r->extflags |= RXf_ANCH_GPOS;
4748 first = NEXTOPER(first);
4751 else if ((!sawopen || !RExC_sawback) &&
4752 (OP(first) == STAR &&
4753 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4754 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4756 /* turn .* into ^.* with an implied $*=1 */
4758 (OP(NEXTOPER(first)) == REG_ANY)
4761 r->extflags |= type;
4762 r->intflags |= PREGf_IMPLICIT;
4763 first = NEXTOPER(first);
4766 if (sawplus && (!sawopen || !RExC_sawback)
4767 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4768 /* x+ must match at the 1st pos of run of x's */
4769 r->intflags |= PREGf_SKIP;
4771 /* Scan is after the zeroth branch, first is atomic matcher. */
4772 #ifdef TRIE_STUDY_OPT
4775 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4776 (IV)(first - scan + 1))
4780 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4781 (IV)(first - scan + 1))
4787 * If there's something expensive in the r.e., find the
4788 * longest literal string that must appear and make it the
4789 * regmust. Resolve ties in favor of later strings, since
4790 * the regstart check works with the beginning of the r.e.
4791 * and avoiding duplication strengthens checking. Not a
4792 * strong reason, but sufficient in the absence of others.
4793 * [Now we resolve ties in favor of the earlier string if
4794 * it happens that c_offset_min has been invalidated, since the
4795 * earlier string may buy us something the later one won't.]
4798 data.longest_fixed = newSVpvs("");
4799 data.longest_float = newSVpvs("");
4800 data.last_found = newSVpvs("");
4801 data.longest = &(data.longest_fixed);
4803 if (!ri->regstclass) {
4804 cl_init(pRExC_state, &ch_class);
4805 data.start_class = &ch_class;
4806 stclass_flag = SCF_DO_STCLASS_AND;
4807 } else /* XXXX Check for BOUND? */
4809 data.last_closep = &last_close;
4811 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4812 &data, -1, NULL, NULL,
4813 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4819 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4820 && data.last_start_min == 0 && data.last_end > 0
4821 && !RExC_seen_zerolen
4822 && !(RExC_seen & REG_SEEN_VERBARG)
4823 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4824 r->extflags |= RXf_CHECK_ALL;
4825 scan_commit(pRExC_state, &data,&minlen,0);
4826 SvREFCNT_dec(data.last_found);
4828 /* Note that code very similar to this but for anchored string
4829 follows immediately below, changes may need to be made to both.
4832 longest_float_length = CHR_SVLEN(data.longest_float);
4833 if (longest_float_length
4834 || (data.flags & SF_FL_BEFORE_EOL
4835 && (!(data.flags & SF_FL_BEFORE_MEOL)
4836 || (RExC_flags & RXf_PMf_MULTILINE))))
4840 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4841 && data.offset_fixed == data.offset_float_min
4842 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4843 goto remove_float; /* As in (a)+. */
4845 /* copy the information about the longest float from the reg_scan_data
4846 over to the program. */
4847 if (SvUTF8(data.longest_float)) {
4848 r->float_utf8 = data.longest_float;
4849 r->float_substr = NULL;
4851 r->float_substr = data.longest_float;
4852 r->float_utf8 = NULL;
4854 /* float_end_shift is how many chars that must be matched that
4855 follow this item. We calculate it ahead of time as once the
4856 lookbehind offset is added in we lose the ability to correctly
4858 ml = data.minlen_float ? *(data.minlen_float)
4859 : (I32)longest_float_length;
4860 r->float_end_shift = ml - data.offset_float_min
4861 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4862 + data.lookbehind_float;
4863 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4864 r->float_max_offset = data.offset_float_max;
4865 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4866 r->float_max_offset -= data.lookbehind_float;
4868 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4869 && (!(data.flags & SF_FL_BEFORE_MEOL)
4870 || (RExC_flags & RXf_PMf_MULTILINE)));
4871 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4875 r->float_substr = r->float_utf8 = NULL;
4876 SvREFCNT_dec(data.longest_float);
4877 longest_float_length = 0;
4880 /* Note that code very similar to this but for floating string
4881 is immediately above, changes may need to be made to both.
4884 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4885 if (longest_fixed_length
4886 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4887 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4888 || (RExC_flags & RXf_PMf_MULTILINE))))
4892 /* copy the information about the longest fixed
4893 from the reg_scan_data over to the program. */
4894 if (SvUTF8(data.longest_fixed)) {
4895 r->anchored_utf8 = data.longest_fixed;
4896 r->anchored_substr = NULL;
4898 r->anchored_substr = data.longest_fixed;
4899 r->anchored_utf8 = NULL;
4901 /* fixed_end_shift is how many chars that must be matched that
4902 follow this item. We calculate it ahead of time as once the
4903 lookbehind offset is added in we lose the ability to correctly
4905 ml = data.minlen_fixed ? *(data.minlen_fixed)
4906 : (I32)longest_fixed_length;
4907 r->anchored_end_shift = ml - data.offset_fixed
4908 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4909 + data.lookbehind_fixed;
4910 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4912 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4913 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4914 || (RExC_flags & RXf_PMf_MULTILINE)));
4915 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4918 r->anchored_substr = r->anchored_utf8 = NULL;
4919 SvREFCNT_dec(data.longest_fixed);
4920 longest_fixed_length = 0;
4923 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4924 ri->regstclass = NULL;
4925 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4927 && !(data.start_class->flags & ANYOF_EOS)
4928 && !cl_is_anything(data.start_class))
4930 const U32 n = add_data(pRExC_state, 1, "f");
4932 Newx(RExC_rxi->data->data[n], 1,
4933 struct regnode_charclass_class);
4934 StructCopy(data.start_class,
4935 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4936 struct regnode_charclass_class);
4937 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4938 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4939 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4940 regprop(r, sv, (regnode*)data.start_class);
4941 PerlIO_printf(Perl_debug_log,
4942 "synthetic stclass \"%s\".\n",
4943 SvPVX_const(sv));});
4946 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4947 if (longest_fixed_length > longest_float_length) {
4948 r->check_end_shift = r->anchored_end_shift;
4949 r->check_substr = r->anchored_substr;
4950 r->check_utf8 = r->anchored_utf8;
4951 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4952 if (r->extflags & RXf_ANCH_SINGLE)
4953 r->extflags |= RXf_NOSCAN;
4956 r->check_end_shift = r->float_end_shift;
4957 r->check_substr = r->float_substr;
4958 r->check_utf8 = r->float_utf8;
4959 r->check_offset_min = r->float_min_offset;
4960 r->check_offset_max = r->float_max_offset;
4962 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4963 This should be changed ASAP! */
4964 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4965 r->extflags |= RXf_USE_INTUIT;
4966 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4967 r->extflags |= RXf_INTUIT_TAIL;
4969 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4970 if ( (STRLEN)minlen < longest_float_length )
4971 minlen= longest_float_length;
4972 if ( (STRLEN)minlen < longest_fixed_length )
4973 minlen= longest_fixed_length;
4977 /* Several toplevels. Best we can is to set minlen. */
4979 struct regnode_charclass_class ch_class;
4982 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4984 scan = ri->program + 1;
4985 cl_init(pRExC_state, &ch_class);
4986 data.start_class = &ch_class;
4987 data.last_closep = &last_close;
4990 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4991 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4995 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4996 = r->float_substr = r->float_utf8 = NULL;
4997 if (!(data.start_class->flags & ANYOF_EOS)
4998 && !cl_is_anything(data.start_class))
5000 const U32 n = add_data(pRExC_state, 1, "f");
5002 Newx(RExC_rxi->data->data[n], 1,
5003 struct regnode_charclass_class);
5004 StructCopy(data.start_class,
5005 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5006 struct regnode_charclass_class);
5007 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5008 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5009 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5010 regprop(r, sv, (regnode*)data.start_class);
5011 PerlIO_printf(Perl_debug_log,
5012 "synthetic stclass \"%s\".\n",
5013 SvPVX_const(sv));});
5017 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5018 the "real" pattern. */
5020 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5021 (IV)minlen, (IV)r->minlen);
5023 r->minlenret = minlen;
5024 if (r->minlen < minlen)
5027 if (RExC_seen & REG_SEEN_GPOS)
5028 r->extflags |= RXf_GPOS_SEEN;
5029 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5030 r->extflags |= RXf_LOOKBEHIND_SEEN;
5031 if (RExC_seen & REG_SEEN_EVAL)
5032 r->extflags |= RXf_EVAL_SEEN;
5033 if (RExC_seen & REG_SEEN_CANY)
5034 r->extflags |= RXf_CANY_SEEN;
5035 if (RExC_seen & REG_SEEN_VERBARG)
5036 r->intflags |= PREGf_VERBARG_SEEN;
5037 if (RExC_seen & REG_SEEN_CUTGROUP)
5038 r->intflags |= PREGf_CUTGROUP_SEEN;
5039 if (RExC_paren_names)
5040 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5042 RXp_PAREN_NAMES(r) = NULL;
5044 #ifdef STUPID_PATTERN_CHECKS
5045 if (RX_PRELEN(rx) == 0)
5046 r->extflags |= RXf_NULL;
5047 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5048 /* XXX: this should happen BEFORE we compile */
5049 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5050 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5051 r->extflags |= RXf_WHITE;
5052 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5053 r->extflags |= RXf_START_ONLY;
5055 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5056 /* XXX: this should happen BEFORE we compile */
5057 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5059 regnode *first = ri->program + 1;
5061 U8 nop = OP(NEXTOPER(first));
5063 if (PL_regkind[fop] == NOTHING && nop == END)
5064 r->extflags |= RXf_NULL;
5065 else if (PL_regkind[fop] == BOL && nop == END)
5066 r->extflags |= RXf_START_ONLY;
5067 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5068 r->extflags |= RXf_WHITE;
5072 if (RExC_paren_names) {
5073 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5074 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5077 ri->name_list_idx = 0;
5079 if (RExC_recurse_count) {
5080 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5081 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5082 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5085 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5086 /* assume we don't need to swap parens around before we match */
5089 PerlIO_printf(Perl_debug_log,"Final program:\n");
5092 #ifdef RE_TRACK_PATTERN_OFFSETS
5093 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5094 const U32 len = ri->u.offsets[0];
5096 GET_RE_DEBUG_FLAGS_DECL;
5097 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5098 for (i = 1; i <= len; i++) {
5099 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5100 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5101 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5103 PerlIO_printf(Perl_debug_log, "\n");
5109 #undef RE_ENGINE_PTR
5113 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5116 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5118 PERL_UNUSED_ARG(value);
5120 if (flags & RXapif_FETCH) {
5121 return reg_named_buff_fetch(rx, key, flags);
5122 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5123 Perl_croak_no_modify(aTHX);
5125 } else if (flags & RXapif_EXISTS) {
5126 return reg_named_buff_exists(rx, key, flags)
5129 } else if (flags & RXapif_REGNAMES) {
5130 return reg_named_buff_all(rx, flags);
5131 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5132 return reg_named_buff_scalar(rx, flags);
5134 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5140 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5143 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5144 PERL_UNUSED_ARG(lastkey);
5146 if (flags & RXapif_FIRSTKEY)
5147 return reg_named_buff_firstkey(rx, flags);
5148 else if (flags & RXapif_NEXTKEY)
5149 return reg_named_buff_nextkey(rx, flags);
5151 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5157 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5160 AV *retarray = NULL;
5162 struct regexp *const rx = (struct regexp *)SvANY(r);
5164 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5166 if (flags & RXapif_ALL)
5169 if (rx && RXp_PAREN_NAMES(rx)) {
5170 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5173 SV* sv_dat=HeVAL(he_str);
5174 I32 *nums=(I32*)SvPVX(sv_dat);
5175 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5176 if ((I32)(rx->nparens) >= nums[i]
5177 && rx->offs[nums[i]].start != -1
5178 && rx->offs[nums[i]].end != -1)
5181 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5185 ret = newSVsv(&PL_sv_undef);
5188 av_push(retarray, ret);
5191 return newRV_noinc(MUTABLE_SV(retarray));
5198 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5201 struct regexp *const rx = (struct regexp *)SvANY(r);
5203 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5205 if (rx && RXp_PAREN_NAMES(rx)) {
5206 if (flags & RXapif_ALL) {
5207 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5209 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5223 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5225 struct regexp *const rx = (struct regexp *)SvANY(r);
5227 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5229 if ( rx && RXp_PAREN_NAMES(rx) ) {
5230 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5232 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5239 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5241 struct regexp *const rx = (struct regexp *)SvANY(r);
5242 GET_RE_DEBUG_FLAGS_DECL;
5244 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5246 if (rx && RXp_PAREN_NAMES(rx)) {
5247 HV *hv = RXp_PAREN_NAMES(rx);
5249 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5252 SV* sv_dat = HeVAL(temphe);
5253 I32 *nums = (I32*)SvPVX(sv_dat);
5254 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5255 if ((I32)(rx->lastparen) >= nums[i] &&
5256 rx->offs[nums[i]].start != -1 &&
5257 rx->offs[nums[i]].end != -1)
5263 if (parno || flags & RXapif_ALL) {
5264 return newSVhek(HeKEY_hek(temphe));
5272 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5277 struct regexp *const rx = (struct regexp *)SvANY(r);
5279 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5281 if (rx && RXp_PAREN_NAMES(rx)) {
5282 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5283 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5284 } else if (flags & RXapif_ONE) {
5285 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5286 av = MUTABLE_AV(SvRV(ret));
5287 length = av_len(av);
5289 return newSViv(length + 1);
5291 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5295 return &PL_sv_undef;
5299 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5301 struct regexp *const rx = (struct regexp *)SvANY(r);
5304 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5306 if (rx && RXp_PAREN_NAMES(rx)) {
5307 HV *hv= RXp_PAREN_NAMES(rx);
5309 (void)hv_iterinit(hv);
5310 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5313 SV* sv_dat = HeVAL(temphe);
5314 I32 *nums = (I32*)SvPVX(sv_dat);
5315 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5316 if ((I32)(rx->lastparen) >= nums[i] &&
5317 rx->offs[nums[i]].start != -1 &&
5318 rx->offs[nums[i]].end != -1)
5324 if (parno || flags & RXapif_ALL) {
5325 av_push(av, newSVhek(HeKEY_hek(temphe)));
5330 return newRV_noinc(MUTABLE_SV(av));
5334 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5337 struct regexp *const rx = (struct regexp *)SvANY(r);
5342 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5345 sv_setsv(sv,&PL_sv_undef);
5349 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5351 i = rx->offs[0].start;
5355 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5357 s = rx->subbeg + rx->offs[0].end;
5358 i = rx->sublen - rx->offs[0].end;
5361 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5362 (s1 = rx->offs[paren].start) != -1 &&
5363 (t1 = rx->offs[paren].end) != -1)
5367 s = rx->subbeg + s1;
5369 sv_setsv(sv,&PL_sv_undef);
5372 assert(rx->sublen >= (s - rx->subbeg) + i );
5374 const int oldtainted = PL_tainted;
5376 sv_setpvn(sv, s, i);
5377 PL_tainted = oldtainted;
5378 if ( (rx->extflags & RXf_CANY_SEEN)
5379 ? (RXp_MATCH_UTF8(rx)
5380 && (!i || is_utf8_string((U8*)s, i)))
5381 : (RXp_MATCH_UTF8(rx)) )
5388 if (RXp_MATCH_TAINTED(rx)) {
5389 if (SvTYPE(sv) >= SVt_PVMG) {
5390 MAGIC* const mg = SvMAGIC(sv);
5393 SvMAGIC_set(sv, mg->mg_moremagic);
5395 if ((mgt = SvMAGIC(sv))) {
5396 mg->mg_moremagic = mgt;
5397 SvMAGIC_set(sv, mg);
5407 sv_setsv(sv,&PL_sv_undef);
5413 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5414 SV const * const value)
5416 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5418 PERL_UNUSED_ARG(rx);
5419 PERL_UNUSED_ARG(paren);
5420 PERL_UNUSED_ARG(value);
5423 Perl_croak_no_modify(aTHX);
5427 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5430 struct regexp *const rx = (struct regexp *)SvANY(r);
5434 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5436 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5438 /* $` / ${^PREMATCH} */
5439 case RX_BUFF_IDX_PREMATCH:
5440 if (rx->offs[0].start != -1) {
5441 i = rx->offs[0].start;
5449 /* $' / ${^POSTMATCH} */
5450 case RX_BUFF_IDX_POSTMATCH:
5451 if (rx->offs[0].end != -1) {
5452 i = rx->sublen - rx->offs[0].end;
5454 s1 = rx->offs[0].end;
5460 /* $& / ${^MATCH}, $1, $2, ... */
5462 if (paren <= (I32)rx->nparens &&
5463 (s1 = rx->offs[paren].start) != -1 &&
5464 (t1 = rx->offs[paren].end) != -1)
5469 if (ckWARN(WARN_UNINITIALIZED))
5470 report_uninit((const SV *)sv);
5475 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5476 const char * const s = rx->subbeg + s1;
5481 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5488 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5490 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5491 PERL_UNUSED_ARG(rx);
5495 return newSVpvs("Regexp");
5498 /* Scans the name of a named buffer from the pattern.
5499 * If flags is REG_RSN_RETURN_NULL returns null.
5500 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5501 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5502 * to the parsed name as looked up in the RExC_paren_names hash.
5503 * If there is an error throws a vFAIL().. type exception.
5506 #define REG_RSN_RETURN_NULL 0
5507 #define REG_RSN_RETURN_NAME 1
5508 #define REG_RSN_RETURN_DATA 2
5511 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5513 char *name_start = RExC_parse;
5515 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5517 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5518 /* skip IDFIRST by using do...while */
5521 RExC_parse += UTF8SKIP(RExC_parse);
5522 } while (isALNUM_utf8((U8*)RExC_parse));
5526 } while (isALNUM(*RExC_parse));
5531 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5532 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5533 if ( flags == REG_RSN_RETURN_NAME)
5535 else if (flags==REG_RSN_RETURN_DATA) {
5538 if ( ! sv_name ) /* should not happen*/
5539 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5540 if (RExC_paren_names)
5541 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5543 sv_dat = HeVAL(he_str);
5545 vFAIL("Reference to nonexistent named group");
5549 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5556 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5557 int rem=(int)(RExC_end - RExC_parse); \
5566 if (RExC_lastparse!=RExC_parse) \
5567 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5570 iscut ? "..." : "<" \
5573 PerlIO_printf(Perl_debug_log,"%16s",""); \
5576 num = RExC_size + 1; \
5578 num=REG_NODE_NUM(RExC_emit); \
5579 if (RExC_lastnum!=num) \
5580 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5582 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5583 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5584 (int)((depth*2)), "", \
5588 RExC_lastparse=RExC_parse; \
5593 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5594 DEBUG_PARSE_MSG((funcname)); \
5595 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5597 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5598 DEBUG_PARSE_MSG((funcname)); \
5599 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5602 - reg - regular expression, i.e. main body or parenthesized thing
5604 * Caller must absorb opening parenthesis.
5606 * Combining parenthesis handling with the base level of regular expression
5607 * is a trifle forced, but the need to tie the tails of the branches to what
5608 * follows makes it hard to avoid.
5610 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5612 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5614 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5618 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5619 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5622 register regnode *ret; /* Will be the head of the group. */
5623 register regnode *br;
5624 register regnode *lastbr;
5625 register regnode *ender = NULL;
5626 register I32 parno = 0;
5628 U32 oregflags = RExC_flags;
5629 bool have_branch = 0;
5631 I32 freeze_paren = 0;
5632 I32 after_freeze = 0;
5634 /* for (?g), (?gc), and (?o) warnings; warning
5635 about (?c) will warn about (?g) -- japhy */
5637 #define WASTED_O 0x01
5638 #define WASTED_G 0x02
5639 #define WASTED_C 0x04
5640 #define WASTED_GC (0x02|0x04)
5641 I32 wastedflags = 0x00;
5643 char * parse_start = RExC_parse; /* MJD */
5644 char * const oregcomp_parse = RExC_parse;
5646 GET_RE_DEBUG_FLAGS_DECL;
5648 PERL_ARGS_ASSERT_REG;
5649 DEBUG_PARSE("reg ");
5651 *flagp = 0; /* Tentatively. */
5654 /* Make an OPEN node, if parenthesized. */
5656 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5657 char *start_verb = RExC_parse;
5658 STRLEN verb_len = 0;
5659 char *start_arg = NULL;
5660 unsigned char op = 0;
5662 int internal_argval = 0; /* internal_argval is only useful if !argok */
5663 while ( *RExC_parse && *RExC_parse != ')' ) {
5664 if ( *RExC_parse == ':' ) {
5665 start_arg = RExC_parse + 1;
5671 verb_len = RExC_parse - start_verb;
5674 while ( *RExC_parse && *RExC_parse != ')' )
5676 if ( *RExC_parse != ')' )
5677 vFAIL("Unterminated verb pattern argument");
5678 if ( RExC_parse == start_arg )
5681 if ( *RExC_parse != ')' )
5682 vFAIL("Unterminated verb pattern");
5685 switch ( *start_verb ) {
5686 case 'A': /* (*ACCEPT) */
5687 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5689 internal_argval = RExC_nestroot;
5692 case 'C': /* (*COMMIT) */
5693 if ( memEQs(start_verb,verb_len,"COMMIT") )
5696 case 'F': /* (*FAIL) */
5697 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5702 case ':': /* (*:NAME) */
5703 case 'M': /* (*MARK:NAME) */
5704 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5709 case 'P': /* (*PRUNE) */
5710 if ( memEQs(start_verb,verb_len,"PRUNE") )
5713 case 'S': /* (*SKIP) */
5714 if ( memEQs(start_verb,verb_len,"SKIP") )
5717 case 'T': /* (*THEN) */
5718 /* [19:06] <TimToady> :: is then */
5719 if ( memEQs(start_verb,verb_len,"THEN") ) {
5721 RExC_seen |= REG_SEEN_CUTGROUP;
5727 vFAIL3("Unknown verb pattern '%.*s'",
5728 verb_len, start_verb);
5731 if ( start_arg && internal_argval ) {
5732 vFAIL3("Verb pattern '%.*s' may not have an argument",
5733 verb_len, start_verb);
5734 } else if ( argok < 0 && !start_arg ) {
5735 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5736 verb_len, start_verb);
5738 ret = reganode(pRExC_state, op, internal_argval);
5739 if ( ! internal_argval && ! SIZE_ONLY ) {
5741 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5742 ARG(ret) = add_data( pRExC_state, 1, "S" );
5743 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5750 if (!internal_argval)
5751 RExC_seen |= REG_SEEN_VERBARG;
5752 } else if ( start_arg ) {
5753 vFAIL3("Verb pattern '%.*s' may not have an argument",
5754 verb_len, start_verb);
5756 ret = reg_node(pRExC_state, op);
5758 nextchar(pRExC_state);
5761 if (*RExC_parse == '?') { /* (?...) */
5762 bool is_logical = 0;
5763 const char * const seqstart = RExC_parse;
5764 bool has_use_defaults = FALSE;
5767 paren = *RExC_parse++;
5768 ret = NULL; /* For look-ahead/behind. */
5771 case 'P': /* (?P...) variants for those used to PCRE/Python */
5772 paren = *RExC_parse++;
5773 if ( paren == '<') /* (?P<...>) named capture */
5775 else if (paren == '>') { /* (?P>name) named recursion */
5776 goto named_recursion;
5778 else if (paren == '=') { /* (?P=...) named backref */
5779 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5780 you change this make sure you change that */
5781 char* name_start = RExC_parse;
5783 SV *sv_dat = reg_scan_name(pRExC_state,
5784 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5785 if (RExC_parse == name_start || *RExC_parse != ')')
5786 vFAIL2("Sequence %.3s... not terminated",parse_start);
5789 num = add_data( pRExC_state, 1, "S" );
5790 RExC_rxi->data->data[num]=(void*)sv_dat;
5791 SvREFCNT_inc_simple_void(sv_dat);
5794 ret = reganode(pRExC_state,
5795 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5799 Set_Node_Offset(ret, parse_start+1);
5800 Set_Node_Cur_Length(ret); /* MJD */
5802 nextchar(pRExC_state);
5806 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5808 case '<': /* (?<...) */
5809 if (*RExC_parse == '!')
5811 else if (*RExC_parse != '=')
5817 case '\'': /* (?'...') */
5818 name_start= RExC_parse;
5819 svname = reg_scan_name(pRExC_state,
5820 SIZE_ONLY ? /* reverse test from the others */
5821 REG_RSN_RETURN_NAME :
5822 REG_RSN_RETURN_NULL);
5823 if (RExC_parse == name_start) {
5825 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5828 if (*RExC_parse != paren)
5829 vFAIL2("Sequence (?%c... not terminated",
5830 paren=='>' ? '<' : paren);
5834 if (!svname) /* shouldnt happen */
5836 "panic: reg_scan_name returned NULL");
5837 if (!RExC_paren_names) {
5838 RExC_paren_names= newHV();
5839 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5841 RExC_paren_name_list= newAV();
5842 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5845 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5847 sv_dat = HeVAL(he_str);
5849 /* croak baby croak */
5851 "panic: paren_name hash element allocation failed");
5852 } else if ( SvPOK(sv_dat) ) {
5853 /* (?|...) can mean we have dupes so scan to check
5854 its already been stored. Maybe a flag indicating
5855 we are inside such a construct would be useful,
5856 but the arrays are likely to be quite small, so
5857 for now we punt -- dmq */
5858 IV count = SvIV(sv_dat);
5859 I32 *pv = (I32*)SvPVX(sv_dat);
5861 for ( i = 0 ; i < count ; i++ ) {
5862 if ( pv[i] == RExC_npar ) {
5868 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5869 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5870 pv[count] = RExC_npar;
5871 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5874 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5875 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5877 SvIV_set(sv_dat, 1);
5880 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5881 SvREFCNT_dec(svname);
5884 /*sv_dump(sv_dat);*/
5886 nextchar(pRExC_state);
5888 goto capturing_parens;
5890 RExC_seen |= REG_SEEN_LOOKBEHIND;
5892 case '=': /* (?=...) */
5893 RExC_seen_zerolen++;
5895 case '!': /* (?!...) */
5896 RExC_seen_zerolen++;
5897 if (*RExC_parse == ')') {
5898 ret=reg_node(pRExC_state, OPFAIL);
5899 nextchar(pRExC_state);
5903 case '|': /* (?|...) */
5904 /* branch reset, behave like a (?:...) except that
5905 buffers in alternations share the same numbers */
5907 after_freeze = freeze_paren = RExC_npar;
5909 case ':': /* (?:...) */
5910 case '>': /* (?>...) */
5912 case '$': /* (?$...) */
5913 case '@': /* (?@...) */
5914 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5916 case '#': /* (?#...) */
5917 while (*RExC_parse && *RExC_parse != ')')
5919 if (*RExC_parse != ')')
5920 FAIL("Sequence (?#... not terminated");
5921 nextchar(pRExC_state);
5924 case '0' : /* (?0) */
5925 case 'R' : /* (?R) */
5926 if (*RExC_parse != ')')
5927 FAIL("Sequence (?R) not terminated");
5928 ret = reg_node(pRExC_state, GOSTART);
5929 *flagp |= POSTPONED;
5930 nextchar(pRExC_state);
5933 { /* named and numeric backreferences */
5935 case '&': /* (?&NAME) */
5936 parse_start = RExC_parse - 1;
5939 SV *sv_dat = reg_scan_name(pRExC_state,
5940 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5941 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5943 goto gen_recurse_regop;
5946 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5948 vFAIL("Illegal pattern");
5950 goto parse_recursion;
5952 case '-': /* (?-1) */
5953 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5954 RExC_parse--; /* rewind to let it be handled later */
5958 case '1': case '2': case '3': case '4': /* (?1) */
5959 case '5': case '6': case '7': case '8': case '9':
5962 num = atoi(RExC_parse);
5963 parse_start = RExC_parse - 1; /* MJD */
5964 if (*RExC_parse == '-')
5966 while (isDIGIT(*RExC_parse))
5968 if (*RExC_parse!=')')
5969 vFAIL("Expecting close bracket");
5972 if ( paren == '-' ) {
5974 Diagram of capture buffer numbering.
5975 Top line is the normal capture buffer numbers
5976 Botton line is the negative indexing as from
5980 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5984 num = RExC_npar + num;
5987 vFAIL("Reference to nonexistent group");
5989 } else if ( paren == '+' ) {
5990 num = RExC_npar + num - 1;
5993 ret = reganode(pRExC_state, GOSUB, num);
5995 if (num > (I32)RExC_rx->nparens) {
5997 vFAIL("Reference to nonexistent group");
5999 ARG2L_SET( ret, RExC_recurse_count++);
6001 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6002 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6006 RExC_seen |= REG_SEEN_RECURSE;
6007 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6008 Set_Node_Offset(ret, parse_start); /* MJD */
6010 *flagp |= POSTPONED;
6011 nextchar(pRExC_state);
6013 } /* named and numeric backreferences */
6016 case '?': /* (??...) */
6018 if (*RExC_parse != '{') {
6020 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6023 *flagp |= POSTPONED;
6024 paren = *RExC_parse++;
6026 case '{': /* (?{...}) */
6031 char *s = RExC_parse;
6033 RExC_seen_zerolen++;
6034 RExC_seen |= REG_SEEN_EVAL;
6035 while (count && (c = *RExC_parse)) {
6046 if (*RExC_parse != ')') {
6048 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6052 OP_4tree *sop, *rop;
6053 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6056 Perl_save_re_context(aTHX);
6057 rop = sv_compile_2op(sv, &sop, "re", &pad);
6058 sop->op_private |= OPpREFCOUNTED;
6059 /* re_dup will OpREFCNT_inc */
6060 OpREFCNT_set(sop, 1);
6063 n = add_data(pRExC_state, 3, "nop");
6064 RExC_rxi->data->data[n] = (void*)rop;
6065 RExC_rxi->data->data[n+1] = (void*)sop;
6066 RExC_rxi->data->data[n+2] = (void*)pad;
6069 else { /* First pass */
6070 if (PL_reginterp_cnt < ++RExC_seen_evals
6072 /* No compiled RE interpolated, has runtime
6073 components ===> unsafe. */
6074 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6075 if (PL_tainting && PL_tainted)
6076 FAIL("Eval-group in insecure regular expression");
6077 #if PERL_VERSION > 8
6078 if (IN_PERL_COMPILETIME)
6083 nextchar(pRExC_state);
6085 ret = reg_node(pRExC_state, LOGICAL);
6088 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6089 /* deal with the length of this later - MJD */
6092 ret = reganode(pRExC_state, EVAL, n);
6093 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6094 Set_Node_Offset(ret, parse_start);
6097 case '(': /* (?(?{...})...) and (?(?=...)...) */
6100 if (RExC_parse[0] == '?') { /* (?(?...)) */
6101 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6102 || RExC_parse[1] == '<'
6103 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6106 ret = reg_node(pRExC_state, LOGICAL);
6109 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6113 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6114 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6116 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6117 char *name_start= RExC_parse++;
6119 SV *sv_dat=reg_scan_name(pRExC_state,
6120 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6121 if (RExC_parse == name_start || *RExC_parse != ch)
6122 vFAIL2("Sequence (?(%c... not terminated",
6123 (ch == '>' ? '<' : ch));
6126 num = add_data( pRExC_state, 1, "S" );
6127 RExC_rxi->data->data[num]=(void*)sv_dat;
6128 SvREFCNT_inc_simple_void(sv_dat);
6130 ret = reganode(pRExC_state,NGROUPP,num);
6131 goto insert_if_check_paren;
6133 else if (RExC_parse[0] == 'D' &&
6134 RExC_parse[1] == 'E' &&
6135 RExC_parse[2] == 'F' &&
6136 RExC_parse[3] == 'I' &&
6137 RExC_parse[4] == 'N' &&
6138 RExC_parse[5] == 'E')
6140 ret = reganode(pRExC_state,DEFINEP,0);
6143 goto insert_if_check_paren;
6145 else if (RExC_parse[0] == 'R') {
6148 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6149 parno = atoi(RExC_parse++);
6150 while (isDIGIT(*RExC_parse))
6152 } else if (RExC_parse[0] == '&') {
6155 sv_dat = reg_scan_name(pRExC_state,
6156 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6157 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6159 ret = reganode(pRExC_state,INSUBP,parno);
6160 goto insert_if_check_paren;
6162 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6165 parno = atoi(RExC_parse++);
6167 while (isDIGIT(*RExC_parse))
6169 ret = reganode(pRExC_state, GROUPP, parno);
6171 insert_if_check_paren:
6172 if ((c = *nextchar(pRExC_state)) != ')')
6173 vFAIL("Switch condition not recognized");
6175 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6176 br = regbranch(pRExC_state, &flags, 1,depth+1);
6178 br = reganode(pRExC_state, LONGJMP, 0);
6180 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6181 c = *nextchar(pRExC_state);
6186 vFAIL("(?(DEFINE)....) does not allow branches");
6187 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6188 regbranch(pRExC_state, &flags, 1,depth+1);
6189 REGTAIL(pRExC_state, ret, lastbr);
6192 c = *nextchar(pRExC_state);
6197 vFAIL("Switch (?(condition)... contains too many branches");
6198 ender = reg_node(pRExC_state, TAIL);
6199 REGTAIL(pRExC_state, br, ender);
6201 REGTAIL(pRExC_state, lastbr, ender);
6202 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6205 REGTAIL(pRExC_state, ret, ender);
6206 RExC_size++; /* XXX WHY do we need this?!!
6207 For large programs it seems to be required
6208 but I can't figure out why. -- dmq*/
6212 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6216 RExC_parse--; /* for vFAIL to print correctly */
6217 vFAIL("Sequence (? incomplete");
6219 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6221 has_use_defaults = TRUE;
6222 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6223 RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6227 parse_flags: /* (?i) */
6229 U32 posflags = 0, negflags = 0;
6230 U32 *flagsp = &posflags;
6231 bool has_charset_modifier = 0;
6233 while (*RExC_parse) {
6234 /* && strchr("iogcmsx", *RExC_parse) */
6235 /* (?g), (?gc) and (?o) are useless here
6236 and must be globally applied -- japhy */
6237 switch (*RExC_parse) {
6238 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6239 case LOCALE_PAT_MOD:
6240 if (has_charset_modifier || flagsp == &negflags) {
6241 goto fail_modifiers;
6243 *flagsp &= ~RXf_PMf_UNICODE;
6244 *flagsp |= RXf_PMf_LOCALE;
6245 has_charset_modifier = 1;
6247 case UNICODE_PAT_MOD:
6248 if (has_charset_modifier || flagsp == &negflags) {
6249 goto fail_modifiers;
6251 *flagsp &= ~RXf_PMf_LOCALE;
6252 *flagsp |= RXf_PMf_UNICODE;
6253 has_charset_modifier = 1;
6256 if (has_use_defaults
6257 || has_charset_modifier
6258 || flagsp == &negflags)
6260 goto fail_modifiers;
6262 *flagsp &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6263 has_charset_modifier = 1;
6265 case ONCE_PAT_MOD: /* 'o' */
6266 case GLOBAL_PAT_MOD: /* 'g' */
6267 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6268 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6269 if (! (wastedflags & wflagbit) ) {
6270 wastedflags |= wflagbit;
6273 "Useless (%s%c) - %suse /%c modifier",
6274 flagsp == &negflags ? "?-" : "?",
6276 flagsp == &negflags ? "don't " : "",
6283 case CONTINUE_PAT_MOD: /* 'c' */
6284 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6285 if (! (wastedflags & WASTED_C) ) {
6286 wastedflags |= WASTED_GC;
6289 "Useless (%sc) - %suse /gc modifier",
6290 flagsp == &negflags ? "?-" : "?",
6291 flagsp == &negflags ? "don't " : ""
6296 case KEEPCOPY_PAT_MOD: /* 'p' */
6297 if (flagsp == &negflags) {
6299 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6301 *flagsp |= RXf_PMf_KEEPCOPY;
6305 /* A flag is a default iff it is following a minus, so
6306 * if there is a minus, it means will be trying to
6307 * re-specify a default which is an error */
6308 if (has_use_defaults || flagsp == &negflags) {
6311 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6315 wastedflags = 0; /* reset so (?g-c) warns twice */
6321 RExC_flags |= posflags;
6322 RExC_flags &= ~negflags;
6324 oregflags |= posflags;
6325 oregflags &= ~negflags;
6327 nextchar(pRExC_state);
6338 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6343 }} /* one for the default block, one for the switch */
6350 ret = reganode(pRExC_state, OPEN, parno);
6353 RExC_nestroot = parno;
6354 if (RExC_seen & REG_SEEN_RECURSE
6355 && !RExC_open_parens[parno-1])
6357 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6358 "Setting open paren #%"IVdf" to %d\n",
6359 (IV)parno, REG_NODE_NUM(ret)));
6360 RExC_open_parens[parno-1]= ret;
6363 Set_Node_Length(ret, 1); /* MJD */
6364 Set_Node_Offset(ret, RExC_parse); /* MJD */
6372 /* Pick up the branches, linking them together. */
6373 parse_start = RExC_parse; /* MJD */
6374 br = regbranch(pRExC_state, &flags, 1,depth+1);
6377 if (RExC_npar > after_freeze)
6378 after_freeze = RExC_npar;
6379 RExC_npar = freeze_paren;
6382 /* branch_len = (paren != 0); */
6386 if (*RExC_parse == '|') {
6387 if (!SIZE_ONLY && RExC_extralen) {
6388 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6391 reginsert(pRExC_state, BRANCH, br, depth+1);
6392 Set_Node_Length(br, paren != 0);
6393 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6397 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6399 else if (paren == ':') {
6400 *flagp |= flags&SIMPLE;
6402 if (is_open) { /* Starts with OPEN. */
6403 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6405 else if (paren != '?') /* Not Conditional */
6407 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6409 while (*RExC_parse == '|') {
6410 if (!SIZE_ONLY && RExC_extralen) {
6411 ender = reganode(pRExC_state, LONGJMP,0);
6412 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6415 RExC_extralen += 2; /* Account for LONGJMP. */
6416 nextchar(pRExC_state);
6418 if (RExC_npar > after_freeze)
6419 after_freeze = RExC_npar;
6420 RExC_npar = freeze_paren;
6422 br = regbranch(pRExC_state, &flags, 0, depth+1);
6426 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6428 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6431 if (have_branch || paren != ':') {
6432 /* Make a closing node, and hook it on the end. */
6435 ender = reg_node(pRExC_state, TAIL);
6438 ender = reganode(pRExC_state, CLOSE, parno);
6439 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6440 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6441 "Setting close paren #%"IVdf" to %d\n",
6442 (IV)parno, REG_NODE_NUM(ender)));
6443 RExC_close_parens[parno-1]= ender;
6444 if (RExC_nestroot == parno)
6447 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6448 Set_Node_Length(ender,1); /* MJD */
6454 *flagp &= ~HASWIDTH;
6457 ender = reg_node(pRExC_state, SUCCEED);
6460 ender = reg_node(pRExC_state, END);
6462 assert(!RExC_opend); /* there can only be one! */
6467 REGTAIL(pRExC_state, lastbr, ender);
6469 if (have_branch && !SIZE_ONLY) {
6471 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6473 /* Hook the tails of the branches to the closing node. */
6474 for (br = ret; br; br = regnext(br)) {
6475 const U8 op = PL_regkind[OP(br)];
6477 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6479 else if (op == BRANCHJ) {
6480 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6488 static const char parens[] = "=!<,>";
6490 if (paren && (p = strchr(parens, paren))) {
6491 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6492 int flag = (p - parens) > 1;
6495 node = SUSPEND, flag = 0;
6496 reginsert(pRExC_state, node,ret, depth+1);
6497 Set_Node_Cur_Length(ret);
6498 Set_Node_Offset(ret, parse_start + 1);
6500 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6504 /* Check for proper termination. */
6506 RExC_flags = oregflags;
6507 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6508 RExC_parse = oregcomp_parse;
6509 vFAIL("Unmatched (");
6512 else if (!paren && RExC_parse < RExC_end) {
6513 if (*RExC_parse == ')') {
6515 vFAIL("Unmatched )");
6518 FAIL("Junk on end of regexp"); /* "Can't happen". */
6522 RExC_npar = after_freeze;
6527 - regbranch - one alternative of an | operator
6529 * Implements the concatenation operator.
6532 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6535 register regnode *ret;
6536 register regnode *chain = NULL;
6537 register regnode *latest;
6538 I32 flags = 0, c = 0;
6539 GET_RE_DEBUG_FLAGS_DECL;
6541 PERL_ARGS_ASSERT_REGBRANCH;
6543 DEBUG_PARSE("brnc");
6548 if (!SIZE_ONLY && RExC_extralen)
6549 ret = reganode(pRExC_state, BRANCHJ,0);
6551 ret = reg_node(pRExC_state, BRANCH);
6552 Set_Node_Length(ret, 1);
6556 if (!first && SIZE_ONLY)
6557 RExC_extralen += 1; /* BRANCHJ */
6559 *flagp = WORST; /* Tentatively. */
6562 nextchar(pRExC_state);
6563 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6565 latest = regpiece(pRExC_state, &flags,depth+1);
6566 if (latest == NULL) {
6567 if (flags & TRYAGAIN)
6571 else if (ret == NULL)
6573 *flagp |= flags&(HASWIDTH|POSTPONED);
6574 if (chain == NULL) /* First piece. */
6575 *flagp |= flags&SPSTART;
6578 REGTAIL(pRExC_state, chain, latest);
6583 if (chain == NULL) { /* Loop ran zero times. */
6584 chain = reg_node(pRExC_state, NOTHING);
6589 *flagp |= flags&SIMPLE;
6596 - regpiece - something followed by possible [*+?]
6598 * Note that the branching code sequences used for ? and the general cases
6599 * of * and + are somewhat optimized: they use the same NOTHING node as
6600 * both the endmarker for their branch list and the body of the last branch.
6601 * It might seem that this node could be dispensed with entirely, but the
6602 * endmarker role is not redundant.
6605 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6608 register regnode *ret;
6610 register char *next;
6612 const char * const origparse = RExC_parse;
6614 I32 max = REG_INFTY;
6616 const char *maxpos = NULL;
6617 GET_RE_DEBUG_FLAGS_DECL;
6619 PERL_ARGS_ASSERT_REGPIECE;
6621 DEBUG_PARSE("piec");
6623 ret = regatom(pRExC_state, &flags,depth+1);
6625 if (flags & TRYAGAIN)
6632 if (op == '{' && regcurly(RExC_parse)) {
6634 parse_start = RExC_parse; /* MJD */
6635 next = RExC_parse + 1;
6636 while (isDIGIT(*next) || *next == ',') {
6645 if (*next == '}') { /* got one */
6649 min = atoi(RExC_parse);
6653 maxpos = RExC_parse;
6655 if (!max && *maxpos != '0')
6656 max = REG_INFTY; /* meaning "infinity" */
6657 else if (max >= REG_INFTY)
6658 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6660 nextchar(pRExC_state);
6663 if ((flags&SIMPLE)) {
6664 RExC_naughty += 2 + RExC_naughty / 2;
6665 reginsert(pRExC_state, CURLY, ret, depth+1);
6666 Set_Node_Offset(ret, parse_start+1); /* MJD */
6667 Set_Node_Cur_Length(ret);
6670 regnode * const w = reg_node(pRExC_state, WHILEM);
6673 REGTAIL(pRExC_state, ret, w);
6674 if (!SIZE_ONLY && RExC_extralen) {
6675 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6676 reginsert(pRExC_state, NOTHING,ret, depth+1);
6677 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6679 reginsert(pRExC_state, CURLYX,ret, depth+1);
6681 Set_Node_Offset(ret, parse_start+1);
6682 Set_Node_Length(ret,
6683 op == '{' ? (RExC_parse - parse_start) : 1);
6685 if (!SIZE_ONLY && RExC_extralen)
6686 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6687 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6689 RExC_whilem_seen++, RExC_extralen += 3;
6690 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6699 vFAIL("Can't do {n,m} with n > m");
6701 ARG1_SET(ret, (U16)min);
6702 ARG2_SET(ret, (U16)max);
6714 #if 0 /* Now runtime fix should be reliable. */
6716 /* if this is reinstated, don't forget to put this back into perldiag:
6718 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6720 (F) The part of the regexp subject to either the * or + quantifier
6721 could match an empty string. The {#} shows in the regular
6722 expression about where the problem was discovered.
6726 if (!(flags&HASWIDTH) && op != '?')
6727 vFAIL("Regexp *+ operand could be empty");
6730 parse_start = RExC_parse;
6731 nextchar(pRExC_state);
6733 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6735 if (op == '*' && (flags&SIMPLE)) {
6736 reginsert(pRExC_state, STAR, ret, depth+1);
6740 else if (op == '*') {
6744 else if (op == '+' && (flags&SIMPLE)) {
6745 reginsert(pRExC_state, PLUS, ret, depth+1);
6749 else if (op == '+') {
6753 else if (op == '?') {
6758 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6759 ckWARN3reg(RExC_parse,
6760 "%.*s matches null string many times",
6761 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6765 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6766 nextchar(pRExC_state);
6767 reginsert(pRExC_state, MINMOD, ret, depth+1);
6768 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6770 #ifndef REG_ALLOW_MINMOD_SUSPEND
6773 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6775 nextchar(pRExC_state);
6776 ender = reg_node(pRExC_state, SUCCEED);
6777 REGTAIL(pRExC_state, ret, ender);
6778 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6780 ender = reg_node(pRExC_state, TAIL);
6781 REGTAIL(pRExC_state, ret, ender);
6785 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6787 vFAIL("Nested quantifiers");
6794 /* reg_namedseq(pRExC_state,UVp)
6796 This is expected to be called by a parser routine that has
6797 recognized '\N' and needs to handle the rest. RExC_parse is
6798 expected to point at the first char following the N at the time
6801 The \N may be inside (indicated by valuep not being NULL) or outside a
6804 \N may begin either a named sequence, or if outside a character class, mean
6805 to match a non-newline. For non single-quoted regexes, the tokenizer has
6806 attempted to decide which, and in the case of a named sequence converted it
6807 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6808 where c1... are the characters in the sequence. For single-quoted regexes,
6809 the tokenizer passes the \N sequence through unchanged; this code will not
6810 attempt to determine this nor expand those. The net effect is that if the
6811 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6812 signals that this \N occurrence means to match a non-newline.
6814 Only the \N{U+...} form should occur in a character class, for the same
6815 reason that '.' inside a character class means to just match a period: it
6816 just doesn't make sense.
6818 If valuep is non-null then it is assumed that we are parsing inside
6819 of a charclass definition and the first codepoint in the resolved
6820 string is returned via *valuep and the routine will return NULL.
6821 In this mode if a multichar string is returned from the charnames
6822 handler, a warning will be issued, and only the first char in the
6823 sequence will be examined. If the string returned is zero length
6824 then the value of *valuep is undefined and NON-NULL will
6825 be returned to indicate failure. (This will NOT be a valid pointer
6828 If valuep is null then it is assumed that we are parsing normal text and a
6829 new EXACT node is inserted into the program containing the resolved string,
6830 and a pointer to the new node is returned. But if the string is zero length
6831 a NOTHING node is emitted instead.
6833 On success RExC_parse is set to the char following the endbrace.
6834 Parsing failures will generate a fatal error via vFAIL(...)
6837 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6839 char * endbrace; /* '}' following the name */
6840 regnode *ret = NULL;
6842 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6846 GET_RE_DEBUG_FLAGS_DECL;
6848 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6852 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6853 * modifier. The other meaning does not */
6854 p = (RExC_flags & RXf_PMf_EXTENDED)
6855 ? regwhite( pRExC_state, RExC_parse )
6858 /* Disambiguate between \N meaning a named character versus \N meaning
6859 * [^\n]. The former is assumed when it can't be the latter. */
6860 if (*p != '{' || regcurly(p)) {
6863 /* no bare \N in a charclass */
6864 vFAIL("\\N in a character class must be a named character: \\N{...}");
6866 nextchar(pRExC_state);
6867 ret = reg_node(pRExC_state, REG_ANY);
6868 *flagp |= HASWIDTH|SIMPLE;
6871 Set_Node_Length(ret, 1); /* MJD */
6875 /* Here, we have decided it should be a named sequence */
6877 /* The test above made sure that the next real character is a '{', but
6878 * under the /x modifier, it could be separated by space (or a comment and
6879 * \n) and this is not allowed (for consistency with \x{...} and the
6880 * tokenizer handling of \N{NAME}). */
6881 if (*RExC_parse != '{') {
6882 vFAIL("Missing braces on \\N{}");
6885 RExC_parse++; /* Skip past the '{' */
6887 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6888 || ! (endbrace == RExC_parse /* nothing between the {} */
6889 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6890 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6892 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6893 vFAIL("\\N{NAME} must be resolved by the lexer");
6896 if (endbrace == RExC_parse) { /* empty: \N{} */
6898 RExC_parse = endbrace + 1;
6899 return reg_node(pRExC_state,NOTHING);
6903 ckWARNreg(RExC_parse,
6904 "Ignoring zero length \\N{} in character class"
6906 RExC_parse = endbrace + 1;
6909 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6912 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
6913 RExC_parse += 2; /* Skip past the 'U+' */
6915 if (valuep) { /* In a bracketed char class */
6916 /* We only pay attention to the first char of
6917 multichar strings being returned. I kinda wonder
6918 if this makes sense as it does change the behaviour
6919 from earlier versions, OTOH that behaviour was broken
6920 as well. XXX Solution is to recharacterize as
6921 [rest-of-class]|multi1|multi2... */
6923 STRLEN length_of_hex;
6924 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6925 | PERL_SCAN_DISALLOW_PREFIX
6926 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6928 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6929 if (endchar < endbrace) {
6930 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6933 length_of_hex = (STRLEN)(endchar - RExC_parse);
6934 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6936 /* The tokenizer should have guaranteed validity, but it's possible to
6937 * bypass it by using single quoting, so check */
6938 if (length_of_hex == 0
6939 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6941 RExC_parse += length_of_hex; /* Includes all the valid */
6942 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6943 ? UTF8SKIP(RExC_parse)
6945 /* Guard against malformed utf8 */
6946 if (RExC_parse >= endchar) RExC_parse = endchar;
6947 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6950 RExC_parse = endbrace + 1;
6951 if (endchar == endbrace) return NULL;
6953 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
6955 else { /* Not a char class */
6956 char *s; /* String to put in generated EXACT node */
6957 STRLEN len = 0; /* Its current byte length */
6958 char *endchar; /* Points to '.' or '}' ending cur char in the input
6961 ret = reg_node(pRExC_state,
6962 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6965 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6966 * the input which is of the form now 'c1.c2.c3...}' until find the
6967 * ending brace or exceed length 255. The characters that exceed this
6968 * limit are dropped. The limit could be relaxed should it become
6969 * desirable by reparsing this as (?:\N{NAME}), so could generate
6970 * multiple EXACT nodes, as is done for just regular input. But this
6971 * is primarily a named character, and not intended to be a huge long
6972 * string, so 255 bytes should be good enough */
6974 STRLEN length_of_hex;
6975 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6976 | PERL_SCAN_DISALLOW_PREFIX
6977 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6978 UV cp; /* Ord of current character */
6980 /* Code points are separated by dots. If none, there is only one
6981 * code point, and is terminated by the brace */
6982 endchar = RExC_parse + strcspn(RExC_parse, ".}");
6984 /* The values are Unicode even on EBCDIC machines */
6985 length_of_hex = (STRLEN)(endchar - RExC_parse);
6986 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6987 if ( length_of_hex == 0
6988 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6990 RExC_parse += length_of_hex; /* Includes all the valid */
6991 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6992 ? UTF8SKIP(RExC_parse)
6994 /* Guard against malformed utf8 */
6995 if (RExC_parse >= endchar) RExC_parse = endchar;
6996 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6999 if (! FOLD) { /* Not folding, just append to the string */
7002 /* Quit before adding this character if would exceed limit */
7003 if (len + UNISKIP(cp) > U8_MAX) break;
7005 unilen = reguni(pRExC_state, cp, s);
7010 } else { /* Folding, output the folded equivalent */
7011 STRLEN foldlen,numlen;
7012 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7013 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7015 /* Quit before exceeding size limit */
7016 if (len + foldlen > U8_MAX) break;
7018 for (foldbuf = tmpbuf;
7022 cp = utf8_to_uvchr(foldbuf, &numlen);
7024 const STRLEN unilen = reguni(pRExC_state, cp, s);
7027 /* In EBCDIC the numlen and unilen can differ. */
7029 if (numlen >= foldlen)
7033 break; /* "Can't happen." */
7037 /* Point to the beginning of the next character in the sequence. */
7038 RExC_parse = endchar + 1;
7040 /* Quit if no more characters */
7041 if (RExC_parse >= endbrace) break;
7046 if (RExC_parse < endbrace) {
7047 ckWARNreg(RExC_parse - 1,
7048 "Using just the first characters returned by \\N{}");
7051 RExC_size += STR_SZ(len);
7054 RExC_emit += STR_SZ(len);
7057 RExC_parse = endbrace + 1;
7059 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7060 with malformed in t/re/pat_advanced.t */
7062 Set_Node_Cur_Length(ret); /* MJD */
7063 nextchar(pRExC_state);
7073 * It returns the code point in utf8 for the value in *encp.
7074 * value: a code value in the source encoding
7075 * encp: a pointer to an Encode object
7077 * If the result from Encode is not a single character,
7078 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7081 S_reg_recode(pTHX_ const char value, SV **encp)
7084 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7085 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7086 const STRLEN newlen = SvCUR(sv);
7087 UV uv = UNICODE_REPLACEMENT;
7089 PERL_ARGS_ASSERT_REG_RECODE;
7093 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7096 if (!newlen || numlen != newlen) {
7097 uv = UNICODE_REPLACEMENT;
7105 - regatom - the lowest level
7107 Try to identify anything special at the start of the pattern. If there
7108 is, then handle it as required. This may involve generating a single regop,
7109 such as for an assertion; or it may involve recursing, such as to
7110 handle a () structure.
7112 If the string doesn't start with something special then we gobble up
7113 as much literal text as we can.
7115 Once we have been able to handle whatever type of thing started the
7116 sequence, we return.
7118 Note: we have to be careful with escapes, as they can be both literal
7119 and special, and in the case of \10 and friends can either, depending
7120 on context. Specifically there are two seperate switches for handling
7121 escape sequences, with the one for handling literal escapes requiring
7122 a dummy entry for all of the special escapes that are actually handled
7127 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7130 register regnode *ret = NULL;
7132 char *parse_start = RExC_parse;
7133 GET_RE_DEBUG_FLAGS_DECL;
7134 DEBUG_PARSE("atom");
7135 *flagp = WORST; /* Tentatively. */
7137 PERL_ARGS_ASSERT_REGATOM;
7140 switch ((U8)*RExC_parse) {
7142 RExC_seen_zerolen++;
7143 nextchar(pRExC_state);
7144 if (RExC_flags & RXf_PMf_MULTILINE)
7145 ret = reg_node(pRExC_state, MBOL);
7146 else if (RExC_flags & RXf_PMf_SINGLELINE)
7147 ret = reg_node(pRExC_state, SBOL);
7149 ret = reg_node(pRExC_state, BOL);
7150 Set_Node_Length(ret, 1); /* MJD */
7153 nextchar(pRExC_state);
7155 RExC_seen_zerolen++;
7156 if (RExC_flags & RXf_PMf_MULTILINE)
7157 ret = reg_node(pRExC_state, MEOL);
7158 else if (RExC_flags & RXf_PMf_SINGLELINE)
7159 ret = reg_node(pRExC_state, SEOL);
7161 ret = reg_node(pRExC_state, EOL);
7162 Set_Node_Length(ret, 1); /* MJD */
7165 nextchar(pRExC_state);
7166 if (RExC_flags & RXf_PMf_SINGLELINE)
7167 ret = reg_node(pRExC_state, SANY);
7169 ret = reg_node(pRExC_state, REG_ANY);
7170 *flagp |= HASWIDTH|SIMPLE;
7172 Set_Node_Length(ret, 1); /* MJD */
7176 char * const oregcomp_parse = ++RExC_parse;
7177 ret = regclass(pRExC_state,depth+1);
7178 if (*RExC_parse != ']') {
7179 RExC_parse = oregcomp_parse;
7180 vFAIL("Unmatched [");
7182 nextchar(pRExC_state);
7183 *flagp |= HASWIDTH|SIMPLE;
7184 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7188 nextchar(pRExC_state);
7189 ret = reg(pRExC_state, 1, &flags,depth+1);
7191 if (flags & TRYAGAIN) {
7192 if (RExC_parse == RExC_end) {
7193 /* Make parent create an empty node if needed. */
7201 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7205 if (flags & TRYAGAIN) {
7209 vFAIL("Internal urp");
7210 /* Supposed to be caught earlier. */
7213 if (!regcurly(RExC_parse)) {
7222 vFAIL("Quantifier follows nothing");
7230 len=0; /* silence a spurious compiler warning */
7231 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7232 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7233 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7234 ret = reganode(pRExC_state, FOLDCHAR, cp);
7235 Set_Node_Length(ret, 1); /* MJD */
7236 nextchar(pRExC_state); /* kill whitespace under /x */
7244 This switch handles escape sequences that resolve to some kind
7245 of special regop and not to literal text. Escape sequnces that
7246 resolve to literal text are handled below in the switch marked
7249 Every entry in this switch *must* have a corresponding entry
7250 in the literal escape switch. However, the opposite is not
7251 required, as the default for this switch is to jump to the
7252 literal text handling code.
7254 switch ((U8)*++RExC_parse) {
7259 /* Special Escapes */
7261 RExC_seen_zerolen++;
7262 ret = reg_node(pRExC_state, SBOL);
7264 goto finish_meta_pat;
7266 ret = reg_node(pRExC_state, GPOS);
7267 RExC_seen |= REG_SEEN_GPOS;
7269 goto finish_meta_pat;
7271 RExC_seen_zerolen++;
7272 ret = reg_node(pRExC_state, KEEPS);
7274 /* XXX:dmq : disabling in-place substitution seems to
7275 * be necessary here to avoid cases of memory corruption, as
7276 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7278 RExC_seen |= REG_SEEN_LOOKBEHIND;
7279 goto finish_meta_pat;
7281 ret = reg_node(pRExC_state, SEOL);
7283 RExC_seen_zerolen++; /* Do not optimize RE away */
7284 goto finish_meta_pat;
7286 ret = reg_node(pRExC_state, EOS);
7288 RExC_seen_zerolen++; /* Do not optimize RE away */
7289 goto finish_meta_pat;
7291 ret = reg_node(pRExC_state, CANY);
7292 RExC_seen |= REG_SEEN_CANY;
7293 *flagp |= HASWIDTH|SIMPLE;
7294 goto finish_meta_pat;
7296 ret = reg_node(pRExC_state, CLUMP);
7298 goto finish_meta_pat;
7301 ret = reg_node(pRExC_state, (U8)(ALNUML));
7303 ret = reg_node(pRExC_state, (U8)(ALNUM));
7304 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7306 *flagp |= HASWIDTH|SIMPLE;
7307 goto finish_meta_pat;
7310 ret = reg_node(pRExC_state, (U8)(NALNUML));
7312 ret = reg_node(pRExC_state, (U8)(NALNUM));
7313 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7315 *flagp |= HASWIDTH|SIMPLE;
7316 goto finish_meta_pat;
7318 RExC_seen_zerolen++;
7319 RExC_seen |= REG_SEEN_LOOKBEHIND;
7321 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7323 ret = reg_node(pRExC_state, (U8)(BOUND));
7324 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7327 goto finish_meta_pat;
7329 RExC_seen_zerolen++;
7330 RExC_seen |= REG_SEEN_LOOKBEHIND;
7332 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7334 ret = reg_node(pRExC_state, (U8)(NBOUND));
7335 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7338 goto finish_meta_pat;
7341 ret = reg_node(pRExC_state, (U8)(SPACEL));
7343 ret = reg_node(pRExC_state, (U8)(SPACE));
7344 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7346 *flagp |= HASWIDTH|SIMPLE;
7347 goto finish_meta_pat;
7350 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7352 ret = reg_node(pRExC_state, (U8)(NSPACE));
7353 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7355 *flagp |= HASWIDTH|SIMPLE;
7356 goto finish_meta_pat;
7358 ret = reg_node(pRExC_state, DIGIT);
7359 *flagp |= HASWIDTH|SIMPLE;
7360 goto finish_meta_pat;
7362 ret = reg_node(pRExC_state, NDIGIT);
7363 *flagp |= HASWIDTH|SIMPLE;
7364 goto finish_meta_pat;
7366 ret = reg_node(pRExC_state, LNBREAK);
7367 *flagp |= HASWIDTH|SIMPLE;
7368 goto finish_meta_pat;
7370 ret = reg_node(pRExC_state, HORIZWS);
7371 *flagp |= HASWIDTH|SIMPLE;
7372 goto finish_meta_pat;
7374 ret = reg_node(pRExC_state, NHORIZWS);
7375 *flagp |= HASWIDTH|SIMPLE;
7376 goto finish_meta_pat;
7378 ret = reg_node(pRExC_state, VERTWS);
7379 *flagp |= HASWIDTH|SIMPLE;
7380 goto finish_meta_pat;
7382 ret = reg_node(pRExC_state, NVERTWS);
7383 *flagp |= HASWIDTH|SIMPLE;
7385 nextchar(pRExC_state);
7386 Set_Node_Length(ret, 2); /* MJD */
7391 char* const oldregxend = RExC_end;
7393 char* parse_start = RExC_parse - 2;
7396 if (RExC_parse[1] == '{') {
7397 /* a lovely hack--pretend we saw [\pX] instead */
7398 RExC_end = strchr(RExC_parse, '}');
7400 const U8 c = (U8)*RExC_parse;
7402 RExC_end = oldregxend;
7403 vFAIL2("Missing right brace on \\%c{}", c);
7408 RExC_end = RExC_parse + 2;
7409 if (RExC_end > oldregxend)
7410 RExC_end = oldregxend;
7414 ret = regclass(pRExC_state,depth+1);
7416 RExC_end = oldregxend;
7419 Set_Node_Offset(ret, parse_start + 2);
7420 Set_Node_Cur_Length(ret);
7421 nextchar(pRExC_state);
7422 *flagp |= HASWIDTH|SIMPLE;
7426 /* Handle \N and \N{NAME} here and not below because it can be
7427 multicharacter. join_exact() will join them up later on.
7428 Also this makes sure that things like /\N{BLAH}+/ and
7429 \N{BLAH} being multi char Just Happen. dmq*/
7431 ret= reg_namedseq(pRExC_state, NULL, flagp);
7433 case 'k': /* Handle \k<NAME> and \k'NAME' */
7436 char ch= RExC_parse[1];
7437 if (ch != '<' && ch != '\'' && ch != '{') {
7439 vFAIL2("Sequence %.2s... not terminated",parse_start);
7441 /* this pretty much dupes the code for (?P=...) in reg(), if
7442 you change this make sure you change that */
7443 char* name_start = (RExC_parse += 2);
7445 SV *sv_dat = reg_scan_name(pRExC_state,
7446 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7447 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7448 if (RExC_parse == name_start || *RExC_parse != ch)
7449 vFAIL2("Sequence %.3s... not terminated",parse_start);
7452 num = add_data( pRExC_state, 1, "S" );
7453 RExC_rxi->data->data[num]=(void*)sv_dat;
7454 SvREFCNT_inc_simple_void(sv_dat);
7458 ret = reganode(pRExC_state,
7459 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7463 /* override incorrect value set in reganode MJD */
7464 Set_Node_Offset(ret, parse_start+1);
7465 Set_Node_Cur_Length(ret); /* MJD */
7466 nextchar(pRExC_state);
7472 case '1': case '2': case '3': case '4':
7473 case '5': case '6': case '7': case '8': case '9':
7476 bool isg = *RExC_parse == 'g';
7481 if (*RExC_parse == '{') {
7485 if (*RExC_parse == '-') {
7489 if (hasbrace && !isDIGIT(*RExC_parse)) {
7490 if (isrel) RExC_parse--;
7492 goto parse_named_seq;
7494 num = atoi(RExC_parse);
7495 if (isg && num == 0)
7496 vFAIL("Reference to invalid group 0");
7498 num = RExC_npar - num;
7500 vFAIL("Reference to nonexistent or unclosed group");
7502 if (!isg && num > 9 && num >= RExC_npar)
7505 char * const parse_start = RExC_parse - 1; /* MJD */
7506 while (isDIGIT(*RExC_parse))
7508 if (parse_start == RExC_parse - 1)
7509 vFAIL("Unterminated \\g... pattern");
7511 if (*RExC_parse != '}')
7512 vFAIL("Unterminated \\g{...} pattern");
7516 if (num > (I32)RExC_rx->nparens)
7517 vFAIL("Reference to nonexistent group");
7520 ret = reganode(pRExC_state,
7521 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7525 /* override incorrect value set in reganode MJD */
7526 Set_Node_Offset(ret, parse_start+1);
7527 Set_Node_Cur_Length(ret); /* MJD */
7529 nextchar(pRExC_state);
7534 if (RExC_parse >= RExC_end)
7535 FAIL("Trailing \\");
7538 /* Do not generate "unrecognized" warnings here, we fall
7539 back into the quick-grab loop below */
7546 if (RExC_flags & RXf_PMf_EXTENDED) {
7547 if ( reg_skipcomment( pRExC_state ) )
7554 register STRLEN len;
7559 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7561 parse_start = RExC_parse - 1;
7567 ret = reg_node(pRExC_state,
7568 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7570 for (len = 0, p = RExC_parse - 1;
7571 len < 127 && p < RExC_end;
7574 char * const oldp = p;
7576 if (RExC_flags & RXf_PMf_EXTENDED)
7577 p = regwhite( pRExC_state, p );
7582 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7583 goto normal_default;
7593 /* Literal Escapes Switch
7595 This switch is meant to handle escape sequences that
7596 resolve to a literal character.
7598 Every escape sequence that represents something
7599 else, like an assertion or a char class, is handled
7600 in the switch marked 'Special Escapes' above in this
7601 routine, but also has an entry here as anything that
7602 isn't explicitly mentioned here will be treated as
7603 an unescaped equivalent literal.
7607 /* These are all the special escapes. */
7611 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7612 goto normal_default;
7613 case 'A': /* Start assertion */
7614 case 'b': case 'B': /* Word-boundary assertion*/
7615 case 'C': /* Single char !DANGEROUS! */
7616 case 'd': case 'D': /* digit class */
7617 case 'g': case 'G': /* generic-backref, pos assertion */
7618 case 'h': case 'H': /* HORIZWS */
7619 case 'k': case 'K': /* named backref, keep marker */
7620 case 'N': /* named char sequence */
7621 case 'p': case 'P': /* Unicode property */
7622 case 'R': /* LNBREAK */
7623 case 's': case 'S': /* space class */
7624 case 'v': case 'V': /* VERTWS */
7625 case 'w': case 'W': /* word class */
7626 case 'X': /* eXtended Unicode "combining character sequence" */
7627 case 'z': case 'Z': /* End of line/string assertion */
7631 /* Anything after here is an escape that resolves to a
7632 literal. (Except digits, which may or may not)
7651 ender = ASCII_TO_NATIVE('\033');
7655 ender = ASCII_TO_NATIVE('\007');
7660 STRLEN brace_len = len;
7662 const char* error_msg;
7664 bool valid = grok_bslash_o(p,
7671 RExC_parse = p; /* going to die anyway; point
7672 to exact spot of failure */
7679 if (PL_encoding && ender < 0x100) {
7680 goto recode_encoding;
7689 char* const e = strchr(p, '}');
7693 vFAIL("Missing right brace on \\x{}");
7696 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7697 | PERL_SCAN_DISALLOW_PREFIX;
7698 STRLEN numlen = e - p - 1;
7699 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7706 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7708 ender = grok_hex(p, &numlen, &flags, NULL);
7711 if (PL_encoding && ender < 0x100)
7712 goto recode_encoding;
7716 ender = grok_bslash_c(*p++, SIZE_ONLY);
7718 case '0': case '1': case '2': case '3':case '4':
7719 case '5': case '6': case '7': case '8':case '9':
7721 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7723 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7725 ender = grok_oct(p, &numlen, &flags, NULL);
7735 if (PL_encoding && ender < 0x100)
7736 goto recode_encoding;
7740 SV* enc = PL_encoding;
7741 ender = reg_recode((const char)(U8)ender, &enc);
7742 if (!enc && SIZE_ONLY)
7743 ckWARNreg(p, "Invalid escape in the specified encoding");
7749 FAIL("Trailing \\");
7752 if (!SIZE_ONLY&& isALPHA(*p))
7753 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7754 goto normal_default;
7759 if (UTF8_IS_START(*p) && UTF) {
7761 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7762 &numlen, UTF8_ALLOW_DEFAULT);
7769 if ( RExC_flags & RXf_PMf_EXTENDED)
7770 p = regwhite( pRExC_state, p );
7772 /* Prime the casefolded buffer. */
7773 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7775 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7780 /* Emit all the Unicode characters. */
7782 for (foldbuf = tmpbuf;
7784 foldlen -= numlen) {
7785 ender = utf8_to_uvchr(foldbuf, &numlen);
7787 const STRLEN unilen = reguni(pRExC_state, ender, s);
7790 /* In EBCDIC the numlen
7791 * and unilen can differ. */
7793 if (numlen >= foldlen)
7797 break; /* "Can't happen." */
7801 const STRLEN unilen = reguni(pRExC_state, ender, s);
7810 REGC((char)ender, s++);
7816 /* Emit all the Unicode characters. */
7818 for (foldbuf = tmpbuf;
7820 foldlen -= numlen) {
7821 ender = utf8_to_uvchr(foldbuf, &numlen);
7823 const STRLEN unilen = reguni(pRExC_state, ender, s);
7826 /* In EBCDIC the numlen
7827 * and unilen can differ. */
7829 if (numlen >= foldlen)
7837 const STRLEN unilen = reguni(pRExC_state, ender, s);
7846 REGC((char)ender, s++);
7850 Set_Node_Cur_Length(ret); /* MJD */
7851 nextchar(pRExC_state);
7853 /* len is STRLEN which is unsigned, need to copy to signed */
7856 vFAIL("Internal disaster");
7860 if (len == 1 && UNI_IS_INVARIANT(ender))
7864 RExC_size += STR_SZ(len);
7867 RExC_emit += STR_SZ(len);
7877 S_regwhite( RExC_state_t *pRExC_state, char *p )
7879 const char *e = RExC_end;
7881 PERL_ARGS_ASSERT_REGWHITE;
7886 else if (*p == '#') {
7895 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7903 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7904 Character classes ([:foo:]) can also be negated ([:^foo:]).
7905 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7906 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7907 but trigger failures because they are currently unimplemented. */
7909 #define POSIXCC_DONE(c) ((c) == ':')
7910 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7911 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7914 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7917 I32 namedclass = OOB_NAMEDCLASS;
7919 PERL_ARGS_ASSERT_REGPPOSIXCC;
7921 if (value == '[' && RExC_parse + 1 < RExC_end &&
7922 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7923 POSIXCC(UCHARAT(RExC_parse))) {
7924 const char c = UCHARAT(RExC_parse);
7925 char* const s = RExC_parse++;
7927 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7929 if (RExC_parse == RExC_end)
7930 /* Grandfather lone [:, [=, [. */
7933 const char* const t = RExC_parse++; /* skip over the c */
7936 if (UCHARAT(RExC_parse) == ']') {
7937 const char *posixcc = s + 1;
7938 RExC_parse++; /* skip over the ending ] */
7941 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7942 const I32 skip = t - posixcc;
7944 /* Initially switch on the length of the name. */
7947 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7948 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7951 /* Names all of length 5. */
7952 /* alnum alpha ascii blank cntrl digit graph lower
7953 print punct space upper */
7954 /* Offset 4 gives the best switch position. */
7955 switch (posixcc[4]) {
7957 if (memEQ(posixcc, "alph", 4)) /* alpha */
7958 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7961 if (memEQ(posixcc, "spac", 4)) /* space */
7962 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7965 if (memEQ(posixcc, "grap", 4)) /* graph */
7966 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7969 if (memEQ(posixcc, "asci", 4)) /* ascii */
7970 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7973 if (memEQ(posixcc, "blan", 4)) /* blank */
7974 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7977 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7978 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7981 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7982 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7985 if (memEQ(posixcc, "lowe", 4)) /* lower */
7986 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7987 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7988 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7991 if (memEQ(posixcc, "digi", 4)) /* digit */
7992 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7993 else if (memEQ(posixcc, "prin", 4)) /* print */
7994 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7995 else if (memEQ(posixcc, "punc", 4)) /* punct */
7996 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8001 if (memEQ(posixcc, "xdigit", 6))
8002 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8006 if (namedclass == OOB_NAMEDCLASS)
8007 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8009 assert (posixcc[skip] == ':');
8010 assert (posixcc[skip+1] == ']');
8011 } else if (!SIZE_ONLY) {
8012 /* [[=foo=]] and [[.foo.]] are still future. */
8014 /* adjust RExC_parse so the warning shows after
8016 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8018 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8021 /* Maternal grandfather:
8022 * "[:" ending in ":" but not in ":]" */
8032 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8036 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8038 if (POSIXCC(UCHARAT(RExC_parse))) {
8039 const char *s = RExC_parse;
8040 const char c = *s++;
8044 if (*s && c == *s && s[1] == ']') {
8046 "POSIX syntax [%c %c] belongs inside character classes",
8049 /* [[=foo=]] and [[.foo.]] are still future. */
8050 if (POSIXCC_NOTYET(c)) {
8051 /* adjust RExC_parse so the error shows after
8053 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8055 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8062 #define _C_C_T_(NAME,TEST,WORD) \
8065 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8067 for (value = 0; value < 256; value++) \
8069 ANYOF_BITMAP_SET(ret, value); \
8074 case ANYOF_N##NAME: \
8076 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8078 for (value = 0; value < 256; value++) \
8080 ANYOF_BITMAP_SET(ret, value); \
8086 /* Like above, but no locale test */
8087 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8089 for (value = 0; value < 256; value++) \
8091 ANYOF_BITMAP_SET(ret, value); \
8095 case ANYOF_N##NAME: \
8096 for (value = 0; value < 256; value++) \
8098 ANYOF_BITMAP_SET(ret, value); \
8103 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8104 * there are two tests passed in, to use depending on that. There aren't any
8105 * cases where the label is different from the name, so no need for that
8107 #define _C_C_T_UNI_8_BIT(NAME,TEST_8,TEST_7,WORD) \
8109 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8110 else if (UNI_SEMANTICS) { \
8111 for (value = 0; value < 256; value++) { \
8112 if (TEST_8) ANYOF_BITMAP_SET(ret, value); \
8116 for (value = 0; value < 256; value++) { \
8117 if (TEST_7) ANYOF_BITMAP_SET(ret, value); \
8123 case ANYOF_N##NAME: \
8124 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8125 else if (UNI_SEMANTICS) { \
8126 for (value = 0; value < 256; value++) { \
8127 if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
8131 for (value = 0; value < 256; value++) { \
8132 if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
8140 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8141 so that it is possible to override the option here without having to
8142 rebuild the entire core. as we are required to do if we change regcomp.h
8143 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8145 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8146 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8149 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8150 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8152 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8156 parse a class specification and produce either an ANYOF node that
8157 matches the pattern or if the pattern matches a single char only and
8158 that char is < 256 and we are case insensitive then we produce an
8163 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8166 register UV nextvalue;
8167 register IV prevvalue = OOB_UNICODE;
8168 register IV range = 0;
8169 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8170 register regnode *ret;
8173 char *rangebegin = NULL;
8174 bool need_class = 0;
8177 bool optimize_invert = TRUE;
8178 AV* unicode_alternate = NULL;
8180 UV literal_endpoint = 0;
8182 UV stored = 0; /* number of chars stored in the class */
8184 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8185 case we need to change the emitted regop to an EXACT. */
8186 const char * orig_parse = RExC_parse;
8187 GET_RE_DEBUG_FLAGS_DECL;
8189 PERL_ARGS_ASSERT_REGCLASS;
8191 PERL_UNUSED_ARG(depth);
8194 DEBUG_PARSE("clas");
8196 /* Assume we are going to generate an ANYOF node. */
8197 ret = reganode(pRExC_state, ANYOF, 0);
8200 ANYOF_FLAGS(ret) = 0;
8202 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
8206 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8210 RExC_size += ANYOF_SKIP;
8211 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8214 RExC_emit += ANYOF_SKIP;
8216 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8218 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8219 ANYOF_BITMAP_ZERO(ret);
8220 listsv = newSVpvs("# comment\n");
8223 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8225 if (!SIZE_ONLY && POSIXCC(nextvalue))
8226 checkposixcc(pRExC_state);
8228 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8229 if (UCHARAT(RExC_parse) == ']')
8233 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8237 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8240 rangebegin = RExC_parse;
8242 value = utf8n_to_uvchr((U8*)RExC_parse,
8243 RExC_end - RExC_parse,
8244 &numlen, UTF8_ALLOW_DEFAULT);
8245 RExC_parse += numlen;
8248 value = UCHARAT(RExC_parse++);
8250 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8251 if (value == '[' && POSIXCC(nextvalue))
8252 namedclass = regpposixcc(pRExC_state, value);
8253 else if (value == '\\') {
8255 value = utf8n_to_uvchr((U8*)RExC_parse,
8256 RExC_end - RExC_parse,
8257 &numlen, UTF8_ALLOW_DEFAULT);
8258 RExC_parse += numlen;
8261 value = UCHARAT(RExC_parse++);
8262 /* Some compilers cannot handle switching on 64-bit integer
8263 * values, therefore value cannot be an UV. Yes, this will
8264 * be a problem later if we want switch on Unicode.
8265 * A similar issue a little bit later when switching on
8266 * namedclass. --jhi */
8267 switch ((I32)value) {
8268 case 'w': namedclass = ANYOF_ALNUM; break;
8269 case 'W': namedclass = ANYOF_NALNUM; break;
8270 case 's': namedclass = ANYOF_SPACE; break;
8271 case 'S': namedclass = ANYOF_NSPACE; break;
8272 case 'd': namedclass = ANYOF_DIGIT; break;
8273 case 'D': namedclass = ANYOF_NDIGIT; break;
8274 case 'v': namedclass = ANYOF_VERTWS; break;
8275 case 'V': namedclass = ANYOF_NVERTWS; break;
8276 case 'h': namedclass = ANYOF_HORIZWS; break;
8277 case 'H': namedclass = ANYOF_NHORIZWS; break;
8278 case 'N': /* Handle \N{NAME} in class */
8280 /* We only pay attention to the first char of
8281 multichar strings being returned. I kinda wonder
8282 if this makes sense as it does change the behaviour
8283 from earlier versions, OTOH that behaviour was broken
8285 UV v; /* value is register so we cant & it /grrr */
8286 if (reg_namedseq(pRExC_state, &v, NULL)) {
8296 if (RExC_parse >= RExC_end)
8297 vFAIL2("Empty \\%c{}", (U8)value);
8298 if (*RExC_parse == '{') {
8299 const U8 c = (U8)value;
8300 e = strchr(RExC_parse++, '}');
8302 vFAIL2("Missing right brace on \\%c{}", c);
8303 while (isSPACE(UCHARAT(RExC_parse)))
8305 if (e == RExC_parse)
8306 vFAIL2("Empty \\%c{}", c);
8308 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8316 if (UCHARAT(RExC_parse) == '^') {
8319 value = value == 'p' ? 'P' : 'p'; /* toggle */
8320 while (isSPACE(UCHARAT(RExC_parse))) {
8325 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8326 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8329 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8330 namedclass = ANYOF_MAX; /* no official name, but it's named */
8333 case 'n': value = '\n'; break;
8334 case 'r': value = '\r'; break;
8335 case 't': value = '\t'; break;
8336 case 'f': value = '\f'; break;
8337 case 'b': value = '\b'; break;
8338 case 'e': value = ASCII_TO_NATIVE('\033');break;
8339 case 'a': value = ASCII_TO_NATIVE('\007');break;
8341 RExC_parse--; /* function expects to be pointed at the 'o' */
8343 const char* error_msg;
8344 bool valid = grok_bslash_o(RExC_parse,
8349 RExC_parse += numlen;
8354 if (PL_encoding && value < 0x100) {
8355 goto recode_encoding;
8359 if (*RExC_parse == '{') {
8360 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8361 | PERL_SCAN_DISALLOW_PREFIX;
8362 char * const e = strchr(RExC_parse++, '}');
8364 vFAIL("Missing right brace on \\x{}");
8366 numlen = e - RExC_parse;
8367 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8371 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8373 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8374 RExC_parse += numlen;
8376 if (PL_encoding && value < 0x100)
8377 goto recode_encoding;
8380 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8382 case '0': case '1': case '2': case '3': case '4':
8383 case '5': case '6': case '7':
8385 /* Take 1-3 octal digits */
8386 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8388 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8389 RExC_parse += numlen;
8390 if (PL_encoding && value < 0x100)
8391 goto recode_encoding;
8396 SV* enc = PL_encoding;
8397 value = reg_recode((const char)(U8)value, &enc);
8398 if (!enc && SIZE_ONLY)
8399 ckWARNreg(RExC_parse,
8400 "Invalid escape in the specified encoding");
8404 /* Allow \_ to not give an error */
8405 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8406 ckWARN2reg(RExC_parse,
8407 "Unrecognized escape \\%c in character class passed through",
8412 } /* end of \blah */
8418 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8420 if (!SIZE_ONLY && !need_class)
8421 ANYOF_CLASS_ZERO(ret);
8425 /* a bad range like a-\d, a-[:digit:] ? */
8429 RExC_parse >= rangebegin ?
8430 RExC_parse - rangebegin : 0;
8431 ckWARN4reg(RExC_parse,
8432 "False [] range \"%*.*s\"",
8435 if (prevvalue < 256) {
8436 ANYOF_BITMAP_SET(ret, prevvalue);
8437 ANYOF_BITMAP_SET(ret, '-');
8440 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8441 Perl_sv_catpvf(aTHX_ listsv,
8442 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8446 range = 0; /* this was not a true range */
8452 const char *what = NULL;
8455 if (namedclass > OOB_NAMEDCLASS)
8456 optimize_invert = FALSE;
8457 /* Possible truncation here but in some 64-bit environments
8458 * the compiler gets heartburn about switch on 64-bit values.
8459 * A similar issue a little earlier when switching on value.
8461 switch ((I32)namedclass) {
8463 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8464 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8465 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8466 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8467 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8468 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8469 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8470 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8471 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8472 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8473 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8474 /* \s, \w match all unicode if utf8. */
8475 case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8476 case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8478 /* \s, \w match ascii and locale only */
8479 case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8480 case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8482 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8483 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8484 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8487 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8490 for (value = 0; value < 128; value++)
8491 ANYOF_BITMAP_SET(ret, value);
8493 for (value = 0; value < 256; value++) {
8495 ANYOF_BITMAP_SET(ret, value);
8504 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8507 for (value = 128; value < 256; value++)
8508 ANYOF_BITMAP_SET(ret, value);
8510 for (value = 0; value < 256; value++) {
8511 if (!isASCII(value))
8512 ANYOF_BITMAP_SET(ret, value);
8521 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8523 /* consecutive digits assumed */
8524 for (value = '0'; value <= '9'; value++)
8525 ANYOF_BITMAP_SET(ret, value);
8528 what = POSIX_CC_UNI_NAME("Digit");
8532 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8534 /* consecutive digits assumed */
8535 for (value = 0; value < '0'; value++)
8536 ANYOF_BITMAP_SET(ret, value);
8537 for (value = '9' + 1; value < 256; value++)
8538 ANYOF_BITMAP_SET(ret, value);
8541 what = POSIX_CC_UNI_NAME("Digit");
8544 /* this is to handle \p and \P */
8547 vFAIL("Invalid [::] class");
8551 /* Strings such as "+utf8::isWord\n" */
8552 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8555 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8558 } /* end of namedclass \blah */
8561 if (prevvalue > (IV)value) /* b-a */ {
8562 const int w = RExC_parse - rangebegin;
8563 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8564 range = 0; /* not a valid range */
8568 prevvalue = value; /* save the beginning of the range */
8569 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8570 RExC_parse[1] != ']') {
8573 /* a bad range like \w-, [:word:]- ? */
8574 if (namedclass > OOB_NAMEDCLASS) {
8575 if (ckWARN(WARN_REGEXP)) {
8577 RExC_parse >= rangebegin ?
8578 RExC_parse - rangebegin : 0;
8580 "False [] range \"%*.*s\"",
8584 ANYOF_BITMAP_SET(ret, '-');
8586 range = 1; /* yeah, it's a range! */
8587 continue; /* but do it the next time */
8591 /* now is the next time */
8592 /*stored += (value - prevvalue + 1);*/
8594 if (prevvalue < 256) {
8595 const IV ceilvalue = value < 256 ? value : 255;
8598 /* In EBCDIC [\x89-\x91] should include
8599 * the \x8e but [i-j] should not. */
8600 if (literal_endpoint == 2 &&
8601 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8602 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8604 if (isLOWER(prevvalue)) {
8605 for (i = prevvalue; i <= ceilvalue; i++)
8606 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8608 ANYOF_BITMAP_SET(ret, i);
8611 for (i = prevvalue; i <= ceilvalue; i++)
8612 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8614 ANYOF_BITMAP_SET(ret, i);
8620 for (i = prevvalue; i <= ceilvalue; i++) {
8621 if (!ANYOF_BITMAP_TEST(ret,i)) {
8623 ANYOF_BITMAP_SET(ret, i);
8627 if (value > 255 || UTF) {
8628 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8629 const UV natvalue = NATIVE_TO_UNI(value);
8630 stored+=2; /* can't optimize this class */
8631 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8632 if (prevnatvalue < natvalue) { /* what about > ? */
8633 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8634 prevnatvalue, natvalue);
8636 else if (prevnatvalue == natvalue) {
8637 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8639 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8641 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8643 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8644 if (RExC_precomp[0] == ':' &&
8645 RExC_precomp[1] == '[' &&
8646 (f == 0xDF || f == 0x92)) {
8647 f = NATIVE_TO_UNI(f);
8650 /* If folding and foldable and a single
8651 * character, insert also the folded version
8652 * to the charclass. */
8654 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8655 if ((RExC_precomp[0] == ':' &&
8656 RExC_precomp[1] == '[' &&
8658 (value == 0xFB05 || value == 0xFB06))) ?
8659 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8660 foldlen == (STRLEN)UNISKIP(f) )
8662 if (foldlen == (STRLEN)UNISKIP(f))
8664 Perl_sv_catpvf(aTHX_ listsv,
8667 /* Any multicharacter foldings
8668 * require the following transform:
8669 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8670 * where E folds into "pq" and F folds
8671 * into "rst", all other characters
8672 * fold to single characters. We save
8673 * away these multicharacter foldings,
8674 * to be later saved as part of the
8675 * additional "s" data. */
8678 if (!unicode_alternate)
8679 unicode_alternate = newAV();
8680 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8682 av_push(unicode_alternate, sv);
8686 /* If folding and the value is one of the Greek
8687 * sigmas insert a few more sigmas to make the
8688 * folding rules of the sigmas to work right.
8689 * Note that not all the possible combinations
8690 * are handled here: some of them are handled
8691 * by the standard folding rules, and some of
8692 * them (literal or EXACTF cases) are handled
8693 * during runtime in regexec.c:S_find_byclass(). */
8694 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8695 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8696 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8697 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8698 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8700 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8701 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8702 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8707 literal_endpoint = 0;
8711 range = 0; /* this range (if it was one) is done now */
8715 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8717 RExC_size += ANYOF_CLASS_ADD_SKIP;
8719 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8725 /****** !SIZE_ONLY AFTER HERE *********/
8727 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8728 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8730 /* optimize single char class to an EXACT node
8731 but *only* when its not a UTF/high char */
8732 const char * cur_parse= RExC_parse;
8733 RExC_emit = (regnode *)orig_emit;
8734 RExC_parse = (char *)orig_parse;
8735 ret = reg_node(pRExC_state,
8736 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8737 RExC_parse = (char *)cur_parse;
8738 *STRING(ret)= (char)value;
8740 RExC_emit += STR_SZ(1);
8741 SvREFCNT_dec(listsv);
8744 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8745 if ( /* If the only flag is folding (plus possibly inversion). */
8746 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8748 for (value = 0; value < 256; ++value) {
8749 if (ANYOF_BITMAP_TEST(ret, value)) {
8750 UV fold = PL_fold[value];
8753 ANYOF_BITMAP_SET(ret, fold);
8756 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8759 /* optimize inverted simple patterns (e.g. [^a-z]) */
8760 if (optimize_invert &&
8761 /* If the only flag is inversion. */
8762 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8763 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8764 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8765 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8768 AV * const av = newAV();
8770 /* The 0th element stores the character class description
8771 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8772 * to initialize the appropriate swash (which gets stored in
8773 * the 1st element), and also useful for dumping the regnode.
8774 * The 2nd element stores the multicharacter foldings,
8775 * used later (regexec.c:S_reginclass()). */
8776 av_store(av, 0, listsv);
8777 av_store(av, 1, NULL);
8778 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8779 rv = newRV_noinc(MUTABLE_SV(av));
8780 n = add_data(pRExC_state, 1, "s");
8781 RExC_rxi->data->data[n] = (void*)rv;
8789 /* reg_skipcomment()
8791 Absorbs an /x style # comments from the input stream.
8792 Returns true if there is more text remaining in the stream.
8793 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8794 terminates the pattern without including a newline.
8796 Note its the callers responsibility to ensure that we are
8802 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8806 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8808 while (RExC_parse < RExC_end)
8809 if (*RExC_parse++ == '\n') {
8814 /* we ran off the end of the pattern without ending
8815 the comment, so we have to add an \n when wrapping */
8816 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8824 Advance that parse position, and optionall absorbs
8825 "whitespace" from the inputstream.
8827 Without /x "whitespace" means (?#...) style comments only,
8828 with /x this means (?#...) and # comments and whitespace proper.
8830 Returns the RExC_parse point from BEFORE the scan occurs.
8832 This is the /x friendly way of saying RExC_parse++.
8836 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8838 char* const retval = RExC_parse++;
8840 PERL_ARGS_ASSERT_NEXTCHAR;
8843 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8844 RExC_parse[2] == '#') {
8845 while (*RExC_parse != ')') {
8846 if (RExC_parse == RExC_end)
8847 FAIL("Sequence (?#... not terminated");
8853 if (RExC_flags & RXf_PMf_EXTENDED) {
8854 if (isSPACE(*RExC_parse)) {
8858 else if (*RExC_parse == '#') {
8859 if ( reg_skipcomment( pRExC_state ) )
8868 - reg_node - emit a node
8870 STATIC regnode * /* Location. */
8871 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8874 register regnode *ptr;
8875 regnode * const ret = RExC_emit;
8876 GET_RE_DEBUG_FLAGS_DECL;
8878 PERL_ARGS_ASSERT_REG_NODE;
8881 SIZE_ALIGN(RExC_size);
8885 if (RExC_emit >= RExC_emit_bound)
8886 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8888 NODE_ALIGN_FILL(ret);
8890 FILL_ADVANCE_NODE(ptr, op);
8891 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8892 #ifdef RE_TRACK_PATTERN_OFFSETS
8893 if (RExC_offsets) { /* MJD */
8894 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8895 "reg_node", __LINE__,
8897 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8898 ? "Overwriting end of array!\n" : "OK",
8899 (UV)(RExC_emit - RExC_emit_start),
8900 (UV)(RExC_parse - RExC_start),
8901 (UV)RExC_offsets[0]));
8902 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8910 - reganode - emit a node with an argument
8912 STATIC regnode * /* Location. */
8913 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8916 register regnode *ptr;
8917 regnode * const ret = RExC_emit;
8918 GET_RE_DEBUG_FLAGS_DECL;
8920 PERL_ARGS_ASSERT_REGANODE;
8923 SIZE_ALIGN(RExC_size);
8928 assert(2==regarglen[op]+1);
8930 Anything larger than this has to allocate the extra amount.
8931 If we changed this to be:
8933 RExC_size += (1 + regarglen[op]);
8935 then it wouldn't matter. Its not clear what side effect
8936 might come from that so its not done so far.
8941 if (RExC_emit >= RExC_emit_bound)
8942 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8944 NODE_ALIGN_FILL(ret);
8946 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8947 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8948 #ifdef RE_TRACK_PATTERN_OFFSETS
8949 if (RExC_offsets) { /* MJD */
8950 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8954 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8955 "Overwriting end of array!\n" : "OK",
8956 (UV)(RExC_emit - RExC_emit_start),
8957 (UV)(RExC_parse - RExC_start),
8958 (UV)RExC_offsets[0]));
8959 Set_Cur_Node_Offset;
8967 - reguni - emit (if appropriate) a Unicode character
8970 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8974 PERL_ARGS_ASSERT_REGUNI;
8976 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8980 - reginsert - insert an operator in front of already-emitted operand
8982 * Means relocating the operand.
8985 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8988 register regnode *src;
8989 register regnode *dst;
8990 register regnode *place;
8991 const int offset = regarglen[(U8)op];
8992 const int size = NODE_STEP_REGNODE + offset;
8993 GET_RE_DEBUG_FLAGS_DECL;
8995 PERL_ARGS_ASSERT_REGINSERT;
8996 PERL_UNUSED_ARG(depth);
8997 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8998 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9007 if (RExC_open_parens) {
9009 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9010 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9011 if ( RExC_open_parens[paren] >= opnd ) {
9012 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9013 RExC_open_parens[paren] += size;
9015 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9017 if ( RExC_close_parens[paren] >= opnd ) {
9018 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9019 RExC_close_parens[paren] += size;
9021 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9026 while (src > opnd) {
9027 StructCopy(--src, --dst, regnode);
9028 #ifdef RE_TRACK_PATTERN_OFFSETS
9029 if (RExC_offsets) { /* MJD 20010112 */
9030 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9034 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9035 ? "Overwriting end of array!\n" : "OK",
9036 (UV)(src - RExC_emit_start),
9037 (UV)(dst - RExC_emit_start),
9038 (UV)RExC_offsets[0]));
9039 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9040 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9046 place = opnd; /* Op node, where operand used to be. */
9047 #ifdef RE_TRACK_PATTERN_OFFSETS
9048 if (RExC_offsets) { /* MJD */
9049 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9053 (UV)(place - RExC_emit_start) > RExC_offsets[0]
9054 ? "Overwriting end of array!\n" : "OK",
9055 (UV)(place - RExC_emit_start),
9056 (UV)(RExC_parse - RExC_start),
9057 (UV)RExC_offsets[0]));
9058 Set_Node_Offset(place, RExC_parse);
9059 Set_Node_Length(place, 1);
9062 src = NEXTOPER(place);
9063 FILL_ADVANCE_NODE(place, op);
9064 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
9065 Zero(src, offset, regnode);
9069 - regtail - set the next-pointer at the end of a node chain of p to val.
9070 - SEE ALSO: regtail_study
9072 /* TODO: All three parms should be const */
9074 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9077 register regnode *scan;
9078 GET_RE_DEBUG_FLAGS_DECL;
9080 PERL_ARGS_ASSERT_REGTAIL;
9082 PERL_UNUSED_ARG(depth);
9088 /* Find last node. */
9091 regnode * const temp = regnext(scan);
9093 SV * const mysv=sv_newmortal();
9094 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9095 regprop(RExC_rx, mysv, scan);
9096 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9097 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9098 (temp == NULL ? "->" : ""),
9099 (temp == NULL ? PL_reg_name[OP(val)] : "")
9107 if (reg_off_by_arg[OP(scan)]) {
9108 ARG_SET(scan, val - scan);
9111 NEXT_OFF(scan) = val - scan;
9117 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9118 - Look for optimizable sequences at the same time.
9119 - currently only looks for EXACT chains.
9121 This is expermental code. The idea is to use this routine to perform
9122 in place optimizations on branches and groups as they are constructed,
9123 with the long term intention of removing optimization from study_chunk so
9124 that it is purely analytical.
9126 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9127 to control which is which.
9130 /* TODO: All four parms should be const */
9133 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9136 register regnode *scan;
9138 #ifdef EXPERIMENTAL_INPLACESCAN
9141 GET_RE_DEBUG_FLAGS_DECL;
9143 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9149 /* Find last node. */
9153 regnode * const temp = regnext(scan);
9154 #ifdef EXPERIMENTAL_INPLACESCAN
9155 if (PL_regkind[OP(scan)] == EXACT)
9156 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9164 if( exact == PSEUDO )
9166 else if ( exact != OP(scan) )
9175 SV * const mysv=sv_newmortal();
9176 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9177 regprop(RExC_rx, mysv, scan);
9178 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9179 SvPV_nolen_const(mysv),
9181 PL_reg_name[exact]);
9188 SV * const mysv_val=sv_newmortal();
9189 DEBUG_PARSE_MSG("");
9190 regprop(RExC_rx, mysv_val, val);
9191 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9192 SvPV_nolen_const(mysv_val),
9193 (IV)REG_NODE_NUM(val),
9197 if (reg_off_by_arg[OP(scan)]) {
9198 ARG_SET(scan, val - scan);
9201 NEXT_OFF(scan) = val - scan;
9209 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9213 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9218 for (bit=0; bit<32; bit++) {
9219 if (flags & (1<<bit)) {
9221 PerlIO_printf(Perl_debug_log, "%s",lead);
9222 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9227 PerlIO_printf(Perl_debug_log, "\n");
9229 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9235 Perl_regdump(pTHX_ const regexp *r)
9239 SV * const sv = sv_newmortal();
9240 SV *dsv= sv_newmortal();
9242 GET_RE_DEBUG_FLAGS_DECL;
9244 PERL_ARGS_ASSERT_REGDUMP;
9246 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9248 /* Header fields of interest. */
9249 if (r->anchored_substr) {
9250 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9251 RE_SV_DUMPLEN(r->anchored_substr), 30);
9252 PerlIO_printf(Perl_debug_log,
9253 "anchored %s%s at %"IVdf" ",
9254 s, RE_SV_TAIL(r->anchored_substr),
9255 (IV)r->anchored_offset);
9256 } else if (r->anchored_utf8) {
9257 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9258 RE_SV_DUMPLEN(r->anchored_utf8), 30);
9259 PerlIO_printf(Perl_debug_log,
9260 "anchored utf8 %s%s at %"IVdf" ",
9261 s, RE_SV_TAIL(r->anchored_utf8),
9262 (IV)r->anchored_offset);
9264 if (r->float_substr) {
9265 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9266 RE_SV_DUMPLEN(r->float_substr), 30);
9267 PerlIO_printf(Perl_debug_log,
9268 "floating %s%s at %"IVdf"..%"UVuf" ",
9269 s, RE_SV_TAIL(r->float_substr),
9270 (IV)r->float_min_offset, (UV)r->float_max_offset);
9271 } else if (r->float_utf8) {
9272 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9273 RE_SV_DUMPLEN(r->float_utf8), 30);
9274 PerlIO_printf(Perl_debug_log,
9275 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9276 s, RE_SV_TAIL(r->float_utf8),
9277 (IV)r->float_min_offset, (UV)r->float_max_offset);
9279 if (r->check_substr || r->check_utf8)
9280 PerlIO_printf(Perl_debug_log,
9282 (r->check_substr == r->float_substr
9283 && r->check_utf8 == r->float_utf8
9284 ? "(checking floating" : "(checking anchored"));
9285 if (r->extflags & RXf_NOSCAN)
9286 PerlIO_printf(Perl_debug_log, " noscan");
9287 if (r->extflags & RXf_CHECK_ALL)
9288 PerlIO_printf(Perl_debug_log, " isall");
9289 if (r->check_substr || r->check_utf8)
9290 PerlIO_printf(Perl_debug_log, ") ");
9292 if (ri->regstclass) {
9293 regprop(r, sv, ri->regstclass);
9294 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9296 if (r->extflags & RXf_ANCH) {
9297 PerlIO_printf(Perl_debug_log, "anchored");
9298 if (r->extflags & RXf_ANCH_BOL)
9299 PerlIO_printf(Perl_debug_log, "(BOL)");
9300 if (r->extflags & RXf_ANCH_MBOL)
9301 PerlIO_printf(Perl_debug_log, "(MBOL)");
9302 if (r->extflags & RXf_ANCH_SBOL)
9303 PerlIO_printf(Perl_debug_log, "(SBOL)");
9304 if (r->extflags & RXf_ANCH_GPOS)
9305 PerlIO_printf(Perl_debug_log, "(GPOS)");
9306 PerlIO_putc(Perl_debug_log, ' ');
9308 if (r->extflags & RXf_GPOS_SEEN)
9309 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9310 if (r->intflags & PREGf_SKIP)
9311 PerlIO_printf(Perl_debug_log, "plus ");
9312 if (r->intflags & PREGf_IMPLICIT)
9313 PerlIO_printf(Perl_debug_log, "implicit ");
9314 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9315 if (r->extflags & RXf_EVAL_SEEN)
9316 PerlIO_printf(Perl_debug_log, "with eval ");
9317 PerlIO_printf(Perl_debug_log, "\n");
9318 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9320 PERL_ARGS_ASSERT_REGDUMP;
9321 PERL_UNUSED_CONTEXT;
9323 #endif /* DEBUGGING */
9327 - regprop - printable representation of opcode
9329 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9332 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9333 if (flags & ANYOF_INVERT) \
9334 /*make sure the invert info is in each */ \
9335 sv_catpvs(sv, "^"); \
9341 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9346 RXi_GET_DECL(prog,progi);
9347 GET_RE_DEBUG_FLAGS_DECL;
9349 PERL_ARGS_ASSERT_REGPROP;
9353 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9354 /* It would be nice to FAIL() here, but this may be called from
9355 regexec.c, and it would be hard to supply pRExC_state. */
9356 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9357 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9359 k = PL_regkind[OP(o)];
9363 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9364 * is a crude hack but it may be the best for now since
9365 * we have no flag "this EXACTish node was UTF-8"
9367 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9368 PERL_PV_ESCAPE_UNI_DETECT |
9369 PERL_PV_PRETTY_ELLIPSES |
9370 PERL_PV_PRETTY_LTGT |
9371 PERL_PV_PRETTY_NOCLEAR
9373 } else if (k == TRIE) {
9374 /* print the details of the trie in dumpuntil instead, as
9375 * progi->data isn't available here */
9376 const char op = OP(o);
9377 const U32 n = ARG(o);
9378 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9379 (reg_ac_data *)progi->data->data[n] :
9381 const reg_trie_data * const trie
9382 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9384 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9385 DEBUG_TRIE_COMPILE_r(
9386 Perl_sv_catpvf(aTHX_ sv,
9387 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9388 (UV)trie->startstate,
9389 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9390 (UV)trie->wordcount,
9393 (UV)TRIE_CHARCOUNT(trie),
9394 (UV)trie->uniquecharcount
9397 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9399 int rangestart = -1;
9400 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9402 for (i = 0; i <= 256; i++) {
9403 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9404 if (rangestart == -1)
9406 } else if (rangestart != -1) {
9407 if (i <= rangestart + 3)
9408 for (; rangestart < i; rangestart++)
9409 put_byte(sv, rangestart);
9411 put_byte(sv, rangestart);
9413 put_byte(sv, i - 1);
9421 } else if (k == CURLY) {
9422 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9423 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9424 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9426 else if (k == WHILEM && o->flags) /* Ordinal/of */
9427 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9428 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9429 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9430 if ( RXp_PAREN_NAMES(prog) ) {
9431 if ( k != REF || OP(o) < NREF) {
9432 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9433 SV **name= av_fetch(list, ARG(o), 0 );
9435 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9438 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9439 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9440 I32 *nums=(I32*)SvPVX(sv_dat);
9441 SV **name= av_fetch(list, nums[0], 0 );
9444 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9445 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9446 (n ? "," : ""), (IV)nums[n]);
9448 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9452 } else if (k == GOSUB)
9453 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9454 else if (k == VERB) {
9456 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9457 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9458 } else if (k == LOGICAL)
9459 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9460 else if (k == FOLDCHAR)
9461 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9462 else if (k == ANYOF) {
9463 int i, rangestart = -1;
9464 const U8 flags = ANYOF_FLAGS(o);
9467 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9468 static const char * const anyofs[] = {
9501 if (flags & ANYOF_LOCALE)
9502 sv_catpvs(sv, "{loc}");
9503 if (flags & ANYOF_FOLD)
9504 sv_catpvs(sv, "{i}");
9505 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9506 if (flags & ANYOF_INVERT)
9509 /* output what the standard cp 0-255 bitmap matches */
9510 for (i = 0; i <= 256; i++) {
9511 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9512 if (rangestart == -1)
9514 } else if (rangestart != -1) {
9515 if (i <= rangestart + 3)
9516 for (; rangestart < i; rangestart++)
9517 put_byte(sv, rangestart);
9519 put_byte(sv, rangestart);
9521 put_byte(sv, i - 1);
9528 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9529 /* output any special charclass tests (used mostly under use locale) */
9530 if (o->flags & ANYOF_CLASS)
9531 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9532 if (ANYOF_CLASS_TEST(o,i)) {
9533 sv_catpv(sv, anyofs[i]);
9537 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9539 /* output information about the unicode matching */
9540 if (flags & ANYOF_UNICODE)
9541 sv_catpvs(sv, "{unicode}");
9542 else if (flags & ANYOF_UNICODE_ALL)
9543 sv_catpvs(sv, "{unicode_all}");
9547 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9551 U8 s[UTF8_MAXBYTES_CASE+1];
9553 for (i = 0; i <= 256; i++) { /* just the first 256 */
9554 uvchr_to_utf8(s, i);
9556 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9557 if (rangestart == -1)
9559 } else if (rangestart != -1) {
9560 if (i <= rangestart + 3)
9561 for (; rangestart < i; rangestart++) {
9562 const U8 * const e = uvchr_to_utf8(s,rangestart);
9564 for(p = s; p < e; p++)
9568 const U8 *e = uvchr_to_utf8(s,rangestart);
9570 for (p = s; p < e; p++)
9573 e = uvchr_to_utf8(s, i-1);
9574 for (p = s; p < e; p++)
9581 sv_catpvs(sv, "..."); /* et cetera */
9585 char *s = savesvpv(lv);
9586 char * const origs = s;
9588 while (*s && *s != '\n')
9592 const char * const t = ++s;
9610 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9612 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9613 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9615 PERL_UNUSED_CONTEXT;
9616 PERL_UNUSED_ARG(sv);
9618 PERL_UNUSED_ARG(prog);
9619 #endif /* DEBUGGING */
9623 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9624 { /* Assume that RE_INTUIT is set */
9626 struct regexp *const prog = (struct regexp *)SvANY(r);
9627 GET_RE_DEBUG_FLAGS_DECL;
9629 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9630 PERL_UNUSED_CONTEXT;
9634 const char * const s = SvPV_nolen_const(prog->check_substr
9635 ? prog->check_substr : prog->check_utf8);
9637 if (!PL_colorset) reginitcolors();
9638 PerlIO_printf(Perl_debug_log,
9639 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9641 prog->check_substr ? "" : "utf8 ",
9642 PL_colors[5],PL_colors[0],
9645 (strlen(s) > 60 ? "..." : ""));
9648 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9654 handles refcounting and freeing the perl core regexp structure. When
9655 it is necessary to actually free the structure the first thing it
9656 does is call the 'free' method of the regexp_engine associated to to
9657 the regexp, allowing the handling of the void *pprivate; member
9658 first. (This routine is not overridable by extensions, which is why
9659 the extensions free is called first.)
9661 See regdupe and regdupe_internal if you change anything here.
9663 #ifndef PERL_IN_XSUB_RE
9665 Perl_pregfree(pTHX_ REGEXP *r)
9671 Perl_pregfree2(pTHX_ REGEXP *rx)
9674 struct regexp *const r = (struct regexp *)SvANY(rx);
9675 GET_RE_DEBUG_FLAGS_DECL;
9677 PERL_ARGS_ASSERT_PREGFREE2;
9680 ReREFCNT_dec(r->mother_re);
9682 CALLREGFREE_PVT(rx); /* free the private data */
9683 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9686 SvREFCNT_dec(r->anchored_substr);
9687 SvREFCNT_dec(r->anchored_utf8);
9688 SvREFCNT_dec(r->float_substr);
9689 SvREFCNT_dec(r->float_utf8);
9690 Safefree(r->substrs);
9692 RX_MATCH_COPY_FREE(rx);
9693 #ifdef PERL_OLD_COPY_ON_WRITE
9694 SvREFCNT_dec(r->saved_copy);
9701 This is a hacky workaround to the structural issue of match results
9702 being stored in the regexp structure which is in turn stored in
9703 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9704 could be PL_curpm in multiple contexts, and could require multiple
9705 result sets being associated with the pattern simultaneously, such
9706 as when doing a recursive match with (??{$qr})
9708 The solution is to make a lightweight copy of the regexp structure
9709 when a qr// is returned from the code executed by (??{$qr}) this
9710 lightweight copy doesnt actually own any of its data except for
9711 the starp/end and the actual regexp structure itself.
9717 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9720 struct regexp *const r = (struct regexp *)SvANY(rx);
9721 register const I32 npar = r->nparens+1;
9723 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9726 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9727 ret = (struct regexp *)SvANY(ret_x);
9729 (void)ReREFCNT_inc(rx);
9730 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9731 by pointing directly at the buffer, but flagging that the allocated
9732 space in the copy is zero. As we've just done a struct copy, it's now
9733 a case of zero-ing that, rather than copying the current length. */
9734 SvPV_set(ret_x, RX_WRAPPED(rx));
9735 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9736 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9737 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9738 SvLEN_set(ret_x, 0);
9739 SvSTASH_set(ret_x, NULL);
9740 SvMAGIC_set(ret_x, NULL);
9741 Newx(ret->offs, npar, regexp_paren_pair);
9742 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9744 Newx(ret->substrs, 1, struct reg_substr_data);
9745 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9747 SvREFCNT_inc_void(ret->anchored_substr);
9748 SvREFCNT_inc_void(ret->anchored_utf8);
9749 SvREFCNT_inc_void(ret->float_substr);
9750 SvREFCNT_inc_void(ret->float_utf8);
9752 /* check_substr and check_utf8, if non-NULL, point to either their
9753 anchored or float namesakes, and don't hold a second reference. */
9755 RX_MATCH_COPIED_off(ret_x);
9756 #ifdef PERL_OLD_COPY_ON_WRITE
9757 ret->saved_copy = NULL;
9759 ret->mother_re = rx;
9765 /* regfree_internal()
9767 Free the private data in a regexp. This is overloadable by
9768 extensions. Perl takes care of the regexp structure in pregfree(),
9769 this covers the *pprivate pointer which technically perldoesnt
9770 know about, however of course we have to handle the
9771 regexp_internal structure when no extension is in use.
9773 Note this is called before freeing anything in the regexp
9778 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9781 struct regexp *const r = (struct regexp *)SvANY(rx);
9783 GET_RE_DEBUG_FLAGS_DECL;
9785 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9791 SV *dsv= sv_newmortal();
9792 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9793 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9794 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9795 PL_colors[4],PL_colors[5],s);
9798 #ifdef RE_TRACK_PATTERN_OFFSETS
9800 Safefree(ri->u.offsets); /* 20010421 MJD */
9803 int n = ri->data->count;
9804 PAD* new_comppad = NULL;
9809 /* If you add a ->what type here, update the comment in regcomp.h */
9810 switch (ri->data->what[n]) {
9815 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9818 Safefree(ri->data->data[n]);
9821 new_comppad = MUTABLE_AV(ri->data->data[n]);
9824 if (new_comppad == NULL)
9825 Perl_croak(aTHX_ "panic: pregfree comppad");
9826 PAD_SAVE_LOCAL(old_comppad,
9827 /* Watch out for global destruction's random ordering. */
9828 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9831 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9834 op_free((OP_4tree*)ri->data->data[n]);
9836 PAD_RESTORE_LOCAL(old_comppad);
9837 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9843 { /* Aho Corasick add-on structure for a trie node.
9844 Used in stclass optimization only */
9846 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9848 refcount = --aho->refcount;
9851 PerlMemShared_free(aho->states);
9852 PerlMemShared_free(aho->fail);
9853 /* do this last!!!! */
9854 PerlMemShared_free(ri->data->data[n]);
9855 PerlMemShared_free(ri->regstclass);
9861 /* trie structure. */
9863 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9865 refcount = --trie->refcount;
9868 PerlMemShared_free(trie->charmap);
9869 PerlMemShared_free(trie->states);
9870 PerlMemShared_free(trie->trans);
9872 PerlMemShared_free(trie->bitmap);
9874 PerlMemShared_free(trie->jump);
9875 PerlMemShared_free(trie->wordinfo);
9876 /* do this last!!!! */
9877 PerlMemShared_free(ri->data->data[n]);
9882 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9885 Safefree(ri->data->what);
9892 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9893 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
9894 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9897 re_dup - duplicate a regexp.
9899 This routine is expected to clone a given regexp structure. It is only
9900 compiled under USE_ITHREADS.
9902 After all of the core data stored in struct regexp is duplicated
9903 the regexp_engine.dupe method is used to copy any private data
9904 stored in the *pprivate pointer. This allows extensions to handle
9905 any duplication it needs to do.
9907 See pregfree() and regfree_internal() if you change anything here.
9909 #if defined(USE_ITHREADS)
9910 #ifndef PERL_IN_XSUB_RE
9912 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9916 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9917 struct regexp *ret = (struct regexp *)SvANY(dstr);
9919 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9921 npar = r->nparens+1;
9922 Newx(ret->offs, npar, regexp_paren_pair);
9923 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9925 /* no need to copy these */
9926 Newx(ret->swap, npar, regexp_paren_pair);
9930 /* Do it this way to avoid reading from *r after the StructCopy().
9931 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9932 cache, it doesn't matter. */
9933 const bool anchored = r->check_substr
9934 ? r->check_substr == r->anchored_substr
9935 : r->check_utf8 == r->anchored_utf8;
9936 Newx(ret->substrs, 1, struct reg_substr_data);
9937 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9939 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9940 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9941 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9942 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9944 /* check_substr and check_utf8, if non-NULL, point to either their
9945 anchored or float namesakes, and don't hold a second reference. */
9947 if (ret->check_substr) {
9949 assert(r->check_utf8 == r->anchored_utf8);
9950 ret->check_substr = ret->anchored_substr;
9951 ret->check_utf8 = ret->anchored_utf8;
9953 assert(r->check_substr == r->float_substr);
9954 assert(r->check_utf8 == r->float_utf8);
9955 ret->check_substr = ret->float_substr;
9956 ret->check_utf8 = ret->float_utf8;
9958 } else if (ret->check_utf8) {
9960 ret->check_utf8 = ret->anchored_utf8;
9962 ret->check_utf8 = ret->float_utf8;
9967 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9970 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9972 if (RX_MATCH_COPIED(dstr))
9973 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9976 #ifdef PERL_OLD_COPY_ON_WRITE
9977 ret->saved_copy = NULL;
9980 if (ret->mother_re) {
9981 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9982 /* Our storage points directly to our mother regexp, but that's
9983 1: a buffer in a different thread
9984 2: something we no longer hold a reference on
9985 so we need to copy it locally. */
9986 /* Note we need to sue SvCUR() on our mother_re, because it, in
9987 turn, may well be pointing to its own mother_re. */
9988 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9989 SvCUR(ret->mother_re)+1));
9990 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9992 ret->mother_re = NULL;
9996 #endif /* PERL_IN_XSUB_RE */
10001 This is the internal complement to regdupe() which is used to copy
10002 the structure pointed to by the *pprivate pointer in the regexp.
10003 This is the core version of the extension overridable cloning hook.
10004 The regexp structure being duplicated will be copied by perl prior
10005 to this and will be provided as the regexp *r argument, however
10006 with the /old/ structures pprivate pointer value. Thus this routine
10007 may override any copying normally done by perl.
10009 It returns a pointer to the new regexp_internal structure.
10013 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10016 struct regexp *const r = (struct regexp *)SvANY(rx);
10017 regexp_internal *reti;
10019 RXi_GET_DECL(r,ri);
10021 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10023 npar = r->nparens+1;
10026 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10027 Copy(ri->program, reti->program, len+1, regnode);
10030 reti->regstclass = NULL;
10033 struct reg_data *d;
10034 const int count = ri->data->count;
10037 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10038 char, struct reg_data);
10039 Newx(d->what, count, U8);
10042 for (i = 0; i < count; i++) {
10043 d->what[i] = ri->data->what[i];
10044 switch (d->what[i]) {
10045 /* legal options are one of: sSfpontTua
10046 see also regcomp.h and pregfree() */
10047 case 'a': /* actually an AV, but the dup function is identical. */
10050 case 'p': /* actually an AV, but the dup function is identical. */
10051 case 'u': /* actually an HV, but the dup function is identical. */
10052 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10055 /* This is cheating. */
10056 Newx(d->data[i], 1, struct regnode_charclass_class);
10057 StructCopy(ri->data->data[i], d->data[i],
10058 struct regnode_charclass_class);
10059 reti->regstclass = (regnode*)d->data[i];
10062 /* Compiled op trees are readonly and in shared memory,
10063 and can thus be shared without duplication. */
10065 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10069 /* Trie stclasses are readonly and can thus be shared
10070 * without duplication. We free the stclass in pregfree
10071 * when the corresponding reg_ac_data struct is freed.
10073 reti->regstclass= ri->regstclass;
10077 ((reg_trie_data*)ri->data->data[i])->refcount++;
10081 d->data[i] = ri->data->data[i];
10084 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10093 reti->name_list_idx = ri->name_list_idx;
10095 #ifdef RE_TRACK_PATTERN_OFFSETS
10096 if (ri->u.offsets) {
10097 Newx(reti->u.offsets, 2*len+1, U32);
10098 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10101 SetProgLen(reti,len);
10104 return (void*)reti;
10107 #endif /* USE_ITHREADS */
10109 #ifndef PERL_IN_XSUB_RE
10112 - regnext - dig the "next" pointer out of a node
10115 Perl_regnext(pTHX_ register regnode *p)
10118 register I32 offset;
10123 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10124 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10127 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10136 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10139 STRLEN l1 = strlen(pat1);
10140 STRLEN l2 = strlen(pat2);
10143 const char *message;
10145 PERL_ARGS_ASSERT_RE_CROAK2;
10151 Copy(pat1, buf, l1 , char);
10152 Copy(pat2, buf + l1, l2 , char);
10153 buf[l1 + l2] = '\n';
10154 buf[l1 + l2 + 1] = '\0';
10156 /* ANSI variant takes additional second argument */
10157 va_start(args, pat2);
10161 msv = vmess(buf, &args);
10163 message = SvPV_const(msv,l1);
10166 Copy(message, buf, l1 , char);
10167 buf[l1-1] = '\0'; /* Overwrite \n */
10168 Perl_croak(aTHX_ "%s", buf);
10171 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10173 #ifndef PERL_IN_XSUB_RE
10175 Perl_save_re_context(pTHX)
10179 struct re_save_state *state;
10181 SAVEVPTR(PL_curcop);
10182 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10184 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10185 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10186 SSPUSHUV(SAVEt_RE_STATE);
10188 Copy(&PL_reg_state, state, 1, struct re_save_state);
10190 PL_reg_start_tmp = 0;
10191 PL_reg_start_tmpl = 0;
10192 PL_reg_oldsaved = NULL;
10193 PL_reg_oldsavedlen = 0;
10194 PL_reg_maxiter = 0;
10195 PL_reg_leftiter = 0;
10196 PL_reg_poscache = NULL;
10197 PL_reg_poscache_size = 0;
10198 #ifdef PERL_OLD_COPY_ON_WRITE
10202 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10204 const REGEXP * const rx = PM_GETRE(PL_curpm);
10207 for (i = 1; i <= RX_NPARENS(rx); i++) {
10208 char digits[TYPE_CHARS(long)];
10209 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10210 GV *const *const gvp
10211 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10214 GV * const gv = *gvp;
10215 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10225 clear_re(pTHX_ void *r)
10228 ReREFCNT_dec((REGEXP *)r);
10234 S_put_byte(pTHX_ SV *sv, int c)
10236 PERL_ARGS_ASSERT_PUT_BYTE;
10238 /* Our definition of isPRINT() ignores locales, so only bytes that are
10239 not part of UTF-8 are considered printable. I assume that the same
10240 holds for UTF-EBCDIC.
10241 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10242 which Wikipedia says:
10244 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10245 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10246 identical, to the ASCII delete (DEL) or rubout control character.
10247 ) So the old condition can be simplified to !isPRINT(c) */
10249 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
10251 const char string = c;
10252 if (c == '-' || c == ']' || c == '\\' || c == '^')
10253 sv_catpvs(sv, "\\");
10254 sv_catpvn(sv, &string, 1);
10259 #define CLEAR_OPTSTART \
10260 if (optstart) STMT_START { \
10261 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10265 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10267 STATIC const regnode *
10268 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10269 const regnode *last, const regnode *plast,
10270 SV* sv, I32 indent, U32 depth)
10273 register U8 op = PSEUDO; /* Arbitrary non-END op. */
10274 register const regnode *next;
10275 const regnode *optstart= NULL;
10277 RXi_GET_DECL(r,ri);
10278 GET_RE_DEBUG_FLAGS_DECL;
10280 PERL_ARGS_ASSERT_DUMPUNTIL;
10282 #ifdef DEBUG_DUMPUNTIL
10283 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10284 last ? last-start : 0,plast ? plast-start : 0);
10287 if (plast && plast < last)
10290 while (PL_regkind[op] != END && (!last || node < last)) {
10291 /* While that wasn't END last time... */
10294 if (op == CLOSE || op == WHILEM)
10296 next = regnext((regnode *)node);
10299 if (OP(node) == OPTIMIZED) {
10300 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10307 regprop(r, sv, node);
10308 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10309 (int)(2*indent + 1), "", SvPVX_const(sv));
10311 if (OP(node) != OPTIMIZED) {
10312 if (next == NULL) /* Next ptr. */
10313 PerlIO_printf(Perl_debug_log, " (0)");
10314 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10315 PerlIO_printf(Perl_debug_log, " (FAIL)");
10317 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10318 (void)PerlIO_putc(Perl_debug_log, '\n');
10322 if (PL_regkind[(U8)op] == BRANCHJ) {
10325 register const regnode *nnode = (OP(next) == LONGJMP
10326 ? regnext((regnode *)next)
10328 if (last && nnode > last)
10330 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10333 else if (PL_regkind[(U8)op] == BRANCH) {
10335 DUMPUNTIL(NEXTOPER(node), next);
10337 else if ( PL_regkind[(U8)op] == TRIE ) {
10338 const regnode *this_trie = node;
10339 const char op = OP(node);
10340 const U32 n = ARG(node);
10341 const reg_ac_data * const ac = op>=AHOCORASICK ?
10342 (reg_ac_data *)ri->data->data[n] :
10344 const reg_trie_data * const trie =
10345 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10347 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10349 const regnode *nextbranch= NULL;
10352 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10353 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10355 PerlIO_printf(Perl_debug_log, "%*s%s ",
10356 (int)(2*(indent+3)), "",
10357 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10358 PL_colors[0], PL_colors[1],
10359 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10360 PERL_PV_PRETTY_ELLIPSES |
10361 PERL_PV_PRETTY_LTGT
10366 U16 dist= trie->jump[word_idx+1];
10367 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10368 (UV)((dist ? this_trie + dist : next) - start));
10371 nextbranch= this_trie + trie->jump[0];
10372 DUMPUNTIL(this_trie + dist, nextbranch);
10374 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10375 nextbranch= regnext((regnode *)nextbranch);
10377 PerlIO_printf(Perl_debug_log, "\n");
10380 if (last && next > last)
10385 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10386 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10387 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10389 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10391 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10393 else if ( op == PLUS || op == STAR) {
10394 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10396 else if (op == ANYOF) {
10397 /* arglen 1 + class block */
10398 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10399 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10400 node = NEXTOPER(node);
10402 else if (PL_regkind[(U8)op] == EXACT) {
10403 /* Literal string, where present. */
10404 node += NODE_SZ_STR(node) - 1;
10405 node = NEXTOPER(node);
10408 node = NEXTOPER(node);
10409 node += regarglen[(U8)op];
10411 if (op == CURLYX || op == OPEN)
10415 #ifdef DEBUG_DUMPUNTIL
10416 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10421 #endif /* DEBUGGING */
10425 * c-indentation-style: bsd
10426 * c-basic-offset: 4
10427 * indent-tabs-mode: t
10430 * ex: set ts=8 sts=4 sw=4 noet: