5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
93 # if defined(BUGGY_MSC6)
94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 # pragma optimize("a",off)
96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 # pragma optimize("w",on )
98 # endif /* BUGGY_MSC6 */
102 #define STATIC static
105 typedef struct RExC_state_t {
106 U32 flags; /* are we folding, multilining? */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
111 char *start; /* Start of input for compile */
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
115 regnode *emit_start; /* Start of emitted-code area */
116 regnode *emit_bound; /* First regnode outside of the allocated space */
117 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
121 I32 size; /* Code size. */
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
124 I32 nestroot; /* root parens we are in - used by accept */
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
135 HV *charnames; /* cache of named sequences */
136 HV *paren_names; /* Paren names */
138 regnode **recurse; /* Recurse regops */
139 I32 recurse_count; /* Number of recurse regops */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
145 const char *lastparse;
147 AV *paren_name_list; /* idx -> name */
148 #define RExC_lastparse (pRExC_state->lastparse)
149 #define RExC_lastnum (pRExC_state->lastnum)
150 #define RExC_paren_name_list (pRExC_state->paren_name_list)
154 #define RExC_flags (pRExC_state->flags)
155 #define RExC_precomp (pRExC_state->precomp)
156 #define RExC_rx_sv (pRExC_state->rx_sv)
157 #define RExC_rx (pRExC_state->rx)
158 #define RExC_rxi (pRExC_state->rxi)
159 #define RExC_start (pRExC_state->start)
160 #define RExC_end (pRExC_state->end)
161 #define RExC_parse (pRExC_state->parse)
162 #define RExC_whilem_seen (pRExC_state->whilem_seen)
163 #ifdef RE_TRACK_PATTERN_OFFSETS
164 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166 #define RExC_emit (pRExC_state->emit)
167 #define RExC_emit_start (pRExC_state->emit_start)
168 #define RExC_emit_bound (pRExC_state->emit_bound)
169 #define RExC_naughty (pRExC_state->naughty)
170 #define RExC_sawback (pRExC_state->sawback)
171 #define RExC_seen (pRExC_state->seen)
172 #define RExC_size (pRExC_state->size)
173 #define RExC_npar (pRExC_state->npar)
174 #define RExC_nestroot (pRExC_state->nestroot)
175 #define RExC_extralen (pRExC_state->extralen)
176 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
177 #define RExC_seen_evals (pRExC_state->seen_evals)
178 #define RExC_utf8 (pRExC_state->utf8)
179 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
180 #define RExC_charnames (pRExC_state->charnames)
181 #define RExC_open_parens (pRExC_state->open_parens)
182 #define RExC_close_parens (pRExC_state->close_parens)
183 #define RExC_opend (pRExC_state->opend)
184 #define RExC_paren_names (pRExC_state->paren_names)
185 #define RExC_recurse (pRExC_state->recurse)
186 #define RExC_recurse_count (pRExC_state->recurse_count)
189 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
194 #undef SPSTART /* dratted cpp namespace... */
197 * Flags to be passed up and down.
199 #define WORST 0 /* Worst case. */
200 #define HASWIDTH 0x01 /* Known to match non-null strings. */
201 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
202 #define SPSTART 0x04 /* Starts with * or +. */
203 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
204 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
206 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
208 /* whether trie related optimizations are enabled */
209 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
210 #define TRIE_STUDY_OPT
211 #define FULL_TRIE_STUDY
217 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
218 #define PBITVAL(paren) (1 << ((paren) & 7))
219 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
220 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
221 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
224 /* About scan_data_t.
226 During optimisation we recurse through the regexp program performing
227 various inplace (keyhole style) optimisations. In addition study_chunk
228 and scan_commit populate this data structure with information about
229 what strings MUST appear in the pattern. We look for the longest
230 string that must appear for at a fixed location, and we look for the
231 longest string that may appear at a floating location. So for instance
236 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
237 strings (because they follow a .* construct). study_chunk will identify
238 both FOO and BAR as being the longest fixed and floating strings respectively.
240 The strings can be composites, for instance
244 will result in a composite fixed substring 'foo'.
246 For each string some basic information is maintained:
248 - offset or min_offset
249 This is the position the string must appear at, or not before.
250 It also implicitly (when combined with minlenp) tells us how many
251 character must match before the string we are searching.
252 Likewise when combined with minlenp and the length of the string
253 tells us how many characters must appear after the string we have
257 Only used for floating strings. This is the rightmost point that
258 the string can appear at. Ifset to I32 max it indicates that the
259 string can occur infinitely far to the right.
262 A pointer to the minimum length of the pattern that the string
263 was found inside. This is important as in the case of positive
264 lookahead or positive lookbehind we can have multiple patterns
269 The minimum length of the pattern overall is 3, the minimum length
270 of the lookahead part is 3, but the minimum length of the part that
271 will actually match is 1. So 'FOO's minimum length is 3, but the
272 minimum length for the F is 1. This is important as the minimum length
273 is used to determine offsets in front of and behind the string being
274 looked for. Since strings can be composites this is the length of the
275 pattern at the time it was commited with a scan_commit. Note that
276 the length is calculated by study_chunk, so that the minimum lengths
277 are not known until the full pattern has been compiled, thus the
278 pointer to the value.
282 In the case of lookbehind the string being searched for can be
283 offset past the start point of the final matching string.
284 If this value was just blithely removed from the min_offset it would
285 invalidate some of the calculations for how many chars must match
286 before or after (as they are derived from min_offset and minlen and
287 the length of the string being searched for).
288 When the final pattern is compiled and the data is moved from the
289 scan_data_t structure into the regexp structure the information
290 about lookbehind is factored in, with the information that would
291 have been lost precalculated in the end_shift field for the
294 The fields pos_min and pos_delta are used to store the minimum offset
295 and the delta to the maximum offset at the current point in the pattern.
299 typedef struct scan_data_t {
300 /*I32 len_min; unused */
301 /*I32 len_delta; unused */
305 I32 last_end; /* min value, <0 unless valid. */
308 SV **longest; /* Either &l_fixed, or &l_float. */
309 SV *longest_fixed; /* longest fixed string found in pattern */
310 I32 offset_fixed; /* offset where it starts */
311 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
312 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
313 SV *longest_float; /* longest floating string found in pattern */
314 I32 offset_float_min; /* earliest point in string it can appear */
315 I32 offset_float_max; /* latest point in string it can appear */
316 I32 *minlen_float; /* pointer to the minlen relevent to the string */
317 I32 lookbehind_float; /* is the position of the string modified by LB */
321 struct regnode_charclass_class *start_class;
325 * Forward declarations for pregcomp()'s friends.
328 static const scan_data_t zero_scan_data =
329 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
331 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
332 #define SF_BEFORE_SEOL 0x0001
333 #define SF_BEFORE_MEOL 0x0002
334 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
335 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
338 # define SF_FIX_SHIFT_EOL (0+2)
339 # define SF_FL_SHIFT_EOL (0+4)
341 # define SF_FIX_SHIFT_EOL (+2)
342 # define SF_FL_SHIFT_EOL (+4)
345 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
346 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
348 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
349 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
350 #define SF_IS_INF 0x0040
351 #define SF_HAS_PAR 0x0080
352 #define SF_IN_PAR 0x0100
353 #define SF_HAS_EVAL 0x0200
354 #define SCF_DO_SUBSTR 0x0400
355 #define SCF_DO_STCLASS_AND 0x0800
356 #define SCF_DO_STCLASS_OR 0x1000
357 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
358 #define SCF_WHILEM_VISITED_POS 0x2000
360 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
361 #define SCF_SEEN_ACCEPT 0x8000
363 #define UTF (RExC_utf8 != 0)
364 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
365 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
367 #define OOB_UNICODE 12345678
368 #define OOB_NAMEDCLASS -1
370 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
371 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
374 /* length of regex to show in messages that don't mark a position within */
375 #define RegexLengthToShowInErrorMessages 127
378 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
379 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
380 * op/pragma/warn/regcomp.
382 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
383 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
385 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
388 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
389 * arg. Show regex, up to a maximum length. If it's too long, chop and add
392 #define _FAIL(code) STMT_START { \
393 const char *ellipses = ""; \
394 IV len = RExC_end - RExC_precomp; \
397 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
398 if (len > RegexLengthToShowInErrorMessages) { \
399 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
400 len = RegexLengthToShowInErrorMessages - 10; \
406 #define FAIL(msg) _FAIL( \
407 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
408 msg, (int)len, RExC_precomp, ellipses))
410 #define FAIL2(msg,arg) _FAIL( \
411 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
412 arg, (int)len, RExC_precomp, ellipses))
415 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
417 #define Simple_vFAIL(m) STMT_START { \
418 const IV offset = RExC_parse - RExC_precomp; \
419 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
420 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
424 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
426 #define vFAIL(m) STMT_START { \
428 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
433 * Like Simple_vFAIL(), but accepts two arguments.
435 #define Simple_vFAIL2(m,a1) STMT_START { \
436 const IV offset = RExC_parse - RExC_precomp; \
437 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
438 (int)offset, RExC_precomp, RExC_precomp + offset); \
442 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
444 #define vFAIL2(m,a1) STMT_START { \
446 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
447 Simple_vFAIL2(m, a1); \
452 * Like Simple_vFAIL(), but accepts three arguments.
454 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
455 const IV offset = RExC_parse - RExC_precomp; \
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
461 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
463 #define vFAIL3(m,a1,a2) STMT_START { \
465 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
466 Simple_vFAIL3(m, a1, a2); \
470 * Like Simple_vFAIL(), but accepts four arguments.
472 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
473 const IV offset = RExC_parse - RExC_precomp; \
474 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
475 (int)offset, RExC_precomp, RExC_precomp + offset); \
478 #define ckWARNreg(loc,m) STMT_START { \
479 const IV offset = loc - RExC_precomp; \
480 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 (int)offset, RExC_precomp, RExC_precomp + offset); \
484 #define ckWARNregdep(loc,m) STMT_START { \
485 const IV offset = loc - RExC_precomp; \
486 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
488 (int)offset, RExC_precomp, RExC_precomp + offset); \
491 #define ckWARN2reg(loc, m, a1) STMT_START { \
492 const IV offset = loc - RExC_precomp; \
493 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
494 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
497 #define vWARN3(loc, m, a1, a2) STMT_START { \
498 const IV offset = loc - RExC_precomp; \
499 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
500 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
503 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
504 const IV offset = loc - RExC_precomp; \
505 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
506 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
509 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
510 const IV offset = loc - RExC_precomp; \
511 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
512 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
515 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
516 const IV offset = loc - RExC_precomp; \
517 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
518 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
521 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
522 const IV offset = loc - RExC_precomp; \
523 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
528 /* Allow for side effects in s */
529 #define REGC(c,s) STMT_START { \
530 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
533 /* Macros for recording node offsets. 20001227 mjd@plover.com
534 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
535 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
536 * Element 0 holds the number n.
537 * Position is 1 indexed.
539 #ifndef RE_TRACK_PATTERN_OFFSETS
540 #define Set_Node_Offset_To_R(node,byte)
541 #define Set_Node_Offset(node,byte)
542 #define Set_Cur_Node_Offset
543 #define Set_Node_Length_To_R(node,len)
544 #define Set_Node_Length(node,len)
545 #define Set_Node_Cur_Length(node)
546 #define Node_Offset(n)
547 #define Node_Length(n)
548 #define Set_Node_Offset_Length(node,offset,len)
549 #define ProgLen(ri) ri->u.proglen
550 #define SetProgLen(ri,x) ri->u.proglen = x
552 #define ProgLen(ri) ri->u.offsets[0]
553 #define SetProgLen(ri,x) ri->u.offsets[0] = x
554 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
556 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
557 __LINE__, (int)(node), (int)(byte))); \
559 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
561 RExC_offsets[2*(node)-1] = (byte); \
566 #define Set_Node_Offset(node,byte) \
567 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
568 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
570 #define Set_Node_Length_To_R(node,len) STMT_START { \
572 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
573 __LINE__, (int)(node), (int)(len))); \
575 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
577 RExC_offsets[2*(node)] = (len); \
582 #define Set_Node_Length(node,len) \
583 Set_Node_Length_To_R((node)-RExC_emit_start, len)
584 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
585 #define Set_Node_Cur_Length(node) \
586 Set_Node_Length(node, RExC_parse - parse_start)
588 /* Get offsets and lengths */
589 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
590 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
592 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
593 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
594 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
598 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
599 #define EXPERIMENTAL_INPLACESCAN
600 #endif /*RE_TRACK_PATTERN_OFFSETS*/
602 #define DEBUG_STUDYDATA(str,data,depth) \
603 DEBUG_OPTIMISE_MORE_r(if(data){ \
604 PerlIO_printf(Perl_debug_log, \
605 "%*s" str "Pos:%"IVdf"/%"IVdf \
606 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
607 (int)(depth)*2, "", \
608 (IV)((data)->pos_min), \
609 (IV)((data)->pos_delta), \
610 (UV)((data)->flags), \
611 (IV)((data)->whilem_c), \
612 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
613 is_inf ? "INF " : "" \
615 if ((data)->last_found) \
616 PerlIO_printf(Perl_debug_log, \
617 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
618 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
619 SvPVX_const((data)->last_found), \
620 (IV)((data)->last_end), \
621 (IV)((data)->last_start_min), \
622 (IV)((data)->last_start_max), \
623 ((data)->longest && \
624 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
625 SvPVX_const((data)->longest_fixed), \
626 (IV)((data)->offset_fixed), \
627 ((data)->longest && \
628 (data)->longest==&((data)->longest_float)) ? "*" : "", \
629 SvPVX_const((data)->longest_float), \
630 (IV)((data)->offset_float_min), \
631 (IV)((data)->offset_float_max) \
633 PerlIO_printf(Perl_debug_log,"\n"); \
636 static void clear_re(pTHX_ void *r);
638 /* Mark that we cannot extend a found fixed substring at this point.
639 Update the longest found anchored substring and the longest found
640 floating substrings if needed. */
643 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
645 const STRLEN l = CHR_SVLEN(data->last_found);
646 const STRLEN old_l = CHR_SVLEN(*data->longest);
647 GET_RE_DEBUG_FLAGS_DECL;
649 PERL_ARGS_ASSERT_SCAN_COMMIT;
651 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
652 SvSetMagicSV(*data->longest, data->last_found);
653 if (*data->longest == data->longest_fixed) {
654 data->offset_fixed = l ? data->last_start_min : data->pos_min;
655 if (data->flags & SF_BEFORE_EOL)
657 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
659 data->flags &= ~SF_FIX_BEFORE_EOL;
660 data->minlen_fixed=minlenp;
661 data->lookbehind_fixed=0;
663 else { /* *data->longest == data->longest_float */
664 data->offset_float_min = l ? data->last_start_min : data->pos_min;
665 data->offset_float_max = (l
666 ? data->last_start_max
667 : data->pos_min + data->pos_delta);
668 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
669 data->offset_float_max = I32_MAX;
670 if (data->flags & SF_BEFORE_EOL)
672 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
674 data->flags &= ~SF_FL_BEFORE_EOL;
675 data->minlen_float=minlenp;
676 data->lookbehind_float=0;
679 SvCUR_set(data->last_found, 0);
681 SV * const sv = data->last_found;
682 if (SvUTF8(sv) && SvMAGICAL(sv)) {
683 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
689 data->flags &= ~SF_BEFORE_EOL;
690 DEBUG_STUDYDATA("commit: ",data,0);
693 /* Can match anything (initialization) */
695 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
697 PERL_ARGS_ASSERT_CL_ANYTHING;
699 ANYOF_CLASS_ZERO(cl);
700 ANYOF_BITMAP_SETALL(cl);
701 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
703 cl->flags |= ANYOF_LOCALE;
706 /* Can match anything (initialization) */
708 S_cl_is_anything(const struct regnode_charclass_class *cl)
712 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
714 for (value = 0; value <= ANYOF_MAX; value += 2)
715 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
717 if (!(cl->flags & ANYOF_UNICODE_ALL))
719 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
724 /* Can match anything (initialization) */
726 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
728 PERL_ARGS_ASSERT_CL_INIT;
730 Zero(cl, 1, struct regnode_charclass_class);
732 cl_anything(pRExC_state, cl);
736 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
738 PERL_ARGS_ASSERT_CL_INIT_ZERO;
740 Zero(cl, 1, struct regnode_charclass_class);
742 cl_anything(pRExC_state, cl);
744 cl->flags |= ANYOF_LOCALE;
747 /* 'And' a given class with another one. Can create false positives */
748 /* We assume that cl is not inverted */
750 S_cl_and(struct regnode_charclass_class *cl,
751 const struct regnode_charclass_class *and_with)
753 PERL_ARGS_ASSERT_CL_AND;
755 assert(and_with->type == ANYOF);
756 if (!(and_with->flags & ANYOF_CLASS)
757 && !(cl->flags & ANYOF_CLASS)
758 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
759 && !(and_with->flags & ANYOF_FOLD)
760 && !(cl->flags & ANYOF_FOLD)) {
763 if (and_with->flags & ANYOF_INVERT)
764 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
765 cl->bitmap[i] &= ~and_with->bitmap[i];
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] &= and_with->bitmap[i];
769 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
770 if (!(and_with->flags & ANYOF_EOS))
771 cl->flags &= ~ANYOF_EOS;
773 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
774 !(and_with->flags & ANYOF_INVERT)) {
775 cl->flags &= ~ANYOF_UNICODE_ALL;
776 cl->flags |= ANYOF_UNICODE;
777 ARG_SET(cl, ARG(and_with));
779 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
780 !(and_with->flags & ANYOF_INVERT))
781 cl->flags &= ~ANYOF_UNICODE_ALL;
782 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
783 !(and_with->flags & ANYOF_INVERT))
784 cl->flags &= ~ANYOF_UNICODE;
787 /* 'OR' a given class with another one. Can create false positives */
788 /* We assume that cl is not inverted */
790 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
792 PERL_ARGS_ASSERT_CL_OR;
794 if (or_with->flags & ANYOF_INVERT) {
796 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
797 * <= (B1 | !B2) | (CL1 | !CL2)
798 * which is wasteful if CL2 is small, but we ignore CL2:
799 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
800 * XXXX Can we handle case-fold? Unclear:
801 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
802 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
804 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 && !(or_with->flags & ANYOF_FOLD)
806 && !(cl->flags & ANYOF_FOLD) ) {
809 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
810 cl->bitmap[i] |= ~or_with->bitmap[i];
811 } /* XXXX: logic is complicated otherwise */
813 cl_anything(pRExC_state, cl);
816 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
817 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
818 && (!(or_with->flags & ANYOF_FOLD)
819 || (cl->flags & ANYOF_FOLD)) ) {
822 /* OR char bitmap and class bitmap separately */
823 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
824 cl->bitmap[i] |= or_with->bitmap[i];
825 if (or_with->flags & ANYOF_CLASS) {
826 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
827 cl->classflags[i] |= or_with->classflags[i];
828 cl->flags |= ANYOF_CLASS;
831 else { /* XXXX: logic is complicated, leave it along for a moment. */
832 cl_anything(pRExC_state, cl);
835 if (or_with->flags & ANYOF_EOS)
836 cl->flags |= ANYOF_EOS;
838 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
839 ARG(cl) != ARG(or_with)) {
840 cl->flags |= ANYOF_UNICODE_ALL;
841 cl->flags &= ~ANYOF_UNICODE;
843 if (or_with->flags & ANYOF_UNICODE_ALL) {
844 cl->flags |= ANYOF_UNICODE_ALL;
845 cl->flags &= ~ANYOF_UNICODE;
849 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
850 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
851 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
852 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
857 dump_trie(trie,widecharmap,revcharmap)
858 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
859 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
861 These routines dump out a trie in a somewhat readable format.
862 The _interim_ variants are used for debugging the interim
863 tables that are used to generate the final compressed
864 representation which is what dump_trie expects.
866 Part of the reason for their existance is to provide a form
867 of documentation as to how the different representations function.
872 Dumps the final compressed table form of the trie to Perl_debug_log.
873 Used for debugging make_trie().
877 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
878 AV *revcharmap, U32 depth)
881 SV *sv=sv_newmortal();
882 int colwidth= widecharmap ? 6 : 4;
883 GET_RE_DEBUG_FLAGS_DECL;
885 PERL_ARGS_ASSERT_DUMP_TRIE;
887 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
888 (int)depth * 2 + 2,"",
889 "Match","Base","Ofs" );
891 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
892 SV ** const tmp = av_fetch( revcharmap, state, 0);
894 PerlIO_printf( Perl_debug_log, "%*s",
896 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
897 PL_colors[0], PL_colors[1],
898 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
899 PERL_PV_ESCAPE_FIRSTCHAR
904 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
905 (int)depth * 2 + 2,"");
907 for( state = 0 ; state < trie->uniquecharcount ; state++ )
908 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
909 PerlIO_printf( Perl_debug_log, "\n");
911 for( state = 1 ; state < trie->statecount ; state++ ) {
912 const U32 base = trie->states[ state ].trans.base;
914 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
916 if ( trie->states[ state ].wordnum ) {
917 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
919 PerlIO_printf( Perl_debug_log, "%6s", "" );
922 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
927 while( ( base + ofs < trie->uniquecharcount ) ||
928 ( base + ofs - trie->uniquecharcount < trie->lasttrans
929 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
932 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
934 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
935 if ( ( base + ofs >= trie->uniquecharcount ) &&
936 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
937 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
939 PerlIO_printf( Perl_debug_log, "%*"UVXf,
941 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
943 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
947 PerlIO_printf( Perl_debug_log, "]");
950 PerlIO_printf( Perl_debug_log, "\n" );
954 Dumps a fully constructed but uncompressed trie in list form.
955 List tries normally only are used for construction when the number of
956 possible chars (trie->uniquecharcount) is very high.
957 Used for debugging make_trie().
960 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
961 HV *widecharmap, AV *revcharmap, U32 next_alloc,
965 SV *sv=sv_newmortal();
966 int colwidth= widecharmap ? 6 : 4;
967 GET_RE_DEBUG_FLAGS_DECL;
969 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
971 /* print out the table precompression. */
972 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
973 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
974 "------:-----+-----------------\n" );
976 for( state=1 ; state < next_alloc ; state ++ ) {
979 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
980 (int)depth * 2 + 2,"", (UV)state );
981 if ( ! trie->states[ state ].wordnum ) {
982 PerlIO_printf( Perl_debug_log, "%5s| ","");
984 PerlIO_printf( Perl_debug_log, "W%4x| ",
985 trie->states[ state ].wordnum
988 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
989 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
991 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
993 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
994 PL_colors[0], PL_colors[1],
995 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
996 PERL_PV_ESCAPE_FIRSTCHAR
998 TRIE_LIST_ITEM(state,charid).forid,
999 (UV)TRIE_LIST_ITEM(state,charid).newstate
1002 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1003 (int)((depth * 2) + 14), "");
1006 PerlIO_printf( Perl_debug_log, "\n");
1011 Dumps a fully constructed but uncompressed trie in table form.
1012 This is the normal DFA style state transition table, with a few
1013 twists to facilitate compression later.
1014 Used for debugging make_trie().
1017 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1018 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1023 SV *sv=sv_newmortal();
1024 int colwidth= widecharmap ? 6 : 4;
1025 GET_RE_DEBUG_FLAGS_DECL;
1027 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1030 print out the table precompression so that we can do a visual check
1031 that they are identical.
1034 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1036 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1037 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1039 PerlIO_printf( Perl_debug_log, "%*s",
1041 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1042 PL_colors[0], PL_colors[1],
1043 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1044 PERL_PV_ESCAPE_FIRSTCHAR
1050 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1052 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1053 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1056 PerlIO_printf( Perl_debug_log, "\n" );
1058 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1060 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1061 (int)depth * 2 + 2,"",
1062 (UV)TRIE_NODENUM( state ) );
1064 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1065 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1067 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1069 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1071 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1074 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1075 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1082 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1083 startbranch: the first branch in the whole branch sequence
1084 first : start branch of sequence of branch-exact nodes.
1085 May be the same as startbranch
1086 last : Thing following the last branch.
1087 May be the same as tail.
1088 tail : item following the branch sequence
1089 count : words in the sequence
1090 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1091 depth : indent depth
1093 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1095 A trie is an N'ary tree where the branches are determined by digital
1096 decomposition of the key. IE, at the root node you look up the 1st character and
1097 follow that branch repeat until you find the end of the branches. Nodes can be
1098 marked as "accepting" meaning they represent a complete word. Eg:
1102 would convert into the following structure. Numbers represent states, letters
1103 following numbers represent valid transitions on the letter from that state, if
1104 the number is in square brackets it represents an accepting state, otherwise it
1105 will be in parenthesis.
1107 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1111 (1) +-i->(6)-+-s->[7]
1113 +-s->(3)-+-h->(4)-+-e->[5]
1115 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1117 This shows that when matching against the string 'hers' we will begin at state 1
1118 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1119 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1120 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1121 single traverse. We store a mapping from accepting to state to which word was
1122 matched, and then when we have multiple possibilities we try to complete the
1123 rest of the regex in the order in which they occured in the alternation.
1125 The only prior NFA like behaviour that would be changed by the TRIE support is
1126 the silent ignoring of duplicate alternations which are of the form:
1128 / (DUPE|DUPE) X? (?{ ... }) Y /x
1130 Thus EVAL blocks follwing a trie may be called a different number of times with
1131 and without the optimisation. With the optimisations dupes will be silently
1132 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1133 the following demonstrates:
1135 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1137 which prints out 'word' three times, but
1139 'words'=~/(word|word|word)(?{ print $1 })S/
1141 which doesnt print it out at all. This is due to other optimisations kicking in.
1143 Example of what happens on a structural level:
1145 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1147 1: CURLYM[1] {1,32767}(18)
1158 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1159 and should turn into:
1161 1: CURLYM[1] {1,32767}(18)
1163 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1171 Cases where tail != last would be like /(?foo|bar)baz/:
1181 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1182 and would end up looking like:
1185 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1192 d = uvuni_to_utf8_flags(d, uv, 0);
1194 is the recommended Unicode-aware way of saying
1199 #define TRIE_STORE_REVCHAR \
1202 SV *zlopp = newSV(2); \
1203 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1204 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1205 SvCUR_set(zlopp, kapow - flrbbbbb); \
1208 av_push(revcharmap, zlopp); \
1210 char ooooff = (char)uvc; \
1211 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1215 #define TRIE_READ_CHAR STMT_START { \
1219 if ( foldlen > 0 ) { \
1220 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1225 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1226 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1227 foldlen -= UNISKIP( uvc ); \
1228 scan = foldbuf + UNISKIP( uvc ); \
1231 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1241 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1242 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1243 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1244 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1246 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1247 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1248 TRIE_LIST_CUR( state )++; \
1251 #define TRIE_LIST_NEW(state) STMT_START { \
1252 Newxz( trie->states[ state ].trans.list, \
1253 4, reg_trie_trans_le ); \
1254 TRIE_LIST_CUR( state ) = 1; \
1255 TRIE_LIST_LEN( state ) = 4; \
1258 #define TRIE_HANDLE_WORD(state) STMT_START { \
1259 U16 dupe= trie->states[ state ].wordnum; \
1260 regnode * const noper_next = regnext( noper ); \
1262 if (trie->wordlen) \
1263 trie->wordlen[ curword ] = wordlen; \
1265 /* store the word for dumping */ \
1267 if (OP(noper) != NOTHING) \
1268 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1270 tmp = newSVpvn_utf8( "", 0, UTF ); \
1271 av_push( trie_words, tmp ); \
1276 if ( noper_next < tail ) { \
1278 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1279 trie->jump[curword] = (U16)(noper_next - convert); \
1281 jumper = noper_next; \
1283 nextbranch= regnext(cur); \
1287 /* So it's a dupe. This means we need to maintain a */\
1288 /* linked-list from the first to the next. */\
1289 /* we only allocate the nextword buffer when there */\
1290 /* a dupe, so first time we have to do the allocation */\
1291 if (!trie->nextword) \
1292 trie->nextword = (U16 *) \
1293 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1294 while ( trie->nextword[dupe] ) \
1295 dupe= trie->nextword[dupe]; \
1296 trie->nextword[dupe]= curword; \
1298 /* we haven't inserted this word yet. */ \
1299 trie->states[ state ].wordnum = curword; \
1304 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1305 ( ( base + charid >= ucharcount \
1306 && base + charid < ubound \
1307 && state == trie->trans[ base - ucharcount + charid ].check \
1308 && trie->trans[ base - ucharcount + charid ].next ) \
1309 ? trie->trans[ base - ucharcount + charid ].next \
1310 : ( state==1 ? special : 0 ) \
1314 #define MADE_JUMP_TRIE 2
1315 #define MADE_EXACT_TRIE 4
1318 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1321 /* first pass, loop through and scan words */
1322 reg_trie_data *trie;
1323 HV *widecharmap = NULL;
1324 AV *revcharmap = newAV();
1326 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1331 regnode *jumper = NULL;
1332 regnode *nextbranch = NULL;
1333 regnode *convert = NULL;
1334 /* we just use folder as a flag in utf8 */
1335 const U8 * const folder = ( flags == EXACTF
1337 : ( flags == EXACTFL
1344 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1345 AV *trie_words = NULL;
1346 /* along with revcharmap, this only used during construction but both are
1347 * useful during debugging so we store them in the struct when debugging.
1350 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1351 STRLEN trie_charcount=0;
1353 SV *re_trie_maxbuff;
1354 GET_RE_DEBUG_FLAGS_DECL;
1356 PERL_ARGS_ASSERT_MAKE_TRIE;
1358 PERL_UNUSED_ARG(depth);
1361 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1363 trie->startstate = 1;
1364 trie->wordcount = word_count;
1365 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1366 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1367 if (!(UTF && folder))
1368 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1370 trie_words = newAV();
1373 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1374 if (!SvIOK(re_trie_maxbuff)) {
1375 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1378 PerlIO_printf( Perl_debug_log,
1379 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1380 (int)depth * 2 + 2, "",
1381 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1382 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1386 /* Find the node we are going to overwrite */
1387 if ( first == startbranch && OP( last ) != BRANCH ) {
1388 /* whole branch chain */
1391 /* branch sub-chain */
1392 convert = NEXTOPER( first );
1395 /* -- First loop and Setup --
1397 We first traverse the branches and scan each word to determine if it
1398 contains widechars, and how many unique chars there are, this is
1399 important as we have to build a table with at least as many columns as we
1402 We use an array of integers to represent the character codes 0..255
1403 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1404 native representation of the character value as the key and IV's for the
1407 *TODO* If we keep track of how many times each character is used we can
1408 remap the columns so that the table compression later on is more
1409 efficient in terms of memory by ensuring most common value is in the
1410 middle and the least common are on the outside. IMO this would be better
1411 than a most to least common mapping as theres a decent chance the most
1412 common letter will share a node with the least common, meaning the node
1413 will not be compressable. With a middle is most common approach the worst
1414 case is when we have the least common nodes twice.
1418 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1419 regnode * const noper = NEXTOPER( cur );
1420 const U8 *uc = (U8*)STRING( noper );
1421 const U8 * const e = uc + STR_LEN( noper );
1423 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1424 const U8 *scan = (U8*)NULL;
1425 U32 wordlen = 0; /* required init */
1427 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1429 if (OP(noper) == NOTHING) {
1433 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1434 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1435 regardless of encoding */
1437 for ( ; uc < e ; uc += len ) {
1438 TRIE_CHARCOUNT(trie)++;
1442 if ( !trie->charmap[ uvc ] ) {
1443 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1445 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1449 /* store the codepoint in the bitmap, and if its ascii
1450 also store its folded equivelent. */
1451 TRIE_BITMAP_SET(trie,uvc);
1453 /* store the folded codepoint */
1454 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1457 /* store first byte of utf8 representation of
1458 codepoints in the 127 < uvc < 256 range */
1459 if (127 < uvc && uvc < 192) {
1460 TRIE_BITMAP_SET(trie,194);
1461 } else if (191 < uvc ) {
1462 TRIE_BITMAP_SET(trie,195);
1463 /* && uvc < 256 -- we know uvc is < 256 already */
1466 set_bit = 0; /* We've done our bit :-) */
1471 widecharmap = newHV();
1473 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1476 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1478 if ( !SvTRUE( *svpp ) ) {
1479 sv_setiv( *svpp, ++trie->uniquecharcount );
1484 if( cur == first ) {
1487 } else if (chars < trie->minlen) {
1489 } else if (chars > trie->maxlen) {
1493 } /* end first pass */
1494 DEBUG_TRIE_COMPILE_r(
1495 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1496 (int)depth * 2 + 2,"",
1497 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1498 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1499 (int)trie->minlen, (int)trie->maxlen )
1501 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1504 We now know what we are dealing with in terms of unique chars and
1505 string sizes so we can calculate how much memory a naive
1506 representation using a flat table will take. If it's over a reasonable
1507 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1508 conservative but potentially much slower representation using an array
1511 At the end we convert both representations into the same compressed
1512 form that will be used in regexec.c for matching with. The latter
1513 is a form that cannot be used to construct with but has memory
1514 properties similar to the list form and access properties similar
1515 to the table form making it both suitable for fast searches and
1516 small enough that its feasable to store for the duration of a program.
1518 See the comment in the code where the compressed table is produced
1519 inplace from the flat tabe representation for an explanation of how
1520 the compression works.
1525 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1527 Second Pass -- Array Of Lists Representation
1529 Each state will be represented by a list of charid:state records
1530 (reg_trie_trans_le) the first such element holds the CUR and LEN
1531 points of the allocated array. (See defines above).
1533 We build the initial structure using the lists, and then convert
1534 it into the compressed table form which allows faster lookups
1535 (but cant be modified once converted).
1538 STRLEN transcount = 1;
1540 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1541 "%*sCompiling trie using list compiler\n",
1542 (int)depth * 2 + 2, ""));
1544 trie->states = (reg_trie_state *)
1545 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1546 sizeof(reg_trie_state) );
1550 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1552 regnode * const noper = NEXTOPER( cur );
1553 U8 *uc = (U8*)STRING( noper );
1554 const U8 * const e = uc + STR_LEN( noper );
1555 U32 state = 1; /* required init */
1556 U16 charid = 0; /* sanity init */
1557 U8 *scan = (U8*)NULL; /* sanity init */
1558 STRLEN foldlen = 0; /* required init */
1559 U32 wordlen = 0; /* required init */
1560 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1562 if (OP(noper) != NOTHING) {
1563 for ( ; uc < e ; uc += len ) {
1568 charid = trie->charmap[ uvc ];
1570 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1574 charid=(U16)SvIV( *svpp );
1577 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1584 if ( !trie->states[ state ].trans.list ) {
1585 TRIE_LIST_NEW( state );
1587 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1588 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1589 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1594 newstate = next_alloc++;
1595 TRIE_LIST_PUSH( state, charid, newstate );
1600 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1604 TRIE_HANDLE_WORD(state);
1606 } /* end second pass */
1608 /* next alloc is the NEXT state to be allocated */
1609 trie->statecount = next_alloc;
1610 trie->states = (reg_trie_state *)
1611 PerlMemShared_realloc( trie->states,
1613 * sizeof(reg_trie_state) );
1615 /* and now dump it out before we compress it */
1616 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1617 revcharmap, next_alloc,
1621 trie->trans = (reg_trie_trans *)
1622 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1629 for( state=1 ; state < next_alloc ; state ++ ) {
1633 DEBUG_TRIE_COMPILE_MORE_r(
1634 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1638 if (trie->states[state].trans.list) {
1639 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1643 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1644 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1645 if ( forid < minid ) {
1647 } else if ( forid > maxid ) {
1651 if ( transcount < tp + maxid - minid + 1) {
1653 trie->trans = (reg_trie_trans *)
1654 PerlMemShared_realloc( trie->trans,
1656 * sizeof(reg_trie_trans) );
1657 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1659 base = trie->uniquecharcount + tp - minid;
1660 if ( maxid == minid ) {
1662 for ( ; zp < tp ; zp++ ) {
1663 if ( ! trie->trans[ zp ].next ) {
1664 base = trie->uniquecharcount + zp - minid;
1665 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1666 trie->trans[ zp ].check = state;
1672 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1673 trie->trans[ tp ].check = state;
1678 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1679 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1680 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1681 trie->trans[ tid ].check = state;
1683 tp += ( maxid - minid + 1 );
1685 Safefree(trie->states[ state ].trans.list);
1688 DEBUG_TRIE_COMPILE_MORE_r(
1689 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1692 trie->states[ state ].trans.base=base;
1694 trie->lasttrans = tp + 1;
1698 Second Pass -- Flat Table Representation.
1700 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1701 We know that we will need Charcount+1 trans at most to store the data
1702 (one row per char at worst case) So we preallocate both structures
1703 assuming worst case.
1705 We then construct the trie using only the .next slots of the entry
1708 We use the .check field of the first entry of the node temporarily to
1709 make compression both faster and easier by keeping track of how many non
1710 zero fields are in the node.
1712 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1715 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1716 number representing the first entry of the node, and state as a
1717 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1718 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1719 are 2 entrys per node. eg:
1727 The table is internally in the right hand, idx form. However as we also
1728 have to deal with the states array which is indexed by nodenum we have to
1729 use TRIE_NODENUM() to convert.
1732 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1733 "%*sCompiling trie using table compiler\n",
1734 (int)depth * 2 + 2, ""));
1736 trie->trans = (reg_trie_trans *)
1737 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1738 * trie->uniquecharcount + 1,
1739 sizeof(reg_trie_trans) );
1740 trie->states = (reg_trie_state *)
1741 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1742 sizeof(reg_trie_state) );
1743 next_alloc = trie->uniquecharcount + 1;
1746 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1748 regnode * const noper = NEXTOPER( cur );
1749 const U8 *uc = (U8*)STRING( noper );
1750 const U8 * const e = uc + STR_LEN( noper );
1752 U32 state = 1; /* required init */
1754 U16 charid = 0; /* sanity init */
1755 U32 accept_state = 0; /* sanity init */
1756 U8 *scan = (U8*)NULL; /* sanity init */
1758 STRLEN foldlen = 0; /* required init */
1759 U32 wordlen = 0; /* required init */
1760 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1762 if ( OP(noper) != NOTHING ) {
1763 for ( ; uc < e ; uc += len ) {
1768 charid = trie->charmap[ uvc ];
1770 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1771 charid = svpp ? (U16)SvIV(*svpp) : 0;
1775 if ( !trie->trans[ state + charid ].next ) {
1776 trie->trans[ state + charid ].next = next_alloc;
1777 trie->trans[ state ].check++;
1778 next_alloc += trie->uniquecharcount;
1780 state = trie->trans[ state + charid ].next;
1782 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1784 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1787 accept_state = TRIE_NODENUM( state );
1788 TRIE_HANDLE_WORD(accept_state);
1790 } /* end second pass */
1792 /* and now dump it out before we compress it */
1793 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1795 next_alloc, depth+1));
1799 * Inplace compress the table.*
1801 For sparse data sets the table constructed by the trie algorithm will
1802 be mostly 0/FAIL transitions or to put it another way mostly empty.
1803 (Note that leaf nodes will not contain any transitions.)
1805 This algorithm compresses the tables by eliminating most such
1806 transitions, at the cost of a modest bit of extra work during lookup:
1808 - Each states[] entry contains a .base field which indicates the
1809 index in the state[] array wheres its transition data is stored.
1811 - If .base is 0 there are no valid transitions from that node.
1813 - If .base is nonzero then charid is added to it to find an entry in
1816 -If trans[states[state].base+charid].check!=state then the
1817 transition is taken to be a 0/Fail transition. Thus if there are fail
1818 transitions at the front of the node then the .base offset will point
1819 somewhere inside the previous nodes data (or maybe even into a node
1820 even earlier), but the .check field determines if the transition is
1824 The following process inplace converts the table to the compressed
1825 table: We first do not compress the root node 1,and mark its all its
1826 .check pointers as 1 and set its .base pointer as 1 as well. This
1827 allows to do a DFA construction from the compressed table later, and
1828 ensures that any .base pointers we calculate later are greater than
1831 - We set 'pos' to indicate the first entry of the second node.
1833 - We then iterate over the columns of the node, finding the first and
1834 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1835 and set the .check pointers accordingly, and advance pos
1836 appropriately and repreat for the next node. Note that when we copy
1837 the next pointers we have to convert them from the original
1838 NODEIDX form to NODENUM form as the former is not valid post
1841 - If a node has no transitions used we mark its base as 0 and do not
1842 advance the pos pointer.
1844 - If a node only has one transition we use a second pointer into the
1845 structure to fill in allocated fail transitions from other states.
1846 This pointer is independent of the main pointer and scans forward
1847 looking for null transitions that are allocated to a state. When it
1848 finds one it writes the single transition into the "hole". If the
1849 pointer doesnt find one the single transition is appended as normal.
1851 - Once compressed we can Renew/realloc the structures to release the
1854 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1855 specifically Fig 3.47 and the associated pseudocode.
1859 const U32 laststate = TRIE_NODENUM( next_alloc );
1862 trie->statecount = laststate;
1864 for ( state = 1 ; state < laststate ; state++ ) {
1866 const U32 stateidx = TRIE_NODEIDX( state );
1867 const U32 o_used = trie->trans[ stateidx ].check;
1868 U32 used = trie->trans[ stateidx ].check;
1869 trie->trans[ stateidx ].check = 0;
1871 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1872 if ( flag || trie->trans[ stateidx + charid ].next ) {
1873 if ( trie->trans[ stateidx + charid ].next ) {
1875 for ( ; zp < pos ; zp++ ) {
1876 if ( ! trie->trans[ zp ].next ) {
1880 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1881 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1882 trie->trans[ zp ].check = state;
1883 if ( ++zp > pos ) pos = zp;
1890 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1892 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1893 trie->trans[ pos ].check = state;
1898 trie->lasttrans = pos + 1;
1899 trie->states = (reg_trie_state *)
1900 PerlMemShared_realloc( trie->states, laststate
1901 * sizeof(reg_trie_state) );
1902 DEBUG_TRIE_COMPILE_MORE_r(
1903 PerlIO_printf( Perl_debug_log,
1904 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1905 (int)depth * 2 + 2,"",
1906 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1909 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1912 } /* end table compress */
1914 DEBUG_TRIE_COMPILE_MORE_r(
1915 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1916 (int)depth * 2 + 2, "",
1917 (UV)trie->statecount,
1918 (UV)trie->lasttrans)
1920 /* resize the trans array to remove unused space */
1921 trie->trans = (reg_trie_trans *)
1922 PerlMemShared_realloc( trie->trans, trie->lasttrans
1923 * sizeof(reg_trie_trans) );
1925 /* and now dump out the compressed format */
1926 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1928 { /* Modify the program and insert the new TRIE node*/
1929 U8 nodetype =(U8)(flags & 0xFF);
1933 regnode *optimize = NULL;
1934 #ifdef RE_TRACK_PATTERN_OFFSETS
1937 U32 mjd_nodelen = 0;
1938 #endif /* RE_TRACK_PATTERN_OFFSETS */
1939 #endif /* DEBUGGING */
1941 This means we convert either the first branch or the first Exact,
1942 depending on whether the thing following (in 'last') is a branch
1943 or not and whther first is the startbranch (ie is it a sub part of
1944 the alternation or is it the whole thing.)
1945 Assuming its a sub part we conver the EXACT otherwise we convert
1946 the whole branch sequence, including the first.
1948 /* Find the node we are going to overwrite */
1949 if ( first != startbranch || OP( last ) == BRANCH ) {
1950 /* branch sub-chain */
1951 NEXT_OFF( first ) = (U16)(last - first);
1952 #ifdef RE_TRACK_PATTERN_OFFSETS
1954 mjd_offset= Node_Offset((convert));
1955 mjd_nodelen= Node_Length((convert));
1958 /* whole branch chain */
1960 #ifdef RE_TRACK_PATTERN_OFFSETS
1963 const regnode *nop = NEXTOPER( convert );
1964 mjd_offset= Node_Offset((nop));
1965 mjd_nodelen= Node_Length((nop));
1969 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1970 (int)depth * 2 + 2, "",
1971 (UV)mjd_offset, (UV)mjd_nodelen)
1974 /* But first we check to see if there is a common prefix we can
1975 split out as an EXACT and put in front of the TRIE node. */
1976 trie->startstate= 1;
1977 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1979 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1983 const U32 base = trie->states[ state ].trans.base;
1985 if ( trie->states[state].wordnum )
1988 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1989 if ( ( base + ofs >= trie->uniquecharcount ) &&
1990 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1991 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1993 if ( ++count > 1 ) {
1994 SV **tmp = av_fetch( revcharmap, ofs, 0);
1995 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1996 if ( state == 1 ) break;
1998 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2000 PerlIO_printf(Perl_debug_log,
2001 "%*sNew Start State=%"UVuf" Class: [",
2002 (int)depth * 2 + 2, "",
2005 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2006 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2008 TRIE_BITMAP_SET(trie,*ch);
2010 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2012 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2016 TRIE_BITMAP_SET(trie,*ch);
2018 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2019 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2025 SV **tmp = av_fetch( revcharmap, idx, 0);
2027 char *ch = SvPV( *tmp, len );
2029 SV *sv=sv_newmortal();
2030 PerlIO_printf( Perl_debug_log,
2031 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2032 (int)depth * 2 + 2, "",
2034 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2035 PL_colors[0], PL_colors[1],
2036 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2037 PERL_PV_ESCAPE_FIRSTCHAR
2042 OP( convert ) = nodetype;
2043 str=STRING(convert);
2046 STR_LEN(convert) += len;
2052 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2058 regnode *n = convert+NODE_SZ_STR(convert);
2059 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2060 trie->startstate = state;
2061 trie->minlen -= (state - 1);
2062 trie->maxlen -= (state - 1);
2064 /* At least the UNICOS C compiler choked on this
2065 * being argument to DEBUG_r(), so let's just have
2068 #ifdef PERL_EXT_RE_BUILD
2074 regnode *fix = convert;
2075 U32 word = trie->wordcount;
2077 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2078 while( ++fix < n ) {
2079 Set_Node_Offset_Length(fix, 0, 0);
2082 SV ** const tmp = av_fetch( trie_words, word, 0 );
2084 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2087 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2095 NEXT_OFF(convert) = (U16)(tail - convert);
2096 DEBUG_r(optimize= n);
2102 if ( trie->maxlen ) {
2103 NEXT_OFF( convert ) = (U16)(tail - convert);
2104 ARG_SET( convert, data_slot );
2105 /* Store the offset to the first unabsorbed branch in
2106 jump[0], which is otherwise unused by the jump logic.
2107 We use this when dumping a trie and during optimisation. */
2109 trie->jump[0] = (U16)(nextbranch - convert);
2112 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2113 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2115 OP( convert ) = TRIEC;
2116 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2117 PerlMemShared_free(trie->bitmap);
2120 OP( convert ) = TRIE;
2122 /* store the type in the flags */
2123 convert->flags = nodetype;
2127 + regarglen[ OP( convert ) ];
2129 /* XXX We really should free up the resource in trie now,
2130 as we won't use them - (which resources?) dmq */
2132 /* needed for dumping*/
2133 DEBUG_r(if (optimize) {
2134 regnode *opt = convert;
2136 while ( ++opt < optimize) {
2137 Set_Node_Offset_Length(opt,0,0);
2140 Try to clean up some of the debris left after the
2143 while( optimize < jumper ) {
2144 mjd_nodelen += Node_Length((optimize));
2145 OP( optimize ) = OPTIMIZED;
2146 Set_Node_Offset_Length(optimize,0,0);
2149 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2151 } /* end node insert */
2152 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2153 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2155 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2156 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2158 SvREFCNT_dec(revcharmap);
2162 : trie->startstate>1
2168 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2170 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2172 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2173 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2176 We find the fail state for each state in the trie, this state is the longest proper
2177 suffix of the current states 'word' that is also a proper prefix of another word in our
2178 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2179 the DFA not to have to restart after its tried and failed a word at a given point, it
2180 simply continues as though it had been matching the other word in the first place.
2182 'abcdgu'=~/abcdefg|cdgu/
2183 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2184 fail, which would bring use to the state representing 'd' in the second word where we would
2185 try 'g' and succeed, prodceding to match 'cdgu'.
2187 /* add a fail transition */
2188 const U32 trie_offset = ARG(source);
2189 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2191 const U32 ucharcount = trie->uniquecharcount;
2192 const U32 numstates = trie->statecount;
2193 const U32 ubound = trie->lasttrans + ucharcount;
2197 U32 base = trie->states[ 1 ].trans.base;
2200 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2201 GET_RE_DEBUG_FLAGS_DECL;
2203 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2205 PERL_UNUSED_ARG(depth);
2209 ARG_SET( stclass, data_slot );
2210 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2211 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2212 aho->trie=trie_offset;
2213 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2214 Copy( trie->states, aho->states, numstates, reg_trie_state );
2215 Newxz( q, numstates, U32);
2216 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2219 /* initialize fail[0..1] to be 1 so that we always have
2220 a valid final fail state */
2221 fail[ 0 ] = fail[ 1 ] = 1;
2223 for ( charid = 0; charid < ucharcount ; charid++ ) {
2224 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2226 q[ q_write ] = newstate;
2227 /* set to point at the root */
2228 fail[ q[ q_write++ ] ]=1;
2231 while ( q_read < q_write) {
2232 const U32 cur = q[ q_read++ % numstates ];
2233 base = trie->states[ cur ].trans.base;
2235 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2236 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2238 U32 fail_state = cur;
2241 fail_state = fail[ fail_state ];
2242 fail_base = aho->states[ fail_state ].trans.base;
2243 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2245 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2246 fail[ ch_state ] = fail_state;
2247 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2249 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2251 q[ q_write++ % numstates] = ch_state;
2255 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2256 when we fail in state 1, this allows us to use the
2257 charclass scan to find a valid start char. This is based on the principle
2258 that theres a good chance the string being searched contains lots of stuff
2259 that cant be a start char.
2261 fail[ 0 ] = fail[ 1 ] = 0;
2262 DEBUG_TRIE_COMPILE_r({
2263 PerlIO_printf(Perl_debug_log,
2264 "%*sStclass Failtable (%"UVuf" states): 0",
2265 (int)(depth * 2), "", (UV)numstates
2267 for( q_read=1; q_read<numstates; q_read++ ) {
2268 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2270 PerlIO_printf(Perl_debug_log, "\n");
2273 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2278 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2279 * These need to be revisited when a newer toolchain becomes available.
2281 #if defined(__sparc64__) && defined(__GNUC__)
2282 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2283 # undef SPARC64_GCC_WORKAROUND
2284 # define SPARC64_GCC_WORKAROUND 1
2288 #define DEBUG_PEEP(str,scan,depth) \
2289 DEBUG_OPTIMISE_r({if (scan){ \
2290 SV * const mysv=sv_newmortal(); \
2291 regnode *Next = regnext(scan); \
2292 regprop(RExC_rx, mysv, scan); \
2293 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2294 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2295 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2302 #define JOIN_EXACT(scan,min,flags) \
2303 if (PL_regkind[OP(scan)] == EXACT) \
2304 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2307 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2308 /* Merge several consecutive EXACTish nodes into one. */
2309 regnode *n = regnext(scan);
2311 regnode *next = scan + NODE_SZ_STR(scan);
2315 regnode *stop = scan;
2316 GET_RE_DEBUG_FLAGS_DECL;
2318 PERL_UNUSED_ARG(depth);
2321 PERL_ARGS_ASSERT_JOIN_EXACT;
2322 #ifndef EXPERIMENTAL_INPLACESCAN
2323 PERL_UNUSED_ARG(flags);
2324 PERL_UNUSED_ARG(val);
2326 DEBUG_PEEP("join",scan,depth);
2328 /* Skip NOTHING, merge EXACT*. */
2330 ( PL_regkind[OP(n)] == NOTHING ||
2331 (stringok && (OP(n) == OP(scan))))
2333 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2335 if (OP(n) == TAIL || n > next)
2337 if (PL_regkind[OP(n)] == NOTHING) {
2338 DEBUG_PEEP("skip:",n,depth);
2339 NEXT_OFF(scan) += NEXT_OFF(n);
2340 next = n + NODE_STEP_REGNODE;
2347 else if (stringok) {
2348 const unsigned int oldl = STR_LEN(scan);
2349 regnode * const nnext = regnext(n);
2351 DEBUG_PEEP("merg",n,depth);
2354 if (oldl + STR_LEN(n) > U8_MAX)
2356 NEXT_OFF(scan) += NEXT_OFF(n);
2357 STR_LEN(scan) += STR_LEN(n);
2358 next = n + NODE_SZ_STR(n);
2359 /* Now we can overwrite *n : */
2360 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2368 #ifdef EXPERIMENTAL_INPLACESCAN
2369 if (flags && !NEXT_OFF(n)) {
2370 DEBUG_PEEP("atch", val, depth);
2371 if (reg_off_by_arg[OP(n)]) {
2372 ARG_SET(n, val - n);
2375 NEXT_OFF(n) = val - n;
2382 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2384 Two problematic code points in Unicode casefolding of EXACT nodes:
2386 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2387 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2393 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2394 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2396 This means that in case-insensitive matching (or "loose matching",
2397 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2398 length of the above casefolded versions) can match a target string
2399 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2400 This would rather mess up the minimum length computation.
2402 What we'll do is to look for the tail four bytes, and then peek
2403 at the preceding two bytes to see whether we need to decrease
2404 the minimum length by four (six minus two).
2406 Thanks to the design of UTF-8, there cannot be false matches:
2407 A sequence of valid UTF-8 bytes cannot be a subsequence of
2408 another valid sequence of UTF-8 bytes.
2411 char * const s0 = STRING(scan), *s, *t;
2412 char * const s1 = s0 + STR_LEN(scan) - 1;
2413 char * const s2 = s1 - 4;
2414 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2415 const char t0[] = "\xaf\x49\xaf\x42";
2417 const char t0[] = "\xcc\x88\xcc\x81";
2419 const char * const t1 = t0 + 3;
2422 s < s2 && (t = ninstr(s, s1, t0, t1));
2425 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2426 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2428 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2429 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2437 n = scan + NODE_SZ_STR(scan);
2439 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2446 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2450 /* REx optimizer. Converts nodes into quickier variants "in place".
2451 Finds fixed substrings. */
2453 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2454 to the position after last scanned or to NULL. */
2456 #define INIT_AND_WITHP \
2457 assert(!and_withp); \
2458 Newx(and_withp,1,struct regnode_charclass_class); \
2459 SAVEFREEPV(and_withp)
2461 /* this is a chain of data about sub patterns we are processing that
2462 need to be handled seperately/specially in study_chunk. Its so
2463 we can simulate recursion without losing state. */
2465 typedef struct scan_frame {
2466 regnode *last; /* last node to process in this frame */
2467 regnode *next; /* next node to process when last is reached */
2468 struct scan_frame *prev; /*previous frame*/
2469 I32 stop; /* what stopparen do we use */
2473 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2475 #define CASE_SYNST_FNC(nAmE) \
2477 if (flags & SCF_DO_STCLASS_AND) { \
2478 for (value = 0; value < 256; value++) \
2479 if (!is_ ## nAmE ## _cp(value)) \
2480 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2483 for (value = 0; value < 256; value++) \
2484 if (is_ ## nAmE ## _cp(value)) \
2485 ANYOF_BITMAP_SET(data->start_class, value); \
2489 if (flags & SCF_DO_STCLASS_AND) { \
2490 for (value = 0; value < 256; value++) \
2491 if (is_ ## nAmE ## _cp(value)) \
2492 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2495 for (value = 0; value < 256; value++) \
2496 if (!is_ ## nAmE ## _cp(value)) \
2497 ANYOF_BITMAP_SET(data->start_class, value); \
2504 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2505 I32 *minlenp, I32 *deltap,
2510 struct regnode_charclass_class *and_withp,
2511 U32 flags, U32 depth)
2512 /* scanp: Start here (read-write). */
2513 /* deltap: Write maxlen-minlen here. */
2514 /* last: Stop before this one. */
2515 /* data: string data about the pattern */
2516 /* stopparen: treat close N as END */
2517 /* recursed: which subroutines have we recursed into */
2518 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2521 I32 min = 0, pars = 0, code;
2522 regnode *scan = *scanp, *next;
2524 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2525 int is_inf_internal = 0; /* The studied chunk is infinite */
2526 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2527 scan_data_t data_fake;
2528 SV *re_trie_maxbuff = NULL;
2529 regnode *first_non_open = scan;
2530 I32 stopmin = I32_MAX;
2531 scan_frame *frame = NULL;
2532 GET_RE_DEBUG_FLAGS_DECL;
2534 PERL_ARGS_ASSERT_STUDY_CHUNK;
2537 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2541 while (first_non_open && OP(first_non_open) == OPEN)
2542 first_non_open=regnext(first_non_open);
2547 while ( scan && OP(scan) != END && scan < last ){
2548 /* Peephole optimizer: */
2549 DEBUG_STUDYDATA("Peep:", data,depth);
2550 DEBUG_PEEP("Peep",scan,depth);
2551 JOIN_EXACT(scan,&min,0);
2553 /* Follow the next-chain of the current node and optimize
2554 away all the NOTHINGs from it. */
2555 if (OP(scan) != CURLYX) {
2556 const int max = (reg_off_by_arg[OP(scan)]
2558 /* I32 may be smaller than U16 on CRAYs! */
2559 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2560 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2564 /* Skip NOTHING and LONGJMP. */
2565 while ((n = regnext(n))
2566 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2567 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2568 && off + noff < max)
2570 if (reg_off_by_arg[OP(scan)])
2573 NEXT_OFF(scan) = off;
2578 /* The principal pseudo-switch. Cannot be a switch, since we
2579 look into several different things. */
2580 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2581 || OP(scan) == IFTHEN) {
2582 next = regnext(scan);
2584 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2586 if (OP(next) == code || code == IFTHEN) {
2587 /* NOTE - There is similar code to this block below for handling
2588 TRIE nodes on a re-study. If you change stuff here check there
2590 I32 max1 = 0, min1 = I32_MAX, num = 0;
2591 struct regnode_charclass_class accum;
2592 regnode * const startbranch=scan;
2594 if (flags & SCF_DO_SUBSTR)
2595 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2596 if (flags & SCF_DO_STCLASS)
2597 cl_init_zero(pRExC_state, &accum);
2599 while (OP(scan) == code) {
2600 I32 deltanext, minnext, f = 0, fake;
2601 struct regnode_charclass_class this_class;
2604 data_fake.flags = 0;
2606 data_fake.whilem_c = data->whilem_c;
2607 data_fake.last_closep = data->last_closep;
2610 data_fake.last_closep = &fake;
2612 data_fake.pos_delta = delta;
2613 next = regnext(scan);
2614 scan = NEXTOPER(scan);
2616 scan = NEXTOPER(scan);
2617 if (flags & SCF_DO_STCLASS) {
2618 cl_init(pRExC_state, &this_class);
2619 data_fake.start_class = &this_class;
2620 f = SCF_DO_STCLASS_AND;
2622 if (flags & SCF_WHILEM_VISITED_POS)
2623 f |= SCF_WHILEM_VISITED_POS;
2625 /* we suppose the run is continuous, last=next...*/
2626 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2628 stopparen, recursed, NULL, f,depth+1);
2631 if (max1 < minnext + deltanext)
2632 max1 = minnext + deltanext;
2633 if (deltanext == I32_MAX)
2634 is_inf = is_inf_internal = 1;
2636 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2638 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2639 if ( stopmin > minnext)
2640 stopmin = min + min1;
2641 flags &= ~SCF_DO_SUBSTR;
2643 data->flags |= SCF_SEEN_ACCEPT;
2646 if (data_fake.flags & SF_HAS_EVAL)
2647 data->flags |= SF_HAS_EVAL;
2648 data->whilem_c = data_fake.whilem_c;
2650 if (flags & SCF_DO_STCLASS)
2651 cl_or(pRExC_state, &accum, &this_class);
2653 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2655 if (flags & SCF_DO_SUBSTR) {
2656 data->pos_min += min1;
2657 data->pos_delta += max1 - min1;
2658 if (max1 != min1 || is_inf)
2659 data->longest = &(data->longest_float);
2662 delta += max1 - min1;
2663 if (flags & SCF_DO_STCLASS_OR) {
2664 cl_or(pRExC_state, data->start_class, &accum);
2666 cl_and(data->start_class, and_withp);
2667 flags &= ~SCF_DO_STCLASS;
2670 else if (flags & SCF_DO_STCLASS_AND) {
2672 cl_and(data->start_class, &accum);
2673 flags &= ~SCF_DO_STCLASS;
2676 /* Switch to OR mode: cache the old value of
2677 * data->start_class */
2679 StructCopy(data->start_class, and_withp,
2680 struct regnode_charclass_class);
2681 flags &= ~SCF_DO_STCLASS_AND;
2682 StructCopy(&accum, data->start_class,
2683 struct regnode_charclass_class);
2684 flags |= SCF_DO_STCLASS_OR;
2685 data->start_class->flags |= ANYOF_EOS;
2689 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2692 Assuming this was/is a branch we are dealing with: 'scan' now
2693 points at the item that follows the branch sequence, whatever
2694 it is. We now start at the beginning of the sequence and look
2701 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2703 If we can find such a subseqence we need to turn the first
2704 element into a trie and then add the subsequent branch exact
2705 strings to the trie.
2709 1. patterns where the whole set of branch can be converted.
2711 2. patterns where only a subset can be converted.
2713 In case 1 we can replace the whole set with a single regop
2714 for the trie. In case 2 we need to keep the start and end
2717 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2718 becomes BRANCH TRIE; BRANCH X;
2720 There is an additional case, that being where there is a
2721 common prefix, which gets split out into an EXACT like node
2722 preceding the TRIE node.
2724 If x(1..n)==tail then we can do a simple trie, if not we make
2725 a "jump" trie, such that when we match the appropriate word
2726 we "jump" to the appopriate tail node. Essentailly we turn
2727 a nested if into a case structure of sorts.
2732 if (!re_trie_maxbuff) {
2733 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2734 if (!SvIOK(re_trie_maxbuff))
2735 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2737 if ( SvIV(re_trie_maxbuff)>=0 ) {
2739 regnode *first = (regnode *)NULL;
2740 regnode *last = (regnode *)NULL;
2741 regnode *tail = scan;
2746 SV * const mysv = sv_newmortal(); /* for dumping */
2748 /* var tail is used because there may be a TAIL
2749 regop in the way. Ie, the exacts will point to the
2750 thing following the TAIL, but the last branch will
2751 point at the TAIL. So we advance tail. If we
2752 have nested (?:) we may have to move through several
2756 while ( OP( tail ) == TAIL ) {
2757 /* this is the TAIL generated by (?:) */
2758 tail = regnext( tail );
2763 regprop(RExC_rx, mysv, tail );
2764 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2765 (int)depth * 2 + 2, "",
2766 "Looking for TRIE'able sequences. Tail node is: ",
2767 SvPV_nolen_const( mysv )
2773 step through the branches, cur represents each
2774 branch, noper is the first thing to be matched
2775 as part of that branch and noper_next is the
2776 regnext() of that node. if noper is an EXACT
2777 and noper_next is the same as scan (our current
2778 position in the regex) then the EXACT branch is
2779 a possible optimization target. Once we have
2780 two or more consequetive such branches we can
2781 create a trie of the EXACT's contents and stich
2782 it in place. If the sequence represents all of
2783 the branches we eliminate the whole thing and
2784 replace it with a single TRIE. If it is a
2785 subsequence then we need to stitch it in. This
2786 means the first branch has to remain, and needs
2787 to be repointed at the item on the branch chain
2788 following the last branch optimized. This could
2789 be either a BRANCH, in which case the
2790 subsequence is internal, or it could be the
2791 item following the branch sequence in which
2792 case the subsequence is at the end.
2796 /* dont use tail as the end marker for this traverse */
2797 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2798 regnode * const noper = NEXTOPER( cur );
2799 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2800 regnode * const noper_next = regnext( noper );
2804 regprop(RExC_rx, mysv, cur);
2805 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2806 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2808 regprop(RExC_rx, mysv, noper);
2809 PerlIO_printf( Perl_debug_log, " -> %s",
2810 SvPV_nolen_const(mysv));
2813 regprop(RExC_rx, mysv, noper_next );
2814 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2815 SvPV_nolen_const(mysv));
2817 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2818 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2820 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2821 : PL_regkind[ OP( noper ) ] == EXACT )
2822 || OP(noper) == NOTHING )
2824 && noper_next == tail
2829 if ( !first || optype == NOTHING ) {
2830 if (!first) first = cur;
2831 optype = OP( noper );
2837 Currently we assume that the trie can handle unicode and ascii
2838 matches fold cased matches. If this proves true then the following
2839 define will prevent tries in this situation.
2841 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2843 #define TRIE_TYPE_IS_SAFE 1
2844 if ( last && TRIE_TYPE_IS_SAFE ) {
2845 make_trie( pRExC_state,
2846 startbranch, first, cur, tail, count,
2849 if ( PL_regkind[ OP( noper ) ] == EXACT
2851 && noper_next == tail
2856 optype = OP( noper );
2866 regprop(RExC_rx, mysv, cur);
2867 PerlIO_printf( Perl_debug_log,
2868 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2869 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2873 if ( last && TRIE_TYPE_IS_SAFE ) {
2874 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2875 #ifdef TRIE_STUDY_OPT
2876 if ( ((made == MADE_EXACT_TRIE &&
2877 startbranch == first)
2878 || ( first_non_open == first )) &&
2880 flags |= SCF_TRIE_RESTUDY;
2881 if ( startbranch == first
2884 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2894 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2895 scan = NEXTOPER(NEXTOPER(scan));
2896 } else /* single branch is optimized. */
2897 scan = NEXTOPER(scan);
2899 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2900 scan_frame *newframe = NULL;
2905 if (OP(scan) != SUSPEND) {
2906 /* set the pointer */
2907 if (OP(scan) == GOSUB) {
2909 RExC_recurse[ARG2L(scan)] = scan;
2910 start = RExC_open_parens[paren-1];
2911 end = RExC_close_parens[paren-1];
2914 start = RExC_rxi->program + 1;
2918 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2919 SAVEFREEPV(recursed);
2921 if (!PAREN_TEST(recursed,paren+1)) {
2922 PAREN_SET(recursed,paren+1);
2923 Newx(newframe,1,scan_frame);
2925 if (flags & SCF_DO_SUBSTR) {
2926 SCAN_COMMIT(pRExC_state,data,minlenp);
2927 data->longest = &(data->longest_float);
2929 is_inf = is_inf_internal = 1;
2930 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2931 cl_anything(pRExC_state, data->start_class);
2932 flags &= ~SCF_DO_STCLASS;
2935 Newx(newframe,1,scan_frame);
2938 end = regnext(scan);
2943 SAVEFREEPV(newframe);
2944 newframe->next = regnext(scan);
2945 newframe->last = last;
2946 newframe->stop = stopparen;
2947 newframe->prev = frame;
2957 else if (OP(scan) == EXACT) {
2958 I32 l = STR_LEN(scan);
2961 const U8 * const s = (U8*)STRING(scan);
2962 l = utf8_length(s, s + l);
2963 uc = utf8_to_uvchr(s, NULL);
2965 uc = *((U8*)STRING(scan));
2968 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2969 /* The code below prefers earlier match for fixed
2970 offset, later match for variable offset. */
2971 if (data->last_end == -1) { /* Update the start info. */
2972 data->last_start_min = data->pos_min;
2973 data->last_start_max = is_inf
2974 ? I32_MAX : data->pos_min + data->pos_delta;
2976 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2978 SvUTF8_on(data->last_found);
2980 SV * const sv = data->last_found;
2981 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2982 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2983 if (mg && mg->mg_len >= 0)
2984 mg->mg_len += utf8_length((U8*)STRING(scan),
2985 (U8*)STRING(scan)+STR_LEN(scan));
2987 data->last_end = data->pos_min + l;
2988 data->pos_min += l; /* As in the first entry. */
2989 data->flags &= ~SF_BEFORE_EOL;
2991 if (flags & SCF_DO_STCLASS_AND) {
2992 /* Check whether it is compatible with what we know already! */
2996 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2997 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2998 && (!(data->start_class->flags & ANYOF_FOLD)
2999 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3002 ANYOF_CLASS_ZERO(data->start_class);
3003 ANYOF_BITMAP_ZERO(data->start_class);
3005 ANYOF_BITMAP_SET(data->start_class, uc);
3006 data->start_class->flags &= ~ANYOF_EOS;
3008 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3010 else if (flags & SCF_DO_STCLASS_OR) {
3011 /* false positive possible if the class is case-folded */
3013 ANYOF_BITMAP_SET(data->start_class, uc);
3015 data->start_class->flags |= ANYOF_UNICODE_ALL;
3016 data->start_class->flags &= ~ANYOF_EOS;
3017 cl_and(data->start_class, and_withp);
3019 flags &= ~SCF_DO_STCLASS;
3021 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3022 I32 l = STR_LEN(scan);
3023 UV uc = *((U8*)STRING(scan));
3025 /* Search for fixed substrings supports EXACT only. */
3026 if (flags & SCF_DO_SUBSTR) {
3028 SCAN_COMMIT(pRExC_state, data, minlenp);
3031 const U8 * const s = (U8 *)STRING(scan);
3032 l = utf8_length(s, s + l);
3033 uc = utf8_to_uvchr(s, NULL);
3036 if (flags & SCF_DO_SUBSTR)
3038 if (flags & SCF_DO_STCLASS_AND) {
3039 /* Check whether it is compatible with what we know already! */
3043 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3044 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3045 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3047 ANYOF_CLASS_ZERO(data->start_class);
3048 ANYOF_BITMAP_ZERO(data->start_class);
3050 ANYOF_BITMAP_SET(data->start_class, uc);
3051 data->start_class->flags &= ~ANYOF_EOS;
3052 data->start_class->flags |= ANYOF_FOLD;
3053 if (OP(scan) == EXACTFL)
3054 data->start_class->flags |= ANYOF_LOCALE;
3057 else if (flags & SCF_DO_STCLASS_OR) {
3058 if (data->start_class->flags & ANYOF_FOLD) {
3059 /* false positive possible if the class is case-folded.
3060 Assume that the locale settings are the same... */
3062 ANYOF_BITMAP_SET(data->start_class, uc);
3063 data->start_class->flags &= ~ANYOF_EOS;
3065 cl_and(data->start_class, and_withp);
3067 flags &= ~SCF_DO_STCLASS;
3069 else if (strchr((const char*)PL_varies,OP(scan))) {
3070 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3071 I32 f = flags, pos_before = 0;
3072 regnode * const oscan = scan;
3073 struct regnode_charclass_class this_class;
3074 struct regnode_charclass_class *oclass = NULL;
3075 I32 next_is_eval = 0;
3077 switch (PL_regkind[OP(scan)]) {
3078 case WHILEM: /* End of (?:...)* . */
3079 scan = NEXTOPER(scan);
3082 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3083 next = NEXTOPER(scan);
3084 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3086 maxcount = REG_INFTY;
3087 next = regnext(scan);
3088 scan = NEXTOPER(scan);
3092 if (flags & SCF_DO_SUBSTR)
3097 if (flags & SCF_DO_STCLASS) {
3099 maxcount = REG_INFTY;
3100 next = regnext(scan);
3101 scan = NEXTOPER(scan);
3104 is_inf = is_inf_internal = 1;
3105 scan = regnext(scan);
3106 if (flags & SCF_DO_SUBSTR) {
3107 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3108 data->longest = &(data->longest_float);
3110 goto optimize_curly_tail;
3112 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3113 && (scan->flags == stopparen))
3118 mincount = ARG1(scan);
3119 maxcount = ARG2(scan);
3121 next = regnext(scan);
3122 if (OP(scan) == CURLYX) {
3123 I32 lp = (data ? *(data->last_closep) : 0);
3124 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3126 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3127 next_is_eval = (OP(scan) == EVAL);
3129 if (flags & SCF_DO_SUBSTR) {
3130 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3131 pos_before = data->pos_min;
3135 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3137 data->flags |= SF_IS_INF;
3139 if (flags & SCF_DO_STCLASS) {
3140 cl_init(pRExC_state, &this_class);
3141 oclass = data->start_class;
3142 data->start_class = &this_class;
3143 f |= SCF_DO_STCLASS_AND;
3144 f &= ~SCF_DO_STCLASS_OR;
3146 /* These are the cases when once a subexpression
3147 fails at a particular position, it cannot succeed
3148 even after backtracking at the enclosing scope.
3150 XXXX what if minimal match and we are at the
3151 initial run of {n,m}? */
3152 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3153 f &= ~SCF_WHILEM_VISITED_POS;
3155 /* This will finish on WHILEM, setting scan, or on NULL: */
3156 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3157 last, data, stopparen, recursed, NULL,
3159 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3161 if (flags & SCF_DO_STCLASS)
3162 data->start_class = oclass;
3163 if (mincount == 0 || minnext == 0) {
3164 if (flags & SCF_DO_STCLASS_OR) {
3165 cl_or(pRExC_state, data->start_class, &this_class);
3167 else if (flags & SCF_DO_STCLASS_AND) {
3168 /* Switch to OR mode: cache the old value of
3169 * data->start_class */
3171 StructCopy(data->start_class, and_withp,
3172 struct regnode_charclass_class);
3173 flags &= ~SCF_DO_STCLASS_AND;
3174 StructCopy(&this_class, data->start_class,
3175 struct regnode_charclass_class);
3176 flags |= SCF_DO_STCLASS_OR;
3177 data->start_class->flags |= ANYOF_EOS;
3179 } else { /* Non-zero len */
3180 if (flags & SCF_DO_STCLASS_OR) {
3181 cl_or(pRExC_state, data->start_class, &this_class);
3182 cl_and(data->start_class, and_withp);
3184 else if (flags & SCF_DO_STCLASS_AND)
3185 cl_and(data->start_class, &this_class);
3186 flags &= ~SCF_DO_STCLASS;
3188 if (!scan) /* It was not CURLYX, but CURLY. */
3190 if ( /* ? quantifier ok, except for (?{ ... }) */
3191 (next_is_eval || !(mincount == 0 && maxcount == 1))
3192 && (minnext == 0) && (deltanext == 0)
3193 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3194 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3196 ckWARNreg(RExC_parse,
3197 "Quantifier unexpected on zero-length expression");
3200 min += minnext * mincount;
3201 is_inf_internal |= ((maxcount == REG_INFTY
3202 && (minnext + deltanext) > 0)
3203 || deltanext == I32_MAX);
3204 is_inf |= is_inf_internal;
3205 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3207 /* Try powerful optimization CURLYX => CURLYN. */
3208 if ( OP(oscan) == CURLYX && data
3209 && data->flags & SF_IN_PAR
3210 && !(data->flags & SF_HAS_EVAL)
3211 && !deltanext && minnext == 1 ) {
3212 /* Try to optimize to CURLYN. */
3213 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3214 regnode * const nxt1 = nxt;
3221 if (!strchr((const char*)PL_simple,OP(nxt))
3222 && !(PL_regkind[OP(nxt)] == EXACT
3223 && STR_LEN(nxt) == 1))
3229 if (OP(nxt) != CLOSE)
3231 if (RExC_open_parens) {
3232 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3233 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3235 /* Now we know that nxt2 is the only contents: */
3236 oscan->flags = (U8)ARG(nxt);
3238 OP(nxt1) = NOTHING; /* was OPEN. */
3241 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3242 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3243 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3244 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3245 OP(nxt + 1) = OPTIMIZED; /* was count. */
3246 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3251 /* Try optimization CURLYX => CURLYM. */
3252 if ( OP(oscan) == CURLYX && data
3253 && !(data->flags & SF_HAS_PAR)
3254 && !(data->flags & SF_HAS_EVAL)
3255 && !deltanext /* atom is fixed width */
3256 && minnext != 0 /* CURLYM can't handle zero width */
3258 /* XXXX How to optimize if data == 0? */
3259 /* Optimize to a simpler form. */
3260 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3264 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3265 && (OP(nxt2) != WHILEM))
3267 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3268 /* Need to optimize away parenths. */
3269 if (data->flags & SF_IN_PAR) {
3270 /* Set the parenth number. */
3271 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3273 if (OP(nxt) != CLOSE)
3274 FAIL("Panic opt close");
3275 oscan->flags = (U8)ARG(nxt);
3276 if (RExC_open_parens) {
3277 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3278 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3280 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3281 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3284 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3285 OP(nxt + 1) = OPTIMIZED; /* was count. */
3286 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3287 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3290 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3291 regnode *nnxt = regnext(nxt1);
3294 if (reg_off_by_arg[OP(nxt1)])
3295 ARG_SET(nxt1, nxt2 - nxt1);
3296 else if (nxt2 - nxt1 < U16_MAX)
3297 NEXT_OFF(nxt1) = nxt2 - nxt1;
3299 OP(nxt) = NOTHING; /* Cannot beautify */
3304 /* Optimize again: */
3305 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3306 NULL, stopparen, recursed, NULL, 0,depth+1);
3311 else if ((OP(oscan) == CURLYX)
3312 && (flags & SCF_WHILEM_VISITED_POS)
3313 /* See the comment on a similar expression above.
3314 However, this time it not a subexpression
3315 we care about, but the expression itself. */
3316 && (maxcount == REG_INFTY)
3317 && data && ++data->whilem_c < 16) {
3318 /* This stays as CURLYX, we can put the count/of pair. */
3319 /* Find WHILEM (as in regexec.c) */
3320 regnode *nxt = oscan + NEXT_OFF(oscan);
3322 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3324 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3325 | (RExC_whilem_seen << 4)); /* On WHILEM */
3327 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3329 if (flags & SCF_DO_SUBSTR) {
3330 SV *last_str = NULL;
3331 int counted = mincount != 0;
3333 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3334 #if defined(SPARC64_GCC_WORKAROUND)
3337 const char *s = NULL;
3340 if (pos_before >= data->last_start_min)
3343 b = data->last_start_min;
3346 s = SvPV_const(data->last_found, l);
3347 old = b - data->last_start_min;
3350 I32 b = pos_before >= data->last_start_min
3351 ? pos_before : data->last_start_min;
3353 const char * const s = SvPV_const(data->last_found, l);
3354 I32 old = b - data->last_start_min;
3358 old = utf8_hop((U8*)s, old) - (U8*)s;
3361 /* Get the added string: */
3362 last_str = newSVpvn_utf8(s + old, l, UTF);
3363 if (deltanext == 0 && pos_before == b) {
3364 /* What was added is a constant string */
3366 SvGROW(last_str, (mincount * l) + 1);
3367 repeatcpy(SvPVX(last_str) + l,
3368 SvPVX_const(last_str), l, mincount - 1);
3369 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3370 /* Add additional parts. */
3371 SvCUR_set(data->last_found,
3372 SvCUR(data->last_found) - l);
3373 sv_catsv(data->last_found, last_str);
3375 SV * sv = data->last_found;
3377 SvUTF8(sv) && SvMAGICAL(sv) ?
3378 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3379 if (mg && mg->mg_len >= 0)
3380 mg->mg_len += CHR_SVLEN(last_str) - l;
3382 data->last_end += l * (mincount - 1);
3385 /* start offset must point into the last copy */
3386 data->last_start_min += minnext * (mincount - 1);
3387 data->last_start_max += is_inf ? I32_MAX
3388 : (maxcount - 1) * (minnext + data->pos_delta);
3391 /* It is counted once already... */
3392 data->pos_min += minnext * (mincount - counted);
3393 data->pos_delta += - counted * deltanext +
3394 (minnext + deltanext) * maxcount - minnext * mincount;
3395 if (mincount != maxcount) {
3396 /* Cannot extend fixed substrings found inside
3398 SCAN_COMMIT(pRExC_state,data,minlenp);
3399 if (mincount && last_str) {
3400 SV * const sv = data->last_found;
3401 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3402 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3406 sv_setsv(sv, last_str);
3407 data->last_end = data->pos_min;
3408 data->last_start_min =
3409 data->pos_min - CHR_SVLEN(last_str);
3410 data->last_start_max = is_inf
3412 : data->pos_min + data->pos_delta
3413 - CHR_SVLEN(last_str);
3415 data->longest = &(data->longest_float);
3417 SvREFCNT_dec(last_str);
3419 if (data && (fl & SF_HAS_EVAL))
3420 data->flags |= SF_HAS_EVAL;
3421 optimize_curly_tail:
3422 if (OP(oscan) != CURLYX) {
3423 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3425 NEXT_OFF(oscan) += NEXT_OFF(next);
3428 default: /* REF and CLUMP only? */
3429 if (flags & SCF_DO_SUBSTR) {
3430 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3431 data->longest = &(data->longest_float);
3433 is_inf = is_inf_internal = 1;
3434 if (flags & SCF_DO_STCLASS_OR)
3435 cl_anything(pRExC_state, data->start_class);
3436 flags &= ~SCF_DO_STCLASS;
3440 else if (OP(scan) == LNBREAK) {
3441 if (flags & SCF_DO_STCLASS) {
3443 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3444 if (flags & SCF_DO_STCLASS_AND) {
3445 for (value = 0; value < 256; value++)
3446 if (!is_VERTWS_cp(value))
3447 ANYOF_BITMAP_CLEAR(data->start_class, value);
3450 for (value = 0; value < 256; value++)
3451 if (is_VERTWS_cp(value))
3452 ANYOF_BITMAP_SET(data->start_class, value);
3454 if (flags & SCF_DO_STCLASS_OR)
3455 cl_and(data->start_class, and_withp);
3456 flags &= ~SCF_DO_STCLASS;
3460 if (flags & SCF_DO_SUBSTR) {
3461 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3463 data->pos_delta += 1;
3464 data->longest = &(data->longest_float);
3468 else if (OP(scan) == FOLDCHAR) {
3469 int d = ARG(scan)==0xDF ? 1 : 2;
3470 flags &= ~SCF_DO_STCLASS;
3473 if (flags & SCF_DO_SUBSTR) {
3474 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3476 data->pos_delta += d;
3477 data->longest = &(data->longest_float);
3480 else if (strchr((const char*)PL_simple,OP(scan))) {
3483 if (flags & SCF_DO_SUBSTR) {
3484 SCAN_COMMIT(pRExC_state,data,minlenp);
3488 if (flags & SCF_DO_STCLASS) {
3489 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3491 /* Some of the logic below assumes that switching
3492 locale on will only add false positives. */
3493 switch (PL_regkind[OP(scan)]) {
3497 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3498 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3499 cl_anything(pRExC_state, data->start_class);
3502 if (OP(scan) == SANY)
3504 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3505 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3506 || (data->start_class->flags & ANYOF_CLASS));
3507 cl_anything(pRExC_state, data->start_class);
3509 if (flags & SCF_DO_STCLASS_AND || !value)
3510 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3513 if (flags & SCF_DO_STCLASS_AND)
3514 cl_and(data->start_class,
3515 (struct regnode_charclass_class*)scan);
3517 cl_or(pRExC_state, data->start_class,
3518 (struct regnode_charclass_class*)scan);
3521 if (flags & SCF_DO_STCLASS_AND) {
3522 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3523 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3524 for (value = 0; value < 256; value++)
3525 if (!isALNUM(value))
3526 ANYOF_BITMAP_CLEAR(data->start_class, value);
3530 if (data->start_class->flags & ANYOF_LOCALE)
3531 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3533 for (value = 0; value < 256; value++)
3535 ANYOF_BITMAP_SET(data->start_class, value);
3540 if (flags & SCF_DO_STCLASS_AND) {
3541 if (data->start_class->flags & ANYOF_LOCALE)
3542 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3545 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3546 data->start_class->flags |= ANYOF_LOCALE;
3550 if (flags & SCF_DO_STCLASS_AND) {
3551 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3552 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3553 for (value = 0; value < 256; value++)
3555 ANYOF_BITMAP_CLEAR(data->start_class, value);
3559 if (data->start_class->flags & ANYOF_LOCALE)
3560 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3562 for (value = 0; value < 256; value++)
3563 if (!isALNUM(value))
3564 ANYOF_BITMAP_SET(data->start_class, value);
3569 if (flags & SCF_DO_STCLASS_AND) {
3570 if (data->start_class->flags & ANYOF_LOCALE)
3571 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3574 data->start_class->flags |= ANYOF_LOCALE;
3575 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3579 if (flags & SCF_DO_STCLASS_AND) {
3580 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3581 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3582 for (value = 0; value < 256; value++)
3583 if (!isSPACE(value))
3584 ANYOF_BITMAP_CLEAR(data->start_class, value);
3588 if (data->start_class->flags & ANYOF_LOCALE)
3589 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3591 for (value = 0; value < 256; value++)
3593 ANYOF_BITMAP_SET(data->start_class, value);
3598 if (flags & SCF_DO_STCLASS_AND) {
3599 if (data->start_class->flags & ANYOF_LOCALE)
3600 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3603 data->start_class->flags |= ANYOF_LOCALE;
3604 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3608 if (flags & SCF_DO_STCLASS_AND) {
3609 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3610 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3611 for (value = 0; value < 256; value++)
3613 ANYOF_BITMAP_CLEAR(data->start_class, value);
3617 if (data->start_class->flags & ANYOF_LOCALE)
3618 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3620 for (value = 0; value < 256; value++)
3621 if (!isSPACE(value))
3622 ANYOF_BITMAP_SET(data->start_class, value);
3627 if (flags & SCF_DO_STCLASS_AND) {
3628 if (data->start_class->flags & ANYOF_LOCALE) {
3629 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3630 for (value = 0; value < 256; value++)
3631 if (!isSPACE(value))
3632 ANYOF_BITMAP_CLEAR(data->start_class, value);
3636 data->start_class->flags |= ANYOF_LOCALE;
3637 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3641 if (flags & SCF_DO_STCLASS_AND) {
3642 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3643 for (value = 0; value < 256; value++)
3644 if (!isDIGIT(value))
3645 ANYOF_BITMAP_CLEAR(data->start_class, value);
3648 if (data->start_class->flags & ANYOF_LOCALE)
3649 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3651 for (value = 0; value < 256; value++)
3653 ANYOF_BITMAP_SET(data->start_class, value);
3658 if (flags & SCF_DO_STCLASS_AND) {
3659 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3660 for (value = 0; value < 256; value++)
3662 ANYOF_BITMAP_CLEAR(data->start_class, value);
3665 if (data->start_class->flags & ANYOF_LOCALE)
3666 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3668 for (value = 0; value < 256; value++)
3669 if (!isDIGIT(value))
3670 ANYOF_BITMAP_SET(data->start_class, value);
3674 CASE_SYNST_FNC(VERTWS);
3675 CASE_SYNST_FNC(HORIZWS);
3678 if (flags & SCF_DO_STCLASS_OR)
3679 cl_and(data->start_class, and_withp);
3680 flags &= ~SCF_DO_STCLASS;
3683 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3684 data->flags |= (OP(scan) == MEOL
3688 else if ( PL_regkind[OP(scan)] == BRANCHJ
3689 /* Lookbehind, or need to calculate parens/evals/stclass: */
3690 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3691 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3692 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3693 || OP(scan) == UNLESSM )
3695 /* Negative Lookahead/lookbehind
3696 In this case we can't do fixed string optimisation.
3699 I32 deltanext, minnext, fake = 0;
3701 struct regnode_charclass_class intrnl;
3704 data_fake.flags = 0;
3706 data_fake.whilem_c = data->whilem_c;
3707 data_fake.last_closep = data->last_closep;
3710 data_fake.last_closep = &fake;
3711 data_fake.pos_delta = delta;
3712 if ( flags & SCF_DO_STCLASS && !scan->flags
3713 && OP(scan) == IFMATCH ) { /* Lookahead */
3714 cl_init(pRExC_state, &intrnl);
3715 data_fake.start_class = &intrnl;
3716 f |= SCF_DO_STCLASS_AND;
3718 if (flags & SCF_WHILEM_VISITED_POS)
3719 f |= SCF_WHILEM_VISITED_POS;
3720 next = regnext(scan);
3721 nscan = NEXTOPER(NEXTOPER(scan));
3722 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3723 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3726 FAIL("Variable length lookbehind not implemented");
3728 else if (minnext > (I32)U8_MAX) {
3729 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3731 scan->flags = (U8)minnext;
3734 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3736 if (data_fake.flags & SF_HAS_EVAL)
3737 data->flags |= SF_HAS_EVAL;
3738 data->whilem_c = data_fake.whilem_c;
3740 if (f & SCF_DO_STCLASS_AND) {
3741 if (flags & SCF_DO_STCLASS_OR) {
3742 /* OR before, AND after: ideally we would recurse with
3743 * data_fake to get the AND applied by study of the
3744 * remainder of the pattern, and then derecurse;
3745 * *** HACK *** for now just treat as "no information".
3746 * See [perl #56690].
3748 cl_init(pRExC_state, data->start_class);
3750 /* AND before and after: combine and continue */
3751 const int was = (data->start_class->flags & ANYOF_EOS);
3753 cl_and(data->start_class, &intrnl);
3755 data->start_class->flags |= ANYOF_EOS;
3759 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3761 /* Positive Lookahead/lookbehind
3762 In this case we can do fixed string optimisation,
3763 but we must be careful about it. Note in the case of
3764 lookbehind the positions will be offset by the minimum
3765 length of the pattern, something we won't know about
3766 until after the recurse.
3768 I32 deltanext, fake = 0;
3770 struct regnode_charclass_class intrnl;
3772 /* We use SAVEFREEPV so that when the full compile
3773 is finished perl will clean up the allocated
3774 minlens when its all done. This was we don't
3775 have to worry about freeing them when we know
3776 they wont be used, which would be a pain.
3779 Newx( minnextp, 1, I32 );
3780 SAVEFREEPV(minnextp);
3783 StructCopy(data, &data_fake, scan_data_t);
3784 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3787 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3788 data_fake.last_found=newSVsv(data->last_found);
3792 data_fake.last_closep = &fake;
3793 data_fake.flags = 0;
3794 data_fake.pos_delta = delta;
3796 data_fake.flags |= SF_IS_INF;
3797 if ( flags & SCF_DO_STCLASS && !scan->flags
3798 && OP(scan) == IFMATCH ) { /* Lookahead */
3799 cl_init(pRExC_state, &intrnl);
3800 data_fake.start_class = &intrnl;
3801 f |= SCF_DO_STCLASS_AND;
3803 if (flags & SCF_WHILEM_VISITED_POS)
3804 f |= SCF_WHILEM_VISITED_POS;
3805 next = regnext(scan);
3806 nscan = NEXTOPER(NEXTOPER(scan));
3808 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3809 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3812 FAIL("Variable length lookbehind not implemented");
3814 else if (*minnextp > (I32)U8_MAX) {
3815 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3817 scan->flags = (U8)*minnextp;
3822 if (f & SCF_DO_STCLASS_AND) {
3823 const int was = (data->start_class->flags & ANYOF_EOS);
3825 cl_and(data->start_class, &intrnl);
3827 data->start_class->flags |= ANYOF_EOS;
3830 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3832 if (data_fake.flags & SF_HAS_EVAL)
3833 data->flags |= SF_HAS_EVAL;
3834 data->whilem_c = data_fake.whilem_c;
3835 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3836 if (RExC_rx->minlen<*minnextp)
3837 RExC_rx->minlen=*minnextp;
3838 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3839 SvREFCNT_dec(data_fake.last_found);
3841 if ( data_fake.minlen_fixed != minlenp )
3843 data->offset_fixed= data_fake.offset_fixed;
3844 data->minlen_fixed= data_fake.minlen_fixed;
3845 data->lookbehind_fixed+= scan->flags;
3847 if ( data_fake.minlen_float != minlenp )
3849 data->minlen_float= data_fake.minlen_float;
3850 data->offset_float_min=data_fake.offset_float_min;
3851 data->offset_float_max=data_fake.offset_float_max;
3852 data->lookbehind_float+= scan->flags;
3861 else if (OP(scan) == OPEN) {
3862 if (stopparen != (I32)ARG(scan))
3865 else if (OP(scan) == CLOSE) {
3866 if (stopparen == (I32)ARG(scan)) {
3869 if ((I32)ARG(scan) == is_par) {
3870 next = regnext(scan);
3872 if ( next && (OP(next) != WHILEM) && next < last)
3873 is_par = 0; /* Disable optimization */
3876 *(data->last_closep) = ARG(scan);
3878 else if (OP(scan) == EVAL) {
3880 data->flags |= SF_HAS_EVAL;
3882 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3883 if (flags & SCF_DO_SUBSTR) {
3884 SCAN_COMMIT(pRExC_state,data,minlenp);
3885 flags &= ~SCF_DO_SUBSTR;
3887 if (data && OP(scan)==ACCEPT) {
3888 data->flags |= SCF_SEEN_ACCEPT;
3893 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3895 if (flags & SCF_DO_SUBSTR) {
3896 SCAN_COMMIT(pRExC_state,data,minlenp);
3897 data->longest = &(data->longest_float);
3899 is_inf = is_inf_internal = 1;
3900 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3901 cl_anything(pRExC_state, data->start_class);
3902 flags &= ~SCF_DO_STCLASS;
3904 else if (OP(scan) == GPOS) {
3905 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3906 !(delta || is_inf || (data && data->pos_delta)))
3908 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3909 RExC_rx->extflags |= RXf_ANCH_GPOS;
3910 if (RExC_rx->gofs < (U32)min)
3911 RExC_rx->gofs = min;
3913 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3917 #ifdef TRIE_STUDY_OPT
3918 #ifdef FULL_TRIE_STUDY
3919 else if (PL_regkind[OP(scan)] == TRIE) {
3920 /* NOTE - There is similar code to this block above for handling
3921 BRANCH nodes on the initial study. If you change stuff here
3923 regnode *trie_node= scan;
3924 regnode *tail= regnext(scan);
3925 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3926 I32 max1 = 0, min1 = I32_MAX;
3927 struct regnode_charclass_class accum;
3929 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3930 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3931 if (flags & SCF_DO_STCLASS)
3932 cl_init_zero(pRExC_state, &accum);
3938 const regnode *nextbranch= NULL;
3941 for ( word=1 ; word <= trie->wordcount ; word++)
3943 I32 deltanext=0, minnext=0, f = 0, fake;
3944 struct regnode_charclass_class this_class;
3946 data_fake.flags = 0;
3948 data_fake.whilem_c = data->whilem_c;
3949 data_fake.last_closep = data->last_closep;
3952 data_fake.last_closep = &fake;
3953 data_fake.pos_delta = delta;
3954 if (flags & SCF_DO_STCLASS) {
3955 cl_init(pRExC_state, &this_class);
3956 data_fake.start_class = &this_class;
3957 f = SCF_DO_STCLASS_AND;
3959 if (flags & SCF_WHILEM_VISITED_POS)
3960 f |= SCF_WHILEM_VISITED_POS;
3962 if (trie->jump[word]) {
3964 nextbranch = trie_node + trie->jump[0];
3965 scan= trie_node + trie->jump[word];
3966 /* We go from the jump point to the branch that follows
3967 it. Note this means we need the vestigal unused branches
3968 even though they arent otherwise used.
3970 minnext = study_chunk(pRExC_state, &scan, minlenp,
3971 &deltanext, (regnode *)nextbranch, &data_fake,
3972 stopparen, recursed, NULL, f,depth+1);
3974 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3975 nextbranch= regnext((regnode*)nextbranch);
3977 if (min1 > (I32)(minnext + trie->minlen))
3978 min1 = minnext + trie->minlen;
3979 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3980 max1 = minnext + deltanext + trie->maxlen;
3981 if (deltanext == I32_MAX)
3982 is_inf = is_inf_internal = 1;
3984 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3986 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3987 if ( stopmin > min + min1)
3988 stopmin = min + min1;
3989 flags &= ~SCF_DO_SUBSTR;
3991 data->flags |= SCF_SEEN_ACCEPT;
3994 if (data_fake.flags & SF_HAS_EVAL)
3995 data->flags |= SF_HAS_EVAL;
3996 data->whilem_c = data_fake.whilem_c;
3998 if (flags & SCF_DO_STCLASS)
3999 cl_or(pRExC_state, &accum, &this_class);
4002 if (flags & SCF_DO_SUBSTR) {
4003 data->pos_min += min1;
4004 data->pos_delta += max1 - min1;
4005 if (max1 != min1 || is_inf)
4006 data->longest = &(data->longest_float);
4009 delta += max1 - min1;
4010 if (flags & SCF_DO_STCLASS_OR) {
4011 cl_or(pRExC_state, data->start_class, &accum);
4013 cl_and(data->start_class, and_withp);
4014 flags &= ~SCF_DO_STCLASS;
4017 else if (flags & SCF_DO_STCLASS_AND) {
4019 cl_and(data->start_class, &accum);
4020 flags &= ~SCF_DO_STCLASS;
4023 /* Switch to OR mode: cache the old value of
4024 * data->start_class */
4026 StructCopy(data->start_class, and_withp,
4027 struct regnode_charclass_class);
4028 flags &= ~SCF_DO_STCLASS_AND;
4029 StructCopy(&accum, data->start_class,
4030 struct regnode_charclass_class);
4031 flags |= SCF_DO_STCLASS_OR;
4032 data->start_class->flags |= ANYOF_EOS;
4039 else if (PL_regkind[OP(scan)] == TRIE) {
4040 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4043 min += trie->minlen;
4044 delta += (trie->maxlen - trie->minlen);
4045 flags &= ~SCF_DO_STCLASS; /* xxx */
4046 if (flags & SCF_DO_SUBSTR) {
4047 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4048 data->pos_min += trie->minlen;
4049 data->pos_delta += (trie->maxlen - trie->minlen);
4050 if (trie->maxlen != trie->minlen)
4051 data->longest = &(data->longest_float);
4053 if (trie->jump) /* no more substrings -- for now /grr*/
4054 flags &= ~SCF_DO_SUBSTR;
4056 #endif /* old or new */
4057 #endif /* TRIE_STUDY_OPT */
4059 /* Else: zero-length, ignore. */
4060 scan = regnext(scan);
4065 stopparen = frame->stop;
4066 frame = frame->prev;
4067 goto fake_study_recurse;
4072 DEBUG_STUDYDATA("pre-fin:",data,depth);
4075 *deltap = is_inf_internal ? I32_MAX : delta;
4076 if (flags & SCF_DO_SUBSTR && is_inf)
4077 data->pos_delta = I32_MAX - data->pos_min;
4078 if (is_par > (I32)U8_MAX)
4080 if (is_par && pars==1 && data) {
4081 data->flags |= SF_IN_PAR;
4082 data->flags &= ~SF_HAS_PAR;
4084 else if (pars && data) {
4085 data->flags |= SF_HAS_PAR;
4086 data->flags &= ~SF_IN_PAR;
4088 if (flags & SCF_DO_STCLASS_OR)
4089 cl_and(data->start_class, and_withp);
4090 if (flags & SCF_TRIE_RESTUDY)
4091 data->flags |= SCF_TRIE_RESTUDY;
4093 DEBUG_STUDYDATA("post-fin:",data,depth);
4095 return min < stopmin ? min : stopmin;
4099 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4101 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4103 PERL_ARGS_ASSERT_ADD_DATA;
4105 Renewc(RExC_rxi->data,
4106 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4107 char, struct reg_data);
4109 Renew(RExC_rxi->data->what, count + n, U8);
4111 Newx(RExC_rxi->data->what, n, U8);
4112 RExC_rxi->data->count = count + n;
4113 Copy(s, RExC_rxi->data->what + count, n, U8);
4117 /*XXX: todo make this not included in a non debugging perl */
4118 #ifndef PERL_IN_XSUB_RE
4120 Perl_reginitcolors(pTHX)
4123 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4125 char *t = savepv(s);
4129 t = strchr(t, '\t');
4135 PL_colors[i] = t = (char *)"";
4140 PL_colors[i++] = (char *)"";
4147 #ifdef TRIE_STUDY_OPT
4148 #define CHECK_RESTUDY_GOTO \
4150 (data.flags & SCF_TRIE_RESTUDY) \
4154 #define CHECK_RESTUDY_GOTO
4158 - pregcomp - compile a regular expression into internal code
4160 * We can't allocate space until we know how big the compiled form will be,
4161 * but we can't compile it (and thus know how big it is) until we've got a
4162 * place to put the code. So we cheat: we compile it twice, once with code
4163 * generation turned off and size counting turned on, and once "for real".
4164 * This also means that we don't allocate space until we are sure that the
4165 * thing really will compile successfully, and we never have to move the
4166 * code and thus invalidate pointers into it. (Note that it has to be in
4167 * one piece because free() must be able to free it all.) [NB: not true in perl]
4169 * Beware that the optimization-preparation code in here knows about some
4170 * of the structure of the compiled regexp. [I'll say.]
4175 #ifndef PERL_IN_XSUB_RE
4176 #define RE_ENGINE_PTR &PL_core_reg_engine
4178 extern const struct regexp_engine my_reg_engine;
4179 #define RE_ENGINE_PTR &my_reg_engine
4182 #ifndef PERL_IN_XSUB_RE
4184 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4187 HV * const table = GvHV(PL_hintgv);
4189 PERL_ARGS_ASSERT_PREGCOMP;
4191 /* Dispatch a request to compile a regexp to correct
4194 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4195 GET_RE_DEBUG_FLAGS_DECL;
4196 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4197 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4199 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4202 return CALLREGCOMP_ENG(eng, pattern, flags);
4205 return Perl_re_compile(aTHX_ pattern, flags);
4210 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4215 register regexp_internal *ri;
4217 char *exp = SvPV(pattern, plen);
4218 char* xend = exp + plen;
4225 RExC_state_t RExC_state;
4226 RExC_state_t * const pRExC_state = &RExC_state;
4227 #ifdef TRIE_STUDY_OPT
4229 RExC_state_t copyRExC_state;
4231 GET_RE_DEBUG_FLAGS_DECL;
4233 PERL_ARGS_ASSERT_RE_COMPILE;
4235 DEBUG_r(if (!PL_colorset) reginitcolors());
4237 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4240 SV *dsv= sv_newmortal();
4241 RE_PV_QUOTED_DECL(s, RExC_utf8,
4242 dsv, exp, plen, 60);
4243 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4244 PL_colors[4],PL_colors[5],s);
4249 RExC_flags = pm_flags;
4253 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4254 RExC_seen_evals = 0;
4257 /* First pass: determine size, legality. */
4265 RExC_emit = &PL_regdummy;
4266 RExC_whilem_seen = 0;
4267 RExC_charnames = NULL;
4268 RExC_open_parens = NULL;
4269 RExC_close_parens = NULL;
4271 RExC_paren_names = NULL;
4273 RExC_paren_name_list = NULL;
4275 RExC_recurse = NULL;
4276 RExC_recurse_count = 0;
4278 #if 0 /* REGC() is (currently) a NOP at the first pass.
4279 * Clever compilers notice this and complain. --jhi */
4280 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4282 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4283 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4284 RExC_precomp = NULL;
4287 if (RExC_utf8 && !RExC_orig_utf8) {
4288 /* It's possible to write a regexp in ascii that represents Unicode
4289 codepoints outside of the byte range, such as via \x{100}. If we
4290 detect such a sequence we have to convert the entire pattern to utf8
4291 and then recompile, as our sizing calculation will have been based
4292 on 1 byte == 1 character, but we will need to use utf8 to encode
4293 at least some part of the pattern, and therefore must convert the whole
4295 XXX: somehow figure out how to make this less expensive...
4298 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4299 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4300 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4302 RExC_orig_utf8 = RExC_utf8;
4304 goto redo_first_pass;
4307 PerlIO_printf(Perl_debug_log,
4308 "Required size %"IVdf" nodes\n"
4309 "Starting second pass (creation)\n",
4312 RExC_lastparse=NULL;
4314 /* Small enough for pointer-storage convention?
4315 If extralen==0, this means that we will not need long jumps. */
4316 if (RExC_size >= 0x10000L && RExC_extralen)
4317 RExC_size += RExC_extralen;
4320 if (RExC_whilem_seen > 15)
4321 RExC_whilem_seen = 15;
4323 /* Allocate space and zero-initialize. Note, the two step process
4324 of zeroing when in debug mode, thus anything assigned has to
4325 happen after that */
4326 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4327 r = (struct regexp*)SvANY(rx);
4328 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4329 char, regexp_internal);
4330 if ( r == NULL || ri == NULL )
4331 FAIL("Regexp out of space");
4333 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4334 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4336 /* bulk initialize base fields with 0. */
4337 Zero(ri, sizeof(regexp_internal), char);
4340 /* non-zero initialization begins here */
4342 r->engine= RE_ENGINE_PTR;
4343 r->extflags = pm_flags;
4345 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4346 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4347 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4348 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4349 >> RXf_PMf_STD_PMMOD_SHIFT);
4350 const char *fptr = STD_PAT_MODS; /*"msix"*/
4352 const STRLEN wraplen = plen + has_minus + has_p + has_runon
4353 + (sizeof(STD_PAT_MODS) - 1)
4354 + (sizeof("(?:)") - 1);
4356 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4357 SvCUR_set(rx, wraplen);
4359 SvFLAGS(rx) |= SvUTF8(pattern);
4362 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4364 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4365 char *colon = r + 1;
4368 while((ch = *fptr++)) {
4382 Copy(RExC_precomp, p, plen, char);
4383 assert ((RX_WRAPPED(rx) - p) < 16);
4384 r->pre_prefix = p - RX_WRAPPED(rx);
4393 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4395 if (RExC_seen & REG_SEEN_RECURSE) {
4396 Newxz(RExC_open_parens, RExC_npar,regnode *);
4397 SAVEFREEPV(RExC_open_parens);
4398 Newxz(RExC_close_parens,RExC_npar,regnode *);
4399 SAVEFREEPV(RExC_close_parens);
4402 /* Useful during FAIL. */
4403 #ifdef RE_TRACK_PATTERN_OFFSETS
4404 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4405 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4406 "%s %"UVuf" bytes for offset annotations.\n",
4407 ri->u.offsets ? "Got" : "Couldn't get",
4408 (UV)((2*RExC_size+1) * sizeof(U32))));
4410 SetProgLen(ri,RExC_size);
4414 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4416 /* Second pass: emit code. */
4417 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4422 RExC_emit_start = ri->program;
4423 RExC_emit = ri->program;
4424 RExC_emit_bound = ri->program + RExC_size + 1;
4426 /* Store the count of eval-groups for security checks: */
4427 RExC_rx->seen_evals = RExC_seen_evals;
4428 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4429 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4433 /* XXXX To minimize changes to RE engine we always allocate
4434 3-units-long substrs field. */
4435 Newx(r->substrs, 1, struct reg_substr_data);
4436 if (RExC_recurse_count) {
4437 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4438 SAVEFREEPV(RExC_recurse);
4442 r->minlen = minlen = sawplus = sawopen = 0;
4443 Zero(r->substrs, 1, struct reg_substr_data);
4445 #ifdef TRIE_STUDY_OPT
4447 StructCopy(&zero_scan_data, &data, scan_data_t);
4448 copyRExC_state = RExC_state;
4451 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4453 RExC_state = copyRExC_state;
4454 if (seen & REG_TOP_LEVEL_BRANCHES)
4455 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4457 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4458 if (data.last_found) {
4459 SvREFCNT_dec(data.longest_fixed);
4460 SvREFCNT_dec(data.longest_float);
4461 SvREFCNT_dec(data.last_found);
4463 StructCopy(&zero_scan_data, &data, scan_data_t);
4466 StructCopy(&zero_scan_data, &data, scan_data_t);
4469 /* Dig out information for optimizations. */
4470 r->extflags = RExC_flags; /* was pm_op */
4471 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4474 SvUTF8_on(rx); /* Unicode in it? */
4475 ri->regstclass = NULL;
4476 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4477 r->intflags |= PREGf_NAUGHTY;
4478 scan = ri->program + 1; /* First BRANCH. */
4480 /* testing for BRANCH here tells us whether there is "must appear"
4481 data in the pattern. If there is then we can use it for optimisations */
4482 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4484 STRLEN longest_float_length, longest_fixed_length;
4485 struct regnode_charclass_class ch_class; /* pointed to by data */
4487 I32 last_close = 0; /* pointed to by data */
4488 regnode *first= scan;
4489 regnode *first_next= regnext(first);
4492 * Skip introductions and multiplicators >= 1
4493 * so that we can extract the 'meat' of the pattern that must
4494 * match in the large if() sequence following.
4495 * NOTE that EXACT is NOT covered here, as it is normally
4496 * picked up by the optimiser separately.
4498 * This is unfortunate as the optimiser isnt handling lookahead
4499 * properly currently.
4502 while ((OP(first) == OPEN && (sawopen = 1)) ||
4503 /* An OR of *one* alternative - should not happen now. */
4504 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4505 /* for now we can't handle lookbehind IFMATCH*/
4506 (OP(first) == IFMATCH && !first->flags) ||
4507 (OP(first) == PLUS) ||
4508 (OP(first) == MINMOD) ||
4509 /* An {n,m} with n>0 */
4510 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4511 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4514 * the only op that could be a regnode is PLUS, all the rest
4515 * will be regnode_1 or regnode_2.
4518 if (OP(first) == PLUS)
4521 first += regarglen[OP(first)];
4523 first = NEXTOPER(first);
4524 first_next= regnext(first);
4527 /* Starting-point info. */
4529 DEBUG_PEEP("first:",first,0);
4530 /* Ignore EXACT as we deal with it later. */
4531 if (PL_regkind[OP(first)] == EXACT) {
4532 if (OP(first) == EXACT)
4533 NOOP; /* Empty, get anchored substr later. */
4534 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4535 ri->regstclass = first;
4538 else if (PL_regkind[OP(first)] == TRIE &&
4539 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4542 /* this can happen only on restudy */
4543 if ( OP(first) == TRIE ) {
4544 struct regnode_1 *trieop = (struct regnode_1 *)
4545 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4546 StructCopy(first,trieop,struct regnode_1);
4547 trie_op=(regnode *)trieop;
4549 struct regnode_charclass *trieop = (struct regnode_charclass *)
4550 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4551 StructCopy(first,trieop,struct regnode_charclass);
4552 trie_op=(regnode *)trieop;
4555 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4556 ri->regstclass = trie_op;
4559 else if (strchr((const char*)PL_simple,OP(first)))
4560 ri->regstclass = first;
4561 else if (PL_regkind[OP(first)] == BOUND ||
4562 PL_regkind[OP(first)] == NBOUND)
4563 ri->regstclass = first;
4564 else if (PL_regkind[OP(first)] == BOL) {
4565 r->extflags |= (OP(first) == MBOL
4567 : (OP(first) == SBOL
4570 first = NEXTOPER(first);
4573 else if (OP(first) == GPOS) {
4574 r->extflags |= RXf_ANCH_GPOS;
4575 first = NEXTOPER(first);
4578 else if ((!sawopen || !RExC_sawback) &&
4579 (OP(first) == STAR &&
4580 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4581 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4583 /* turn .* into ^.* with an implied $*=1 */
4585 (OP(NEXTOPER(first)) == REG_ANY)
4588 r->extflags |= type;
4589 r->intflags |= PREGf_IMPLICIT;
4590 first = NEXTOPER(first);
4593 if (sawplus && (!sawopen || !RExC_sawback)
4594 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4595 /* x+ must match at the 1st pos of run of x's */
4596 r->intflags |= PREGf_SKIP;
4598 /* Scan is after the zeroth branch, first is atomic matcher. */
4599 #ifdef TRIE_STUDY_OPT
4602 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4603 (IV)(first - scan + 1))
4607 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4608 (IV)(first - scan + 1))
4614 * If there's something expensive in the r.e., find the
4615 * longest literal string that must appear and make it the
4616 * regmust. Resolve ties in favor of later strings, since
4617 * the regstart check works with the beginning of the r.e.
4618 * and avoiding duplication strengthens checking. Not a
4619 * strong reason, but sufficient in the absence of others.
4620 * [Now we resolve ties in favor of the earlier string if
4621 * it happens that c_offset_min has been invalidated, since the
4622 * earlier string may buy us something the later one won't.]
4625 data.longest_fixed = newSVpvs("");
4626 data.longest_float = newSVpvs("");
4627 data.last_found = newSVpvs("");
4628 data.longest = &(data.longest_fixed);
4630 if (!ri->regstclass) {
4631 cl_init(pRExC_state, &ch_class);
4632 data.start_class = &ch_class;
4633 stclass_flag = SCF_DO_STCLASS_AND;
4634 } else /* XXXX Check for BOUND? */
4636 data.last_closep = &last_close;
4638 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4639 &data, -1, NULL, NULL,
4640 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4646 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4647 && data.last_start_min == 0 && data.last_end > 0
4648 && !RExC_seen_zerolen
4649 && !(RExC_seen & REG_SEEN_VERBARG)
4650 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4651 r->extflags |= RXf_CHECK_ALL;
4652 scan_commit(pRExC_state, &data,&minlen,0);
4653 SvREFCNT_dec(data.last_found);
4655 /* Note that code very similar to this but for anchored string
4656 follows immediately below, changes may need to be made to both.
4659 longest_float_length = CHR_SVLEN(data.longest_float);
4660 if (longest_float_length
4661 || (data.flags & SF_FL_BEFORE_EOL
4662 && (!(data.flags & SF_FL_BEFORE_MEOL)
4663 || (RExC_flags & RXf_PMf_MULTILINE))))
4667 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4668 && data.offset_fixed == data.offset_float_min
4669 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4670 goto remove_float; /* As in (a)+. */
4672 /* copy the information about the longest float from the reg_scan_data
4673 over to the program. */
4674 if (SvUTF8(data.longest_float)) {
4675 r->float_utf8 = data.longest_float;
4676 r->float_substr = NULL;
4678 r->float_substr = data.longest_float;
4679 r->float_utf8 = NULL;
4681 /* float_end_shift is how many chars that must be matched that
4682 follow this item. We calculate it ahead of time as once the
4683 lookbehind offset is added in we lose the ability to correctly
4685 ml = data.minlen_float ? *(data.minlen_float)
4686 : (I32)longest_float_length;
4687 r->float_end_shift = ml - data.offset_float_min
4688 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4689 + data.lookbehind_float;
4690 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4691 r->float_max_offset = data.offset_float_max;
4692 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4693 r->float_max_offset -= data.lookbehind_float;
4695 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4696 && (!(data.flags & SF_FL_BEFORE_MEOL)
4697 || (RExC_flags & RXf_PMf_MULTILINE)));
4698 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4702 r->float_substr = r->float_utf8 = NULL;
4703 SvREFCNT_dec(data.longest_float);
4704 longest_float_length = 0;
4707 /* Note that code very similar to this but for floating string
4708 is immediately above, changes may need to be made to both.
4711 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4712 if (longest_fixed_length
4713 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4714 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4715 || (RExC_flags & RXf_PMf_MULTILINE))))
4719 /* copy the information about the longest fixed
4720 from the reg_scan_data over to the program. */
4721 if (SvUTF8(data.longest_fixed)) {
4722 r->anchored_utf8 = data.longest_fixed;
4723 r->anchored_substr = NULL;
4725 r->anchored_substr = data.longest_fixed;
4726 r->anchored_utf8 = NULL;
4728 /* fixed_end_shift is how many chars that must be matched that
4729 follow this item. We calculate it ahead of time as once the
4730 lookbehind offset is added in we lose the ability to correctly
4732 ml = data.minlen_fixed ? *(data.minlen_fixed)
4733 : (I32)longest_fixed_length;
4734 r->anchored_end_shift = ml - data.offset_fixed
4735 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4736 + data.lookbehind_fixed;
4737 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4739 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4740 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4741 || (RExC_flags & RXf_PMf_MULTILINE)));
4742 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4745 r->anchored_substr = r->anchored_utf8 = NULL;
4746 SvREFCNT_dec(data.longest_fixed);
4747 longest_fixed_length = 0;
4750 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4751 ri->regstclass = NULL;
4752 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4754 && !(data.start_class->flags & ANYOF_EOS)
4755 && !cl_is_anything(data.start_class))
4757 const U32 n = add_data(pRExC_state, 1, "f");
4759 Newx(RExC_rxi->data->data[n], 1,
4760 struct regnode_charclass_class);
4761 StructCopy(data.start_class,
4762 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4763 struct regnode_charclass_class);
4764 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4765 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4766 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4767 regprop(r, sv, (regnode*)data.start_class);
4768 PerlIO_printf(Perl_debug_log,
4769 "synthetic stclass \"%s\".\n",
4770 SvPVX_const(sv));});
4773 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4774 if (longest_fixed_length > longest_float_length) {
4775 r->check_end_shift = r->anchored_end_shift;
4776 r->check_substr = r->anchored_substr;
4777 r->check_utf8 = r->anchored_utf8;
4778 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4779 if (r->extflags & RXf_ANCH_SINGLE)
4780 r->extflags |= RXf_NOSCAN;
4783 r->check_end_shift = r->float_end_shift;
4784 r->check_substr = r->float_substr;
4785 r->check_utf8 = r->float_utf8;
4786 r->check_offset_min = r->float_min_offset;
4787 r->check_offset_max = r->float_max_offset;
4789 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4790 This should be changed ASAP! */
4791 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4792 r->extflags |= RXf_USE_INTUIT;
4793 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4794 r->extflags |= RXf_INTUIT_TAIL;
4796 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4797 if ( (STRLEN)minlen < longest_float_length )
4798 minlen= longest_float_length;
4799 if ( (STRLEN)minlen < longest_fixed_length )
4800 minlen= longest_fixed_length;
4804 /* Several toplevels. Best we can is to set minlen. */
4806 struct regnode_charclass_class ch_class;
4809 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4811 scan = ri->program + 1;
4812 cl_init(pRExC_state, &ch_class);
4813 data.start_class = &ch_class;
4814 data.last_closep = &last_close;
4817 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4818 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4822 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4823 = r->float_substr = r->float_utf8 = NULL;
4824 if (!(data.start_class->flags & ANYOF_EOS)
4825 && !cl_is_anything(data.start_class))
4827 const U32 n = add_data(pRExC_state, 1, "f");
4829 Newx(RExC_rxi->data->data[n], 1,
4830 struct regnode_charclass_class);
4831 StructCopy(data.start_class,
4832 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4833 struct regnode_charclass_class);
4834 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4835 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4836 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4837 regprop(r, sv, (regnode*)data.start_class);
4838 PerlIO_printf(Perl_debug_log,
4839 "synthetic stclass \"%s\".\n",
4840 SvPVX_const(sv));});
4844 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4845 the "real" pattern. */
4847 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4848 (IV)minlen, (IV)r->minlen);
4850 r->minlenret = minlen;
4851 if (r->minlen < minlen)
4854 if (RExC_seen & REG_SEEN_GPOS)
4855 r->extflags |= RXf_GPOS_SEEN;
4856 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4857 r->extflags |= RXf_LOOKBEHIND_SEEN;
4858 if (RExC_seen & REG_SEEN_EVAL)
4859 r->extflags |= RXf_EVAL_SEEN;
4860 if (RExC_seen & REG_SEEN_CANY)
4861 r->extflags |= RXf_CANY_SEEN;
4862 if (RExC_seen & REG_SEEN_VERBARG)
4863 r->intflags |= PREGf_VERBARG_SEEN;
4864 if (RExC_seen & REG_SEEN_CUTGROUP)
4865 r->intflags |= PREGf_CUTGROUP_SEEN;
4866 if (RExC_paren_names)
4867 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4869 RXp_PAREN_NAMES(r) = NULL;
4871 #ifdef STUPID_PATTERN_CHECKS
4872 if (RX_PRELEN(rx) == 0)
4873 r->extflags |= RXf_NULL;
4874 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4875 /* XXX: this should happen BEFORE we compile */
4876 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4877 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4878 r->extflags |= RXf_WHITE;
4879 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4880 r->extflags |= RXf_START_ONLY;
4882 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4883 /* XXX: this should happen BEFORE we compile */
4884 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4886 regnode *first = ri->program + 1;
4888 U8 nop = OP(NEXTOPER(first));
4890 if (PL_regkind[fop] == NOTHING && nop == END)
4891 r->extflags |= RXf_NULL;
4892 else if (PL_regkind[fop] == BOL && nop == END)
4893 r->extflags |= RXf_START_ONLY;
4894 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4895 r->extflags |= RXf_WHITE;
4899 if (RExC_paren_names) {
4900 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4901 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4904 ri->name_list_idx = 0;
4906 if (RExC_recurse_count) {
4907 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4908 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4909 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4912 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4913 /* assume we don't need to swap parens around before we match */
4916 PerlIO_printf(Perl_debug_log,"Final program:\n");
4919 #ifdef RE_TRACK_PATTERN_OFFSETS
4920 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4921 const U32 len = ri->u.offsets[0];
4923 GET_RE_DEBUG_FLAGS_DECL;
4924 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4925 for (i = 1; i <= len; i++) {
4926 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4927 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4928 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4930 PerlIO_printf(Perl_debug_log, "\n");
4936 #undef RE_ENGINE_PTR
4940 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4943 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4945 PERL_UNUSED_ARG(value);
4947 if (flags & RXapif_FETCH) {
4948 return reg_named_buff_fetch(rx, key, flags);
4949 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4950 Perl_croak(aTHX_ "%s", PL_no_modify);
4952 } else if (flags & RXapif_EXISTS) {
4953 return reg_named_buff_exists(rx, key, flags)
4956 } else if (flags & RXapif_REGNAMES) {
4957 return reg_named_buff_all(rx, flags);
4958 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4959 return reg_named_buff_scalar(rx, flags);
4961 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4967 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4970 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4971 PERL_UNUSED_ARG(lastkey);
4973 if (flags & RXapif_FIRSTKEY)
4974 return reg_named_buff_firstkey(rx, flags);
4975 else if (flags & RXapif_NEXTKEY)
4976 return reg_named_buff_nextkey(rx, flags);
4978 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4984 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4987 AV *retarray = NULL;
4989 struct regexp *const rx = (struct regexp *)SvANY(r);
4991 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4993 if (flags & RXapif_ALL)
4996 if (rx && RXp_PAREN_NAMES(rx)) {
4997 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5000 SV* sv_dat=HeVAL(he_str);
5001 I32 *nums=(I32*)SvPVX(sv_dat);
5002 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5003 if ((I32)(rx->nparens) >= nums[i]
5004 && rx->offs[nums[i]].start != -1
5005 && rx->offs[nums[i]].end != -1)
5008 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5012 ret = newSVsv(&PL_sv_undef);
5015 av_push(retarray, ret);
5018 return newRV_noinc(MUTABLE_SV(retarray));
5025 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5028 struct regexp *const rx = (struct regexp *)SvANY(r);
5030 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5032 if (rx && RXp_PAREN_NAMES(rx)) {
5033 if (flags & RXapif_ALL) {
5034 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5036 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5050 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5052 struct regexp *const rx = (struct regexp *)SvANY(r);
5054 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5056 if ( rx && RXp_PAREN_NAMES(rx) ) {
5057 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5059 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5066 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5068 struct regexp *const rx = (struct regexp *)SvANY(r);
5069 GET_RE_DEBUG_FLAGS_DECL;
5071 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5073 if (rx && RXp_PAREN_NAMES(rx)) {
5074 HV *hv = RXp_PAREN_NAMES(rx);
5076 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5079 SV* sv_dat = HeVAL(temphe);
5080 I32 *nums = (I32*)SvPVX(sv_dat);
5081 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5082 if ((I32)(rx->lastparen) >= nums[i] &&
5083 rx->offs[nums[i]].start != -1 &&
5084 rx->offs[nums[i]].end != -1)
5090 if (parno || flags & RXapif_ALL) {
5091 return newSVhek(HeKEY_hek(temphe));
5099 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5104 struct regexp *const rx = (struct regexp *)SvANY(r);
5106 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5108 if (rx && RXp_PAREN_NAMES(rx)) {
5109 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5110 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5111 } else if (flags & RXapif_ONE) {
5112 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5113 av = MUTABLE_AV(SvRV(ret));
5114 length = av_len(av);
5116 return newSViv(length + 1);
5118 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5122 return &PL_sv_undef;
5126 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5128 struct regexp *const rx = (struct regexp *)SvANY(r);
5131 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5133 if (rx && RXp_PAREN_NAMES(rx)) {
5134 HV *hv= RXp_PAREN_NAMES(rx);
5136 (void)hv_iterinit(hv);
5137 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5140 SV* sv_dat = HeVAL(temphe);
5141 I32 *nums = (I32*)SvPVX(sv_dat);
5142 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5143 if ((I32)(rx->lastparen) >= nums[i] &&
5144 rx->offs[nums[i]].start != -1 &&
5145 rx->offs[nums[i]].end != -1)
5151 if (parno || flags & RXapif_ALL) {
5152 av_push(av, newSVhek(HeKEY_hek(temphe)));
5157 return newRV_noinc(MUTABLE_SV(av));
5161 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5164 struct regexp *const rx = (struct regexp *)SvANY(r);
5169 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5172 sv_setsv(sv,&PL_sv_undef);
5176 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5178 i = rx->offs[0].start;
5182 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5184 s = rx->subbeg + rx->offs[0].end;
5185 i = rx->sublen - rx->offs[0].end;
5188 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5189 (s1 = rx->offs[paren].start) != -1 &&
5190 (t1 = rx->offs[paren].end) != -1)
5194 s = rx->subbeg + s1;
5196 sv_setsv(sv,&PL_sv_undef);
5199 assert(rx->sublen >= (s - rx->subbeg) + i );
5201 const int oldtainted = PL_tainted;
5203 sv_setpvn(sv, s, i);
5204 PL_tainted = oldtainted;
5205 if ( (rx->extflags & RXf_CANY_SEEN)
5206 ? (RXp_MATCH_UTF8(rx)
5207 && (!i || is_utf8_string((U8*)s, i)))
5208 : (RXp_MATCH_UTF8(rx)) )
5215 if (RXp_MATCH_TAINTED(rx)) {
5216 if (SvTYPE(sv) >= SVt_PVMG) {
5217 MAGIC* const mg = SvMAGIC(sv);
5220 SvMAGIC_set(sv, mg->mg_moremagic);
5222 if ((mgt = SvMAGIC(sv))) {
5223 mg->mg_moremagic = mgt;
5224 SvMAGIC_set(sv, mg);
5234 sv_setsv(sv,&PL_sv_undef);
5240 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5241 SV const * const value)
5243 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5245 PERL_UNUSED_ARG(rx);
5246 PERL_UNUSED_ARG(paren);
5247 PERL_UNUSED_ARG(value);
5250 Perl_croak(aTHX_ "%s", PL_no_modify);
5254 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5257 struct regexp *const rx = (struct regexp *)SvANY(r);
5261 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5263 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5265 /* $` / ${^PREMATCH} */
5266 case RX_BUFF_IDX_PREMATCH:
5267 if (rx->offs[0].start != -1) {
5268 i = rx->offs[0].start;
5276 /* $' / ${^POSTMATCH} */
5277 case RX_BUFF_IDX_POSTMATCH:
5278 if (rx->offs[0].end != -1) {
5279 i = rx->sublen - rx->offs[0].end;
5281 s1 = rx->offs[0].end;
5287 /* $& / ${^MATCH}, $1, $2, ... */
5289 if (paren <= (I32)rx->nparens &&
5290 (s1 = rx->offs[paren].start) != -1 &&
5291 (t1 = rx->offs[paren].end) != -1)
5296 if (ckWARN(WARN_UNINITIALIZED))
5297 report_uninit((const SV *)sv);
5302 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5303 const char * const s = rx->subbeg + s1;
5308 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5315 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5317 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5318 PERL_UNUSED_ARG(rx);
5322 return newSVpvs("Regexp");
5325 /* Scans the name of a named buffer from the pattern.
5326 * If flags is REG_RSN_RETURN_NULL returns null.
5327 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5328 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5329 * to the parsed name as looked up in the RExC_paren_names hash.
5330 * If there is an error throws a vFAIL().. type exception.
5333 #define REG_RSN_RETURN_NULL 0
5334 #define REG_RSN_RETURN_NAME 1
5335 #define REG_RSN_RETURN_DATA 2
5338 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5340 char *name_start = RExC_parse;
5342 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5344 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5345 /* skip IDFIRST by using do...while */
5348 RExC_parse += UTF8SKIP(RExC_parse);
5349 } while (isALNUM_utf8((U8*)RExC_parse));
5353 } while (isALNUM(*RExC_parse));
5358 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5359 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5360 if ( flags == REG_RSN_RETURN_NAME)
5362 else if (flags==REG_RSN_RETURN_DATA) {
5365 if ( ! sv_name ) /* should not happen*/
5366 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5367 if (RExC_paren_names)
5368 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5370 sv_dat = HeVAL(he_str);
5372 vFAIL("Reference to nonexistent named group");
5376 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5383 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5384 int rem=(int)(RExC_end - RExC_parse); \
5393 if (RExC_lastparse!=RExC_parse) \
5394 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5397 iscut ? "..." : "<" \
5400 PerlIO_printf(Perl_debug_log,"%16s",""); \
5403 num = RExC_size + 1; \
5405 num=REG_NODE_NUM(RExC_emit); \
5406 if (RExC_lastnum!=num) \
5407 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5409 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5410 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5411 (int)((depth*2)), "", \
5415 RExC_lastparse=RExC_parse; \
5420 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5421 DEBUG_PARSE_MSG((funcname)); \
5422 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5424 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5425 DEBUG_PARSE_MSG((funcname)); \
5426 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5429 - reg - regular expression, i.e. main body or parenthesized thing
5431 * Caller must absorb opening parenthesis.
5433 * Combining parenthesis handling with the base level of regular expression
5434 * is a trifle forced, but the need to tie the tails of the branches to what
5435 * follows makes it hard to avoid.
5437 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5439 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5441 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5445 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5446 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5449 register regnode *ret; /* Will be the head of the group. */
5450 register regnode *br;
5451 register regnode *lastbr;
5452 register regnode *ender = NULL;
5453 register I32 parno = 0;
5455 U32 oregflags = RExC_flags;
5456 bool have_branch = 0;
5458 I32 freeze_paren = 0;
5459 I32 after_freeze = 0;
5461 /* for (?g), (?gc), and (?o) warnings; warning
5462 about (?c) will warn about (?g) -- japhy */
5464 #define WASTED_O 0x01
5465 #define WASTED_G 0x02
5466 #define WASTED_C 0x04
5467 #define WASTED_GC (0x02|0x04)
5468 I32 wastedflags = 0x00;
5470 char * parse_start = RExC_parse; /* MJD */
5471 char * const oregcomp_parse = RExC_parse;
5473 GET_RE_DEBUG_FLAGS_DECL;
5475 PERL_ARGS_ASSERT_REG;
5476 DEBUG_PARSE("reg ");
5478 *flagp = 0; /* Tentatively. */
5481 /* Make an OPEN node, if parenthesized. */
5483 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5484 char *start_verb = RExC_parse;
5485 STRLEN verb_len = 0;
5486 char *start_arg = NULL;
5487 unsigned char op = 0;
5489 int internal_argval = 0; /* internal_argval is only useful if !argok */
5490 while ( *RExC_parse && *RExC_parse != ')' ) {
5491 if ( *RExC_parse == ':' ) {
5492 start_arg = RExC_parse + 1;
5498 verb_len = RExC_parse - start_verb;
5501 while ( *RExC_parse && *RExC_parse != ')' )
5503 if ( *RExC_parse != ')' )
5504 vFAIL("Unterminated verb pattern argument");
5505 if ( RExC_parse == start_arg )
5508 if ( *RExC_parse != ')' )
5509 vFAIL("Unterminated verb pattern");
5512 switch ( *start_verb ) {
5513 case 'A': /* (*ACCEPT) */
5514 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5516 internal_argval = RExC_nestroot;
5519 case 'C': /* (*COMMIT) */
5520 if ( memEQs(start_verb,verb_len,"COMMIT") )
5523 case 'F': /* (*FAIL) */
5524 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5529 case ':': /* (*:NAME) */
5530 case 'M': /* (*MARK:NAME) */
5531 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5536 case 'P': /* (*PRUNE) */
5537 if ( memEQs(start_verb,verb_len,"PRUNE") )
5540 case 'S': /* (*SKIP) */
5541 if ( memEQs(start_verb,verb_len,"SKIP") )
5544 case 'T': /* (*THEN) */
5545 /* [19:06] <TimToady> :: is then */
5546 if ( memEQs(start_verb,verb_len,"THEN") ) {
5548 RExC_seen |= REG_SEEN_CUTGROUP;
5554 vFAIL3("Unknown verb pattern '%.*s'",
5555 verb_len, start_verb);
5558 if ( start_arg && internal_argval ) {
5559 vFAIL3("Verb pattern '%.*s' may not have an argument",
5560 verb_len, start_verb);
5561 } else if ( argok < 0 && !start_arg ) {
5562 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5563 verb_len, start_verb);
5565 ret = reganode(pRExC_state, op, internal_argval);
5566 if ( ! internal_argval && ! SIZE_ONLY ) {
5568 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5569 ARG(ret) = add_data( pRExC_state, 1, "S" );
5570 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5577 if (!internal_argval)
5578 RExC_seen |= REG_SEEN_VERBARG;
5579 } else if ( start_arg ) {
5580 vFAIL3("Verb pattern '%.*s' may not have an argument",
5581 verb_len, start_verb);
5583 ret = reg_node(pRExC_state, op);
5585 nextchar(pRExC_state);
5588 if (*RExC_parse == '?') { /* (?...) */
5589 bool is_logical = 0;
5590 const char * const seqstart = RExC_parse;
5593 paren = *RExC_parse++;
5594 ret = NULL; /* For look-ahead/behind. */
5597 case 'P': /* (?P...) variants for those used to PCRE/Python */
5598 paren = *RExC_parse++;
5599 if ( paren == '<') /* (?P<...>) named capture */
5601 else if (paren == '>') { /* (?P>name) named recursion */
5602 goto named_recursion;
5604 else if (paren == '=') { /* (?P=...) named backref */
5605 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5606 you change this make sure you change that */
5607 char* name_start = RExC_parse;
5609 SV *sv_dat = reg_scan_name(pRExC_state,
5610 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5611 if (RExC_parse == name_start || *RExC_parse != ')')
5612 vFAIL2("Sequence %.3s... not terminated",parse_start);
5615 num = add_data( pRExC_state, 1, "S" );
5616 RExC_rxi->data->data[num]=(void*)sv_dat;
5617 SvREFCNT_inc_simple_void(sv_dat);
5620 ret = reganode(pRExC_state,
5621 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5625 Set_Node_Offset(ret, parse_start+1);
5626 Set_Node_Cur_Length(ret); /* MJD */
5628 nextchar(pRExC_state);
5632 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5634 case '<': /* (?<...) */
5635 if (*RExC_parse == '!')
5637 else if (*RExC_parse != '=')
5643 case '\'': /* (?'...') */
5644 name_start= RExC_parse;
5645 svname = reg_scan_name(pRExC_state,
5646 SIZE_ONLY ? /* reverse test from the others */
5647 REG_RSN_RETURN_NAME :
5648 REG_RSN_RETURN_NULL);
5649 if (RExC_parse == name_start) {
5651 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5654 if (*RExC_parse != paren)
5655 vFAIL2("Sequence (?%c... not terminated",
5656 paren=='>' ? '<' : paren);
5660 if (!svname) /* shouldnt happen */
5662 "panic: reg_scan_name returned NULL");
5663 if (!RExC_paren_names) {
5664 RExC_paren_names= newHV();
5665 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5667 RExC_paren_name_list= newAV();
5668 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5671 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5673 sv_dat = HeVAL(he_str);
5675 /* croak baby croak */
5677 "panic: paren_name hash element allocation failed");
5678 } else if ( SvPOK(sv_dat) ) {
5679 /* (?|...) can mean we have dupes so scan to check
5680 its already been stored. Maybe a flag indicating
5681 we are inside such a construct would be useful,
5682 but the arrays are likely to be quite small, so
5683 for now we punt -- dmq */
5684 IV count = SvIV(sv_dat);
5685 I32 *pv = (I32*)SvPVX(sv_dat);
5687 for ( i = 0 ; i < count ; i++ ) {
5688 if ( pv[i] == RExC_npar ) {
5694 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5695 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5696 pv[count] = RExC_npar;
5697 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5700 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5701 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5703 SvIV_set(sv_dat, 1);
5706 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5707 SvREFCNT_dec(svname);
5710 /*sv_dump(sv_dat);*/
5712 nextchar(pRExC_state);
5714 goto capturing_parens;
5716 RExC_seen |= REG_SEEN_LOOKBEHIND;
5718 case '=': /* (?=...) */
5719 RExC_seen_zerolen++;
5721 case '!': /* (?!...) */
5722 RExC_seen_zerolen++;
5723 if (*RExC_parse == ')') {
5724 ret=reg_node(pRExC_state, OPFAIL);
5725 nextchar(pRExC_state);
5729 case '|': /* (?|...) */
5730 /* branch reset, behave like a (?:...) except that
5731 buffers in alternations share the same numbers */
5733 after_freeze = freeze_paren = RExC_npar;
5735 case ':': /* (?:...) */
5736 case '>': /* (?>...) */
5738 case '$': /* (?$...) */
5739 case '@': /* (?@...) */
5740 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5742 case '#': /* (?#...) */
5743 while (*RExC_parse && *RExC_parse != ')')
5745 if (*RExC_parse != ')')
5746 FAIL("Sequence (?#... not terminated");
5747 nextchar(pRExC_state);
5750 case '0' : /* (?0) */
5751 case 'R' : /* (?R) */
5752 if (*RExC_parse != ')')
5753 FAIL("Sequence (?R) not terminated");
5754 ret = reg_node(pRExC_state, GOSTART);
5755 *flagp |= POSTPONED;
5756 nextchar(pRExC_state);
5759 { /* named and numeric backreferences */
5761 case '&': /* (?&NAME) */
5762 parse_start = RExC_parse - 1;
5765 SV *sv_dat = reg_scan_name(pRExC_state,
5766 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5767 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5769 goto gen_recurse_regop;
5772 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5774 vFAIL("Illegal pattern");
5776 goto parse_recursion;
5778 case '-': /* (?-1) */
5779 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5780 RExC_parse--; /* rewind to let it be handled later */
5784 case '1': case '2': case '3': case '4': /* (?1) */
5785 case '5': case '6': case '7': case '8': case '9':
5788 num = atoi(RExC_parse);
5789 parse_start = RExC_parse - 1; /* MJD */
5790 if (*RExC_parse == '-')
5792 while (isDIGIT(*RExC_parse))
5794 if (*RExC_parse!=')')
5795 vFAIL("Expecting close bracket");
5798 if ( paren == '-' ) {
5800 Diagram of capture buffer numbering.
5801 Top line is the normal capture buffer numbers
5802 Botton line is the negative indexing as from
5806 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5810 num = RExC_npar + num;
5813 vFAIL("Reference to nonexistent group");
5815 } else if ( paren == '+' ) {
5816 num = RExC_npar + num - 1;
5819 ret = reganode(pRExC_state, GOSUB, num);
5821 if (num > (I32)RExC_rx->nparens) {
5823 vFAIL("Reference to nonexistent group");
5825 ARG2L_SET( ret, RExC_recurse_count++);
5827 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5828 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5832 RExC_seen |= REG_SEEN_RECURSE;
5833 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5834 Set_Node_Offset(ret, parse_start); /* MJD */
5836 *flagp |= POSTPONED;
5837 nextchar(pRExC_state);
5839 } /* named and numeric backreferences */
5842 case '?': /* (??...) */
5844 if (*RExC_parse != '{') {
5846 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5849 *flagp |= POSTPONED;
5850 paren = *RExC_parse++;
5852 case '{': /* (?{...}) */
5857 char *s = RExC_parse;
5859 RExC_seen_zerolen++;
5860 RExC_seen |= REG_SEEN_EVAL;
5861 while (count && (c = *RExC_parse)) {
5872 if (*RExC_parse != ')') {
5874 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5878 OP_4tree *sop, *rop;
5879 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5882 Perl_save_re_context(aTHX);
5883 rop = sv_compile_2op(sv, &sop, "re", &pad);
5884 sop->op_private |= OPpREFCOUNTED;
5885 /* re_dup will OpREFCNT_inc */
5886 OpREFCNT_set(sop, 1);
5889 n = add_data(pRExC_state, 3, "nop");
5890 RExC_rxi->data->data[n] = (void*)rop;
5891 RExC_rxi->data->data[n+1] = (void*)sop;
5892 RExC_rxi->data->data[n+2] = (void*)pad;
5895 else { /* First pass */
5896 if (PL_reginterp_cnt < ++RExC_seen_evals
5898 /* No compiled RE interpolated, has runtime
5899 components ===> unsafe. */
5900 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5901 if (PL_tainting && PL_tainted)
5902 FAIL("Eval-group in insecure regular expression");
5903 #if PERL_VERSION > 8
5904 if (IN_PERL_COMPILETIME)
5909 nextchar(pRExC_state);
5911 ret = reg_node(pRExC_state, LOGICAL);
5914 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5915 /* deal with the length of this later - MJD */
5918 ret = reganode(pRExC_state, EVAL, n);
5919 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5920 Set_Node_Offset(ret, parse_start);
5923 case '(': /* (?(?{...})...) and (?(?=...)...) */
5926 if (RExC_parse[0] == '?') { /* (?(?...)) */
5927 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5928 || RExC_parse[1] == '<'
5929 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5932 ret = reg_node(pRExC_state, LOGICAL);
5935 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5939 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5940 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5942 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5943 char *name_start= RExC_parse++;
5945 SV *sv_dat=reg_scan_name(pRExC_state,
5946 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5947 if (RExC_parse == name_start || *RExC_parse != ch)
5948 vFAIL2("Sequence (?(%c... not terminated",
5949 (ch == '>' ? '<' : ch));
5952 num = add_data( pRExC_state, 1, "S" );
5953 RExC_rxi->data->data[num]=(void*)sv_dat;
5954 SvREFCNT_inc_simple_void(sv_dat);
5956 ret = reganode(pRExC_state,NGROUPP,num);
5957 goto insert_if_check_paren;
5959 else if (RExC_parse[0] == 'D' &&
5960 RExC_parse[1] == 'E' &&
5961 RExC_parse[2] == 'F' &&
5962 RExC_parse[3] == 'I' &&
5963 RExC_parse[4] == 'N' &&
5964 RExC_parse[5] == 'E')
5966 ret = reganode(pRExC_state,DEFINEP,0);
5969 goto insert_if_check_paren;
5971 else if (RExC_parse[0] == 'R') {
5974 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5975 parno = atoi(RExC_parse++);
5976 while (isDIGIT(*RExC_parse))
5978 } else if (RExC_parse[0] == '&') {
5981 sv_dat = reg_scan_name(pRExC_state,
5982 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5983 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5985 ret = reganode(pRExC_state,INSUBP,parno);
5986 goto insert_if_check_paren;
5988 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5991 parno = atoi(RExC_parse++);
5993 while (isDIGIT(*RExC_parse))
5995 ret = reganode(pRExC_state, GROUPP, parno);
5997 insert_if_check_paren:
5998 if ((c = *nextchar(pRExC_state)) != ')')
5999 vFAIL("Switch condition not recognized");
6001 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6002 br = regbranch(pRExC_state, &flags, 1,depth+1);
6004 br = reganode(pRExC_state, LONGJMP, 0);
6006 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6007 c = *nextchar(pRExC_state);
6012 vFAIL("(?(DEFINE)....) does not allow branches");
6013 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6014 regbranch(pRExC_state, &flags, 1,depth+1);
6015 REGTAIL(pRExC_state, ret, lastbr);
6018 c = *nextchar(pRExC_state);
6023 vFAIL("Switch (?(condition)... contains too many branches");
6024 ender = reg_node(pRExC_state, TAIL);
6025 REGTAIL(pRExC_state, br, ender);
6027 REGTAIL(pRExC_state, lastbr, ender);
6028 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6031 REGTAIL(pRExC_state, ret, ender);
6032 RExC_size++; /* XXX WHY do we need this?!!
6033 For large programs it seems to be required
6034 but I can't figure out why. -- dmq*/
6038 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6042 RExC_parse--; /* for vFAIL to print correctly */
6043 vFAIL("Sequence (? incomplete");
6047 parse_flags: /* (?i) */
6049 U32 posflags = 0, negflags = 0;
6050 U32 *flagsp = &posflags;
6052 while (*RExC_parse) {
6053 /* && strchr("iogcmsx", *RExC_parse) */
6054 /* (?g), (?gc) and (?o) are useless here
6055 and must be globally applied -- japhy */
6056 switch (*RExC_parse) {
6057 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6058 case ONCE_PAT_MOD: /* 'o' */
6059 case GLOBAL_PAT_MOD: /* 'g' */
6060 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6061 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6062 if (! (wastedflags & wflagbit) ) {
6063 wastedflags |= wflagbit;
6066 "Useless (%s%c) - %suse /%c modifier",
6067 flagsp == &negflags ? "?-" : "?",
6069 flagsp == &negflags ? "don't " : "",
6076 case CONTINUE_PAT_MOD: /* 'c' */
6077 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6078 if (! (wastedflags & WASTED_C) ) {
6079 wastedflags |= WASTED_GC;
6082 "Useless (%sc) - %suse /gc modifier",
6083 flagsp == &negflags ? "?-" : "?",
6084 flagsp == &negflags ? "don't " : ""
6089 case KEEPCOPY_PAT_MOD: /* 'p' */
6090 if (flagsp == &negflags) {
6092 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6094 *flagsp |= RXf_PMf_KEEPCOPY;
6098 if (flagsp == &negflags) {
6100 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6104 wastedflags = 0; /* reset so (?g-c) warns twice */
6110 RExC_flags |= posflags;
6111 RExC_flags &= ~negflags;
6113 oregflags |= posflags;
6114 oregflags &= ~negflags;
6116 nextchar(pRExC_state);
6127 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6132 }} /* one for the default block, one for the switch */
6139 ret = reganode(pRExC_state, OPEN, parno);
6142 RExC_nestroot = parno;
6143 if (RExC_seen & REG_SEEN_RECURSE
6144 && !RExC_open_parens[parno-1])
6146 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6147 "Setting open paren #%"IVdf" to %d\n",
6148 (IV)parno, REG_NODE_NUM(ret)));
6149 RExC_open_parens[parno-1]= ret;
6152 Set_Node_Length(ret, 1); /* MJD */
6153 Set_Node_Offset(ret, RExC_parse); /* MJD */
6161 /* Pick up the branches, linking them together. */
6162 parse_start = RExC_parse; /* MJD */
6163 br = regbranch(pRExC_state, &flags, 1,depth+1);
6166 if (RExC_npar > after_freeze)
6167 after_freeze = RExC_npar;
6168 RExC_npar = freeze_paren;
6171 /* branch_len = (paren != 0); */
6175 if (*RExC_parse == '|') {
6176 if (!SIZE_ONLY && RExC_extralen) {
6177 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6180 reginsert(pRExC_state, BRANCH, br, depth+1);
6181 Set_Node_Length(br, paren != 0);
6182 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6186 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6188 else if (paren == ':') {
6189 *flagp |= flags&SIMPLE;
6191 if (is_open) { /* Starts with OPEN. */
6192 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6194 else if (paren != '?') /* Not Conditional */
6196 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6198 while (*RExC_parse == '|') {
6199 if (!SIZE_ONLY && RExC_extralen) {
6200 ender = reganode(pRExC_state, LONGJMP,0);
6201 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6204 RExC_extralen += 2; /* Account for LONGJMP. */
6205 nextchar(pRExC_state);
6207 if (RExC_npar > after_freeze)
6208 after_freeze = RExC_npar;
6209 RExC_npar = freeze_paren;
6211 br = regbranch(pRExC_state, &flags, 0, depth+1);
6215 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6217 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6220 if (have_branch || paren != ':') {
6221 /* Make a closing node, and hook it on the end. */
6224 ender = reg_node(pRExC_state, TAIL);
6227 ender = reganode(pRExC_state, CLOSE, parno);
6228 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6229 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6230 "Setting close paren #%"IVdf" to %d\n",
6231 (IV)parno, REG_NODE_NUM(ender)));
6232 RExC_close_parens[parno-1]= ender;
6233 if (RExC_nestroot == parno)
6236 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6237 Set_Node_Length(ender,1); /* MJD */
6243 *flagp &= ~HASWIDTH;
6246 ender = reg_node(pRExC_state, SUCCEED);
6249 ender = reg_node(pRExC_state, END);
6251 assert(!RExC_opend); /* there can only be one! */
6256 REGTAIL(pRExC_state, lastbr, ender);
6258 if (have_branch && !SIZE_ONLY) {
6260 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6262 /* Hook the tails of the branches to the closing node. */
6263 for (br = ret; br; br = regnext(br)) {
6264 const U8 op = PL_regkind[OP(br)];
6266 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6268 else if (op == BRANCHJ) {
6269 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6277 static const char parens[] = "=!<,>";
6279 if (paren && (p = strchr(parens, paren))) {
6280 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6281 int flag = (p - parens) > 1;
6284 node = SUSPEND, flag = 0;
6285 reginsert(pRExC_state, node,ret, depth+1);
6286 Set_Node_Cur_Length(ret);
6287 Set_Node_Offset(ret, parse_start + 1);
6289 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6293 /* Check for proper termination. */
6295 RExC_flags = oregflags;
6296 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6297 RExC_parse = oregcomp_parse;
6298 vFAIL("Unmatched (");
6301 else if (!paren && RExC_parse < RExC_end) {
6302 if (*RExC_parse == ')') {
6304 vFAIL("Unmatched )");
6307 FAIL("Junk on end of regexp"); /* "Can't happen". */
6311 RExC_npar = after_freeze;
6316 - regbranch - one alternative of an | operator
6318 * Implements the concatenation operator.
6321 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6324 register regnode *ret;
6325 register regnode *chain = NULL;
6326 register regnode *latest;
6327 I32 flags = 0, c = 0;
6328 GET_RE_DEBUG_FLAGS_DECL;
6330 PERL_ARGS_ASSERT_REGBRANCH;
6332 DEBUG_PARSE("brnc");
6337 if (!SIZE_ONLY && RExC_extralen)
6338 ret = reganode(pRExC_state, BRANCHJ,0);
6340 ret = reg_node(pRExC_state, BRANCH);
6341 Set_Node_Length(ret, 1);
6345 if (!first && SIZE_ONLY)
6346 RExC_extralen += 1; /* BRANCHJ */
6348 *flagp = WORST; /* Tentatively. */
6351 nextchar(pRExC_state);
6352 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6354 latest = regpiece(pRExC_state, &flags,depth+1);
6355 if (latest == NULL) {
6356 if (flags & TRYAGAIN)
6360 else if (ret == NULL)
6362 *flagp |= flags&(HASWIDTH|POSTPONED);
6363 if (chain == NULL) /* First piece. */
6364 *flagp |= flags&SPSTART;
6367 REGTAIL(pRExC_state, chain, latest);
6372 if (chain == NULL) { /* Loop ran zero times. */
6373 chain = reg_node(pRExC_state, NOTHING);
6378 *flagp |= flags&SIMPLE;
6385 - regpiece - something followed by possible [*+?]
6387 * Note that the branching code sequences used for ? and the general cases
6388 * of * and + are somewhat optimized: they use the same NOTHING node as
6389 * both the endmarker for their branch list and the body of the last branch.
6390 * It might seem that this node could be dispensed with entirely, but the
6391 * endmarker role is not redundant.
6394 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6397 register regnode *ret;
6399 register char *next;
6401 const char * const origparse = RExC_parse;
6403 I32 max = REG_INFTY;
6405 const char *maxpos = NULL;
6406 GET_RE_DEBUG_FLAGS_DECL;
6408 PERL_ARGS_ASSERT_REGPIECE;
6410 DEBUG_PARSE("piec");
6412 ret = regatom(pRExC_state, &flags,depth+1);
6414 if (flags & TRYAGAIN)
6421 if (op == '{' && regcurly(RExC_parse)) {
6423 parse_start = RExC_parse; /* MJD */
6424 next = RExC_parse + 1;
6425 while (isDIGIT(*next) || *next == ',') {
6434 if (*next == '}') { /* got one */
6438 min = atoi(RExC_parse);
6442 maxpos = RExC_parse;
6444 if (!max && *maxpos != '0')
6445 max = REG_INFTY; /* meaning "infinity" */
6446 else if (max >= REG_INFTY)
6447 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6449 nextchar(pRExC_state);
6452 if ((flags&SIMPLE)) {
6453 RExC_naughty += 2 + RExC_naughty / 2;
6454 reginsert(pRExC_state, CURLY, ret, depth+1);
6455 Set_Node_Offset(ret, parse_start+1); /* MJD */
6456 Set_Node_Cur_Length(ret);
6459 regnode * const w = reg_node(pRExC_state, WHILEM);
6462 REGTAIL(pRExC_state, ret, w);
6463 if (!SIZE_ONLY && RExC_extralen) {
6464 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6465 reginsert(pRExC_state, NOTHING,ret, depth+1);
6466 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6468 reginsert(pRExC_state, CURLYX,ret, depth+1);
6470 Set_Node_Offset(ret, parse_start+1);
6471 Set_Node_Length(ret,
6472 op == '{' ? (RExC_parse - parse_start) : 1);
6474 if (!SIZE_ONLY && RExC_extralen)
6475 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6476 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6478 RExC_whilem_seen++, RExC_extralen += 3;
6479 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6488 vFAIL("Can't do {n,m} with n > m");
6490 ARG1_SET(ret, (U16)min);
6491 ARG2_SET(ret, (U16)max);
6503 #if 0 /* Now runtime fix should be reliable. */
6505 /* if this is reinstated, don't forget to put this back into perldiag:
6507 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6509 (F) The part of the regexp subject to either the * or + quantifier
6510 could match an empty string. The {#} shows in the regular
6511 expression about where the problem was discovered.
6515 if (!(flags&HASWIDTH) && op != '?')
6516 vFAIL("Regexp *+ operand could be empty");
6519 parse_start = RExC_parse;
6520 nextchar(pRExC_state);
6522 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6524 if (op == '*' && (flags&SIMPLE)) {
6525 reginsert(pRExC_state, STAR, ret, depth+1);
6529 else if (op == '*') {
6533 else if (op == '+' && (flags&SIMPLE)) {
6534 reginsert(pRExC_state, PLUS, ret, depth+1);
6538 else if (op == '+') {
6542 else if (op == '?') {
6547 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6548 ckWARN3reg(RExC_parse,
6549 "%.*s matches null string many times",
6550 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6554 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6555 nextchar(pRExC_state);
6556 reginsert(pRExC_state, MINMOD, ret, depth+1);
6557 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6559 #ifndef REG_ALLOW_MINMOD_SUSPEND
6562 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6564 nextchar(pRExC_state);
6565 ender = reg_node(pRExC_state, SUCCEED);
6566 REGTAIL(pRExC_state, ret, ender);
6567 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6569 ender = reg_node(pRExC_state, TAIL);
6570 REGTAIL(pRExC_state, ret, ender);
6574 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6576 vFAIL("Nested quantifiers");
6583 /* reg_namedseq(pRExC_state,UVp)
6585 This is expected to be called by a parser routine that has
6586 recognized '\N' and needs to handle the rest. RExC_parse is
6587 expected to point at the first char following the N at the time
6590 If valuep is non-null then it is assumed that we are parsing inside
6591 of a charclass definition and the first codepoint in the resolved
6592 string is returned via *valuep and the routine will return NULL.
6593 In this mode if a multichar string is returned from the charnames
6594 handler a warning will be issued, and only the first char in the
6595 sequence will be examined. If the string returned is zero length
6596 then the value of *valuep is undefined and NON-NULL will
6597 be returned to indicate failure. (This will NOT be a valid pointer
6600 If valuep is null then it is assumed that we are parsing normal text
6601 and inserts a new EXACT node into the program containing the resolved
6602 string and returns a pointer to the new node. If the string is
6603 zerolength a NOTHING node is emitted.
6605 On success RExC_parse is set to the char following the endbrace.
6606 Parsing failures will generate a fatal errorvia vFAIL(...)
6608 NOTE: We cache all results from the charnames handler locally in
6609 the RExC_charnames hash (created on first use) to prevent a charnames
6610 handler from playing silly-buggers and returning a short string and
6611 then a long string for a given pattern. Since the regexp program
6612 size is calculated during an initial parse this would result
6613 in a buffer overrun so we cache to prevent the charname result from
6614 changing during the course of the parse.
6618 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6620 char * name; /* start of the content of the name */
6621 char * endbrace; /* endbrace following the name */
6624 STRLEN len; /* this has various purposes throughout the code */
6625 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6626 regnode *ret = NULL;
6628 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6630 if (*RExC_parse != '{' ||
6631 (*RExC_parse == '{' && RExC_parse[1]
6632 && strchr("0123456789", RExC_parse[1])))
6634 GET_RE_DEBUG_FLAGS_DECL;
6636 /* no bare \N in a charclass */
6637 vFAIL("Missing braces on \\N{}");
6639 nextchar(pRExC_state);
6640 ret = reg_node(pRExC_state, REG_ANY);
6641 *flagp |= HASWIDTH|SIMPLE;
6644 Set_Node_Length(ret, 1); /* MJD */
6647 name = RExC_parse+1;
6648 endbrace = strchr(RExC_parse, '}');
6651 vFAIL("Missing right brace on \\N{}");
6653 RExC_parse = endbrace + 1;
6656 /* RExC_parse points at the beginning brace,
6657 endbrace points at the last */
6658 if ( name[0]=='U' && name[1]=='+' ) {
6659 /* its a "Unicode hex" notation {U+89AB} */
6660 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6661 | PERL_SCAN_DISALLOW_PREFIX
6662 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6664 len = (STRLEN)(endbrace - name - 2);
6665 cp = grok_hex(name + 2, &len, &fl, NULL);
6666 if ( len != (STRLEN)(endbrace - name - 2) ) {
6670 if (cp > 0xff) RExC_utf8 = 1;
6675 /* Need to convert to utf8 if either: won't fit into a byte, or the re
6676 * is going to be in utf8 and the representation changes under utf8. */
6677 if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
6678 U8 string[UTF8_MAXBYTES+1];
6681 tmps = uvuni_to_utf8(string, cp);
6682 sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
6683 } else { /* Otherwise, no need for utf8, can skip that step */
6686 sv_str= newSVpvn(&string, 1);
6689 /* fetch the charnames handler for this scope */
6690 HV * const table = GvHV(PL_hintgv);
6692 hv_fetchs(table, "charnames", FALSE) :
6694 SV *cv= cvp ? *cvp : NULL;
6697 /* create an SV with the name as argument */
6698 sv_name = newSVpvn(name, endbrace - name);
6700 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6701 vFAIL2("Constant(\\N{%" SVf "}) unknown: "
6702 "(possibly a missing \"use charnames ...\")",
6705 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6706 vFAIL2("Constant(\\N{%" SVf "}): "
6707 "$^H{charnames} is not defined", SVfARG(sv_name));
6712 if (!RExC_charnames) {
6713 /* make sure our cache is allocated */
6714 RExC_charnames = newHV();
6715 sv_2mortal(MUTABLE_SV(RExC_charnames));
6717 /* see if we have looked this one up before */
6718 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6720 sv_str = HeVAL(he_str);
6733 count= call_sv(cv, G_SCALAR);
6735 if (count == 1) { /* XXXX is this right? dmq */
6737 SvREFCNT_inc_simple_void(sv_str);
6745 if ( !sv_str || !SvOK(sv_str) ) {
6746 vFAIL2("Constant(\\N{%" SVf "}): Call to &{$^H{charnames}} "
6747 "did not return a defined value", SVfARG(sv_name));
6749 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6754 char *p = SvPV(sv_str, len);
6757 if ( SvUTF8(sv_str) ) {
6758 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6762 We have to turn on utf8 for high bit chars otherwise
6763 we get failures with
6765 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6766 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6768 This is different from what \x{} would do with the same
6769 codepoint, where the condition is > 0xFF.
6776 /* warn if we havent used the whole string? */
6778 if (numlen<len && SIZE_ONLY) {
6779 ckWARN2reg(RExC_parse,
6780 "Ignoring excess chars from \\N{%" SVf "} in character class",
6784 } else if (SIZE_ONLY) {
6785 ckWARN2reg(RExC_parse,
6786 "Ignoring zero length \\N{%" SVf "} in character class",
6791 SvREFCNT_dec(sv_name);
6793 SvREFCNT_dec(sv_str);
6794 return len ? NULL : (regnode *)&len;
6795 } else if(SvCUR(sv_str)) {
6801 char * parse_start = name-3; /* needed for the offsets */
6803 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6805 ret = reg_node(pRExC_state,
6806 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6809 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6810 sv_utf8_upgrade(sv_str);
6811 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6815 p = SvPV(sv_str, len);
6817 /* len is the length written, charlen is the size the char read */
6818 for ( len = 0; p < pend; p += charlen ) {
6820 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6822 STRLEN foldlen,numlen;
6823 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6824 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6825 /* Emit all the Unicode characters. */
6827 for (foldbuf = tmpbuf;
6831 uvc = utf8_to_uvchr(foldbuf, &numlen);
6833 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6836 /* In EBCDIC the numlen
6837 * and unilen can differ. */
6839 if (numlen >= foldlen)
6843 break; /* "Can't happen." */
6846 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6858 RExC_size += STR_SZ(len);
6861 RExC_emit += STR_SZ(len);
6863 Set_Node_Cur_Length(ret); /* MJD */
6865 nextchar(pRExC_state);
6866 } else { /* zero length */
6867 ret = reg_node(pRExC_state,NOTHING);
6870 SvREFCNT_dec(sv_str);
6873 SvREFCNT_dec(sv_name);
6883 * It returns the code point in utf8 for the value in *encp.
6884 * value: a code value in the source encoding
6885 * encp: a pointer to an Encode object
6887 * If the result from Encode is not a single character,
6888 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6891 S_reg_recode(pTHX_ const char value, SV **encp)
6894 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6895 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6896 const STRLEN newlen = SvCUR(sv);
6897 UV uv = UNICODE_REPLACEMENT;
6899 PERL_ARGS_ASSERT_REG_RECODE;
6903 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6906 if (!newlen || numlen != newlen) {
6907 uv = UNICODE_REPLACEMENT;
6915 - regatom - the lowest level
6917 Try to identify anything special at the start of the pattern. If there
6918 is, then handle it as required. This may involve generating a single regop,
6919 such as for an assertion; or it may involve recursing, such as to
6920 handle a () structure.
6922 If the string doesn't start with something special then we gobble up
6923 as much literal text as we can.
6925 Once we have been able to handle whatever type of thing started the
6926 sequence, we return.
6928 Note: we have to be careful with escapes, as they can be both literal
6929 and special, and in the case of \10 and friends can either, depending
6930 on context. Specifically there are two seperate switches for handling
6931 escape sequences, with the one for handling literal escapes requiring
6932 a dummy entry for all of the special escapes that are actually handled
6937 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6940 register regnode *ret = NULL;
6942 char *parse_start = RExC_parse;
6943 GET_RE_DEBUG_FLAGS_DECL;
6944 DEBUG_PARSE("atom");
6945 *flagp = WORST; /* Tentatively. */
6947 PERL_ARGS_ASSERT_REGATOM;
6950 switch ((U8)*RExC_parse) {
6952 RExC_seen_zerolen++;
6953 nextchar(pRExC_state);
6954 if (RExC_flags & RXf_PMf_MULTILINE)
6955 ret = reg_node(pRExC_state, MBOL);
6956 else if (RExC_flags & RXf_PMf_SINGLELINE)
6957 ret = reg_node(pRExC_state, SBOL);
6959 ret = reg_node(pRExC_state, BOL);
6960 Set_Node_Length(ret, 1); /* MJD */
6963 nextchar(pRExC_state);
6965 RExC_seen_zerolen++;
6966 if (RExC_flags & RXf_PMf_MULTILINE)
6967 ret = reg_node(pRExC_state, MEOL);
6968 else if (RExC_flags & RXf_PMf_SINGLELINE)
6969 ret = reg_node(pRExC_state, SEOL);
6971 ret = reg_node(pRExC_state, EOL);
6972 Set_Node_Length(ret, 1); /* MJD */
6975 nextchar(pRExC_state);
6976 if (RExC_flags & RXf_PMf_SINGLELINE)
6977 ret = reg_node(pRExC_state, SANY);
6979 ret = reg_node(pRExC_state, REG_ANY);
6980 *flagp |= HASWIDTH|SIMPLE;
6982 Set_Node_Length(ret, 1); /* MJD */
6986 char * const oregcomp_parse = ++RExC_parse;
6987 ret = regclass(pRExC_state,depth+1);
6988 if (*RExC_parse != ']') {
6989 RExC_parse = oregcomp_parse;
6990 vFAIL("Unmatched [");
6992 nextchar(pRExC_state);
6993 *flagp |= HASWIDTH|SIMPLE;
6994 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6998 nextchar(pRExC_state);
6999 ret = reg(pRExC_state, 1, &flags,depth+1);
7001 if (flags & TRYAGAIN) {
7002 if (RExC_parse == RExC_end) {
7003 /* Make parent create an empty node if needed. */
7011 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7015 if (flags & TRYAGAIN) {
7019 vFAIL("Internal urp");
7020 /* Supposed to be caught earlier. */
7023 if (!regcurly(RExC_parse)) {
7032 vFAIL("Quantifier follows nothing");
7040 len=0; /* silence a spurious compiler warning */
7041 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7042 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7043 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7044 ret = reganode(pRExC_state, FOLDCHAR, cp);
7045 Set_Node_Length(ret, 1); /* MJD */
7046 nextchar(pRExC_state); /* kill whitespace under /x */
7054 This switch handles escape sequences that resolve to some kind
7055 of special regop and not to literal text. Escape sequnces that
7056 resolve to literal text are handled below in the switch marked
7059 Every entry in this switch *must* have a corresponding entry
7060 in the literal escape switch. However, the opposite is not
7061 required, as the default for this switch is to jump to the
7062 literal text handling code.
7064 switch ((U8)*++RExC_parse) {
7069 /* Special Escapes */
7071 RExC_seen_zerolen++;
7072 ret = reg_node(pRExC_state, SBOL);
7074 goto finish_meta_pat;
7076 ret = reg_node(pRExC_state, GPOS);
7077 RExC_seen |= REG_SEEN_GPOS;
7079 goto finish_meta_pat;
7081 RExC_seen_zerolen++;
7082 ret = reg_node(pRExC_state, KEEPS);
7084 /* XXX:dmq : disabling in-place substitution seems to
7085 * be necessary here to avoid cases of memory corruption, as
7086 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7088 RExC_seen |= REG_SEEN_LOOKBEHIND;
7089 goto finish_meta_pat;
7091 ret = reg_node(pRExC_state, SEOL);
7093 RExC_seen_zerolen++; /* Do not optimize RE away */
7094 goto finish_meta_pat;
7096 ret = reg_node(pRExC_state, EOS);
7098 RExC_seen_zerolen++; /* Do not optimize RE away */
7099 goto finish_meta_pat;
7101 ret = reg_node(pRExC_state, CANY);
7102 RExC_seen |= REG_SEEN_CANY;
7103 *flagp |= HASWIDTH|SIMPLE;
7104 goto finish_meta_pat;
7106 ret = reg_node(pRExC_state, CLUMP);
7108 goto finish_meta_pat;
7110 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7111 *flagp |= HASWIDTH|SIMPLE;
7112 goto finish_meta_pat;
7114 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7115 *flagp |= HASWIDTH|SIMPLE;
7116 goto finish_meta_pat;
7118 RExC_seen_zerolen++;
7119 RExC_seen |= REG_SEEN_LOOKBEHIND;
7120 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7122 goto finish_meta_pat;
7124 RExC_seen_zerolen++;
7125 RExC_seen |= REG_SEEN_LOOKBEHIND;
7126 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7128 goto finish_meta_pat;
7130 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7131 *flagp |= HASWIDTH|SIMPLE;
7132 goto finish_meta_pat;
7134 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7135 *flagp |= HASWIDTH|SIMPLE;
7136 goto finish_meta_pat;
7138 ret = reg_node(pRExC_state, DIGIT);
7139 *flagp |= HASWIDTH|SIMPLE;
7140 goto finish_meta_pat;
7142 ret = reg_node(pRExC_state, NDIGIT);
7143 *flagp |= HASWIDTH|SIMPLE;
7144 goto finish_meta_pat;
7146 ret = reg_node(pRExC_state, LNBREAK);
7147 *flagp |= HASWIDTH|SIMPLE;
7148 goto finish_meta_pat;
7150 ret = reg_node(pRExC_state, HORIZWS);
7151 *flagp |= HASWIDTH|SIMPLE;
7152 goto finish_meta_pat;
7154 ret = reg_node(pRExC_state, NHORIZWS);
7155 *flagp |= HASWIDTH|SIMPLE;
7156 goto finish_meta_pat;
7158 ret = reg_node(pRExC_state, VERTWS);
7159 *flagp |= HASWIDTH|SIMPLE;
7160 goto finish_meta_pat;
7162 ret = reg_node(pRExC_state, NVERTWS);
7163 *flagp |= HASWIDTH|SIMPLE;
7165 nextchar(pRExC_state);
7166 Set_Node_Length(ret, 2); /* MJD */
7171 char* const oldregxend = RExC_end;
7173 char* parse_start = RExC_parse - 2;
7176 if (RExC_parse[1] == '{') {
7177 /* a lovely hack--pretend we saw [\pX] instead */
7178 RExC_end = strchr(RExC_parse, '}');
7180 const U8 c = (U8)*RExC_parse;
7182 RExC_end = oldregxend;
7183 vFAIL2("Missing right brace on \\%c{}", c);
7188 RExC_end = RExC_parse + 2;
7189 if (RExC_end > oldregxend)
7190 RExC_end = oldregxend;
7194 ret = regclass(pRExC_state,depth+1);
7196 RExC_end = oldregxend;
7199 Set_Node_Offset(ret, parse_start + 2);
7200 Set_Node_Cur_Length(ret);
7201 nextchar(pRExC_state);
7202 *flagp |= HASWIDTH|SIMPLE;
7206 /* Handle \N and \N{NAME} here and not below because it can be
7207 multicharacter. join_exact() will join them up later on.
7208 Also this makes sure that things like /\N{BLAH}+/ and
7209 \N{BLAH} being multi char Just Happen. dmq*/
7211 ret= reg_namedseq(pRExC_state, NULL, flagp);
7213 case 'k': /* Handle \k<NAME> and \k'NAME' */
7216 char ch= RExC_parse[1];
7217 if (ch != '<' && ch != '\'' && ch != '{') {
7219 vFAIL2("Sequence %.2s... not terminated",parse_start);
7221 /* this pretty much dupes the code for (?P=...) in reg(), if
7222 you change this make sure you change that */
7223 char* name_start = (RExC_parse += 2);
7225 SV *sv_dat = reg_scan_name(pRExC_state,
7226 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7227 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7228 if (RExC_parse == name_start || *RExC_parse != ch)
7229 vFAIL2("Sequence %.3s... not terminated",parse_start);
7232 num = add_data( pRExC_state, 1, "S" );
7233 RExC_rxi->data->data[num]=(void*)sv_dat;
7234 SvREFCNT_inc_simple_void(sv_dat);
7238 ret = reganode(pRExC_state,
7239 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7243 /* override incorrect value set in reganode MJD */
7244 Set_Node_Offset(ret, parse_start+1);
7245 Set_Node_Cur_Length(ret); /* MJD */
7246 nextchar(pRExC_state);
7252 case '1': case '2': case '3': case '4':
7253 case '5': case '6': case '7': case '8': case '9':
7256 bool isg = *RExC_parse == 'g';
7261 if (*RExC_parse == '{') {
7265 if (*RExC_parse == '-') {
7269 if (hasbrace && !isDIGIT(*RExC_parse)) {
7270 if (isrel) RExC_parse--;
7272 goto parse_named_seq;
7274 num = atoi(RExC_parse);
7275 if (isg && num == 0)
7276 vFAIL("Reference to invalid group 0");
7278 num = RExC_npar - num;
7280 vFAIL("Reference to nonexistent or unclosed group");
7282 if (!isg && num > 9 && num >= RExC_npar)
7285 char * const parse_start = RExC_parse - 1; /* MJD */
7286 while (isDIGIT(*RExC_parse))
7288 if (parse_start == RExC_parse - 1)
7289 vFAIL("Unterminated \\g... pattern");
7291 if (*RExC_parse != '}')
7292 vFAIL("Unterminated \\g{...} pattern");
7296 if (num > (I32)RExC_rx->nparens)
7297 vFAIL("Reference to nonexistent group");
7300 ret = reganode(pRExC_state,
7301 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7305 /* override incorrect value set in reganode MJD */
7306 Set_Node_Offset(ret, parse_start+1);
7307 Set_Node_Cur_Length(ret); /* MJD */
7309 nextchar(pRExC_state);
7314 if (RExC_parse >= RExC_end)
7315 FAIL("Trailing \\");
7318 /* Do not generate "unrecognized" warnings here, we fall
7319 back into the quick-grab loop below */
7326 if (RExC_flags & RXf_PMf_EXTENDED) {
7327 if ( reg_skipcomment( pRExC_state ) )
7334 register STRLEN len;
7339 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7341 parse_start = RExC_parse - 1;
7347 ret = reg_node(pRExC_state,
7348 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7350 for (len = 0, p = RExC_parse - 1;
7351 len < 127 && p < RExC_end;
7354 char * const oldp = p;
7356 if (RExC_flags & RXf_PMf_EXTENDED)
7357 p = regwhite( pRExC_state, p );
7362 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7363 goto normal_default;
7373 /* Literal Escapes Switch
7375 This switch is meant to handle escape sequences that
7376 resolve to a literal character.
7378 Every escape sequence that represents something
7379 else, like an assertion or a char class, is handled
7380 in the switch marked 'Special Escapes' above in this
7381 routine, but also has an entry here as anything that
7382 isn't explicitly mentioned here will be treated as
7383 an unescaped equivalent literal.
7387 /* These are all the special escapes. */
7391 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7392 goto normal_default;
7393 case 'A': /* Start assertion */
7394 case 'b': case 'B': /* Word-boundary assertion*/
7395 case 'C': /* Single char !DANGEROUS! */
7396 case 'd': case 'D': /* digit class */
7397 case 'g': case 'G': /* generic-backref, pos assertion */
7398 case 'h': case 'H': /* HORIZWS */
7399 case 'k': case 'K': /* named backref, keep marker */
7400 case 'N': /* named char sequence */
7401 case 'p': case 'P': /* Unicode property */
7402 case 'R': /* LNBREAK */
7403 case 's': case 'S': /* space class */
7404 case 'v': case 'V': /* VERTWS */
7405 case 'w': case 'W': /* word class */
7406 case 'X': /* eXtended Unicode "combining character sequence" */
7407 case 'z': case 'Z': /* End of line/string assertion */
7411 /* Anything after here is an escape that resolves to a
7412 literal. (Except digits, which may or may not)
7431 ender = ASCII_TO_NATIVE('\033');
7435 ender = ASCII_TO_NATIVE('\007');
7440 char* const e = strchr(p, '}');
7444 vFAIL("Missing right brace on \\x{}");
7447 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7448 | PERL_SCAN_DISALLOW_PREFIX;
7449 STRLEN numlen = e - p - 1;
7450 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7457 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7459 ender = grok_hex(p, &numlen, &flags, NULL);
7462 if (PL_encoding && ender < 0x100)
7463 goto recode_encoding;
7467 ender = UCHARAT(p++);
7468 ender = toCTRL(ender);
7470 case '0': case '1': case '2': case '3':case '4':
7471 case '5': case '6': case '7': case '8':case '9':
7473 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7476 ender = grok_oct(p, &numlen, &flags, NULL);
7478 /* An octal above 0xff is interpreted differently
7479 * depending on if the re is in utf8 or not. If it
7480 * is in utf8, the value will be itself, otherwise
7481 * it is interpreted as modulo 0x100. It has been
7482 * decided to discourage the use of octal above the
7483 * single-byte range. For now, warn only when
7484 * it ends up modulo */
7485 if (SIZE_ONLY && ender >= 0x100
7486 && ! UTF && ! PL_encoding) {
7487 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7495 if (PL_encoding && ender < 0x100)
7496 goto recode_encoding;
7500 SV* enc = PL_encoding;
7501 ender = reg_recode((const char)(U8)ender, &enc);
7502 if (!enc && SIZE_ONLY)
7503 ckWARNreg(p, "Invalid escape in the specified encoding");
7509 FAIL("Trailing \\");
7512 if (!SIZE_ONLY&& isALPHA(*p))
7513 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7514 goto normal_default;
7519 if (UTF8_IS_START(*p) && UTF) {
7521 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7522 &numlen, UTF8_ALLOW_DEFAULT);
7529 if ( RExC_flags & RXf_PMf_EXTENDED)
7530 p = regwhite( pRExC_state, p );
7532 /* Prime the casefolded buffer. */
7533 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7535 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7540 /* Emit all the Unicode characters. */
7542 for (foldbuf = tmpbuf;
7544 foldlen -= numlen) {
7545 ender = utf8_to_uvchr(foldbuf, &numlen);
7547 const STRLEN unilen = reguni(pRExC_state, ender, s);
7550 /* In EBCDIC the numlen
7551 * and unilen can differ. */
7553 if (numlen >= foldlen)
7557 break; /* "Can't happen." */
7561 const STRLEN unilen = reguni(pRExC_state, ender, s);
7570 REGC((char)ender, s++);
7576 /* Emit all the Unicode characters. */
7578 for (foldbuf = tmpbuf;
7580 foldlen -= numlen) {
7581 ender = utf8_to_uvchr(foldbuf, &numlen);
7583 const STRLEN unilen = reguni(pRExC_state, ender, s);
7586 /* In EBCDIC the numlen
7587 * and unilen can differ. */
7589 if (numlen >= foldlen)
7597 const STRLEN unilen = reguni(pRExC_state, ender, s);
7606 REGC((char)ender, s++);
7610 Set_Node_Cur_Length(ret); /* MJD */
7611 nextchar(pRExC_state);
7613 /* len is STRLEN which is unsigned, need to copy to signed */
7616 vFAIL("Internal disaster");
7620 if (len == 1 && UNI_IS_INVARIANT(ender))
7624 RExC_size += STR_SZ(len);
7627 RExC_emit += STR_SZ(len);
7637 S_regwhite( RExC_state_t *pRExC_state, char *p )
7639 const char *e = RExC_end;
7641 PERL_ARGS_ASSERT_REGWHITE;
7646 else if (*p == '#') {
7655 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7663 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7664 Character classes ([:foo:]) can also be negated ([:^foo:]).
7665 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7666 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7667 but trigger failures because they are currently unimplemented. */
7669 #define POSIXCC_DONE(c) ((c) == ':')
7670 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7671 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7674 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7677 I32 namedclass = OOB_NAMEDCLASS;
7679 PERL_ARGS_ASSERT_REGPPOSIXCC;
7681 if (value == '[' && RExC_parse + 1 < RExC_end &&
7682 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7683 POSIXCC(UCHARAT(RExC_parse))) {
7684 const char c = UCHARAT(RExC_parse);
7685 char* const s = RExC_parse++;
7687 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7689 if (RExC_parse == RExC_end)
7690 /* Grandfather lone [:, [=, [. */
7693 const char* const t = RExC_parse++; /* skip over the c */
7696 if (UCHARAT(RExC_parse) == ']') {
7697 const char *posixcc = s + 1;
7698 RExC_parse++; /* skip over the ending ] */
7701 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7702 const I32 skip = t - posixcc;
7704 /* Initially switch on the length of the name. */
7707 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7708 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7711 /* Names all of length 5. */
7712 /* alnum alpha ascii blank cntrl digit graph lower
7713 print punct space upper */
7714 /* Offset 4 gives the best switch position. */
7715 switch (posixcc[4]) {
7717 if (memEQ(posixcc, "alph", 4)) /* alpha */
7718 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7721 if (memEQ(posixcc, "spac", 4)) /* space */
7722 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7725 if (memEQ(posixcc, "grap", 4)) /* graph */
7726 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7729 if (memEQ(posixcc, "asci", 4)) /* ascii */
7730 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7733 if (memEQ(posixcc, "blan", 4)) /* blank */
7734 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7737 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7738 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7741 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7742 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7745 if (memEQ(posixcc, "lowe", 4)) /* lower */
7746 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7747 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7748 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7751 if (memEQ(posixcc, "digi", 4)) /* digit */
7752 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7753 else if (memEQ(posixcc, "prin", 4)) /* print */
7754 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7755 else if (memEQ(posixcc, "punc", 4)) /* punct */
7756 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7761 if (memEQ(posixcc, "xdigit", 6))
7762 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7766 if (namedclass == OOB_NAMEDCLASS)
7767 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7769 assert (posixcc[skip] == ':');
7770 assert (posixcc[skip+1] == ']');
7771 } else if (!SIZE_ONLY) {
7772 /* [[=foo=]] and [[.foo.]] are still future. */
7774 /* adjust RExC_parse so the warning shows after
7776 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7778 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7781 /* Maternal grandfather:
7782 * "[:" ending in ":" but not in ":]" */
7792 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7796 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7798 if (POSIXCC(UCHARAT(RExC_parse))) {
7799 const char *s = RExC_parse;
7800 const char c = *s++;
7804 if (*s && c == *s && s[1] == ']') {
7806 "POSIX syntax [%c %c] belongs inside character classes",
7809 /* [[=foo=]] and [[.foo.]] are still future. */
7810 if (POSIXCC_NOTYET(c)) {
7811 /* adjust RExC_parse so the error shows after
7813 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7815 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7822 #define _C_C_T_(NAME,TEST,WORD) \
7825 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7827 for (value = 0; value < 256; value++) \
7829 ANYOF_BITMAP_SET(ret, value); \
7834 case ANYOF_N##NAME: \
7836 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7838 for (value = 0; value < 256; value++) \
7840 ANYOF_BITMAP_SET(ret, value); \
7846 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7848 for (value = 0; value < 256; value++) \
7850 ANYOF_BITMAP_SET(ret, value); \
7854 case ANYOF_N##NAME: \
7855 for (value = 0; value < 256; value++) \
7857 ANYOF_BITMAP_SET(ret, value); \
7863 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7864 so that it is possible to override the option here without having to
7865 rebuild the entire core. as we are required to do if we change regcomp.h
7866 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7868 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7869 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7872 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7873 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7875 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7879 parse a class specification and produce either an ANYOF node that
7880 matches the pattern or if the pattern matches a single char only and
7881 that char is < 256 and we are case insensitive then we produce an
7886 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7889 register UV nextvalue;
7890 register IV prevvalue = OOB_UNICODE;
7891 register IV range = 0;
7892 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7893 register regnode *ret;
7896 char *rangebegin = NULL;
7897 bool need_class = 0;
7900 bool optimize_invert = TRUE;
7901 AV* unicode_alternate = NULL;
7903 UV literal_endpoint = 0;
7905 UV stored = 0; /* number of chars stored in the class */
7907 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7908 case we need to change the emitted regop to an EXACT. */
7909 const char * orig_parse = RExC_parse;
7910 GET_RE_DEBUG_FLAGS_DECL;
7912 PERL_ARGS_ASSERT_REGCLASS;
7914 PERL_UNUSED_ARG(depth);
7917 DEBUG_PARSE("clas");
7919 /* Assume we are going to generate an ANYOF node. */
7920 ret = reganode(pRExC_state, ANYOF, 0);
7923 ANYOF_FLAGS(ret) = 0;
7925 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7929 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7933 RExC_size += ANYOF_SKIP;
7934 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7937 RExC_emit += ANYOF_SKIP;
7939 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7941 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7942 ANYOF_BITMAP_ZERO(ret);
7943 listsv = newSVpvs("# comment\n");
7946 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7948 if (!SIZE_ONLY && POSIXCC(nextvalue))
7949 checkposixcc(pRExC_state);
7951 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7952 if (UCHARAT(RExC_parse) == ']')
7956 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7960 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7963 rangebegin = RExC_parse;
7965 value = utf8n_to_uvchr((U8*)RExC_parse,
7966 RExC_end - RExC_parse,
7967 &numlen, UTF8_ALLOW_DEFAULT);
7968 RExC_parse += numlen;
7971 value = UCHARAT(RExC_parse++);
7973 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7974 if (value == '[' && POSIXCC(nextvalue))
7975 namedclass = regpposixcc(pRExC_state, value);
7976 else if (value == '\\') {
7978 value = utf8n_to_uvchr((U8*)RExC_parse,
7979 RExC_end - RExC_parse,
7980 &numlen, UTF8_ALLOW_DEFAULT);
7981 RExC_parse += numlen;
7984 value = UCHARAT(RExC_parse++);
7985 /* Some compilers cannot handle switching on 64-bit integer
7986 * values, therefore value cannot be an UV. Yes, this will
7987 * be a problem later if we want switch on Unicode.
7988 * A similar issue a little bit later when switching on
7989 * namedclass. --jhi */
7990 switch ((I32)value) {
7991 case 'w': namedclass = ANYOF_ALNUM; break;
7992 case 'W': namedclass = ANYOF_NALNUM; break;
7993 case 's': namedclass = ANYOF_SPACE; break;
7994 case 'S': namedclass = ANYOF_NSPACE; break;
7995 case 'd': namedclass = ANYOF_DIGIT; break;
7996 case 'D': namedclass = ANYOF_NDIGIT; break;
7997 case 'v': namedclass = ANYOF_VERTWS; break;
7998 case 'V': namedclass = ANYOF_NVERTWS; break;
7999 case 'h': namedclass = ANYOF_HORIZWS; break;
8000 case 'H': namedclass = ANYOF_NHORIZWS; break;
8001 case 'N': /* Handle \N{NAME} in class */
8003 /* We only pay attention to the first char of
8004 multichar strings being returned. I kinda wonder
8005 if this makes sense as it does change the behaviour
8006 from earlier versions, OTOH that behaviour was broken
8008 UV v; /* value is register so we cant & it /grrr */
8009 if (reg_namedseq(pRExC_state, &v, NULL)) {
8019 if (RExC_parse >= RExC_end)
8020 vFAIL2("Empty \\%c{}", (U8)value);
8021 if (*RExC_parse == '{') {
8022 const U8 c = (U8)value;
8023 e = strchr(RExC_parse++, '}');
8025 vFAIL2("Missing right brace on \\%c{}", c);
8026 while (isSPACE(UCHARAT(RExC_parse)))
8028 if (e == RExC_parse)
8029 vFAIL2("Empty \\%c{}", c);
8031 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8039 if (UCHARAT(RExC_parse) == '^') {
8042 value = value == 'p' ? 'P' : 'p'; /* toggle */
8043 while (isSPACE(UCHARAT(RExC_parse))) {
8048 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8049 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8052 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8053 namedclass = ANYOF_MAX; /* no official name, but it's named */
8056 case 'n': value = '\n'; break;
8057 case 'r': value = '\r'; break;
8058 case 't': value = '\t'; break;
8059 case 'f': value = '\f'; break;
8060 case 'b': value = '\b'; break;
8061 case 'e': value = ASCII_TO_NATIVE('\033');break;
8062 case 'a': value = ASCII_TO_NATIVE('\007');break;
8064 if (*RExC_parse == '{') {
8065 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8066 | PERL_SCAN_DISALLOW_PREFIX;
8067 char * const e = strchr(RExC_parse++, '}');
8069 vFAIL("Missing right brace on \\x{}");
8071 numlen = e - RExC_parse;
8072 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8076 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8078 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8079 RExC_parse += numlen;
8081 if (PL_encoding && value < 0x100)
8082 goto recode_encoding;
8085 value = UCHARAT(RExC_parse++);
8086 value = toCTRL(value);
8088 case '0': case '1': case '2': case '3': case '4':
8089 case '5': case '6': case '7': case '8': case '9':
8093 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8094 RExC_parse += numlen;
8095 if (PL_encoding && value < 0x100)
8096 goto recode_encoding;
8101 SV* enc = PL_encoding;
8102 value = reg_recode((const char)(U8)value, &enc);
8103 if (!enc && SIZE_ONLY)
8104 ckWARNreg(RExC_parse,
8105 "Invalid escape in the specified encoding");
8109 if (!SIZE_ONLY && isALPHA(value))
8110 ckWARN2reg(RExC_parse,
8111 "Unrecognized escape \\%c in character class passed through",
8115 } /* end of \blah */
8121 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8123 if (!SIZE_ONLY && !need_class)
8124 ANYOF_CLASS_ZERO(ret);
8128 /* a bad range like a-\d, a-[:digit:] ? */
8132 RExC_parse >= rangebegin ?
8133 RExC_parse - rangebegin : 0;
8134 ckWARN4reg(RExC_parse,
8135 "False [] range \"%*.*s\"",
8138 if (prevvalue < 256) {
8139 ANYOF_BITMAP_SET(ret, prevvalue);
8140 ANYOF_BITMAP_SET(ret, '-');
8143 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8144 Perl_sv_catpvf(aTHX_ listsv,
8145 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8149 range = 0; /* this was not a true range */
8155 const char *what = NULL;
8158 if (namedclass > OOB_NAMEDCLASS)
8159 optimize_invert = FALSE;
8160 /* Possible truncation here but in some 64-bit environments
8161 * the compiler gets heartburn about switch on 64-bit values.
8162 * A similar issue a little earlier when switching on value.
8164 switch ((I32)namedclass) {
8166 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8167 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8168 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8169 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8170 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8171 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8172 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8173 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8174 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8175 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8176 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8177 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8178 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8180 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8181 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8183 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8184 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8185 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8188 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8191 for (value = 0; value < 128; value++)
8192 ANYOF_BITMAP_SET(ret, value);
8194 for (value = 0; value < 256; value++) {
8196 ANYOF_BITMAP_SET(ret, value);
8205 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8208 for (value = 128; value < 256; value++)
8209 ANYOF_BITMAP_SET(ret, value);
8211 for (value = 0; value < 256; value++) {
8212 if (!isASCII(value))
8213 ANYOF_BITMAP_SET(ret, value);
8222 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8224 /* consecutive digits assumed */
8225 for (value = '0'; value <= '9'; value++)
8226 ANYOF_BITMAP_SET(ret, value);
8229 what = POSIX_CC_UNI_NAME("Digit");
8233 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8235 /* consecutive digits assumed */
8236 for (value = 0; value < '0'; value++)
8237 ANYOF_BITMAP_SET(ret, value);
8238 for (value = '9' + 1; value < 256; value++)
8239 ANYOF_BITMAP_SET(ret, value);
8242 what = POSIX_CC_UNI_NAME("Digit");
8245 /* this is to handle \p and \P */
8248 vFAIL("Invalid [::] class");
8252 /* Strings such as "+utf8::isWord\n" */
8253 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8256 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8259 } /* end of namedclass \blah */
8262 if (prevvalue > (IV)value) /* b-a */ {
8263 const int w = RExC_parse - rangebegin;
8264 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8265 range = 0; /* not a valid range */
8269 prevvalue = value; /* save the beginning of the range */
8270 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8271 RExC_parse[1] != ']') {
8274 /* a bad range like \w-, [:word:]- ? */
8275 if (namedclass > OOB_NAMEDCLASS) {
8276 if (ckWARN(WARN_REGEXP)) {
8278 RExC_parse >= rangebegin ?
8279 RExC_parse - rangebegin : 0;
8281 "False [] range \"%*.*s\"",
8285 ANYOF_BITMAP_SET(ret, '-');
8287 range = 1; /* yeah, it's a range! */
8288 continue; /* but do it the next time */
8292 /* now is the next time */
8293 /*stored += (value - prevvalue + 1);*/
8295 if (prevvalue < 256) {
8296 const IV ceilvalue = value < 256 ? value : 255;
8299 /* In EBCDIC [\x89-\x91] should include
8300 * the \x8e but [i-j] should not. */
8301 if (literal_endpoint == 2 &&
8302 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8303 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8305 if (isLOWER(prevvalue)) {
8306 for (i = prevvalue; i <= ceilvalue; i++)
8307 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8309 ANYOF_BITMAP_SET(ret, i);
8312 for (i = prevvalue; i <= ceilvalue; i++)
8313 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8315 ANYOF_BITMAP_SET(ret, i);
8321 for (i = prevvalue; i <= ceilvalue; i++) {
8322 if (!ANYOF_BITMAP_TEST(ret,i)) {
8324 ANYOF_BITMAP_SET(ret, i);
8328 if (value > 255 || UTF) {
8329 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8330 const UV natvalue = NATIVE_TO_UNI(value);
8331 stored+=2; /* can't optimize this class */
8332 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8333 if (prevnatvalue < natvalue) { /* what about > ? */
8334 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8335 prevnatvalue, natvalue);
8337 else if (prevnatvalue == natvalue) {
8338 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8340 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8342 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8344 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8345 if (RExC_precomp[0] == ':' &&
8346 RExC_precomp[1] == '[' &&
8347 (f == 0xDF || f == 0x92)) {
8348 f = NATIVE_TO_UNI(f);
8351 /* If folding and foldable and a single
8352 * character, insert also the folded version
8353 * to the charclass. */
8355 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8356 if ((RExC_precomp[0] == ':' &&
8357 RExC_precomp[1] == '[' &&
8359 (value == 0xFB05 || value == 0xFB06))) ?
8360 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8361 foldlen == (STRLEN)UNISKIP(f) )
8363 if (foldlen == (STRLEN)UNISKIP(f))
8365 Perl_sv_catpvf(aTHX_ listsv,
8368 /* Any multicharacter foldings
8369 * require the following transform:
8370 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8371 * where E folds into "pq" and F folds
8372 * into "rst", all other characters
8373 * fold to single characters. We save
8374 * away these multicharacter foldings,
8375 * to be later saved as part of the
8376 * additional "s" data. */
8379 if (!unicode_alternate)
8380 unicode_alternate = newAV();
8381 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8383 av_push(unicode_alternate, sv);
8387 /* If folding and the value is one of the Greek
8388 * sigmas insert a few more sigmas to make the
8389 * folding rules of the sigmas to work right.
8390 * Note that not all the possible combinations
8391 * are handled here: some of them are handled
8392 * by the standard folding rules, and some of
8393 * them (literal or EXACTF cases) are handled
8394 * during runtime in regexec.c:S_find_byclass(). */
8395 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8396 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8397 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8398 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8399 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8401 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8402 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8403 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8408 literal_endpoint = 0;
8412 range = 0; /* this range (if it was one) is done now */
8416 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8418 RExC_size += ANYOF_CLASS_ADD_SKIP;
8420 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8426 /****** !SIZE_ONLY AFTER HERE *********/
8428 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8429 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8431 /* optimize single char class to an EXACT node
8432 but *only* when its not a UTF/high char */
8433 const char * cur_parse= RExC_parse;
8434 RExC_emit = (regnode *)orig_emit;
8435 RExC_parse = (char *)orig_parse;
8436 ret = reg_node(pRExC_state,
8437 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8438 RExC_parse = (char *)cur_parse;
8439 *STRING(ret)= (char)value;
8441 RExC_emit += STR_SZ(1);
8443 SvREFCNT_dec(listsv);
8447 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8448 if ( /* If the only flag is folding (plus possibly inversion). */
8449 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8451 for (value = 0; value < 256; ++value) {
8452 if (ANYOF_BITMAP_TEST(ret, value)) {
8453 UV fold = PL_fold[value];
8456 ANYOF_BITMAP_SET(ret, fold);
8459 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8462 /* optimize inverted simple patterns (e.g. [^a-z]) */
8463 if (optimize_invert &&
8464 /* If the only flag is inversion. */
8465 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8466 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8467 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8468 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8471 AV * const av = newAV();
8473 /* The 0th element stores the character class description
8474 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8475 * to initialize the appropriate swash (which gets stored in
8476 * the 1st element), and also useful for dumping the regnode.
8477 * The 2nd element stores the multicharacter foldings,
8478 * used later (regexec.c:S_reginclass()). */
8479 av_store(av, 0, listsv);
8480 av_store(av, 1, NULL);
8481 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8482 rv = newRV_noinc(MUTABLE_SV(av));
8483 n = add_data(pRExC_state, 1, "s");
8484 RExC_rxi->data->data[n] = (void*)rv;
8492 /* reg_skipcomment()
8494 Absorbs an /x style # comments from the input stream.
8495 Returns true if there is more text remaining in the stream.
8496 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8497 terminates the pattern without including a newline.
8499 Note its the callers responsibility to ensure that we are
8505 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8509 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8511 while (RExC_parse < RExC_end)
8512 if (*RExC_parse++ == '\n') {
8517 /* we ran off the end of the pattern without ending
8518 the comment, so we have to add an \n when wrapping */
8519 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8527 Advance that parse position, and optionall absorbs
8528 "whitespace" from the inputstream.
8530 Without /x "whitespace" means (?#...) style comments only,
8531 with /x this means (?#...) and # comments and whitespace proper.
8533 Returns the RExC_parse point from BEFORE the scan occurs.
8535 This is the /x friendly way of saying RExC_parse++.
8539 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8541 char* const retval = RExC_parse++;
8543 PERL_ARGS_ASSERT_NEXTCHAR;
8546 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8547 RExC_parse[2] == '#') {
8548 while (*RExC_parse != ')') {
8549 if (RExC_parse == RExC_end)
8550 FAIL("Sequence (?#... not terminated");
8556 if (RExC_flags & RXf_PMf_EXTENDED) {
8557 if (isSPACE(*RExC_parse)) {
8561 else if (*RExC_parse == '#') {
8562 if ( reg_skipcomment( pRExC_state ) )
8571 - reg_node - emit a node
8573 STATIC regnode * /* Location. */
8574 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8577 register regnode *ptr;
8578 regnode * const ret = RExC_emit;
8579 GET_RE_DEBUG_FLAGS_DECL;
8581 PERL_ARGS_ASSERT_REG_NODE;
8584 SIZE_ALIGN(RExC_size);
8588 if (RExC_emit >= RExC_emit_bound)
8589 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8591 NODE_ALIGN_FILL(ret);
8593 FILL_ADVANCE_NODE(ptr, op);
8594 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8595 #ifdef RE_TRACK_PATTERN_OFFSETS
8596 if (RExC_offsets) { /* MJD */
8597 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8598 "reg_node", __LINE__,
8600 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8601 ? "Overwriting end of array!\n" : "OK",
8602 (UV)(RExC_emit - RExC_emit_start),
8603 (UV)(RExC_parse - RExC_start),
8604 (UV)RExC_offsets[0]));
8605 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8613 - reganode - emit a node with an argument
8615 STATIC regnode * /* Location. */
8616 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8619 register regnode *ptr;
8620 regnode * const ret = RExC_emit;
8621 GET_RE_DEBUG_FLAGS_DECL;
8623 PERL_ARGS_ASSERT_REGANODE;
8626 SIZE_ALIGN(RExC_size);
8631 assert(2==regarglen[op]+1);
8633 Anything larger than this has to allocate the extra amount.
8634 If we changed this to be:
8636 RExC_size += (1 + regarglen[op]);
8638 then it wouldn't matter. Its not clear what side effect
8639 might come from that so its not done so far.
8644 if (RExC_emit >= RExC_emit_bound)
8645 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8647 NODE_ALIGN_FILL(ret);
8649 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8650 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8651 #ifdef RE_TRACK_PATTERN_OFFSETS
8652 if (RExC_offsets) { /* MJD */
8653 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8657 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8658 "Overwriting end of array!\n" : "OK",
8659 (UV)(RExC_emit - RExC_emit_start),
8660 (UV)(RExC_parse - RExC_start),
8661 (UV)RExC_offsets[0]));
8662 Set_Cur_Node_Offset;
8670 - reguni - emit (if appropriate) a Unicode character
8673 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8677 PERL_ARGS_ASSERT_REGUNI;
8679 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8683 - reginsert - insert an operator in front of already-emitted operand
8685 * Means relocating the operand.
8688 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8691 register regnode *src;
8692 register regnode *dst;
8693 register regnode *place;
8694 const int offset = regarglen[(U8)op];
8695 const int size = NODE_STEP_REGNODE + offset;
8696 GET_RE_DEBUG_FLAGS_DECL;
8698 PERL_ARGS_ASSERT_REGINSERT;
8699 PERL_UNUSED_ARG(depth);
8700 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8701 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8710 if (RExC_open_parens) {
8712 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8713 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8714 if ( RExC_open_parens[paren] >= opnd ) {
8715 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8716 RExC_open_parens[paren] += size;
8718 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8720 if ( RExC_close_parens[paren] >= opnd ) {
8721 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8722 RExC_close_parens[paren] += size;
8724 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8729 while (src > opnd) {
8730 StructCopy(--src, --dst, regnode);
8731 #ifdef RE_TRACK_PATTERN_OFFSETS
8732 if (RExC_offsets) { /* MJD 20010112 */
8733 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8737 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8738 ? "Overwriting end of array!\n" : "OK",
8739 (UV)(src - RExC_emit_start),
8740 (UV)(dst - RExC_emit_start),
8741 (UV)RExC_offsets[0]));
8742 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8743 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8749 place = opnd; /* Op node, where operand used to be. */
8750 #ifdef RE_TRACK_PATTERN_OFFSETS
8751 if (RExC_offsets) { /* MJD */
8752 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8756 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8757 ? "Overwriting end of array!\n" : "OK",
8758 (UV)(place - RExC_emit_start),
8759 (UV)(RExC_parse - RExC_start),
8760 (UV)RExC_offsets[0]));
8761 Set_Node_Offset(place, RExC_parse);
8762 Set_Node_Length(place, 1);
8765 src = NEXTOPER(place);
8766 FILL_ADVANCE_NODE(place, op);
8767 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8768 Zero(src, offset, regnode);
8772 - regtail - set the next-pointer at the end of a node chain of p to val.
8773 - SEE ALSO: regtail_study
8775 /* TODO: All three parms should be const */
8777 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8780 register regnode *scan;
8781 GET_RE_DEBUG_FLAGS_DECL;
8783 PERL_ARGS_ASSERT_REGTAIL;
8785 PERL_UNUSED_ARG(depth);
8791 /* Find last node. */
8794 regnode * const temp = regnext(scan);
8796 SV * const mysv=sv_newmortal();
8797 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8798 regprop(RExC_rx, mysv, scan);
8799 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8800 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8801 (temp == NULL ? "->" : ""),
8802 (temp == NULL ? PL_reg_name[OP(val)] : "")
8810 if (reg_off_by_arg[OP(scan)]) {
8811 ARG_SET(scan, val - scan);
8814 NEXT_OFF(scan) = val - scan;
8820 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8821 - Look for optimizable sequences at the same time.
8822 - currently only looks for EXACT chains.
8824 This is expermental code. The idea is to use this routine to perform
8825 in place optimizations on branches and groups as they are constructed,
8826 with the long term intention of removing optimization from study_chunk so
8827 that it is purely analytical.
8829 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8830 to control which is which.
8833 /* TODO: All four parms should be const */
8836 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8839 register regnode *scan;
8841 #ifdef EXPERIMENTAL_INPLACESCAN
8844 GET_RE_DEBUG_FLAGS_DECL;
8846 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8852 /* Find last node. */
8856 regnode * const temp = regnext(scan);
8857 #ifdef EXPERIMENTAL_INPLACESCAN
8858 if (PL_regkind[OP(scan)] == EXACT)
8859 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8867 if( exact == PSEUDO )
8869 else if ( exact != OP(scan) )
8878 SV * const mysv=sv_newmortal();
8879 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8880 regprop(RExC_rx, mysv, scan);
8881 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8882 SvPV_nolen_const(mysv),
8884 PL_reg_name[exact]);
8891 SV * const mysv_val=sv_newmortal();
8892 DEBUG_PARSE_MSG("");
8893 regprop(RExC_rx, mysv_val, val);
8894 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8895 SvPV_nolen_const(mysv_val),
8896 (IV)REG_NODE_NUM(val),
8900 if (reg_off_by_arg[OP(scan)]) {
8901 ARG_SET(scan, val - scan);
8904 NEXT_OFF(scan) = val - scan;
8912 - regcurly - a little FSA that accepts {\d+,?\d*}
8915 S_regcurly(register const char *s)
8917 PERL_ARGS_ASSERT_REGCURLY;
8936 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8940 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8945 for (bit=0; bit<32; bit++) {
8946 if (flags & (1<<bit)) {
8948 PerlIO_printf(Perl_debug_log, "%s",lead);
8949 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8954 PerlIO_printf(Perl_debug_log, "\n");
8956 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8962 Perl_regdump(pTHX_ const regexp *r)
8966 SV * const sv = sv_newmortal();
8967 SV *dsv= sv_newmortal();
8969 GET_RE_DEBUG_FLAGS_DECL;
8971 PERL_ARGS_ASSERT_REGDUMP;
8973 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8975 /* Header fields of interest. */
8976 if (r->anchored_substr) {
8977 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8978 RE_SV_DUMPLEN(r->anchored_substr), 30);
8979 PerlIO_printf(Perl_debug_log,
8980 "anchored %s%s at %"IVdf" ",
8981 s, RE_SV_TAIL(r->anchored_substr),
8982 (IV)r->anchored_offset);
8983 } else if (r->anchored_utf8) {
8984 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8985 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8986 PerlIO_printf(Perl_debug_log,
8987 "anchored utf8 %s%s at %"IVdf" ",
8988 s, RE_SV_TAIL(r->anchored_utf8),
8989 (IV)r->anchored_offset);
8991 if (r->float_substr) {
8992 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8993 RE_SV_DUMPLEN(r->float_substr), 30);
8994 PerlIO_printf(Perl_debug_log,
8995 "floating %s%s at %"IVdf"..%"UVuf" ",
8996 s, RE_SV_TAIL(r->float_substr),
8997 (IV)r->float_min_offset, (UV)r->float_max_offset);
8998 } else if (r->float_utf8) {
8999 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9000 RE_SV_DUMPLEN(r->float_utf8), 30);
9001 PerlIO_printf(Perl_debug_log,
9002 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9003 s, RE_SV_TAIL(r->float_utf8),
9004 (IV)r->float_min_offset, (UV)r->float_max_offset);
9006 if (r->check_substr || r->check_utf8)
9007 PerlIO_printf(Perl_debug_log,
9009 (r->check_substr == r->float_substr
9010 && r->check_utf8 == r->float_utf8
9011 ? "(checking floating" : "(checking anchored"));
9012 if (r->extflags & RXf_NOSCAN)
9013 PerlIO_printf(Perl_debug_log, " noscan");
9014 if (r->extflags & RXf_CHECK_ALL)
9015 PerlIO_printf(Perl_debug_log, " isall");
9016 if (r->check_substr || r->check_utf8)
9017 PerlIO_printf(Perl_debug_log, ") ");
9019 if (ri->regstclass) {
9020 regprop(r, sv, ri->regstclass);
9021 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9023 if (r->extflags & RXf_ANCH) {
9024 PerlIO_printf(Perl_debug_log, "anchored");
9025 if (r->extflags & RXf_ANCH_BOL)
9026 PerlIO_printf(Perl_debug_log, "(BOL)");
9027 if (r->extflags & RXf_ANCH_MBOL)
9028 PerlIO_printf(Perl_debug_log, "(MBOL)");
9029 if (r->extflags & RXf_ANCH_SBOL)
9030 PerlIO_printf(Perl_debug_log, "(SBOL)");
9031 if (r->extflags & RXf_ANCH_GPOS)
9032 PerlIO_printf(Perl_debug_log, "(GPOS)");
9033 PerlIO_putc(Perl_debug_log, ' ');
9035 if (r->extflags & RXf_GPOS_SEEN)
9036 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9037 if (r->intflags & PREGf_SKIP)
9038 PerlIO_printf(Perl_debug_log, "plus ");
9039 if (r->intflags & PREGf_IMPLICIT)
9040 PerlIO_printf(Perl_debug_log, "implicit ");
9041 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9042 if (r->extflags & RXf_EVAL_SEEN)
9043 PerlIO_printf(Perl_debug_log, "with eval ");
9044 PerlIO_printf(Perl_debug_log, "\n");
9045 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9047 PERL_ARGS_ASSERT_REGDUMP;
9048 PERL_UNUSED_CONTEXT;
9050 #endif /* DEBUGGING */
9054 - regprop - printable representation of opcode
9056 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9059 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9060 if (flags & ANYOF_INVERT) \
9061 /*make sure the invert info is in each */ \
9062 sv_catpvs(sv, "^"); \
9068 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9073 RXi_GET_DECL(prog,progi);
9074 GET_RE_DEBUG_FLAGS_DECL;
9076 PERL_ARGS_ASSERT_REGPROP;
9080 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9081 /* It would be nice to FAIL() here, but this may be called from
9082 regexec.c, and it would be hard to supply pRExC_state. */
9083 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9084 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9086 k = PL_regkind[OP(o)];
9090 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9091 * is a crude hack but it may be the best for now since
9092 * we have no flag "this EXACTish node was UTF-8"
9094 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9095 PERL_PV_ESCAPE_UNI_DETECT |
9096 PERL_PV_PRETTY_ELLIPSES |
9097 PERL_PV_PRETTY_LTGT |
9098 PERL_PV_PRETTY_NOCLEAR
9100 } else if (k == TRIE) {
9101 /* print the details of the trie in dumpuntil instead, as
9102 * progi->data isn't available here */
9103 const char op = OP(o);
9104 const U32 n = ARG(o);
9105 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9106 (reg_ac_data *)progi->data->data[n] :
9108 const reg_trie_data * const trie
9109 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9111 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9112 DEBUG_TRIE_COMPILE_r(
9113 Perl_sv_catpvf(aTHX_ sv,
9114 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9115 (UV)trie->startstate,
9116 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9117 (UV)trie->wordcount,
9120 (UV)TRIE_CHARCOUNT(trie),
9121 (UV)trie->uniquecharcount
9124 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9126 int rangestart = -1;
9127 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9129 for (i = 0; i <= 256; i++) {
9130 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9131 if (rangestart == -1)
9133 } else if (rangestart != -1) {
9134 if (i <= rangestart + 3)
9135 for (; rangestart < i; rangestart++)
9136 put_byte(sv, rangestart);
9138 put_byte(sv, rangestart);
9140 put_byte(sv, i - 1);
9148 } else if (k == CURLY) {
9149 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9150 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9151 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9153 else if (k == WHILEM && o->flags) /* Ordinal/of */
9154 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9155 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9156 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9157 if ( RXp_PAREN_NAMES(prog) ) {
9158 if ( k != REF || OP(o) < NREF) {
9159 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9160 SV **name= av_fetch(list, ARG(o), 0 );
9162 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9165 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9166 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9167 I32 *nums=(I32*)SvPVX(sv_dat);
9168 SV **name= av_fetch(list, nums[0], 0 );
9171 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9172 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9173 (n ? "," : ""), (IV)nums[n]);
9175 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9179 } else if (k == GOSUB)
9180 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9181 else if (k == VERB) {
9183 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9184 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9185 } else if (k == LOGICAL)
9186 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9187 else if (k == FOLDCHAR)
9188 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9189 else if (k == ANYOF) {
9190 int i, rangestart = -1;
9191 const U8 flags = ANYOF_FLAGS(o);
9194 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9195 static const char * const anyofs[] = {
9228 if (flags & ANYOF_LOCALE)
9229 sv_catpvs(sv, "{loc}");
9230 if (flags & ANYOF_FOLD)
9231 sv_catpvs(sv, "{i}");
9232 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9233 if (flags & ANYOF_INVERT)
9236 /* output what the standard cp 0-255 bitmap matches */
9237 for (i = 0; i <= 256; i++) {
9238 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9239 if (rangestart == -1)
9241 } else if (rangestart != -1) {
9242 if (i <= rangestart + 3)
9243 for (; rangestart < i; rangestart++)
9244 put_byte(sv, rangestart);
9246 put_byte(sv, rangestart);
9248 put_byte(sv, i - 1);
9255 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9256 /* output any special charclass tests (used mostly under use locale) */
9257 if (o->flags & ANYOF_CLASS)
9258 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9259 if (ANYOF_CLASS_TEST(o,i)) {
9260 sv_catpv(sv, anyofs[i]);
9264 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9266 /* output information about the unicode matching */
9267 if (flags & ANYOF_UNICODE)
9268 sv_catpvs(sv, "{unicode}");
9269 else if (flags & ANYOF_UNICODE_ALL)
9270 sv_catpvs(sv, "{unicode_all}");
9274 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9278 U8 s[UTF8_MAXBYTES_CASE+1];
9280 for (i = 0; i <= 256; i++) { /* just the first 256 */
9281 uvchr_to_utf8(s, i);
9283 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9284 if (rangestart == -1)
9286 } else if (rangestart != -1) {
9287 if (i <= rangestart + 3)
9288 for (; rangestart < i; rangestart++) {
9289 const U8 * const e = uvchr_to_utf8(s,rangestart);
9291 for(p = s; p < e; p++)
9295 const U8 *e = uvchr_to_utf8(s,rangestart);
9297 for (p = s; p < e; p++)
9300 e = uvchr_to_utf8(s, i-1);
9301 for (p = s; p < e; p++)
9308 sv_catpvs(sv, "..."); /* et cetera */
9312 char *s = savesvpv(lv);
9313 char * const origs = s;
9315 while (*s && *s != '\n')
9319 const char * const t = ++s;
9337 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9339 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9340 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9342 PERL_UNUSED_CONTEXT;
9343 PERL_UNUSED_ARG(sv);
9345 PERL_UNUSED_ARG(prog);
9346 #endif /* DEBUGGING */
9350 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9351 { /* Assume that RE_INTUIT is set */
9353 struct regexp *const prog = (struct regexp *)SvANY(r);
9354 GET_RE_DEBUG_FLAGS_DECL;
9356 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9357 PERL_UNUSED_CONTEXT;
9361 const char * const s = SvPV_nolen_const(prog->check_substr
9362 ? prog->check_substr : prog->check_utf8);
9364 if (!PL_colorset) reginitcolors();
9365 PerlIO_printf(Perl_debug_log,
9366 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9368 prog->check_substr ? "" : "utf8 ",
9369 PL_colors[5],PL_colors[0],
9372 (strlen(s) > 60 ? "..." : ""));
9375 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9381 handles refcounting and freeing the perl core regexp structure. When
9382 it is necessary to actually free the structure the first thing it
9383 does is call the 'free' method of the regexp_engine associated to to
9384 the regexp, allowing the handling of the void *pprivate; member
9385 first. (This routine is not overridable by extensions, which is why
9386 the extensions free is called first.)
9388 See regdupe and regdupe_internal if you change anything here.
9390 #ifndef PERL_IN_XSUB_RE
9392 Perl_pregfree(pTHX_ REGEXP *r)
9398 Perl_pregfree2(pTHX_ REGEXP *rx)
9401 struct regexp *const r = (struct regexp *)SvANY(rx);
9402 GET_RE_DEBUG_FLAGS_DECL;
9404 PERL_ARGS_ASSERT_PREGFREE2;
9407 ReREFCNT_dec(r->mother_re);
9409 CALLREGFREE_PVT(rx); /* free the private data */
9410 if (RXp_PAREN_NAMES(r))
9411 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9414 if (r->anchored_substr)
9415 SvREFCNT_dec(r->anchored_substr);
9416 if (r->anchored_utf8)
9417 SvREFCNT_dec(r->anchored_utf8);
9418 if (r->float_substr)
9419 SvREFCNT_dec(r->float_substr);
9421 SvREFCNT_dec(r->float_utf8);
9422 Safefree(r->substrs);
9424 RX_MATCH_COPY_FREE(rx);
9425 #ifdef PERL_OLD_COPY_ON_WRITE
9427 SvREFCNT_dec(r->saved_copy);
9434 This is a hacky workaround to the structural issue of match results
9435 being stored in the regexp structure which is in turn stored in
9436 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9437 could be PL_curpm in multiple contexts, and could require multiple
9438 result sets being associated with the pattern simultaneously, such
9439 as when doing a recursive match with (??{$qr})
9441 The solution is to make a lightweight copy of the regexp structure
9442 when a qr// is returned from the code executed by (??{$qr}) this
9443 lightweight copy doesnt actually own any of its data except for
9444 the starp/end and the actual regexp structure itself.
9450 Perl_reg_temp_copy (pTHX_ REGEXP *rx)
9452 REGEXP *ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9453 struct regexp *ret = (struct regexp *)SvANY(ret_x);
9454 struct regexp *const r = (struct regexp *)SvANY(rx);
9455 register const I32 npar = r->nparens+1;
9457 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9459 (void)ReREFCNT_inc(rx);
9460 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9461 by pointing directly at the buffer, but flagging that the allocated
9462 space in the copy is zero. As we've just done a struct copy, it's now
9463 a case of zero-ing that, rather than copying the current length. */
9464 SvPV_set(ret_x, RX_WRAPPED(rx));
9465 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9466 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9467 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9468 SvLEN_set(ret_x, 0);
9469 Newx(ret->offs, npar, regexp_paren_pair);
9470 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9472 Newx(ret->substrs, 1, struct reg_substr_data);
9473 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9475 SvREFCNT_inc_void(ret->anchored_substr);
9476 SvREFCNT_inc_void(ret->anchored_utf8);
9477 SvREFCNT_inc_void(ret->float_substr);
9478 SvREFCNT_inc_void(ret->float_utf8);
9480 /* check_substr and check_utf8, if non-NULL, point to either their
9481 anchored or float namesakes, and don't hold a second reference. */
9483 RX_MATCH_COPIED_off(ret_x);
9484 #ifdef PERL_OLD_COPY_ON_WRITE
9485 ret->saved_copy = NULL;
9487 ret->mother_re = rx;
9493 /* regfree_internal()
9495 Free the private data in a regexp. This is overloadable by
9496 extensions. Perl takes care of the regexp structure in pregfree(),
9497 this covers the *pprivate pointer which technically perldoesnt
9498 know about, however of course we have to handle the
9499 regexp_internal structure when no extension is in use.
9501 Note this is called before freeing anything in the regexp
9506 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9509 struct regexp *const r = (struct regexp *)SvANY(rx);
9511 GET_RE_DEBUG_FLAGS_DECL;
9513 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9519 SV *dsv= sv_newmortal();
9520 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9521 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9522 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9523 PL_colors[4],PL_colors[5],s);
9526 #ifdef RE_TRACK_PATTERN_OFFSETS
9528 Safefree(ri->u.offsets); /* 20010421 MJD */
9531 int n = ri->data->count;
9532 PAD* new_comppad = NULL;
9537 /* If you add a ->what type here, update the comment in regcomp.h */
9538 switch (ri->data->what[n]) {
9542 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9545 Safefree(ri->data->data[n]);
9548 new_comppad = MUTABLE_AV(ri->data->data[n]);
9551 if (new_comppad == NULL)
9552 Perl_croak(aTHX_ "panic: pregfree comppad");
9553 PAD_SAVE_LOCAL(old_comppad,
9554 /* Watch out for global destruction's random ordering. */
9555 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9558 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9561 op_free((OP_4tree*)ri->data->data[n]);
9563 PAD_RESTORE_LOCAL(old_comppad);
9564 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9570 { /* Aho Corasick add-on structure for a trie node.
9571 Used in stclass optimization only */
9573 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9575 refcount = --aho->refcount;
9578 PerlMemShared_free(aho->states);
9579 PerlMemShared_free(aho->fail);
9580 /* do this last!!!! */
9581 PerlMemShared_free(ri->data->data[n]);
9582 PerlMemShared_free(ri->regstclass);
9588 /* trie structure. */
9590 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9592 refcount = --trie->refcount;
9595 PerlMemShared_free(trie->charmap);
9596 PerlMemShared_free(trie->states);
9597 PerlMemShared_free(trie->trans);
9599 PerlMemShared_free(trie->bitmap);
9601 PerlMemShared_free(trie->wordlen);
9603 PerlMemShared_free(trie->jump);
9605 PerlMemShared_free(trie->nextword);
9606 /* do this last!!!! */
9607 PerlMemShared_free(ri->data->data[n]);
9612 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9615 Safefree(ri->data->what);
9622 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9623 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9624 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9625 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9628 re_dup - duplicate a regexp.
9630 This routine is expected to clone a given regexp structure. It is only
9631 compiled under USE_ITHREADS.
9633 After all of the core data stored in struct regexp is duplicated
9634 the regexp_engine.dupe method is used to copy any private data
9635 stored in the *pprivate pointer. This allows extensions to handle
9636 any duplication it needs to do.
9638 See pregfree() and regfree_internal() if you change anything here.
9640 #if defined(USE_ITHREADS)
9641 #ifndef PERL_IN_XSUB_RE
9643 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9647 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9648 struct regexp *ret = (struct regexp *)SvANY(dstr);
9650 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9652 npar = r->nparens+1;
9653 Newx(ret->offs, npar, regexp_paren_pair);
9654 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9656 /* no need to copy these */
9657 Newx(ret->swap, npar, regexp_paren_pair);
9661 /* Do it this way to avoid reading from *r after the StructCopy().
9662 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9663 cache, it doesn't matter. */
9664 const bool anchored = r->check_substr
9665 ? r->check_substr == r->anchored_substr
9666 : r->check_utf8 == r->anchored_utf8;
9667 Newx(ret->substrs, 1, struct reg_substr_data);
9668 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9670 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9671 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9672 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9673 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9675 /* check_substr and check_utf8, if non-NULL, point to either their
9676 anchored or float namesakes, and don't hold a second reference. */
9678 if (ret->check_substr) {
9680 assert(r->check_utf8 == r->anchored_utf8);
9681 ret->check_substr = ret->anchored_substr;
9682 ret->check_utf8 = ret->anchored_utf8;
9684 assert(r->check_substr == r->float_substr);
9685 assert(r->check_utf8 == r->float_utf8);
9686 ret->check_substr = ret->float_substr;
9687 ret->check_utf8 = ret->float_utf8;
9689 } else if (ret->check_utf8) {
9691 ret->check_utf8 = ret->anchored_utf8;
9693 ret->check_utf8 = ret->float_utf8;
9698 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9701 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9703 if (RX_MATCH_COPIED(dstr))
9704 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9707 #ifdef PERL_OLD_COPY_ON_WRITE
9708 ret->saved_copy = NULL;
9711 ret->mother_re = NULL;
9714 #endif /* PERL_IN_XSUB_RE */
9719 This is the internal complement to regdupe() which is used to copy
9720 the structure pointed to by the *pprivate pointer in the regexp.
9721 This is the core version of the extension overridable cloning hook.
9722 The regexp structure being duplicated will be copied by perl prior
9723 to this and will be provided as the regexp *r argument, however
9724 with the /old/ structures pprivate pointer value. Thus this routine
9725 may override any copying normally done by perl.
9727 It returns a pointer to the new regexp_internal structure.
9731 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9734 struct regexp *const r = (struct regexp *)SvANY(rx);
9735 regexp_internal *reti;
9739 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9741 npar = r->nparens+1;
9744 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9745 Copy(ri->program, reti->program, len+1, regnode);
9748 reti->regstclass = NULL;
9752 const int count = ri->data->count;
9755 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9756 char, struct reg_data);
9757 Newx(d->what, count, U8);
9760 for (i = 0; i < count; i++) {
9761 d->what[i] = ri->data->what[i];
9762 switch (d->what[i]) {
9763 /* legal options are one of: sSfpontTu
9764 see also regcomp.h and pregfree() */
9767 case 'p': /* actually an AV, but the dup function is identical. */
9768 case 'u': /* actually an HV, but the dup function is identical. */
9769 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9772 /* This is cheating. */
9773 Newx(d->data[i], 1, struct regnode_charclass_class);
9774 StructCopy(ri->data->data[i], d->data[i],
9775 struct regnode_charclass_class);
9776 reti->regstclass = (regnode*)d->data[i];
9779 /* Compiled op trees are readonly and in shared memory,
9780 and can thus be shared without duplication. */
9782 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9786 /* Trie stclasses are readonly and can thus be shared
9787 * without duplication. We free the stclass in pregfree
9788 * when the corresponding reg_ac_data struct is freed.
9790 reti->regstclass= ri->regstclass;
9794 ((reg_trie_data*)ri->data->data[i])->refcount++;
9798 d->data[i] = ri->data->data[i];
9801 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9810 reti->name_list_idx = ri->name_list_idx;
9812 #ifdef RE_TRACK_PATTERN_OFFSETS
9813 if (ri->u.offsets) {
9814 Newx(reti->u.offsets, 2*len+1, U32);
9815 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9818 SetProgLen(reti,len);
9824 #endif /* USE_ITHREADS */
9826 #ifndef PERL_IN_XSUB_RE
9829 - regnext - dig the "next" pointer out of a node
9832 Perl_regnext(pTHX_ register regnode *p)
9835 register I32 offset;
9840 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9849 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9852 STRLEN l1 = strlen(pat1);
9853 STRLEN l2 = strlen(pat2);
9856 const char *message;
9858 PERL_ARGS_ASSERT_RE_CROAK2;
9864 Copy(pat1, buf, l1 , char);
9865 Copy(pat2, buf + l1, l2 , char);
9866 buf[l1 + l2] = '\n';
9867 buf[l1 + l2 + 1] = '\0';
9869 /* ANSI variant takes additional second argument */
9870 va_start(args, pat2);
9874 msv = vmess(buf, &args);
9876 message = SvPV_const(msv,l1);
9879 Copy(message, buf, l1 , char);
9880 buf[l1-1] = '\0'; /* Overwrite \n */
9881 Perl_croak(aTHX_ "%s", buf);
9884 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9886 #ifndef PERL_IN_XSUB_RE
9888 Perl_save_re_context(pTHX)
9892 struct re_save_state *state;
9894 SAVEVPTR(PL_curcop);
9895 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9897 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9898 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9899 SSPUSHINT(SAVEt_RE_STATE);
9901 Copy(&PL_reg_state, state, 1, struct re_save_state);
9903 PL_reg_start_tmp = 0;
9904 PL_reg_start_tmpl = 0;
9905 PL_reg_oldsaved = NULL;
9906 PL_reg_oldsavedlen = 0;
9908 PL_reg_leftiter = 0;
9909 PL_reg_poscache = NULL;
9910 PL_reg_poscache_size = 0;
9911 #ifdef PERL_OLD_COPY_ON_WRITE
9915 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9917 const REGEXP * const rx = PM_GETRE(PL_curpm);
9920 for (i = 1; i <= RX_NPARENS(rx); i++) {
9921 char digits[TYPE_CHARS(long)];
9922 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9923 GV *const *const gvp
9924 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9927 GV * const gv = *gvp;
9928 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9938 clear_re(pTHX_ void *r)
9941 ReREFCNT_dec((REGEXP *)r);
9947 S_put_byte(pTHX_ SV *sv, int c)
9949 PERL_ARGS_ASSERT_PUT_BYTE;
9951 /* Our definition of isPRINT() ignores locales, so only bytes that are
9952 not part of UTF-8 are considered printable. I assume that the same
9953 holds for UTF-EBCDIC.
9954 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9955 which Wikipedia says:
9957 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9958 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9959 identical, to the ASCII delete (DEL) or rubout control character.
9960 ) So the old condition can be simplified to !isPRINT(c) */
9962 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9964 const char string = c;
9965 if (c == '-' || c == ']' || c == '\\' || c == '^')
9966 sv_catpvs(sv, "\\");
9967 sv_catpvn(sv, &string, 1);
9972 #define CLEAR_OPTSTART \
9973 if (optstart) STMT_START { \
9974 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9978 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9980 STATIC const regnode *
9981 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9982 const regnode *last, const regnode *plast,
9983 SV* sv, I32 indent, U32 depth)
9986 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9987 register const regnode *next;
9988 const regnode *optstart= NULL;
9991 GET_RE_DEBUG_FLAGS_DECL;
9993 PERL_ARGS_ASSERT_DUMPUNTIL;
9995 #ifdef DEBUG_DUMPUNTIL
9996 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9997 last ? last-start : 0,plast ? plast-start : 0);
10000 if (plast && plast < last)
10003 while (PL_regkind[op] != END && (!last || node < last)) {
10004 /* While that wasn't END last time... */
10007 if (op == CLOSE || op == WHILEM)
10009 next = regnext((regnode *)node);
10012 if (OP(node) == OPTIMIZED) {
10013 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10020 regprop(r, sv, node);
10021 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10022 (int)(2*indent + 1), "", SvPVX_const(sv));
10024 if (OP(node) != OPTIMIZED) {
10025 if (next == NULL) /* Next ptr. */
10026 PerlIO_printf(Perl_debug_log, " (0)");
10027 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10028 PerlIO_printf(Perl_debug_log, " (FAIL)");
10030 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10031 (void)PerlIO_putc(Perl_debug_log, '\n');
10035 if (PL_regkind[(U8)op] == BRANCHJ) {
10038 register const regnode *nnode = (OP(next) == LONGJMP
10039 ? regnext((regnode *)next)
10041 if (last && nnode > last)
10043 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10046 else if (PL_regkind[(U8)op] == BRANCH) {
10048 DUMPUNTIL(NEXTOPER(node), next);
10050 else if ( PL_regkind[(U8)op] == TRIE ) {
10051 const regnode *this_trie = node;
10052 const char op = OP(node);
10053 const U32 n = ARG(node);
10054 const reg_ac_data * const ac = op>=AHOCORASICK ?
10055 (reg_ac_data *)ri->data->data[n] :
10057 const reg_trie_data * const trie =
10058 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10060 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10062 const regnode *nextbranch= NULL;
10065 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10066 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10068 PerlIO_printf(Perl_debug_log, "%*s%s ",
10069 (int)(2*(indent+3)), "",
10070 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10071 PL_colors[0], PL_colors[1],
10072 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10073 PERL_PV_PRETTY_ELLIPSES |
10074 PERL_PV_PRETTY_LTGT
10079 U16 dist= trie->jump[word_idx+1];
10080 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10081 (UV)((dist ? this_trie + dist : next) - start));
10084 nextbranch= this_trie + trie->jump[0];
10085 DUMPUNTIL(this_trie + dist, nextbranch);
10087 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10088 nextbranch= regnext((regnode *)nextbranch);
10090 PerlIO_printf(Perl_debug_log, "\n");
10093 if (last && next > last)
10098 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10099 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10100 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10102 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10104 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10106 else if ( op == PLUS || op == STAR) {
10107 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10109 else if (op == ANYOF) {
10110 /* arglen 1 + class block */
10111 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10112 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10113 node = NEXTOPER(node);
10115 else if (PL_regkind[(U8)op] == EXACT) {
10116 /* Literal string, where present. */
10117 node += NODE_SZ_STR(node) - 1;
10118 node = NEXTOPER(node);
10121 node = NEXTOPER(node);
10122 node += regarglen[(U8)op];
10124 if (op == CURLYX || op == OPEN)
10128 #ifdef DEBUG_DUMPUNTIL
10129 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10134 #endif /* DEBUGGING */
10138 * c-indentation-style: bsd
10139 * c-basic-offset: 4
10140 * indent-tabs-mode: t
10143 * ex: set ts=8 sts=4 sw=4 noet: