5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
93 # if defined(BUGGY_MSC6)
94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 # pragma optimize("a",off)
96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 # pragma optimize("w",on )
98 # endif /* BUGGY_MSC6 */
102 #define STATIC static
105 typedef struct RExC_state_t {
106 U32 flags; /* are we folding, multilining? */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
111 char *start; /* Start of input for compile */
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
115 regnode *emit_start; /* Start of emitted-code area */
116 regnode *emit_bound; /* First regnode outside of the allocated space */
117 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
121 I32 size; /* Code size. */
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
124 I32 nestroot; /* root parens we are in - used by accept */
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
135 HV *charnames; /* cache of named sequences */
136 HV *paren_names; /* Paren names */
138 regnode **recurse; /* Recurse regops */
139 I32 recurse_count; /* Number of recurse regops */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
145 const char *lastparse;
147 AV *paren_name_list; /* idx -> name */
148 #define RExC_lastparse (pRExC_state->lastparse)
149 #define RExC_lastnum (pRExC_state->lastnum)
150 #define RExC_paren_name_list (pRExC_state->paren_name_list)
154 #define RExC_flags (pRExC_state->flags)
155 #define RExC_precomp (pRExC_state->precomp)
156 #define RExC_rx_sv (pRExC_state->rx_sv)
157 #define RExC_rx (pRExC_state->rx)
158 #define RExC_rxi (pRExC_state->rxi)
159 #define RExC_start (pRExC_state->start)
160 #define RExC_end (pRExC_state->end)
161 #define RExC_parse (pRExC_state->parse)
162 #define RExC_whilem_seen (pRExC_state->whilem_seen)
163 #ifdef RE_TRACK_PATTERN_OFFSETS
164 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166 #define RExC_emit (pRExC_state->emit)
167 #define RExC_emit_start (pRExC_state->emit_start)
168 #define RExC_emit_bound (pRExC_state->emit_bound)
169 #define RExC_naughty (pRExC_state->naughty)
170 #define RExC_sawback (pRExC_state->sawback)
171 #define RExC_seen (pRExC_state->seen)
172 #define RExC_size (pRExC_state->size)
173 #define RExC_npar (pRExC_state->npar)
174 #define RExC_nestroot (pRExC_state->nestroot)
175 #define RExC_extralen (pRExC_state->extralen)
176 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
177 #define RExC_seen_evals (pRExC_state->seen_evals)
178 #define RExC_utf8 (pRExC_state->utf8)
179 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
180 #define RExC_charnames (pRExC_state->charnames)
181 #define RExC_open_parens (pRExC_state->open_parens)
182 #define RExC_close_parens (pRExC_state->close_parens)
183 #define RExC_opend (pRExC_state->opend)
184 #define RExC_paren_names (pRExC_state->paren_names)
185 #define RExC_recurse (pRExC_state->recurse)
186 #define RExC_recurse_count (pRExC_state->recurse_count)
189 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
194 #undef SPSTART /* dratted cpp namespace... */
197 * Flags to be passed up and down.
199 #define WORST 0 /* Worst case. */
200 #define HASWIDTH 0x01 /* Known to match non-null strings. */
201 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
202 #define SPSTART 0x04 /* Starts with * or +. */
203 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
204 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
206 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
208 /* whether trie related optimizations are enabled */
209 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
210 #define TRIE_STUDY_OPT
211 #define FULL_TRIE_STUDY
217 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
218 #define PBITVAL(paren) (1 << ((paren) & 7))
219 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
220 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
221 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
224 /* About scan_data_t.
226 During optimisation we recurse through the regexp program performing
227 various inplace (keyhole style) optimisations. In addition study_chunk
228 and scan_commit populate this data structure with information about
229 what strings MUST appear in the pattern. We look for the longest
230 string that must appear for at a fixed location, and we look for the
231 longest string that may appear at a floating location. So for instance
236 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
237 strings (because they follow a .* construct). study_chunk will identify
238 both FOO and BAR as being the longest fixed and floating strings respectively.
240 The strings can be composites, for instance
244 will result in a composite fixed substring 'foo'.
246 For each string some basic information is maintained:
248 - offset or min_offset
249 This is the position the string must appear at, or not before.
250 It also implicitly (when combined with minlenp) tells us how many
251 character must match before the string we are searching.
252 Likewise when combined with minlenp and the length of the string
253 tells us how many characters must appear after the string we have
257 Only used for floating strings. This is the rightmost point that
258 the string can appear at. Ifset to I32 max it indicates that the
259 string can occur infinitely far to the right.
262 A pointer to the minimum length of the pattern that the string
263 was found inside. This is important as in the case of positive
264 lookahead or positive lookbehind we can have multiple patterns
269 The minimum length of the pattern overall is 3, the minimum length
270 of the lookahead part is 3, but the minimum length of the part that
271 will actually match is 1. So 'FOO's minimum length is 3, but the
272 minimum length for the F is 1. This is important as the minimum length
273 is used to determine offsets in front of and behind the string being
274 looked for. Since strings can be composites this is the length of the
275 pattern at the time it was commited with a scan_commit. Note that
276 the length is calculated by study_chunk, so that the minimum lengths
277 are not known until the full pattern has been compiled, thus the
278 pointer to the value.
282 In the case of lookbehind the string being searched for can be
283 offset past the start point of the final matching string.
284 If this value was just blithely removed from the min_offset it would
285 invalidate some of the calculations for how many chars must match
286 before or after (as they are derived from min_offset and minlen and
287 the length of the string being searched for).
288 When the final pattern is compiled and the data is moved from the
289 scan_data_t structure into the regexp structure the information
290 about lookbehind is factored in, with the information that would
291 have been lost precalculated in the end_shift field for the
294 The fields pos_min and pos_delta are used to store the minimum offset
295 and the delta to the maximum offset at the current point in the pattern.
299 typedef struct scan_data_t {
300 /*I32 len_min; unused */
301 /*I32 len_delta; unused */
305 I32 last_end; /* min value, <0 unless valid. */
308 SV **longest; /* Either &l_fixed, or &l_float. */
309 SV *longest_fixed; /* longest fixed string found in pattern */
310 I32 offset_fixed; /* offset where it starts */
311 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
312 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
313 SV *longest_float; /* longest floating string found in pattern */
314 I32 offset_float_min; /* earliest point in string it can appear */
315 I32 offset_float_max; /* latest point in string it can appear */
316 I32 *minlen_float; /* pointer to the minlen relevent to the string */
317 I32 lookbehind_float; /* is the position of the string modified by LB */
321 struct regnode_charclass_class *start_class;
325 * Forward declarations for pregcomp()'s friends.
328 static const scan_data_t zero_scan_data =
329 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
331 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
332 #define SF_BEFORE_SEOL 0x0001
333 #define SF_BEFORE_MEOL 0x0002
334 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
335 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
338 # define SF_FIX_SHIFT_EOL (0+2)
339 # define SF_FL_SHIFT_EOL (0+4)
341 # define SF_FIX_SHIFT_EOL (+2)
342 # define SF_FL_SHIFT_EOL (+4)
345 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
346 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
348 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
349 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
350 #define SF_IS_INF 0x0040
351 #define SF_HAS_PAR 0x0080
352 #define SF_IN_PAR 0x0100
353 #define SF_HAS_EVAL 0x0200
354 #define SCF_DO_SUBSTR 0x0400
355 #define SCF_DO_STCLASS_AND 0x0800
356 #define SCF_DO_STCLASS_OR 0x1000
357 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
358 #define SCF_WHILEM_VISITED_POS 0x2000
360 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
361 #define SCF_SEEN_ACCEPT 0x8000
363 #define UTF (RExC_utf8 != 0)
364 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
365 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
367 #define OOB_UNICODE 12345678
368 #define OOB_NAMEDCLASS -1
370 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
371 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
374 /* length of regex to show in messages that don't mark a position within */
375 #define RegexLengthToShowInErrorMessages 127
378 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
379 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
380 * op/pragma/warn/regcomp.
382 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
383 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
385 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
388 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
389 * arg. Show regex, up to a maximum length. If it's too long, chop and add
392 #define _FAIL(code) STMT_START { \
393 const char *ellipses = ""; \
394 IV len = RExC_end - RExC_precomp; \
397 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
398 if (len > RegexLengthToShowInErrorMessages) { \
399 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
400 len = RegexLengthToShowInErrorMessages - 10; \
406 #define FAIL(msg) _FAIL( \
407 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
408 msg, (int)len, RExC_precomp, ellipses))
410 #define FAIL2(msg,arg) _FAIL( \
411 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
412 arg, (int)len, RExC_precomp, ellipses))
415 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
417 #define Simple_vFAIL(m) STMT_START { \
418 const IV offset = RExC_parse - RExC_precomp; \
419 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
420 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
424 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
426 #define vFAIL(m) STMT_START { \
428 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
433 * Like Simple_vFAIL(), but accepts two arguments.
435 #define Simple_vFAIL2(m,a1) STMT_START { \
436 const IV offset = RExC_parse - RExC_precomp; \
437 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
438 (int)offset, RExC_precomp, RExC_precomp + offset); \
442 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
444 #define vFAIL2(m,a1) STMT_START { \
446 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
447 Simple_vFAIL2(m, a1); \
452 * Like Simple_vFAIL(), but accepts three arguments.
454 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
455 const IV offset = RExC_parse - RExC_precomp; \
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
461 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
463 #define vFAIL3(m,a1,a2) STMT_START { \
465 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
466 Simple_vFAIL3(m, a1, a2); \
470 * Like Simple_vFAIL(), but accepts four arguments.
472 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
473 const IV offset = RExC_parse - RExC_precomp; \
474 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
475 (int)offset, RExC_precomp, RExC_precomp + offset); \
478 #define ckWARNreg(loc,m) STMT_START { \
479 const IV offset = loc - RExC_precomp; \
480 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 (int)offset, RExC_precomp, RExC_precomp + offset); \
484 #define ckWARNregdep(loc,m) STMT_START { \
485 const IV offset = loc - RExC_precomp; \
486 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
488 (int)offset, RExC_precomp, RExC_precomp + offset); \
491 #define ckWARN2reg(loc, m, a1) STMT_START { \
492 const IV offset = loc - RExC_precomp; \
493 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
494 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
497 #define vWARN3(loc, m, a1, a2) STMT_START { \
498 const IV offset = loc - RExC_precomp; \
499 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
500 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
503 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
504 const IV offset = loc - RExC_precomp; \
505 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
506 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
509 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
510 const IV offset = loc - RExC_precomp; \
511 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
512 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
515 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
516 const IV offset = loc - RExC_precomp; \
517 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
518 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
521 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
522 const IV offset = loc - RExC_precomp; \
523 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
528 /* Allow for side effects in s */
529 #define REGC(c,s) STMT_START { \
530 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
533 /* Macros for recording node offsets. 20001227 mjd@plover.com
534 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
535 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
536 * Element 0 holds the number n.
537 * Position is 1 indexed.
539 #ifndef RE_TRACK_PATTERN_OFFSETS
540 #define Set_Node_Offset_To_R(node,byte)
541 #define Set_Node_Offset(node,byte)
542 #define Set_Cur_Node_Offset
543 #define Set_Node_Length_To_R(node,len)
544 #define Set_Node_Length(node,len)
545 #define Set_Node_Cur_Length(node)
546 #define Node_Offset(n)
547 #define Node_Length(n)
548 #define Set_Node_Offset_Length(node,offset,len)
549 #define ProgLen(ri) ri->u.proglen
550 #define SetProgLen(ri,x) ri->u.proglen = x
552 #define ProgLen(ri) ri->u.offsets[0]
553 #define SetProgLen(ri,x) ri->u.offsets[0] = x
554 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
556 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
557 __LINE__, (int)(node), (int)(byte))); \
559 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
561 RExC_offsets[2*(node)-1] = (byte); \
566 #define Set_Node_Offset(node,byte) \
567 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
568 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
570 #define Set_Node_Length_To_R(node,len) STMT_START { \
572 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
573 __LINE__, (int)(node), (int)(len))); \
575 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
577 RExC_offsets[2*(node)] = (len); \
582 #define Set_Node_Length(node,len) \
583 Set_Node_Length_To_R((node)-RExC_emit_start, len)
584 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
585 #define Set_Node_Cur_Length(node) \
586 Set_Node_Length(node, RExC_parse - parse_start)
588 /* Get offsets and lengths */
589 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
590 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
592 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
593 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
594 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
598 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
599 #define EXPERIMENTAL_INPLACESCAN
600 #endif /*RE_TRACK_PATTERN_OFFSETS*/
602 #define DEBUG_STUDYDATA(str,data,depth) \
603 DEBUG_OPTIMISE_MORE_r(if(data){ \
604 PerlIO_printf(Perl_debug_log, \
605 "%*s" str "Pos:%"IVdf"/%"IVdf \
606 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
607 (int)(depth)*2, "", \
608 (IV)((data)->pos_min), \
609 (IV)((data)->pos_delta), \
610 (UV)((data)->flags), \
611 (IV)((data)->whilem_c), \
612 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
613 is_inf ? "INF " : "" \
615 if ((data)->last_found) \
616 PerlIO_printf(Perl_debug_log, \
617 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
618 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
619 SvPVX_const((data)->last_found), \
620 (IV)((data)->last_end), \
621 (IV)((data)->last_start_min), \
622 (IV)((data)->last_start_max), \
623 ((data)->longest && \
624 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
625 SvPVX_const((data)->longest_fixed), \
626 (IV)((data)->offset_fixed), \
627 ((data)->longest && \
628 (data)->longest==&((data)->longest_float)) ? "*" : "", \
629 SvPVX_const((data)->longest_float), \
630 (IV)((data)->offset_float_min), \
631 (IV)((data)->offset_float_max) \
633 PerlIO_printf(Perl_debug_log,"\n"); \
636 static void clear_re(pTHX_ void *r);
638 /* Mark that we cannot extend a found fixed substring at this point.
639 Update the longest found anchored substring and the longest found
640 floating substrings if needed. */
643 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
645 const STRLEN l = CHR_SVLEN(data->last_found);
646 const STRLEN old_l = CHR_SVLEN(*data->longest);
647 GET_RE_DEBUG_FLAGS_DECL;
649 PERL_ARGS_ASSERT_SCAN_COMMIT;
651 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
652 SvSetMagicSV(*data->longest, data->last_found);
653 if (*data->longest == data->longest_fixed) {
654 data->offset_fixed = l ? data->last_start_min : data->pos_min;
655 if (data->flags & SF_BEFORE_EOL)
657 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
659 data->flags &= ~SF_FIX_BEFORE_EOL;
660 data->minlen_fixed=minlenp;
661 data->lookbehind_fixed=0;
663 else { /* *data->longest == data->longest_float */
664 data->offset_float_min = l ? data->last_start_min : data->pos_min;
665 data->offset_float_max = (l
666 ? data->last_start_max
667 : data->pos_min + data->pos_delta);
668 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
669 data->offset_float_max = I32_MAX;
670 if (data->flags & SF_BEFORE_EOL)
672 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
674 data->flags &= ~SF_FL_BEFORE_EOL;
675 data->minlen_float=minlenp;
676 data->lookbehind_float=0;
679 SvCUR_set(data->last_found, 0);
681 SV * const sv = data->last_found;
682 if (SvUTF8(sv) && SvMAGICAL(sv)) {
683 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
689 data->flags &= ~SF_BEFORE_EOL;
690 DEBUG_STUDYDATA("commit: ",data,0);
693 /* Can match anything (initialization) */
695 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
697 PERL_ARGS_ASSERT_CL_ANYTHING;
699 ANYOF_CLASS_ZERO(cl);
700 ANYOF_BITMAP_SETALL(cl);
701 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
703 cl->flags |= ANYOF_LOCALE;
706 /* Can match anything (initialization) */
708 S_cl_is_anything(const struct regnode_charclass_class *cl)
712 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
714 for (value = 0; value <= ANYOF_MAX; value += 2)
715 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
717 if (!(cl->flags & ANYOF_UNICODE_ALL))
719 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
724 /* Can match anything (initialization) */
726 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
728 PERL_ARGS_ASSERT_CL_INIT;
730 Zero(cl, 1, struct regnode_charclass_class);
732 cl_anything(pRExC_state, cl);
736 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
738 PERL_ARGS_ASSERT_CL_INIT_ZERO;
740 Zero(cl, 1, struct regnode_charclass_class);
742 cl_anything(pRExC_state, cl);
744 cl->flags |= ANYOF_LOCALE;
747 /* 'And' a given class with another one. Can create false positives */
748 /* We assume that cl is not inverted */
750 S_cl_and(struct regnode_charclass_class *cl,
751 const struct regnode_charclass_class *and_with)
753 PERL_ARGS_ASSERT_CL_AND;
755 assert(and_with->type == ANYOF);
756 if (!(and_with->flags & ANYOF_CLASS)
757 && !(cl->flags & ANYOF_CLASS)
758 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
759 && !(and_with->flags & ANYOF_FOLD)
760 && !(cl->flags & ANYOF_FOLD)) {
763 if (and_with->flags & ANYOF_INVERT)
764 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
765 cl->bitmap[i] &= ~and_with->bitmap[i];
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] &= and_with->bitmap[i];
769 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
770 if (!(and_with->flags & ANYOF_EOS))
771 cl->flags &= ~ANYOF_EOS;
773 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
774 !(and_with->flags & ANYOF_INVERT)) {
775 cl->flags &= ~ANYOF_UNICODE_ALL;
776 cl->flags |= ANYOF_UNICODE;
777 ARG_SET(cl, ARG(and_with));
779 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
780 !(and_with->flags & ANYOF_INVERT))
781 cl->flags &= ~ANYOF_UNICODE_ALL;
782 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
783 !(and_with->flags & ANYOF_INVERT))
784 cl->flags &= ~ANYOF_UNICODE;
787 /* 'OR' a given class with another one. Can create false positives */
788 /* We assume that cl is not inverted */
790 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
792 PERL_ARGS_ASSERT_CL_OR;
794 if (or_with->flags & ANYOF_INVERT) {
796 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
797 * <= (B1 | !B2) | (CL1 | !CL2)
798 * which is wasteful if CL2 is small, but we ignore CL2:
799 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
800 * XXXX Can we handle case-fold? Unclear:
801 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
802 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
804 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 && !(or_with->flags & ANYOF_FOLD)
806 && !(cl->flags & ANYOF_FOLD) ) {
809 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
810 cl->bitmap[i] |= ~or_with->bitmap[i];
811 } /* XXXX: logic is complicated otherwise */
813 cl_anything(pRExC_state, cl);
816 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
817 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
818 && (!(or_with->flags & ANYOF_FOLD)
819 || (cl->flags & ANYOF_FOLD)) ) {
822 /* OR char bitmap and class bitmap separately */
823 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
824 cl->bitmap[i] |= or_with->bitmap[i];
825 if (or_with->flags & ANYOF_CLASS) {
826 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
827 cl->classflags[i] |= or_with->classflags[i];
828 cl->flags |= ANYOF_CLASS;
831 else { /* XXXX: logic is complicated, leave it along for a moment. */
832 cl_anything(pRExC_state, cl);
835 if (or_with->flags & ANYOF_EOS)
836 cl->flags |= ANYOF_EOS;
838 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
839 ARG(cl) != ARG(or_with)) {
840 cl->flags |= ANYOF_UNICODE_ALL;
841 cl->flags &= ~ANYOF_UNICODE;
843 if (or_with->flags & ANYOF_UNICODE_ALL) {
844 cl->flags |= ANYOF_UNICODE_ALL;
845 cl->flags &= ~ANYOF_UNICODE;
849 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
850 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
851 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
852 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
857 dump_trie(trie,widecharmap,revcharmap)
858 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
859 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
861 These routines dump out a trie in a somewhat readable format.
862 The _interim_ variants are used for debugging the interim
863 tables that are used to generate the final compressed
864 representation which is what dump_trie expects.
866 Part of the reason for their existance is to provide a form
867 of documentation as to how the different representations function.
872 Dumps the final compressed table form of the trie to Perl_debug_log.
873 Used for debugging make_trie().
877 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
878 AV *revcharmap, U32 depth)
881 SV *sv=sv_newmortal();
882 int colwidth= widecharmap ? 6 : 4;
883 GET_RE_DEBUG_FLAGS_DECL;
885 PERL_ARGS_ASSERT_DUMP_TRIE;
887 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
888 (int)depth * 2 + 2,"",
889 "Match","Base","Ofs" );
891 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
892 SV ** const tmp = av_fetch( revcharmap, state, 0);
894 PerlIO_printf( Perl_debug_log, "%*s",
896 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
897 PL_colors[0], PL_colors[1],
898 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
899 PERL_PV_ESCAPE_FIRSTCHAR
904 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
905 (int)depth * 2 + 2,"");
907 for( state = 0 ; state < trie->uniquecharcount ; state++ )
908 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
909 PerlIO_printf( Perl_debug_log, "\n");
911 for( state = 1 ; state < trie->statecount ; state++ ) {
912 const U32 base = trie->states[ state ].trans.base;
914 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
916 if ( trie->states[ state ].wordnum ) {
917 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
919 PerlIO_printf( Perl_debug_log, "%6s", "" );
922 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
927 while( ( base + ofs < trie->uniquecharcount ) ||
928 ( base + ofs - trie->uniquecharcount < trie->lasttrans
929 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
932 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
934 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
935 if ( ( base + ofs >= trie->uniquecharcount ) &&
936 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
937 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
939 PerlIO_printf( Perl_debug_log, "%*"UVXf,
941 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
943 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
947 PerlIO_printf( Perl_debug_log, "]");
950 PerlIO_printf( Perl_debug_log, "\n" );
954 Dumps a fully constructed but uncompressed trie in list form.
955 List tries normally only are used for construction when the number of
956 possible chars (trie->uniquecharcount) is very high.
957 Used for debugging make_trie().
960 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
961 HV *widecharmap, AV *revcharmap, U32 next_alloc,
965 SV *sv=sv_newmortal();
966 int colwidth= widecharmap ? 6 : 4;
967 GET_RE_DEBUG_FLAGS_DECL;
969 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
971 /* print out the table precompression. */
972 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
973 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
974 "------:-----+-----------------\n" );
976 for( state=1 ; state < next_alloc ; state ++ ) {
979 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
980 (int)depth * 2 + 2,"", (UV)state );
981 if ( ! trie->states[ state ].wordnum ) {
982 PerlIO_printf( Perl_debug_log, "%5s| ","");
984 PerlIO_printf( Perl_debug_log, "W%4x| ",
985 trie->states[ state ].wordnum
988 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
989 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
991 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
993 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
994 PL_colors[0], PL_colors[1],
995 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
996 PERL_PV_ESCAPE_FIRSTCHAR
998 TRIE_LIST_ITEM(state,charid).forid,
999 (UV)TRIE_LIST_ITEM(state,charid).newstate
1002 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1003 (int)((depth * 2) + 14), "");
1006 PerlIO_printf( Perl_debug_log, "\n");
1011 Dumps a fully constructed but uncompressed trie in table form.
1012 This is the normal DFA style state transition table, with a few
1013 twists to facilitate compression later.
1014 Used for debugging make_trie().
1017 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1018 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1023 SV *sv=sv_newmortal();
1024 int colwidth= widecharmap ? 6 : 4;
1025 GET_RE_DEBUG_FLAGS_DECL;
1027 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1030 print out the table precompression so that we can do a visual check
1031 that they are identical.
1034 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1036 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1037 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1039 PerlIO_printf( Perl_debug_log, "%*s",
1041 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1042 PL_colors[0], PL_colors[1],
1043 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1044 PERL_PV_ESCAPE_FIRSTCHAR
1050 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1052 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1053 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1056 PerlIO_printf( Perl_debug_log, "\n" );
1058 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1060 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1061 (int)depth * 2 + 2,"",
1062 (UV)TRIE_NODENUM( state ) );
1064 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1065 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1067 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1069 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1071 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1074 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1075 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1082 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1083 startbranch: the first branch in the whole branch sequence
1084 first : start branch of sequence of branch-exact nodes.
1085 May be the same as startbranch
1086 last : Thing following the last branch.
1087 May be the same as tail.
1088 tail : item following the branch sequence
1089 count : words in the sequence
1090 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1091 depth : indent depth
1093 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1095 A trie is an N'ary tree where the branches are determined by digital
1096 decomposition of the key. IE, at the root node you look up the 1st character and
1097 follow that branch repeat until you find the end of the branches. Nodes can be
1098 marked as "accepting" meaning they represent a complete word. Eg:
1102 would convert into the following structure. Numbers represent states, letters
1103 following numbers represent valid transitions on the letter from that state, if
1104 the number is in square brackets it represents an accepting state, otherwise it
1105 will be in parenthesis.
1107 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1111 (1) +-i->(6)-+-s->[7]
1113 +-s->(3)-+-h->(4)-+-e->[5]
1115 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1117 This shows that when matching against the string 'hers' we will begin at state 1
1118 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1119 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1120 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1121 single traverse. We store a mapping from accepting to state to which word was
1122 matched, and then when we have multiple possibilities we try to complete the
1123 rest of the regex in the order in which they occured in the alternation.
1125 The only prior NFA like behaviour that would be changed by the TRIE support is
1126 the silent ignoring of duplicate alternations which are of the form:
1128 / (DUPE|DUPE) X? (?{ ... }) Y /x
1130 Thus EVAL blocks follwing a trie may be called a different number of times with
1131 and without the optimisation. With the optimisations dupes will be silently
1132 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1133 the following demonstrates:
1135 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1137 which prints out 'word' three times, but
1139 'words'=~/(word|word|word)(?{ print $1 })S/
1141 which doesnt print it out at all. This is due to other optimisations kicking in.
1143 Example of what happens on a structural level:
1145 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1147 1: CURLYM[1] {1,32767}(18)
1158 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1159 and should turn into:
1161 1: CURLYM[1] {1,32767}(18)
1163 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1171 Cases where tail != last would be like /(?foo|bar)baz/:
1181 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1182 and would end up looking like:
1185 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1192 d = uvuni_to_utf8_flags(d, uv, 0);
1194 is the recommended Unicode-aware way of saying
1199 #define TRIE_STORE_REVCHAR \
1202 SV *zlopp = newSV(2); \
1203 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1204 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1205 SvCUR_set(zlopp, kapow - flrbbbbb); \
1208 av_push(revcharmap, zlopp); \
1210 char ooooff = (char)uvc; \
1211 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1215 #define TRIE_READ_CHAR STMT_START { \
1219 if ( foldlen > 0 ) { \
1220 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1225 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1226 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1227 foldlen -= UNISKIP( uvc ); \
1228 scan = foldbuf + UNISKIP( uvc ); \
1231 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1241 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1242 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1243 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1244 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1246 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1247 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1248 TRIE_LIST_CUR( state )++; \
1251 #define TRIE_LIST_NEW(state) STMT_START { \
1252 Newxz( trie->states[ state ].trans.list, \
1253 4, reg_trie_trans_le ); \
1254 TRIE_LIST_CUR( state ) = 1; \
1255 TRIE_LIST_LEN( state ) = 4; \
1258 #define TRIE_HANDLE_WORD(state) STMT_START { \
1259 U16 dupe= trie->states[ state ].wordnum; \
1260 regnode * const noper_next = regnext( noper ); \
1262 if (trie->wordlen) \
1263 trie->wordlen[ curword ] = wordlen; \
1265 /* store the word for dumping */ \
1267 if (OP(noper) != NOTHING) \
1268 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1270 tmp = newSVpvn_utf8( "", 0, UTF ); \
1271 av_push( trie_words, tmp ); \
1276 if ( noper_next < tail ) { \
1278 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1279 trie->jump[curword] = (U16)(noper_next - convert); \
1281 jumper = noper_next; \
1283 nextbranch= regnext(cur); \
1287 /* So it's a dupe. This means we need to maintain a */\
1288 /* linked-list from the first to the next. */\
1289 /* we only allocate the nextword buffer when there */\
1290 /* a dupe, so first time we have to do the allocation */\
1291 if (!trie->nextword) \
1292 trie->nextword = (U16 *) \
1293 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1294 while ( trie->nextword[dupe] ) \
1295 dupe= trie->nextword[dupe]; \
1296 trie->nextword[dupe]= curword; \
1298 /* we haven't inserted this word yet. */ \
1299 trie->states[ state ].wordnum = curword; \
1304 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1305 ( ( base + charid >= ucharcount \
1306 && base + charid < ubound \
1307 && state == trie->trans[ base - ucharcount + charid ].check \
1308 && trie->trans[ base - ucharcount + charid ].next ) \
1309 ? trie->trans[ base - ucharcount + charid ].next \
1310 : ( state==1 ? special : 0 ) \
1314 #define MADE_JUMP_TRIE 2
1315 #define MADE_EXACT_TRIE 4
1318 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1321 /* first pass, loop through and scan words */
1322 reg_trie_data *trie;
1323 HV *widecharmap = NULL;
1324 AV *revcharmap = newAV();
1326 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1331 regnode *jumper = NULL;
1332 regnode *nextbranch = NULL;
1333 regnode *convert = NULL;
1334 /* we just use folder as a flag in utf8 */
1335 const U8 * const folder = ( flags == EXACTF
1337 : ( flags == EXACTFL
1344 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1345 AV *trie_words = NULL;
1346 /* along with revcharmap, this only used during construction but both are
1347 * useful during debugging so we store them in the struct when debugging.
1350 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1351 STRLEN trie_charcount=0;
1353 SV *re_trie_maxbuff;
1354 GET_RE_DEBUG_FLAGS_DECL;
1356 PERL_ARGS_ASSERT_MAKE_TRIE;
1358 PERL_UNUSED_ARG(depth);
1361 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1363 trie->startstate = 1;
1364 trie->wordcount = word_count;
1365 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1366 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1367 if (!(UTF && folder))
1368 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1370 trie_words = newAV();
1373 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1374 if (!SvIOK(re_trie_maxbuff)) {
1375 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1378 PerlIO_printf( Perl_debug_log,
1379 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1380 (int)depth * 2 + 2, "",
1381 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1382 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1386 /* Find the node we are going to overwrite */
1387 if ( first == startbranch && OP( last ) != BRANCH ) {
1388 /* whole branch chain */
1391 /* branch sub-chain */
1392 convert = NEXTOPER( first );
1395 /* -- First loop and Setup --
1397 We first traverse the branches and scan each word to determine if it
1398 contains widechars, and how many unique chars there are, this is
1399 important as we have to build a table with at least as many columns as we
1402 We use an array of integers to represent the character codes 0..255
1403 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1404 native representation of the character value as the key and IV's for the
1407 *TODO* If we keep track of how many times each character is used we can
1408 remap the columns so that the table compression later on is more
1409 efficient in terms of memory by ensuring most common value is in the
1410 middle and the least common are on the outside. IMO this would be better
1411 than a most to least common mapping as theres a decent chance the most
1412 common letter will share a node with the least common, meaning the node
1413 will not be compressable. With a middle is most common approach the worst
1414 case is when we have the least common nodes twice.
1418 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1419 regnode * const noper = NEXTOPER( cur );
1420 const U8 *uc = (U8*)STRING( noper );
1421 const U8 * const e = uc + STR_LEN( noper );
1423 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1424 const U8 *scan = (U8*)NULL;
1425 U32 wordlen = 0; /* required init */
1427 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1429 if (OP(noper) == NOTHING) {
1433 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1434 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1435 regardless of encoding */
1437 for ( ; uc < e ; uc += len ) {
1438 TRIE_CHARCOUNT(trie)++;
1442 if ( !trie->charmap[ uvc ] ) {
1443 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1445 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1449 /* store the codepoint in the bitmap, and if its ascii
1450 also store its folded equivelent. */
1451 TRIE_BITMAP_SET(trie,uvc);
1453 /* store the folded codepoint */
1454 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1457 /* store first byte of utf8 representation of
1458 codepoints in the 127 < uvc < 256 range */
1459 if (127 < uvc && uvc < 192) {
1460 TRIE_BITMAP_SET(trie,194);
1461 } else if (191 < uvc ) {
1462 TRIE_BITMAP_SET(trie,195);
1463 /* && uvc < 256 -- we know uvc is < 256 already */
1466 set_bit = 0; /* We've done our bit :-) */
1471 widecharmap = newHV();
1473 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1476 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1478 if ( !SvTRUE( *svpp ) ) {
1479 sv_setiv( *svpp, ++trie->uniquecharcount );
1484 if( cur == first ) {
1487 } else if (chars < trie->minlen) {
1489 } else if (chars > trie->maxlen) {
1493 } /* end first pass */
1494 DEBUG_TRIE_COMPILE_r(
1495 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1496 (int)depth * 2 + 2,"",
1497 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1498 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1499 (int)trie->minlen, (int)trie->maxlen )
1501 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1504 We now know what we are dealing with in terms of unique chars and
1505 string sizes so we can calculate how much memory a naive
1506 representation using a flat table will take. If it's over a reasonable
1507 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1508 conservative but potentially much slower representation using an array
1511 At the end we convert both representations into the same compressed
1512 form that will be used in regexec.c for matching with. The latter
1513 is a form that cannot be used to construct with but has memory
1514 properties similar to the list form and access properties similar
1515 to the table form making it both suitable for fast searches and
1516 small enough that its feasable to store for the duration of a program.
1518 See the comment in the code where the compressed table is produced
1519 inplace from the flat tabe representation for an explanation of how
1520 the compression works.
1525 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1527 Second Pass -- Array Of Lists Representation
1529 Each state will be represented by a list of charid:state records
1530 (reg_trie_trans_le) the first such element holds the CUR and LEN
1531 points of the allocated array. (See defines above).
1533 We build the initial structure using the lists, and then convert
1534 it into the compressed table form which allows faster lookups
1535 (but cant be modified once converted).
1538 STRLEN transcount = 1;
1540 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1541 "%*sCompiling trie using list compiler\n",
1542 (int)depth * 2 + 2, ""));
1544 trie->states = (reg_trie_state *)
1545 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1546 sizeof(reg_trie_state) );
1550 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1552 regnode * const noper = NEXTOPER( cur );
1553 U8 *uc = (U8*)STRING( noper );
1554 const U8 * const e = uc + STR_LEN( noper );
1555 U32 state = 1; /* required init */
1556 U16 charid = 0; /* sanity init */
1557 U8 *scan = (U8*)NULL; /* sanity init */
1558 STRLEN foldlen = 0; /* required init */
1559 U32 wordlen = 0; /* required init */
1560 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1562 if (OP(noper) != NOTHING) {
1563 for ( ; uc < e ; uc += len ) {
1568 charid = trie->charmap[ uvc ];
1570 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1574 charid=(U16)SvIV( *svpp );
1577 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1584 if ( !trie->states[ state ].trans.list ) {
1585 TRIE_LIST_NEW( state );
1587 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1588 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1589 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1594 newstate = next_alloc++;
1595 TRIE_LIST_PUSH( state, charid, newstate );
1600 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1604 TRIE_HANDLE_WORD(state);
1606 } /* end second pass */
1608 /* next alloc is the NEXT state to be allocated */
1609 trie->statecount = next_alloc;
1610 trie->states = (reg_trie_state *)
1611 PerlMemShared_realloc( trie->states,
1613 * sizeof(reg_trie_state) );
1615 /* and now dump it out before we compress it */
1616 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1617 revcharmap, next_alloc,
1621 trie->trans = (reg_trie_trans *)
1622 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1629 for( state=1 ; state < next_alloc ; state ++ ) {
1633 DEBUG_TRIE_COMPILE_MORE_r(
1634 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1638 if (trie->states[state].trans.list) {
1639 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1643 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1644 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1645 if ( forid < minid ) {
1647 } else if ( forid > maxid ) {
1651 if ( transcount < tp + maxid - minid + 1) {
1653 trie->trans = (reg_trie_trans *)
1654 PerlMemShared_realloc( trie->trans,
1656 * sizeof(reg_trie_trans) );
1657 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1659 base = trie->uniquecharcount + tp - minid;
1660 if ( maxid == minid ) {
1662 for ( ; zp < tp ; zp++ ) {
1663 if ( ! trie->trans[ zp ].next ) {
1664 base = trie->uniquecharcount + zp - minid;
1665 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1666 trie->trans[ zp ].check = state;
1672 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1673 trie->trans[ tp ].check = state;
1678 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1679 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1680 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1681 trie->trans[ tid ].check = state;
1683 tp += ( maxid - minid + 1 );
1685 Safefree(trie->states[ state ].trans.list);
1688 DEBUG_TRIE_COMPILE_MORE_r(
1689 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1692 trie->states[ state ].trans.base=base;
1694 trie->lasttrans = tp + 1;
1698 Second Pass -- Flat Table Representation.
1700 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1701 We know that we will need Charcount+1 trans at most to store the data
1702 (one row per char at worst case) So we preallocate both structures
1703 assuming worst case.
1705 We then construct the trie using only the .next slots of the entry
1708 We use the .check field of the first entry of the node temporarily to
1709 make compression both faster and easier by keeping track of how many non
1710 zero fields are in the node.
1712 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1715 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1716 number representing the first entry of the node, and state as a
1717 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1718 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1719 are 2 entrys per node. eg:
1727 The table is internally in the right hand, idx form. However as we also
1728 have to deal with the states array which is indexed by nodenum we have to
1729 use TRIE_NODENUM() to convert.
1732 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1733 "%*sCompiling trie using table compiler\n",
1734 (int)depth * 2 + 2, ""));
1736 trie->trans = (reg_trie_trans *)
1737 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1738 * trie->uniquecharcount + 1,
1739 sizeof(reg_trie_trans) );
1740 trie->states = (reg_trie_state *)
1741 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1742 sizeof(reg_trie_state) );
1743 next_alloc = trie->uniquecharcount + 1;
1746 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1748 regnode * const noper = NEXTOPER( cur );
1749 const U8 *uc = (U8*)STRING( noper );
1750 const U8 * const e = uc + STR_LEN( noper );
1752 U32 state = 1; /* required init */
1754 U16 charid = 0; /* sanity init */
1755 U32 accept_state = 0; /* sanity init */
1756 U8 *scan = (U8*)NULL; /* sanity init */
1758 STRLEN foldlen = 0; /* required init */
1759 U32 wordlen = 0; /* required init */
1760 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1762 if ( OP(noper) != NOTHING ) {
1763 for ( ; uc < e ; uc += len ) {
1768 charid = trie->charmap[ uvc ];
1770 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1771 charid = svpp ? (U16)SvIV(*svpp) : 0;
1775 if ( !trie->trans[ state + charid ].next ) {
1776 trie->trans[ state + charid ].next = next_alloc;
1777 trie->trans[ state ].check++;
1778 next_alloc += trie->uniquecharcount;
1780 state = trie->trans[ state + charid ].next;
1782 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1784 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1787 accept_state = TRIE_NODENUM( state );
1788 TRIE_HANDLE_WORD(accept_state);
1790 } /* end second pass */
1792 /* and now dump it out before we compress it */
1793 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1795 next_alloc, depth+1));
1799 * Inplace compress the table.*
1801 For sparse data sets the table constructed by the trie algorithm will
1802 be mostly 0/FAIL transitions or to put it another way mostly empty.
1803 (Note that leaf nodes will not contain any transitions.)
1805 This algorithm compresses the tables by eliminating most such
1806 transitions, at the cost of a modest bit of extra work during lookup:
1808 - Each states[] entry contains a .base field which indicates the
1809 index in the state[] array wheres its transition data is stored.
1811 - If .base is 0 there are no valid transitions from that node.
1813 - If .base is nonzero then charid is added to it to find an entry in
1816 -If trans[states[state].base+charid].check!=state then the
1817 transition is taken to be a 0/Fail transition. Thus if there are fail
1818 transitions at the front of the node then the .base offset will point
1819 somewhere inside the previous nodes data (or maybe even into a node
1820 even earlier), but the .check field determines if the transition is
1824 The following process inplace converts the table to the compressed
1825 table: We first do not compress the root node 1,and mark its all its
1826 .check pointers as 1 and set its .base pointer as 1 as well. This
1827 allows to do a DFA construction from the compressed table later, and
1828 ensures that any .base pointers we calculate later are greater than
1831 - We set 'pos' to indicate the first entry of the second node.
1833 - We then iterate over the columns of the node, finding the first and
1834 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1835 and set the .check pointers accordingly, and advance pos
1836 appropriately and repreat for the next node. Note that when we copy
1837 the next pointers we have to convert them from the original
1838 NODEIDX form to NODENUM form as the former is not valid post
1841 - If a node has no transitions used we mark its base as 0 and do not
1842 advance the pos pointer.
1844 - If a node only has one transition we use a second pointer into the
1845 structure to fill in allocated fail transitions from other states.
1846 This pointer is independent of the main pointer and scans forward
1847 looking for null transitions that are allocated to a state. When it
1848 finds one it writes the single transition into the "hole". If the
1849 pointer doesnt find one the single transition is appended as normal.
1851 - Once compressed we can Renew/realloc the structures to release the
1854 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1855 specifically Fig 3.47 and the associated pseudocode.
1859 const U32 laststate = TRIE_NODENUM( next_alloc );
1862 trie->statecount = laststate;
1864 for ( state = 1 ; state < laststate ; state++ ) {
1866 const U32 stateidx = TRIE_NODEIDX( state );
1867 const U32 o_used = trie->trans[ stateidx ].check;
1868 U32 used = trie->trans[ stateidx ].check;
1869 trie->trans[ stateidx ].check = 0;
1871 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1872 if ( flag || trie->trans[ stateidx + charid ].next ) {
1873 if ( trie->trans[ stateidx + charid ].next ) {
1875 for ( ; zp < pos ; zp++ ) {
1876 if ( ! trie->trans[ zp ].next ) {
1880 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1881 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1882 trie->trans[ zp ].check = state;
1883 if ( ++zp > pos ) pos = zp;
1890 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1892 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1893 trie->trans[ pos ].check = state;
1898 trie->lasttrans = pos + 1;
1899 trie->states = (reg_trie_state *)
1900 PerlMemShared_realloc( trie->states, laststate
1901 * sizeof(reg_trie_state) );
1902 DEBUG_TRIE_COMPILE_MORE_r(
1903 PerlIO_printf( Perl_debug_log,
1904 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1905 (int)depth * 2 + 2,"",
1906 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1909 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1912 } /* end table compress */
1914 DEBUG_TRIE_COMPILE_MORE_r(
1915 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1916 (int)depth * 2 + 2, "",
1917 (UV)trie->statecount,
1918 (UV)trie->lasttrans)
1920 /* resize the trans array to remove unused space */
1921 trie->trans = (reg_trie_trans *)
1922 PerlMemShared_realloc( trie->trans, trie->lasttrans
1923 * sizeof(reg_trie_trans) );
1925 /* and now dump out the compressed format */
1926 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1928 { /* Modify the program and insert the new TRIE node*/
1929 U8 nodetype =(U8)(flags & 0xFF);
1933 regnode *optimize = NULL;
1934 #ifdef RE_TRACK_PATTERN_OFFSETS
1937 U32 mjd_nodelen = 0;
1938 #endif /* RE_TRACK_PATTERN_OFFSETS */
1939 #endif /* DEBUGGING */
1941 This means we convert either the first branch or the first Exact,
1942 depending on whether the thing following (in 'last') is a branch
1943 or not and whther first is the startbranch (ie is it a sub part of
1944 the alternation or is it the whole thing.)
1945 Assuming its a sub part we conver the EXACT otherwise we convert
1946 the whole branch sequence, including the first.
1948 /* Find the node we are going to overwrite */
1949 if ( first != startbranch || OP( last ) == BRANCH ) {
1950 /* branch sub-chain */
1951 NEXT_OFF( first ) = (U16)(last - first);
1952 #ifdef RE_TRACK_PATTERN_OFFSETS
1954 mjd_offset= Node_Offset((convert));
1955 mjd_nodelen= Node_Length((convert));
1958 /* whole branch chain */
1960 #ifdef RE_TRACK_PATTERN_OFFSETS
1963 const regnode *nop = NEXTOPER( convert );
1964 mjd_offset= Node_Offset((nop));
1965 mjd_nodelen= Node_Length((nop));
1969 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1970 (int)depth * 2 + 2, "",
1971 (UV)mjd_offset, (UV)mjd_nodelen)
1974 /* But first we check to see if there is a common prefix we can
1975 split out as an EXACT and put in front of the TRIE node. */
1976 trie->startstate= 1;
1977 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1979 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1983 const U32 base = trie->states[ state ].trans.base;
1985 if ( trie->states[state].wordnum )
1988 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1989 if ( ( base + ofs >= trie->uniquecharcount ) &&
1990 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1991 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1993 if ( ++count > 1 ) {
1994 SV **tmp = av_fetch( revcharmap, ofs, 0);
1995 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1996 if ( state == 1 ) break;
1998 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2000 PerlIO_printf(Perl_debug_log,
2001 "%*sNew Start State=%"UVuf" Class: [",
2002 (int)depth * 2 + 2, "",
2005 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2006 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2008 TRIE_BITMAP_SET(trie,*ch);
2010 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2012 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2016 TRIE_BITMAP_SET(trie,*ch);
2018 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2019 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2025 SV **tmp = av_fetch( revcharmap, idx, 0);
2027 char *ch = SvPV( *tmp, len );
2029 SV *sv=sv_newmortal();
2030 PerlIO_printf( Perl_debug_log,
2031 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2032 (int)depth * 2 + 2, "",
2034 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2035 PL_colors[0], PL_colors[1],
2036 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2037 PERL_PV_ESCAPE_FIRSTCHAR
2042 OP( convert ) = nodetype;
2043 str=STRING(convert);
2046 STR_LEN(convert) += len;
2052 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2058 regnode *n = convert+NODE_SZ_STR(convert);
2059 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2060 trie->startstate = state;
2061 trie->minlen -= (state - 1);
2062 trie->maxlen -= (state - 1);
2064 /* At least the UNICOS C compiler choked on this
2065 * being argument to DEBUG_r(), so let's just have
2068 #ifdef PERL_EXT_RE_BUILD
2074 regnode *fix = convert;
2075 U32 word = trie->wordcount;
2077 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2078 while( ++fix < n ) {
2079 Set_Node_Offset_Length(fix, 0, 0);
2082 SV ** const tmp = av_fetch( trie_words, word, 0 );
2084 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2087 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2095 NEXT_OFF(convert) = (U16)(tail - convert);
2096 DEBUG_r(optimize= n);
2102 if ( trie->maxlen ) {
2103 NEXT_OFF( convert ) = (U16)(tail - convert);
2104 ARG_SET( convert, data_slot );
2105 /* Store the offset to the first unabsorbed branch in
2106 jump[0], which is otherwise unused by the jump logic.
2107 We use this when dumping a trie and during optimisation. */
2109 trie->jump[0] = (U16)(nextbranch - convert);
2112 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2113 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2115 OP( convert ) = TRIEC;
2116 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2117 PerlMemShared_free(trie->bitmap);
2120 OP( convert ) = TRIE;
2122 /* store the type in the flags */
2123 convert->flags = nodetype;
2127 + regarglen[ OP( convert ) ];
2129 /* XXX We really should free up the resource in trie now,
2130 as we won't use them - (which resources?) dmq */
2132 /* needed for dumping*/
2133 DEBUG_r(if (optimize) {
2134 regnode *opt = convert;
2136 while ( ++opt < optimize) {
2137 Set_Node_Offset_Length(opt,0,0);
2140 Try to clean up some of the debris left after the
2143 while( optimize < jumper ) {
2144 mjd_nodelen += Node_Length((optimize));
2145 OP( optimize ) = OPTIMIZED;
2146 Set_Node_Offset_Length(optimize,0,0);
2149 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2151 } /* end node insert */
2152 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2153 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2155 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2156 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2158 SvREFCNT_dec(revcharmap);
2162 : trie->startstate>1
2168 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2170 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2172 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2173 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2176 We find the fail state for each state in the trie, this state is the longest proper
2177 suffix of the current states 'word' that is also a proper prefix of another word in our
2178 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2179 the DFA not to have to restart after its tried and failed a word at a given point, it
2180 simply continues as though it had been matching the other word in the first place.
2182 'abcdgu'=~/abcdefg|cdgu/
2183 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2184 fail, which would bring use to the state representing 'd' in the second word where we would
2185 try 'g' and succeed, prodceding to match 'cdgu'.
2187 /* add a fail transition */
2188 const U32 trie_offset = ARG(source);
2189 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2191 const U32 ucharcount = trie->uniquecharcount;
2192 const U32 numstates = trie->statecount;
2193 const U32 ubound = trie->lasttrans + ucharcount;
2197 U32 base = trie->states[ 1 ].trans.base;
2200 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2201 GET_RE_DEBUG_FLAGS_DECL;
2203 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2205 PERL_UNUSED_ARG(depth);
2209 ARG_SET( stclass, data_slot );
2210 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2211 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2212 aho->trie=trie_offset;
2213 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2214 Copy( trie->states, aho->states, numstates, reg_trie_state );
2215 Newxz( q, numstates, U32);
2216 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2219 /* initialize fail[0..1] to be 1 so that we always have
2220 a valid final fail state */
2221 fail[ 0 ] = fail[ 1 ] = 1;
2223 for ( charid = 0; charid < ucharcount ; charid++ ) {
2224 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2226 q[ q_write ] = newstate;
2227 /* set to point at the root */
2228 fail[ q[ q_write++ ] ]=1;
2231 while ( q_read < q_write) {
2232 const U32 cur = q[ q_read++ % numstates ];
2233 base = trie->states[ cur ].trans.base;
2235 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2236 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2238 U32 fail_state = cur;
2241 fail_state = fail[ fail_state ];
2242 fail_base = aho->states[ fail_state ].trans.base;
2243 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2245 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2246 fail[ ch_state ] = fail_state;
2247 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2249 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2251 q[ q_write++ % numstates] = ch_state;
2255 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2256 when we fail in state 1, this allows us to use the
2257 charclass scan to find a valid start char. This is based on the principle
2258 that theres a good chance the string being searched contains lots of stuff
2259 that cant be a start char.
2261 fail[ 0 ] = fail[ 1 ] = 0;
2262 DEBUG_TRIE_COMPILE_r({
2263 PerlIO_printf(Perl_debug_log,
2264 "%*sStclass Failtable (%"UVuf" states): 0",
2265 (int)(depth * 2), "", (UV)numstates
2267 for( q_read=1; q_read<numstates; q_read++ ) {
2268 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2270 PerlIO_printf(Perl_debug_log, "\n");
2273 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2278 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2279 * These need to be revisited when a newer toolchain becomes available.
2281 #if defined(__sparc64__) && defined(__GNUC__)
2282 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2283 # undef SPARC64_GCC_WORKAROUND
2284 # define SPARC64_GCC_WORKAROUND 1
2288 #define DEBUG_PEEP(str,scan,depth) \
2289 DEBUG_OPTIMISE_r({if (scan){ \
2290 SV * const mysv=sv_newmortal(); \
2291 regnode *Next = regnext(scan); \
2292 regprop(RExC_rx, mysv, scan); \
2293 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2294 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2295 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2302 #define JOIN_EXACT(scan,min,flags) \
2303 if (PL_regkind[OP(scan)] == EXACT) \
2304 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2307 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2308 /* Merge several consecutive EXACTish nodes into one. */
2309 regnode *n = regnext(scan);
2311 regnode *next = scan + NODE_SZ_STR(scan);
2315 regnode *stop = scan;
2316 GET_RE_DEBUG_FLAGS_DECL;
2318 PERL_UNUSED_ARG(depth);
2321 PERL_ARGS_ASSERT_JOIN_EXACT;
2322 #ifndef EXPERIMENTAL_INPLACESCAN
2323 PERL_UNUSED_ARG(flags);
2324 PERL_UNUSED_ARG(val);
2326 DEBUG_PEEP("join",scan,depth);
2328 /* Skip NOTHING, merge EXACT*. */
2330 ( PL_regkind[OP(n)] == NOTHING ||
2331 (stringok && (OP(n) == OP(scan))))
2333 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2335 if (OP(n) == TAIL || n > next)
2337 if (PL_regkind[OP(n)] == NOTHING) {
2338 DEBUG_PEEP("skip:",n,depth);
2339 NEXT_OFF(scan) += NEXT_OFF(n);
2340 next = n + NODE_STEP_REGNODE;
2347 else if (stringok) {
2348 const unsigned int oldl = STR_LEN(scan);
2349 regnode * const nnext = regnext(n);
2351 DEBUG_PEEP("merg",n,depth);
2354 if (oldl + STR_LEN(n) > U8_MAX)
2356 NEXT_OFF(scan) += NEXT_OFF(n);
2357 STR_LEN(scan) += STR_LEN(n);
2358 next = n + NODE_SZ_STR(n);
2359 /* Now we can overwrite *n : */
2360 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2368 #ifdef EXPERIMENTAL_INPLACESCAN
2369 if (flags && !NEXT_OFF(n)) {
2370 DEBUG_PEEP("atch", val, depth);
2371 if (reg_off_by_arg[OP(n)]) {
2372 ARG_SET(n, val - n);
2375 NEXT_OFF(n) = val - n;
2382 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2384 Two problematic code points in Unicode casefolding of EXACT nodes:
2386 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2387 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2393 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2394 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2396 This means that in case-insensitive matching (or "loose matching",
2397 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2398 length of the above casefolded versions) can match a target string
2399 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2400 This would rather mess up the minimum length computation.
2402 What we'll do is to look for the tail four bytes, and then peek
2403 at the preceding two bytes to see whether we need to decrease
2404 the minimum length by four (six minus two).
2406 Thanks to the design of UTF-8, there cannot be false matches:
2407 A sequence of valid UTF-8 bytes cannot be a subsequence of
2408 another valid sequence of UTF-8 bytes.
2411 char * const s0 = STRING(scan), *s, *t;
2412 char * const s1 = s0 + STR_LEN(scan) - 1;
2413 char * const s2 = s1 - 4;
2414 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2415 const char t0[] = "\xaf\x49\xaf\x42";
2417 const char t0[] = "\xcc\x88\xcc\x81";
2419 const char * const t1 = t0 + 3;
2422 s < s2 && (t = ninstr(s, s1, t0, t1));
2425 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2426 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2428 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2429 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2437 n = scan + NODE_SZ_STR(scan);
2439 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2446 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2450 /* REx optimizer. Converts nodes into quickier variants "in place".
2451 Finds fixed substrings. */
2453 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2454 to the position after last scanned or to NULL. */
2456 #define INIT_AND_WITHP \
2457 assert(!and_withp); \
2458 Newx(and_withp,1,struct regnode_charclass_class); \
2459 SAVEFREEPV(and_withp)
2461 /* this is a chain of data about sub patterns we are processing that
2462 need to be handled seperately/specially in study_chunk. Its so
2463 we can simulate recursion without losing state. */
2465 typedef struct scan_frame {
2466 regnode *last; /* last node to process in this frame */
2467 regnode *next; /* next node to process when last is reached */
2468 struct scan_frame *prev; /*previous frame*/
2469 I32 stop; /* what stopparen do we use */
2473 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2475 #define CASE_SYNST_FNC(nAmE) \
2477 if (flags & SCF_DO_STCLASS_AND) { \
2478 for (value = 0; value < 256; value++) \
2479 if (!is_ ## nAmE ## _cp(value)) \
2480 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2483 for (value = 0; value < 256; value++) \
2484 if (is_ ## nAmE ## _cp(value)) \
2485 ANYOF_BITMAP_SET(data->start_class, value); \
2489 if (flags & SCF_DO_STCLASS_AND) { \
2490 for (value = 0; value < 256; value++) \
2491 if (is_ ## nAmE ## _cp(value)) \
2492 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2495 for (value = 0; value < 256; value++) \
2496 if (!is_ ## nAmE ## _cp(value)) \
2497 ANYOF_BITMAP_SET(data->start_class, value); \
2504 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2505 I32 *minlenp, I32 *deltap,
2510 struct regnode_charclass_class *and_withp,
2511 U32 flags, U32 depth)
2512 /* scanp: Start here (read-write). */
2513 /* deltap: Write maxlen-minlen here. */
2514 /* last: Stop before this one. */
2515 /* data: string data about the pattern */
2516 /* stopparen: treat close N as END */
2517 /* recursed: which subroutines have we recursed into */
2518 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2521 I32 min = 0, pars = 0, code;
2522 regnode *scan = *scanp, *next;
2524 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2525 int is_inf_internal = 0; /* The studied chunk is infinite */
2526 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2527 scan_data_t data_fake;
2528 SV *re_trie_maxbuff = NULL;
2529 regnode *first_non_open = scan;
2530 I32 stopmin = I32_MAX;
2531 scan_frame *frame = NULL;
2532 GET_RE_DEBUG_FLAGS_DECL;
2534 PERL_ARGS_ASSERT_STUDY_CHUNK;
2537 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2541 while (first_non_open && OP(first_non_open) == OPEN)
2542 first_non_open=regnext(first_non_open);
2547 while ( scan && OP(scan) != END && scan < last ){
2548 /* Peephole optimizer: */
2549 DEBUG_STUDYDATA("Peep:", data,depth);
2550 DEBUG_PEEP("Peep",scan,depth);
2551 JOIN_EXACT(scan,&min,0);
2553 /* Follow the next-chain of the current node and optimize
2554 away all the NOTHINGs from it. */
2555 if (OP(scan) != CURLYX) {
2556 const int max = (reg_off_by_arg[OP(scan)]
2558 /* I32 may be smaller than U16 on CRAYs! */
2559 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2560 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2564 /* Skip NOTHING and LONGJMP. */
2565 while ((n = regnext(n))
2566 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2567 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2568 && off + noff < max)
2570 if (reg_off_by_arg[OP(scan)])
2573 NEXT_OFF(scan) = off;
2578 /* The principal pseudo-switch. Cannot be a switch, since we
2579 look into several different things. */
2580 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2581 || OP(scan) == IFTHEN) {
2582 next = regnext(scan);
2584 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2586 if (OP(next) == code || code == IFTHEN) {
2587 /* NOTE - There is similar code to this block below for handling
2588 TRIE nodes on a re-study. If you change stuff here check there
2590 I32 max1 = 0, min1 = I32_MAX, num = 0;
2591 struct regnode_charclass_class accum;
2592 regnode * const startbranch=scan;
2594 if (flags & SCF_DO_SUBSTR)
2595 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2596 if (flags & SCF_DO_STCLASS)
2597 cl_init_zero(pRExC_state, &accum);
2599 while (OP(scan) == code) {
2600 I32 deltanext, minnext, f = 0, fake;
2601 struct regnode_charclass_class this_class;
2604 data_fake.flags = 0;
2606 data_fake.whilem_c = data->whilem_c;
2607 data_fake.last_closep = data->last_closep;
2610 data_fake.last_closep = &fake;
2612 data_fake.pos_delta = delta;
2613 next = regnext(scan);
2614 scan = NEXTOPER(scan);
2616 scan = NEXTOPER(scan);
2617 if (flags & SCF_DO_STCLASS) {
2618 cl_init(pRExC_state, &this_class);
2619 data_fake.start_class = &this_class;
2620 f = SCF_DO_STCLASS_AND;
2622 if (flags & SCF_WHILEM_VISITED_POS)
2623 f |= SCF_WHILEM_VISITED_POS;
2625 /* we suppose the run is continuous, last=next...*/
2626 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2628 stopparen, recursed, NULL, f,depth+1);
2631 if (max1 < minnext + deltanext)
2632 max1 = minnext + deltanext;
2633 if (deltanext == I32_MAX)
2634 is_inf = is_inf_internal = 1;
2636 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2638 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2639 if ( stopmin > minnext)
2640 stopmin = min + min1;
2641 flags &= ~SCF_DO_SUBSTR;
2643 data->flags |= SCF_SEEN_ACCEPT;
2646 if (data_fake.flags & SF_HAS_EVAL)
2647 data->flags |= SF_HAS_EVAL;
2648 data->whilem_c = data_fake.whilem_c;
2650 if (flags & SCF_DO_STCLASS)
2651 cl_or(pRExC_state, &accum, &this_class);
2653 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2655 if (flags & SCF_DO_SUBSTR) {
2656 data->pos_min += min1;
2657 data->pos_delta += max1 - min1;
2658 if (max1 != min1 || is_inf)
2659 data->longest = &(data->longest_float);
2662 delta += max1 - min1;
2663 if (flags & SCF_DO_STCLASS_OR) {
2664 cl_or(pRExC_state, data->start_class, &accum);
2666 cl_and(data->start_class, and_withp);
2667 flags &= ~SCF_DO_STCLASS;
2670 else if (flags & SCF_DO_STCLASS_AND) {
2672 cl_and(data->start_class, &accum);
2673 flags &= ~SCF_DO_STCLASS;
2676 /* Switch to OR mode: cache the old value of
2677 * data->start_class */
2679 StructCopy(data->start_class, and_withp,
2680 struct regnode_charclass_class);
2681 flags &= ~SCF_DO_STCLASS_AND;
2682 StructCopy(&accum, data->start_class,
2683 struct regnode_charclass_class);
2684 flags |= SCF_DO_STCLASS_OR;
2685 data->start_class->flags |= ANYOF_EOS;
2689 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2692 Assuming this was/is a branch we are dealing with: 'scan' now
2693 points at the item that follows the branch sequence, whatever
2694 it is. We now start at the beginning of the sequence and look
2701 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2703 If we can find such a subseqence we need to turn the first
2704 element into a trie and then add the subsequent branch exact
2705 strings to the trie.
2709 1. patterns where the whole set of branch can be converted.
2711 2. patterns where only a subset can be converted.
2713 In case 1 we can replace the whole set with a single regop
2714 for the trie. In case 2 we need to keep the start and end
2717 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2718 becomes BRANCH TRIE; BRANCH X;
2720 There is an additional case, that being where there is a
2721 common prefix, which gets split out into an EXACT like node
2722 preceding the TRIE node.
2724 If x(1..n)==tail then we can do a simple trie, if not we make
2725 a "jump" trie, such that when we match the appropriate word
2726 we "jump" to the appopriate tail node. Essentailly we turn
2727 a nested if into a case structure of sorts.
2732 if (!re_trie_maxbuff) {
2733 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2734 if (!SvIOK(re_trie_maxbuff))
2735 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2737 if ( SvIV(re_trie_maxbuff)>=0 ) {
2739 regnode *first = (regnode *)NULL;
2740 regnode *last = (regnode *)NULL;
2741 regnode *tail = scan;
2746 SV * const mysv = sv_newmortal(); /* for dumping */
2748 /* var tail is used because there may be a TAIL
2749 regop in the way. Ie, the exacts will point to the
2750 thing following the TAIL, but the last branch will
2751 point at the TAIL. So we advance tail. If we
2752 have nested (?:) we may have to move through several
2756 while ( OP( tail ) == TAIL ) {
2757 /* this is the TAIL generated by (?:) */
2758 tail = regnext( tail );
2763 regprop(RExC_rx, mysv, tail );
2764 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2765 (int)depth * 2 + 2, "",
2766 "Looking for TRIE'able sequences. Tail node is: ",
2767 SvPV_nolen_const( mysv )
2773 step through the branches, cur represents each
2774 branch, noper is the first thing to be matched
2775 as part of that branch and noper_next is the
2776 regnext() of that node. if noper is an EXACT
2777 and noper_next is the same as scan (our current
2778 position in the regex) then the EXACT branch is
2779 a possible optimization target. Once we have
2780 two or more consequetive such branches we can
2781 create a trie of the EXACT's contents and stich
2782 it in place. If the sequence represents all of
2783 the branches we eliminate the whole thing and
2784 replace it with a single TRIE. If it is a
2785 subsequence then we need to stitch it in. This
2786 means the first branch has to remain, and needs
2787 to be repointed at the item on the branch chain
2788 following the last branch optimized. This could
2789 be either a BRANCH, in which case the
2790 subsequence is internal, or it could be the
2791 item following the branch sequence in which
2792 case the subsequence is at the end.
2796 /* dont use tail as the end marker for this traverse */
2797 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2798 regnode * const noper = NEXTOPER( cur );
2799 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2800 regnode * const noper_next = regnext( noper );
2804 regprop(RExC_rx, mysv, cur);
2805 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2806 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2808 regprop(RExC_rx, mysv, noper);
2809 PerlIO_printf( Perl_debug_log, " -> %s",
2810 SvPV_nolen_const(mysv));
2813 regprop(RExC_rx, mysv, noper_next );
2814 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2815 SvPV_nolen_const(mysv));
2817 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2818 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2820 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2821 : PL_regkind[ OP( noper ) ] == EXACT )
2822 || OP(noper) == NOTHING )
2824 && noper_next == tail
2829 if ( !first || optype == NOTHING ) {
2830 if (!first) first = cur;
2831 optype = OP( noper );
2837 Currently we do not believe that the trie logic can
2838 handle case insensitive matching properly when the
2839 pattern is not unicode (thus forcing unicode semantics).
2841 If/when this is fixed the following define can be swapped
2842 in below to fully enable trie logic.
2844 #define TRIE_TYPE_IS_SAFE 1
2847 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2849 if ( last && TRIE_TYPE_IS_SAFE ) {
2850 make_trie( pRExC_state,
2851 startbranch, first, cur, tail, count,
2854 if ( PL_regkind[ OP( noper ) ] == EXACT
2856 && noper_next == tail
2861 optype = OP( noper );
2871 regprop(RExC_rx, mysv, cur);
2872 PerlIO_printf( Perl_debug_log,
2873 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2874 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2878 if ( last && TRIE_TYPE_IS_SAFE ) {
2879 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2880 #ifdef TRIE_STUDY_OPT
2881 if ( ((made == MADE_EXACT_TRIE &&
2882 startbranch == first)
2883 || ( first_non_open == first )) &&
2885 flags |= SCF_TRIE_RESTUDY;
2886 if ( startbranch == first
2889 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2899 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2900 scan = NEXTOPER(NEXTOPER(scan));
2901 } else /* single branch is optimized. */
2902 scan = NEXTOPER(scan);
2904 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2905 scan_frame *newframe = NULL;
2910 if (OP(scan) != SUSPEND) {
2911 /* set the pointer */
2912 if (OP(scan) == GOSUB) {
2914 RExC_recurse[ARG2L(scan)] = scan;
2915 start = RExC_open_parens[paren-1];
2916 end = RExC_close_parens[paren-1];
2919 start = RExC_rxi->program + 1;
2923 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2924 SAVEFREEPV(recursed);
2926 if (!PAREN_TEST(recursed,paren+1)) {
2927 PAREN_SET(recursed,paren+1);
2928 Newx(newframe,1,scan_frame);
2930 if (flags & SCF_DO_SUBSTR) {
2931 SCAN_COMMIT(pRExC_state,data,minlenp);
2932 data->longest = &(data->longest_float);
2934 is_inf = is_inf_internal = 1;
2935 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2936 cl_anything(pRExC_state, data->start_class);
2937 flags &= ~SCF_DO_STCLASS;
2940 Newx(newframe,1,scan_frame);
2943 end = regnext(scan);
2948 SAVEFREEPV(newframe);
2949 newframe->next = regnext(scan);
2950 newframe->last = last;
2951 newframe->stop = stopparen;
2952 newframe->prev = frame;
2962 else if (OP(scan) == EXACT) {
2963 I32 l = STR_LEN(scan);
2966 const U8 * const s = (U8*)STRING(scan);
2967 l = utf8_length(s, s + l);
2968 uc = utf8_to_uvchr(s, NULL);
2970 uc = *((U8*)STRING(scan));
2973 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2974 /* The code below prefers earlier match for fixed
2975 offset, later match for variable offset. */
2976 if (data->last_end == -1) { /* Update the start info. */
2977 data->last_start_min = data->pos_min;
2978 data->last_start_max = is_inf
2979 ? I32_MAX : data->pos_min + data->pos_delta;
2981 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2983 SvUTF8_on(data->last_found);
2985 SV * const sv = data->last_found;
2986 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2987 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2988 if (mg && mg->mg_len >= 0)
2989 mg->mg_len += utf8_length((U8*)STRING(scan),
2990 (U8*)STRING(scan)+STR_LEN(scan));
2992 data->last_end = data->pos_min + l;
2993 data->pos_min += l; /* As in the first entry. */
2994 data->flags &= ~SF_BEFORE_EOL;
2996 if (flags & SCF_DO_STCLASS_AND) {
2997 /* Check whether it is compatible with what we know already! */
3001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3002 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3003 && (!(data->start_class->flags & ANYOF_FOLD)
3004 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3007 ANYOF_CLASS_ZERO(data->start_class);
3008 ANYOF_BITMAP_ZERO(data->start_class);
3010 ANYOF_BITMAP_SET(data->start_class, uc);
3011 data->start_class->flags &= ~ANYOF_EOS;
3013 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3015 else if (flags & SCF_DO_STCLASS_OR) {
3016 /* false positive possible if the class is case-folded */
3018 ANYOF_BITMAP_SET(data->start_class, uc);
3020 data->start_class->flags |= ANYOF_UNICODE_ALL;
3021 data->start_class->flags &= ~ANYOF_EOS;
3022 cl_and(data->start_class, and_withp);
3024 flags &= ~SCF_DO_STCLASS;
3026 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3027 I32 l = STR_LEN(scan);
3028 UV uc = *((U8*)STRING(scan));
3030 /* Search for fixed substrings supports EXACT only. */
3031 if (flags & SCF_DO_SUBSTR) {
3033 SCAN_COMMIT(pRExC_state, data, minlenp);
3036 const U8 * const s = (U8 *)STRING(scan);
3037 l = utf8_length(s, s + l);
3038 uc = utf8_to_uvchr(s, NULL);
3041 if (flags & SCF_DO_SUBSTR)
3043 if (flags & SCF_DO_STCLASS_AND) {
3044 /* Check whether it is compatible with what we know already! */
3048 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3049 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3050 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3052 ANYOF_CLASS_ZERO(data->start_class);
3053 ANYOF_BITMAP_ZERO(data->start_class);
3055 ANYOF_BITMAP_SET(data->start_class, uc);
3056 data->start_class->flags &= ~ANYOF_EOS;
3057 data->start_class->flags |= ANYOF_FOLD;
3058 if (OP(scan) == EXACTFL)
3059 data->start_class->flags |= ANYOF_LOCALE;
3062 else if (flags & SCF_DO_STCLASS_OR) {
3063 if (data->start_class->flags & ANYOF_FOLD) {
3064 /* false positive possible if the class is case-folded.
3065 Assume that the locale settings are the same... */
3067 ANYOF_BITMAP_SET(data->start_class, uc);
3068 data->start_class->flags &= ~ANYOF_EOS;
3070 cl_and(data->start_class, and_withp);
3072 flags &= ~SCF_DO_STCLASS;
3074 else if (strchr((const char*)PL_varies,OP(scan))) {
3075 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3076 I32 f = flags, pos_before = 0;
3077 regnode * const oscan = scan;
3078 struct regnode_charclass_class this_class;
3079 struct regnode_charclass_class *oclass = NULL;
3080 I32 next_is_eval = 0;
3082 switch (PL_regkind[OP(scan)]) {
3083 case WHILEM: /* End of (?:...)* . */
3084 scan = NEXTOPER(scan);
3087 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3088 next = NEXTOPER(scan);
3089 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3091 maxcount = REG_INFTY;
3092 next = regnext(scan);
3093 scan = NEXTOPER(scan);
3097 if (flags & SCF_DO_SUBSTR)
3102 if (flags & SCF_DO_STCLASS) {
3104 maxcount = REG_INFTY;
3105 next = regnext(scan);
3106 scan = NEXTOPER(scan);
3109 is_inf = is_inf_internal = 1;
3110 scan = regnext(scan);
3111 if (flags & SCF_DO_SUBSTR) {
3112 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3113 data->longest = &(data->longest_float);
3115 goto optimize_curly_tail;
3117 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3118 && (scan->flags == stopparen))
3123 mincount = ARG1(scan);
3124 maxcount = ARG2(scan);
3126 next = regnext(scan);
3127 if (OP(scan) == CURLYX) {
3128 I32 lp = (data ? *(data->last_closep) : 0);
3129 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3131 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3132 next_is_eval = (OP(scan) == EVAL);
3134 if (flags & SCF_DO_SUBSTR) {
3135 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3136 pos_before = data->pos_min;
3140 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3142 data->flags |= SF_IS_INF;
3144 if (flags & SCF_DO_STCLASS) {
3145 cl_init(pRExC_state, &this_class);
3146 oclass = data->start_class;
3147 data->start_class = &this_class;
3148 f |= SCF_DO_STCLASS_AND;
3149 f &= ~SCF_DO_STCLASS_OR;
3151 /* These are the cases when once a subexpression
3152 fails at a particular position, it cannot succeed
3153 even after backtracking at the enclosing scope.
3155 XXXX what if minimal match and we are at the
3156 initial run of {n,m}? */
3157 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3158 f &= ~SCF_WHILEM_VISITED_POS;
3160 /* This will finish on WHILEM, setting scan, or on NULL: */
3161 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3162 last, data, stopparen, recursed, NULL,
3164 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3166 if (flags & SCF_DO_STCLASS)
3167 data->start_class = oclass;
3168 if (mincount == 0 || minnext == 0) {
3169 if (flags & SCF_DO_STCLASS_OR) {
3170 cl_or(pRExC_state, data->start_class, &this_class);
3172 else if (flags & SCF_DO_STCLASS_AND) {
3173 /* Switch to OR mode: cache the old value of
3174 * data->start_class */
3176 StructCopy(data->start_class, and_withp,
3177 struct regnode_charclass_class);
3178 flags &= ~SCF_DO_STCLASS_AND;
3179 StructCopy(&this_class, data->start_class,
3180 struct regnode_charclass_class);
3181 flags |= SCF_DO_STCLASS_OR;
3182 data->start_class->flags |= ANYOF_EOS;
3184 } else { /* Non-zero len */
3185 if (flags & SCF_DO_STCLASS_OR) {
3186 cl_or(pRExC_state, data->start_class, &this_class);
3187 cl_and(data->start_class, and_withp);
3189 else if (flags & SCF_DO_STCLASS_AND)
3190 cl_and(data->start_class, &this_class);
3191 flags &= ~SCF_DO_STCLASS;
3193 if (!scan) /* It was not CURLYX, but CURLY. */
3195 if ( /* ? quantifier ok, except for (?{ ... }) */
3196 (next_is_eval || !(mincount == 0 && maxcount == 1))
3197 && (minnext == 0) && (deltanext == 0)
3198 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3199 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3201 ckWARNreg(RExC_parse,
3202 "Quantifier unexpected on zero-length expression");
3205 min += minnext * mincount;
3206 is_inf_internal |= ((maxcount == REG_INFTY
3207 && (minnext + deltanext) > 0)
3208 || deltanext == I32_MAX);
3209 is_inf |= is_inf_internal;
3210 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3212 /* Try powerful optimization CURLYX => CURLYN. */
3213 if ( OP(oscan) == CURLYX && data
3214 && data->flags & SF_IN_PAR
3215 && !(data->flags & SF_HAS_EVAL)
3216 && !deltanext && minnext == 1 ) {
3217 /* Try to optimize to CURLYN. */
3218 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3219 regnode * const nxt1 = nxt;
3226 if (!strchr((const char*)PL_simple,OP(nxt))
3227 && !(PL_regkind[OP(nxt)] == EXACT
3228 && STR_LEN(nxt) == 1))
3234 if (OP(nxt) != CLOSE)
3236 if (RExC_open_parens) {
3237 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3238 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3240 /* Now we know that nxt2 is the only contents: */
3241 oscan->flags = (U8)ARG(nxt);
3243 OP(nxt1) = NOTHING; /* was OPEN. */
3246 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3247 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3248 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3249 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3250 OP(nxt + 1) = OPTIMIZED; /* was count. */
3251 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3256 /* Try optimization CURLYX => CURLYM. */
3257 if ( OP(oscan) == CURLYX && data
3258 && !(data->flags & SF_HAS_PAR)
3259 && !(data->flags & SF_HAS_EVAL)
3260 && !deltanext /* atom is fixed width */
3261 && minnext != 0 /* CURLYM can't handle zero width */
3263 /* XXXX How to optimize if data == 0? */
3264 /* Optimize to a simpler form. */
3265 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3269 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3270 && (OP(nxt2) != WHILEM))
3272 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3273 /* Need to optimize away parenths. */
3274 if (data->flags & SF_IN_PAR) {
3275 /* Set the parenth number. */
3276 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3278 if (OP(nxt) != CLOSE)
3279 FAIL("Panic opt close");
3280 oscan->flags = (U8)ARG(nxt);
3281 if (RExC_open_parens) {
3282 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3283 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3285 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3286 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3289 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3290 OP(nxt + 1) = OPTIMIZED; /* was count. */
3291 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3292 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3295 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3296 regnode *nnxt = regnext(nxt1);
3299 if (reg_off_by_arg[OP(nxt1)])
3300 ARG_SET(nxt1, nxt2 - nxt1);
3301 else if (nxt2 - nxt1 < U16_MAX)
3302 NEXT_OFF(nxt1) = nxt2 - nxt1;
3304 OP(nxt) = NOTHING; /* Cannot beautify */
3309 /* Optimize again: */
3310 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3311 NULL, stopparen, recursed, NULL, 0,depth+1);
3316 else if ((OP(oscan) == CURLYX)
3317 && (flags & SCF_WHILEM_VISITED_POS)
3318 /* See the comment on a similar expression above.
3319 However, this time it not a subexpression
3320 we care about, but the expression itself. */
3321 && (maxcount == REG_INFTY)
3322 && data && ++data->whilem_c < 16) {
3323 /* This stays as CURLYX, we can put the count/of pair. */
3324 /* Find WHILEM (as in regexec.c) */
3325 regnode *nxt = oscan + NEXT_OFF(oscan);
3327 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3329 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3330 | (RExC_whilem_seen << 4)); /* On WHILEM */
3332 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3334 if (flags & SCF_DO_SUBSTR) {
3335 SV *last_str = NULL;
3336 int counted = mincount != 0;
3338 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3339 #if defined(SPARC64_GCC_WORKAROUND)
3342 const char *s = NULL;
3345 if (pos_before >= data->last_start_min)
3348 b = data->last_start_min;
3351 s = SvPV_const(data->last_found, l);
3352 old = b - data->last_start_min;
3355 I32 b = pos_before >= data->last_start_min
3356 ? pos_before : data->last_start_min;
3358 const char * const s = SvPV_const(data->last_found, l);
3359 I32 old = b - data->last_start_min;
3363 old = utf8_hop((U8*)s, old) - (U8*)s;
3366 /* Get the added string: */
3367 last_str = newSVpvn_utf8(s + old, l, UTF);
3368 if (deltanext == 0 && pos_before == b) {
3369 /* What was added is a constant string */
3371 SvGROW(last_str, (mincount * l) + 1);
3372 repeatcpy(SvPVX(last_str) + l,
3373 SvPVX_const(last_str), l, mincount - 1);
3374 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3375 /* Add additional parts. */
3376 SvCUR_set(data->last_found,
3377 SvCUR(data->last_found) - l);
3378 sv_catsv(data->last_found, last_str);
3380 SV * sv = data->last_found;
3382 SvUTF8(sv) && SvMAGICAL(sv) ?
3383 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3384 if (mg && mg->mg_len >= 0)
3385 mg->mg_len += CHR_SVLEN(last_str) - l;
3387 data->last_end += l * (mincount - 1);
3390 /* start offset must point into the last copy */
3391 data->last_start_min += minnext * (mincount - 1);
3392 data->last_start_max += is_inf ? I32_MAX
3393 : (maxcount - 1) * (minnext + data->pos_delta);
3396 /* It is counted once already... */
3397 data->pos_min += minnext * (mincount - counted);
3398 data->pos_delta += - counted * deltanext +
3399 (minnext + deltanext) * maxcount - minnext * mincount;
3400 if (mincount != maxcount) {
3401 /* Cannot extend fixed substrings found inside
3403 SCAN_COMMIT(pRExC_state,data,minlenp);
3404 if (mincount && last_str) {
3405 SV * const sv = data->last_found;
3406 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3407 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3411 sv_setsv(sv, last_str);
3412 data->last_end = data->pos_min;
3413 data->last_start_min =
3414 data->pos_min - CHR_SVLEN(last_str);
3415 data->last_start_max = is_inf
3417 : data->pos_min + data->pos_delta
3418 - CHR_SVLEN(last_str);
3420 data->longest = &(data->longest_float);
3422 SvREFCNT_dec(last_str);
3424 if (data && (fl & SF_HAS_EVAL))
3425 data->flags |= SF_HAS_EVAL;
3426 optimize_curly_tail:
3427 if (OP(oscan) != CURLYX) {
3428 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3430 NEXT_OFF(oscan) += NEXT_OFF(next);
3433 default: /* REF and CLUMP only? */
3434 if (flags & SCF_DO_SUBSTR) {
3435 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3436 data->longest = &(data->longest_float);
3438 is_inf = is_inf_internal = 1;
3439 if (flags & SCF_DO_STCLASS_OR)
3440 cl_anything(pRExC_state, data->start_class);
3441 flags &= ~SCF_DO_STCLASS;
3445 else if (OP(scan) == LNBREAK) {
3446 if (flags & SCF_DO_STCLASS) {
3448 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3449 if (flags & SCF_DO_STCLASS_AND) {
3450 for (value = 0; value < 256; value++)
3451 if (!is_VERTWS_cp(value))
3452 ANYOF_BITMAP_CLEAR(data->start_class, value);
3455 for (value = 0; value < 256; value++)
3456 if (is_VERTWS_cp(value))
3457 ANYOF_BITMAP_SET(data->start_class, value);
3459 if (flags & SCF_DO_STCLASS_OR)
3460 cl_and(data->start_class, and_withp);
3461 flags &= ~SCF_DO_STCLASS;
3465 if (flags & SCF_DO_SUBSTR) {
3466 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3468 data->pos_delta += 1;
3469 data->longest = &(data->longest_float);
3473 else if (OP(scan) == FOLDCHAR) {
3474 int d = ARG(scan)==0xDF ? 1 : 2;
3475 flags &= ~SCF_DO_STCLASS;
3478 if (flags & SCF_DO_SUBSTR) {
3479 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3481 data->pos_delta += d;
3482 data->longest = &(data->longest_float);
3485 else if (strchr((const char*)PL_simple,OP(scan))) {
3488 if (flags & SCF_DO_SUBSTR) {
3489 SCAN_COMMIT(pRExC_state,data,minlenp);
3493 if (flags & SCF_DO_STCLASS) {
3494 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3496 /* Some of the logic below assumes that switching
3497 locale on will only add false positives. */
3498 switch (PL_regkind[OP(scan)]) {
3502 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3503 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3504 cl_anything(pRExC_state, data->start_class);
3507 if (OP(scan) == SANY)
3509 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3510 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3511 || (data->start_class->flags & ANYOF_CLASS));
3512 cl_anything(pRExC_state, data->start_class);
3514 if (flags & SCF_DO_STCLASS_AND || !value)
3515 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3518 if (flags & SCF_DO_STCLASS_AND)
3519 cl_and(data->start_class,
3520 (struct regnode_charclass_class*)scan);
3522 cl_or(pRExC_state, data->start_class,
3523 (struct regnode_charclass_class*)scan);
3526 if (flags & SCF_DO_STCLASS_AND) {
3527 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3528 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3529 for (value = 0; value < 256; value++)
3530 if (!isALNUM(value))
3531 ANYOF_BITMAP_CLEAR(data->start_class, value);
3535 if (data->start_class->flags & ANYOF_LOCALE)
3536 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3538 for (value = 0; value < 256; value++)
3540 ANYOF_BITMAP_SET(data->start_class, value);
3545 if (flags & SCF_DO_STCLASS_AND) {
3546 if (data->start_class->flags & ANYOF_LOCALE)
3547 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3550 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3551 data->start_class->flags |= ANYOF_LOCALE;
3555 if (flags & SCF_DO_STCLASS_AND) {
3556 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3557 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3558 for (value = 0; value < 256; value++)
3560 ANYOF_BITMAP_CLEAR(data->start_class, value);
3564 if (data->start_class->flags & ANYOF_LOCALE)
3565 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3567 for (value = 0; value < 256; value++)
3568 if (!isALNUM(value))
3569 ANYOF_BITMAP_SET(data->start_class, value);
3574 if (flags & SCF_DO_STCLASS_AND) {
3575 if (data->start_class->flags & ANYOF_LOCALE)
3576 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3579 data->start_class->flags |= ANYOF_LOCALE;
3580 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3584 if (flags & SCF_DO_STCLASS_AND) {
3585 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3586 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3587 for (value = 0; value < 256; value++)
3588 if (!isSPACE(value))
3589 ANYOF_BITMAP_CLEAR(data->start_class, value);
3593 if (data->start_class->flags & ANYOF_LOCALE)
3594 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3596 for (value = 0; value < 256; value++)
3598 ANYOF_BITMAP_SET(data->start_class, value);
3603 if (flags & SCF_DO_STCLASS_AND) {
3604 if (data->start_class->flags & ANYOF_LOCALE)
3605 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3608 data->start_class->flags |= ANYOF_LOCALE;
3609 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3613 if (flags & SCF_DO_STCLASS_AND) {
3614 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3615 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3616 for (value = 0; value < 256; value++)
3618 ANYOF_BITMAP_CLEAR(data->start_class, value);
3622 if (data->start_class->flags & ANYOF_LOCALE)
3623 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3625 for (value = 0; value < 256; value++)
3626 if (!isSPACE(value))
3627 ANYOF_BITMAP_SET(data->start_class, value);
3632 if (flags & SCF_DO_STCLASS_AND) {
3633 if (data->start_class->flags & ANYOF_LOCALE) {
3634 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3635 for (value = 0; value < 256; value++)
3636 if (!isSPACE(value))
3637 ANYOF_BITMAP_CLEAR(data->start_class, value);
3641 data->start_class->flags |= ANYOF_LOCALE;
3642 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3646 if (flags & SCF_DO_STCLASS_AND) {
3647 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3648 for (value = 0; value < 256; value++)
3649 if (!isDIGIT(value))
3650 ANYOF_BITMAP_CLEAR(data->start_class, value);
3653 if (data->start_class->flags & ANYOF_LOCALE)
3654 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3656 for (value = 0; value < 256; value++)
3658 ANYOF_BITMAP_SET(data->start_class, value);
3663 if (flags & SCF_DO_STCLASS_AND) {
3664 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3665 for (value = 0; value < 256; value++)
3667 ANYOF_BITMAP_CLEAR(data->start_class, value);
3670 if (data->start_class->flags & ANYOF_LOCALE)
3671 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3673 for (value = 0; value < 256; value++)
3674 if (!isDIGIT(value))
3675 ANYOF_BITMAP_SET(data->start_class, value);
3679 CASE_SYNST_FNC(VERTWS);
3680 CASE_SYNST_FNC(HORIZWS);
3683 if (flags & SCF_DO_STCLASS_OR)
3684 cl_and(data->start_class, and_withp);
3685 flags &= ~SCF_DO_STCLASS;
3688 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3689 data->flags |= (OP(scan) == MEOL
3693 else if ( PL_regkind[OP(scan)] == BRANCHJ
3694 /* Lookbehind, or need to calculate parens/evals/stclass: */
3695 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3696 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3697 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3698 || OP(scan) == UNLESSM )
3700 /* Negative Lookahead/lookbehind
3701 In this case we can't do fixed string optimisation.
3704 I32 deltanext, minnext, fake = 0;
3706 struct regnode_charclass_class intrnl;
3709 data_fake.flags = 0;
3711 data_fake.whilem_c = data->whilem_c;
3712 data_fake.last_closep = data->last_closep;
3715 data_fake.last_closep = &fake;
3716 data_fake.pos_delta = delta;
3717 if ( flags & SCF_DO_STCLASS && !scan->flags
3718 && OP(scan) == IFMATCH ) { /* Lookahead */
3719 cl_init(pRExC_state, &intrnl);
3720 data_fake.start_class = &intrnl;
3721 f |= SCF_DO_STCLASS_AND;
3723 if (flags & SCF_WHILEM_VISITED_POS)
3724 f |= SCF_WHILEM_VISITED_POS;
3725 next = regnext(scan);
3726 nscan = NEXTOPER(NEXTOPER(scan));
3727 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3728 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3731 FAIL("Variable length lookbehind not implemented");
3733 else if (minnext > (I32)U8_MAX) {
3734 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3736 scan->flags = (U8)minnext;
3739 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3741 if (data_fake.flags & SF_HAS_EVAL)
3742 data->flags |= SF_HAS_EVAL;
3743 data->whilem_c = data_fake.whilem_c;
3745 if (f & SCF_DO_STCLASS_AND) {
3746 if (flags & SCF_DO_STCLASS_OR) {
3747 /* OR before, AND after: ideally we would recurse with
3748 * data_fake to get the AND applied by study of the
3749 * remainder of the pattern, and then derecurse;
3750 * *** HACK *** for now just treat as "no information".
3751 * See [perl #56690].
3753 cl_init(pRExC_state, data->start_class);
3755 /* AND before and after: combine and continue */
3756 const int was = (data->start_class->flags & ANYOF_EOS);
3758 cl_and(data->start_class, &intrnl);
3760 data->start_class->flags |= ANYOF_EOS;
3764 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3766 /* Positive Lookahead/lookbehind
3767 In this case we can do fixed string optimisation,
3768 but we must be careful about it. Note in the case of
3769 lookbehind the positions will be offset by the minimum
3770 length of the pattern, something we won't know about
3771 until after the recurse.
3773 I32 deltanext, fake = 0;
3775 struct regnode_charclass_class intrnl;
3777 /* We use SAVEFREEPV so that when the full compile
3778 is finished perl will clean up the allocated
3779 minlens when its all done. This was we don't
3780 have to worry about freeing them when we know
3781 they wont be used, which would be a pain.
3784 Newx( minnextp, 1, I32 );
3785 SAVEFREEPV(minnextp);
3788 StructCopy(data, &data_fake, scan_data_t);
3789 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3792 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3793 data_fake.last_found=newSVsv(data->last_found);
3797 data_fake.last_closep = &fake;
3798 data_fake.flags = 0;
3799 data_fake.pos_delta = delta;
3801 data_fake.flags |= SF_IS_INF;
3802 if ( flags & SCF_DO_STCLASS && !scan->flags
3803 && OP(scan) == IFMATCH ) { /* Lookahead */
3804 cl_init(pRExC_state, &intrnl);
3805 data_fake.start_class = &intrnl;
3806 f |= SCF_DO_STCLASS_AND;
3808 if (flags & SCF_WHILEM_VISITED_POS)
3809 f |= SCF_WHILEM_VISITED_POS;
3810 next = regnext(scan);
3811 nscan = NEXTOPER(NEXTOPER(scan));
3813 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3814 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3817 FAIL("Variable length lookbehind not implemented");
3819 else if (*minnextp > (I32)U8_MAX) {
3820 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3822 scan->flags = (U8)*minnextp;
3827 if (f & SCF_DO_STCLASS_AND) {
3828 const int was = (data->start_class->flags & ANYOF_EOS);
3830 cl_and(data->start_class, &intrnl);
3832 data->start_class->flags |= ANYOF_EOS;
3835 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3837 if (data_fake.flags & SF_HAS_EVAL)
3838 data->flags |= SF_HAS_EVAL;
3839 data->whilem_c = data_fake.whilem_c;
3840 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3841 if (RExC_rx->minlen<*minnextp)
3842 RExC_rx->minlen=*minnextp;
3843 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3844 SvREFCNT_dec(data_fake.last_found);
3846 if ( data_fake.minlen_fixed != minlenp )
3848 data->offset_fixed= data_fake.offset_fixed;
3849 data->minlen_fixed= data_fake.minlen_fixed;
3850 data->lookbehind_fixed+= scan->flags;
3852 if ( data_fake.minlen_float != minlenp )
3854 data->minlen_float= data_fake.minlen_float;
3855 data->offset_float_min=data_fake.offset_float_min;
3856 data->offset_float_max=data_fake.offset_float_max;
3857 data->lookbehind_float+= scan->flags;
3866 else if (OP(scan) == OPEN) {
3867 if (stopparen != (I32)ARG(scan))
3870 else if (OP(scan) == CLOSE) {
3871 if (stopparen == (I32)ARG(scan)) {
3874 if ((I32)ARG(scan) == is_par) {
3875 next = regnext(scan);
3877 if ( next && (OP(next) != WHILEM) && next < last)
3878 is_par = 0; /* Disable optimization */
3881 *(data->last_closep) = ARG(scan);
3883 else if (OP(scan) == EVAL) {
3885 data->flags |= SF_HAS_EVAL;
3887 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3888 if (flags & SCF_DO_SUBSTR) {
3889 SCAN_COMMIT(pRExC_state,data,minlenp);
3890 flags &= ~SCF_DO_SUBSTR;
3892 if (data && OP(scan)==ACCEPT) {
3893 data->flags |= SCF_SEEN_ACCEPT;
3898 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3900 if (flags & SCF_DO_SUBSTR) {
3901 SCAN_COMMIT(pRExC_state,data,minlenp);
3902 data->longest = &(data->longest_float);
3904 is_inf = is_inf_internal = 1;
3905 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3906 cl_anything(pRExC_state, data->start_class);
3907 flags &= ~SCF_DO_STCLASS;
3909 else if (OP(scan) == GPOS) {
3910 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3911 !(delta || is_inf || (data && data->pos_delta)))
3913 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3914 RExC_rx->extflags |= RXf_ANCH_GPOS;
3915 if (RExC_rx->gofs < (U32)min)
3916 RExC_rx->gofs = min;
3918 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3922 #ifdef TRIE_STUDY_OPT
3923 #ifdef FULL_TRIE_STUDY
3924 else if (PL_regkind[OP(scan)] == TRIE) {
3925 /* NOTE - There is similar code to this block above for handling
3926 BRANCH nodes on the initial study. If you change stuff here
3928 regnode *trie_node= scan;
3929 regnode *tail= regnext(scan);
3930 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3931 I32 max1 = 0, min1 = I32_MAX;
3932 struct regnode_charclass_class accum;
3934 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3935 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3936 if (flags & SCF_DO_STCLASS)
3937 cl_init_zero(pRExC_state, &accum);
3943 const regnode *nextbranch= NULL;
3946 for ( word=1 ; word <= trie->wordcount ; word++)
3948 I32 deltanext=0, minnext=0, f = 0, fake;
3949 struct regnode_charclass_class this_class;
3951 data_fake.flags = 0;
3953 data_fake.whilem_c = data->whilem_c;
3954 data_fake.last_closep = data->last_closep;
3957 data_fake.last_closep = &fake;
3958 data_fake.pos_delta = delta;
3959 if (flags & SCF_DO_STCLASS) {
3960 cl_init(pRExC_state, &this_class);
3961 data_fake.start_class = &this_class;
3962 f = SCF_DO_STCLASS_AND;
3964 if (flags & SCF_WHILEM_VISITED_POS)
3965 f |= SCF_WHILEM_VISITED_POS;
3967 if (trie->jump[word]) {
3969 nextbranch = trie_node + trie->jump[0];
3970 scan= trie_node + trie->jump[word];
3971 /* We go from the jump point to the branch that follows
3972 it. Note this means we need the vestigal unused branches
3973 even though they arent otherwise used.
3975 minnext = study_chunk(pRExC_state, &scan, minlenp,
3976 &deltanext, (regnode *)nextbranch, &data_fake,
3977 stopparen, recursed, NULL, f,depth+1);
3979 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3980 nextbranch= regnext((regnode*)nextbranch);
3982 if (min1 > (I32)(minnext + trie->minlen))
3983 min1 = minnext + trie->minlen;
3984 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3985 max1 = minnext + deltanext + trie->maxlen;
3986 if (deltanext == I32_MAX)
3987 is_inf = is_inf_internal = 1;
3989 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3991 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3992 if ( stopmin > min + min1)
3993 stopmin = min + min1;
3994 flags &= ~SCF_DO_SUBSTR;
3996 data->flags |= SCF_SEEN_ACCEPT;
3999 if (data_fake.flags & SF_HAS_EVAL)
4000 data->flags |= SF_HAS_EVAL;
4001 data->whilem_c = data_fake.whilem_c;
4003 if (flags & SCF_DO_STCLASS)
4004 cl_or(pRExC_state, &accum, &this_class);
4007 if (flags & SCF_DO_SUBSTR) {
4008 data->pos_min += min1;
4009 data->pos_delta += max1 - min1;
4010 if (max1 != min1 || is_inf)
4011 data->longest = &(data->longest_float);
4014 delta += max1 - min1;
4015 if (flags & SCF_DO_STCLASS_OR) {
4016 cl_or(pRExC_state, data->start_class, &accum);
4018 cl_and(data->start_class, and_withp);
4019 flags &= ~SCF_DO_STCLASS;
4022 else if (flags & SCF_DO_STCLASS_AND) {
4024 cl_and(data->start_class, &accum);
4025 flags &= ~SCF_DO_STCLASS;
4028 /* Switch to OR mode: cache the old value of
4029 * data->start_class */
4031 StructCopy(data->start_class, and_withp,
4032 struct regnode_charclass_class);
4033 flags &= ~SCF_DO_STCLASS_AND;
4034 StructCopy(&accum, data->start_class,
4035 struct regnode_charclass_class);
4036 flags |= SCF_DO_STCLASS_OR;
4037 data->start_class->flags |= ANYOF_EOS;
4044 else if (PL_regkind[OP(scan)] == TRIE) {
4045 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4048 min += trie->minlen;
4049 delta += (trie->maxlen - trie->minlen);
4050 flags &= ~SCF_DO_STCLASS; /* xxx */
4051 if (flags & SCF_DO_SUBSTR) {
4052 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4053 data->pos_min += trie->minlen;
4054 data->pos_delta += (trie->maxlen - trie->minlen);
4055 if (trie->maxlen != trie->minlen)
4056 data->longest = &(data->longest_float);
4058 if (trie->jump) /* no more substrings -- for now /grr*/
4059 flags &= ~SCF_DO_SUBSTR;
4061 #endif /* old or new */
4062 #endif /* TRIE_STUDY_OPT */
4064 /* Else: zero-length, ignore. */
4065 scan = regnext(scan);
4070 stopparen = frame->stop;
4071 frame = frame->prev;
4072 goto fake_study_recurse;
4077 DEBUG_STUDYDATA("pre-fin:",data,depth);
4080 *deltap = is_inf_internal ? I32_MAX : delta;
4081 if (flags & SCF_DO_SUBSTR && is_inf)
4082 data->pos_delta = I32_MAX - data->pos_min;
4083 if (is_par > (I32)U8_MAX)
4085 if (is_par && pars==1 && data) {
4086 data->flags |= SF_IN_PAR;
4087 data->flags &= ~SF_HAS_PAR;
4089 else if (pars && data) {
4090 data->flags |= SF_HAS_PAR;
4091 data->flags &= ~SF_IN_PAR;
4093 if (flags & SCF_DO_STCLASS_OR)
4094 cl_and(data->start_class, and_withp);
4095 if (flags & SCF_TRIE_RESTUDY)
4096 data->flags |= SCF_TRIE_RESTUDY;
4098 DEBUG_STUDYDATA("post-fin:",data,depth);
4100 return min < stopmin ? min : stopmin;
4104 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4106 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4108 PERL_ARGS_ASSERT_ADD_DATA;
4110 Renewc(RExC_rxi->data,
4111 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4112 char, struct reg_data);
4114 Renew(RExC_rxi->data->what, count + n, U8);
4116 Newx(RExC_rxi->data->what, n, U8);
4117 RExC_rxi->data->count = count + n;
4118 Copy(s, RExC_rxi->data->what + count, n, U8);
4122 /*XXX: todo make this not included in a non debugging perl */
4123 #ifndef PERL_IN_XSUB_RE
4125 Perl_reginitcolors(pTHX)
4128 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4130 char *t = savepv(s);
4134 t = strchr(t, '\t');
4140 PL_colors[i] = t = (char *)"";
4145 PL_colors[i++] = (char *)"";
4152 #ifdef TRIE_STUDY_OPT
4153 #define CHECK_RESTUDY_GOTO \
4155 (data.flags & SCF_TRIE_RESTUDY) \
4159 #define CHECK_RESTUDY_GOTO
4163 - pregcomp - compile a regular expression into internal code
4165 * We can't allocate space until we know how big the compiled form will be,
4166 * but we can't compile it (and thus know how big it is) until we've got a
4167 * place to put the code. So we cheat: we compile it twice, once with code
4168 * generation turned off and size counting turned on, and once "for real".
4169 * This also means that we don't allocate space until we are sure that the
4170 * thing really will compile successfully, and we never have to move the
4171 * code and thus invalidate pointers into it. (Note that it has to be in
4172 * one piece because free() must be able to free it all.) [NB: not true in perl]
4174 * Beware that the optimization-preparation code in here knows about some
4175 * of the structure of the compiled regexp. [I'll say.]
4180 #ifndef PERL_IN_XSUB_RE
4181 #define RE_ENGINE_PTR &reh_regexp_engine
4183 extern const struct regexp_engine my_reg_engine;
4184 #define RE_ENGINE_PTR &my_reg_engine
4187 #ifndef PERL_IN_XSUB_RE
4189 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4192 HV * const table = GvHV(PL_hintgv);
4194 PERL_ARGS_ASSERT_PREGCOMP;
4196 /* Dispatch a request to compile a regexp to correct
4199 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4200 GET_RE_DEBUG_FLAGS_DECL;
4201 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4202 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4204 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4207 return CALLREGCOMP_ENG(eng, pattern, flags);
4210 return Perl_re_compile(aTHX_ pattern, flags);
4215 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4220 register regexp_internal *ri;
4222 char *exp = SvPV(pattern, plen);
4223 char* xend = exp + plen;
4230 RExC_state_t RExC_state;
4231 RExC_state_t * const pRExC_state = &RExC_state;
4232 #ifdef TRIE_STUDY_OPT
4234 RExC_state_t copyRExC_state;
4236 GET_RE_DEBUG_FLAGS_DECL;
4238 PERL_ARGS_ASSERT_RE_COMPILE;
4240 DEBUG_r(if (!PL_colorset) reginitcolors());
4242 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4245 SV *dsv= sv_newmortal();
4246 RE_PV_QUOTED_DECL(s, RExC_utf8,
4247 dsv, exp, plen, 60);
4248 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4249 PL_colors[4],PL_colors[5],s);
4254 RExC_flags = pm_flags;
4258 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4259 RExC_seen_evals = 0;
4262 /* First pass: determine size, legality. */
4270 RExC_emit = &PL_regdummy;
4271 RExC_whilem_seen = 0;
4272 RExC_charnames = NULL;
4273 RExC_open_parens = NULL;
4274 RExC_close_parens = NULL;
4276 RExC_paren_names = NULL;
4278 RExC_paren_name_list = NULL;
4280 RExC_recurse = NULL;
4281 RExC_recurse_count = 0;
4283 #if 0 /* REGC() is (currently) a NOP at the first pass.
4284 * Clever compilers notice this and complain. --jhi */
4285 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4287 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4288 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4289 RExC_precomp = NULL;
4292 if (RExC_utf8 && !RExC_orig_utf8) {
4293 /* It's possible to write a regexp in ascii that represents Unicode
4294 codepoints outside of the byte range, such as via \x{100}. If we
4295 detect such a sequence we have to convert the entire pattern to utf8
4296 and then recompile, as our sizing calculation will have been based
4297 on 1 byte == 1 character, but we will need to use utf8 to encode
4298 at least some part of the pattern, and therefore must convert the whole
4300 XXX: somehow figure out how to make this less expensive...
4303 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4304 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4305 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4307 RExC_orig_utf8 = RExC_utf8;
4309 goto redo_first_pass;
4312 PerlIO_printf(Perl_debug_log,
4313 "Required size %"IVdf" nodes\n"
4314 "Starting second pass (creation)\n",
4317 RExC_lastparse=NULL;
4319 /* Small enough for pointer-storage convention?
4320 If extralen==0, this means that we will not need long jumps. */
4321 if (RExC_size >= 0x10000L && RExC_extralen)
4322 RExC_size += RExC_extralen;
4325 if (RExC_whilem_seen > 15)
4326 RExC_whilem_seen = 15;
4328 /* Allocate space and zero-initialize. Note, the two step process
4329 of zeroing when in debug mode, thus anything assigned has to
4330 happen after that */
4331 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4332 r = (struct regexp*)SvANY(rx);
4333 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4334 char, regexp_internal);
4335 if ( r == NULL || ri == NULL )
4336 FAIL("Regexp out of space");
4338 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4339 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4341 /* bulk initialize base fields with 0. */
4342 Zero(ri, sizeof(regexp_internal), char);
4345 /* non-zero initialization begins here */
4347 r->engine= RE_ENGINE_PTR;
4348 r->extflags = pm_flags;
4350 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4351 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4352 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4353 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4354 >> RXf_PMf_STD_PMMOD_SHIFT);
4355 const char *fptr = STD_PAT_MODS; /*"msix"*/
4357 const STRLEN wraplen = plen + has_minus + has_p + has_runon
4358 + (sizeof(STD_PAT_MODS) - 1)
4359 + (sizeof("(?:)") - 1);
4361 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4362 SvCUR_set(rx, wraplen);
4364 SvFLAGS(rx) |= SvUTF8(pattern);
4367 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4369 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4370 char *colon = r + 1;
4373 while((ch = *fptr++)) {
4387 Copy(RExC_precomp, p, plen, char);
4388 assert ((RX_WRAPPED(rx) - p) < 16);
4389 r->pre_prefix = p - RX_WRAPPED(rx);
4398 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4400 if (RExC_seen & REG_SEEN_RECURSE) {
4401 Newxz(RExC_open_parens, RExC_npar,regnode *);
4402 SAVEFREEPV(RExC_open_parens);
4403 Newxz(RExC_close_parens,RExC_npar,regnode *);
4404 SAVEFREEPV(RExC_close_parens);
4407 /* Useful during FAIL. */
4408 #ifdef RE_TRACK_PATTERN_OFFSETS
4409 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4410 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4411 "%s %"UVuf" bytes for offset annotations.\n",
4412 ri->u.offsets ? "Got" : "Couldn't get",
4413 (UV)((2*RExC_size+1) * sizeof(U32))));
4415 SetProgLen(ri,RExC_size);
4419 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4421 /* Second pass: emit code. */
4422 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4427 RExC_emit_start = ri->program;
4428 RExC_emit = ri->program;
4429 RExC_emit_bound = ri->program + RExC_size + 1;
4431 /* Store the count of eval-groups for security checks: */
4432 RExC_rx->seen_evals = RExC_seen_evals;
4433 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4434 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4438 /* XXXX To minimize changes to RE engine we always allocate
4439 3-units-long substrs field. */
4440 Newx(r->substrs, 1, struct reg_substr_data);
4441 if (RExC_recurse_count) {
4442 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4443 SAVEFREEPV(RExC_recurse);
4447 r->minlen = minlen = sawplus = sawopen = 0;
4448 Zero(r->substrs, 1, struct reg_substr_data);
4450 #ifdef TRIE_STUDY_OPT
4452 StructCopy(&zero_scan_data, &data, scan_data_t);
4453 copyRExC_state = RExC_state;
4456 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4458 RExC_state = copyRExC_state;
4459 if (seen & REG_TOP_LEVEL_BRANCHES)
4460 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4462 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4463 if (data.last_found) {
4464 SvREFCNT_dec(data.longest_fixed);
4465 SvREFCNT_dec(data.longest_float);
4466 SvREFCNT_dec(data.last_found);
4468 StructCopy(&zero_scan_data, &data, scan_data_t);
4471 StructCopy(&zero_scan_data, &data, scan_data_t);
4474 /* Dig out information for optimizations. */
4475 r->extflags = RExC_flags; /* was pm_op */
4476 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4479 SvUTF8_on(rx); /* Unicode in it? */
4480 ri->regstclass = NULL;
4481 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4482 r->intflags |= PREGf_NAUGHTY;
4483 scan = ri->program + 1; /* First BRANCH. */
4485 /* testing for BRANCH here tells us whether there is "must appear"
4486 data in the pattern. If there is then we can use it for optimisations */
4487 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4489 STRLEN longest_float_length, longest_fixed_length;
4490 struct regnode_charclass_class ch_class; /* pointed to by data */
4492 I32 last_close = 0; /* pointed to by data */
4493 regnode *first= scan;
4494 regnode *first_next= regnext(first);
4497 * Skip introductions and multiplicators >= 1
4498 * so that we can extract the 'meat' of the pattern that must
4499 * match in the large if() sequence following.
4500 * NOTE that EXACT is NOT covered here, as it is normally
4501 * picked up by the optimiser separately.
4503 * This is unfortunate as the optimiser isnt handling lookahead
4504 * properly currently.
4507 while ((OP(first) == OPEN && (sawopen = 1)) ||
4508 /* An OR of *one* alternative - should not happen now. */
4509 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4510 /* for now we can't handle lookbehind IFMATCH*/
4511 (OP(first) == IFMATCH && !first->flags) ||
4512 (OP(first) == PLUS) ||
4513 (OP(first) == MINMOD) ||
4514 /* An {n,m} with n>0 */
4515 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4516 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4519 * the only op that could be a regnode is PLUS, all the rest
4520 * will be regnode_1 or regnode_2.
4523 if (OP(first) == PLUS)
4526 first += regarglen[OP(first)];
4528 first = NEXTOPER(first);
4529 first_next= regnext(first);
4532 /* Starting-point info. */
4534 DEBUG_PEEP("first:",first,0);
4535 /* Ignore EXACT as we deal with it later. */
4536 if (PL_regkind[OP(first)] == EXACT) {
4537 if (OP(first) == EXACT)
4538 NOOP; /* Empty, get anchored substr later. */
4539 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4540 ri->regstclass = first;
4543 else if (PL_regkind[OP(first)] == TRIE &&
4544 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4547 /* this can happen only on restudy */
4548 if ( OP(first) == TRIE ) {
4549 struct regnode_1 *trieop = (struct regnode_1 *)
4550 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4551 StructCopy(first,trieop,struct regnode_1);
4552 trie_op=(regnode *)trieop;
4554 struct regnode_charclass *trieop = (struct regnode_charclass *)
4555 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4556 StructCopy(first,trieop,struct regnode_charclass);
4557 trie_op=(regnode *)trieop;
4560 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4561 ri->regstclass = trie_op;
4564 else if (strchr((const char*)PL_simple,OP(first)))
4565 ri->regstclass = first;
4566 else if (PL_regkind[OP(first)] == BOUND ||
4567 PL_regkind[OP(first)] == NBOUND)
4568 ri->regstclass = first;
4569 else if (PL_regkind[OP(first)] == BOL) {
4570 r->extflags |= (OP(first) == MBOL
4572 : (OP(first) == SBOL
4575 first = NEXTOPER(first);
4578 else if (OP(first) == GPOS) {
4579 r->extflags |= RXf_ANCH_GPOS;
4580 first = NEXTOPER(first);
4583 else if ((!sawopen || !RExC_sawback) &&
4584 (OP(first) == STAR &&
4585 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4586 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4588 /* turn .* into ^.* with an implied $*=1 */
4590 (OP(NEXTOPER(first)) == REG_ANY)
4593 r->extflags |= type;
4594 r->intflags |= PREGf_IMPLICIT;
4595 first = NEXTOPER(first);
4598 if (sawplus && (!sawopen || !RExC_sawback)
4599 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4600 /* x+ must match at the 1st pos of run of x's */
4601 r->intflags |= PREGf_SKIP;
4603 /* Scan is after the zeroth branch, first is atomic matcher. */
4604 #ifdef TRIE_STUDY_OPT
4607 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4608 (IV)(first - scan + 1))
4612 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4613 (IV)(first - scan + 1))
4619 * If there's something expensive in the r.e., find the
4620 * longest literal string that must appear and make it the
4621 * regmust. Resolve ties in favor of later strings, since
4622 * the regstart check works with the beginning of the r.e.
4623 * and avoiding duplication strengthens checking. Not a
4624 * strong reason, but sufficient in the absence of others.
4625 * [Now we resolve ties in favor of the earlier string if
4626 * it happens that c_offset_min has been invalidated, since the
4627 * earlier string may buy us something the later one won't.]
4630 data.longest_fixed = newSVpvs("");
4631 data.longest_float = newSVpvs("");
4632 data.last_found = newSVpvs("");
4633 data.longest = &(data.longest_fixed);
4635 if (!ri->regstclass) {
4636 cl_init(pRExC_state, &ch_class);
4637 data.start_class = &ch_class;
4638 stclass_flag = SCF_DO_STCLASS_AND;
4639 } else /* XXXX Check for BOUND? */
4641 data.last_closep = &last_close;
4643 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4644 &data, -1, NULL, NULL,
4645 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4651 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4652 && data.last_start_min == 0 && data.last_end > 0
4653 && !RExC_seen_zerolen
4654 && !(RExC_seen & REG_SEEN_VERBARG)
4655 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4656 r->extflags |= RXf_CHECK_ALL;
4657 scan_commit(pRExC_state, &data,&minlen,0);
4658 SvREFCNT_dec(data.last_found);
4660 /* Note that code very similar to this but for anchored string
4661 follows immediately below, changes may need to be made to both.
4664 longest_float_length = CHR_SVLEN(data.longest_float);
4665 if (longest_float_length
4666 || (data.flags & SF_FL_BEFORE_EOL
4667 && (!(data.flags & SF_FL_BEFORE_MEOL)
4668 || (RExC_flags & RXf_PMf_MULTILINE))))
4672 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4673 && data.offset_fixed == data.offset_float_min
4674 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4675 goto remove_float; /* As in (a)+. */
4677 /* copy the information about the longest float from the reg_scan_data
4678 over to the program. */
4679 if (SvUTF8(data.longest_float)) {
4680 r->float_utf8 = data.longest_float;
4681 r->float_substr = NULL;
4683 r->float_substr = data.longest_float;
4684 r->float_utf8 = NULL;
4686 /* float_end_shift is how many chars that must be matched that
4687 follow this item. We calculate it ahead of time as once the
4688 lookbehind offset is added in we lose the ability to correctly
4690 ml = data.minlen_float ? *(data.minlen_float)
4691 : (I32)longest_float_length;
4692 r->float_end_shift = ml - data.offset_float_min
4693 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4694 + data.lookbehind_float;
4695 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4696 r->float_max_offset = data.offset_float_max;
4697 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4698 r->float_max_offset -= data.lookbehind_float;
4700 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4701 && (!(data.flags & SF_FL_BEFORE_MEOL)
4702 || (RExC_flags & RXf_PMf_MULTILINE)));
4703 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4707 r->float_substr = r->float_utf8 = NULL;
4708 SvREFCNT_dec(data.longest_float);
4709 longest_float_length = 0;
4712 /* Note that code very similar to this but for floating string
4713 is immediately above, changes may need to be made to both.
4716 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4717 if (longest_fixed_length
4718 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4719 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4720 || (RExC_flags & RXf_PMf_MULTILINE))))
4724 /* copy the information about the longest fixed
4725 from the reg_scan_data over to the program. */
4726 if (SvUTF8(data.longest_fixed)) {
4727 r->anchored_utf8 = data.longest_fixed;
4728 r->anchored_substr = NULL;
4730 r->anchored_substr = data.longest_fixed;
4731 r->anchored_utf8 = NULL;
4733 /* fixed_end_shift is how many chars that must be matched that
4734 follow this item. We calculate it ahead of time as once the
4735 lookbehind offset is added in we lose the ability to correctly
4737 ml = data.minlen_fixed ? *(data.minlen_fixed)
4738 : (I32)longest_fixed_length;
4739 r->anchored_end_shift = ml - data.offset_fixed
4740 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4741 + data.lookbehind_fixed;
4742 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4744 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4745 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4746 || (RExC_flags & RXf_PMf_MULTILINE)));
4747 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4750 r->anchored_substr = r->anchored_utf8 = NULL;
4751 SvREFCNT_dec(data.longest_fixed);
4752 longest_fixed_length = 0;
4755 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4756 ri->regstclass = NULL;
4757 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4759 && !(data.start_class->flags & ANYOF_EOS)
4760 && !cl_is_anything(data.start_class))
4762 const U32 n = add_data(pRExC_state, 1, "f");
4764 Newx(RExC_rxi->data->data[n], 1,
4765 struct regnode_charclass_class);
4766 StructCopy(data.start_class,
4767 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4768 struct regnode_charclass_class);
4769 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4770 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4771 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4772 regprop(r, sv, (regnode*)data.start_class);
4773 PerlIO_printf(Perl_debug_log,
4774 "synthetic stclass \"%s\".\n",
4775 SvPVX_const(sv));});
4778 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4779 if (longest_fixed_length > longest_float_length) {
4780 r->check_end_shift = r->anchored_end_shift;
4781 r->check_substr = r->anchored_substr;
4782 r->check_utf8 = r->anchored_utf8;
4783 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4784 if (r->extflags & RXf_ANCH_SINGLE)
4785 r->extflags |= RXf_NOSCAN;
4788 r->check_end_shift = r->float_end_shift;
4789 r->check_substr = r->float_substr;
4790 r->check_utf8 = r->float_utf8;
4791 r->check_offset_min = r->float_min_offset;
4792 r->check_offset_max = r->float_max_offset;
4794 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4795 This should be changed ASAP! */
4796 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4797 r->extflags |= RXf_USE_INTUIT;
4798 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4799 r->extflags |= RXf_INTUIT_TAIL;
4801 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4802 if ( (STRLEN)minlen < longest_float_length )
4803 minlen= longest_float_length;
4804 if ( (STRLEN)minlen < longest_fixed_length )
4805 minlen= longest_fixed_length;
4809 /* Several toplevels. Best we can is to set minlen. */
4811 struct regnode_charclass_class ch_class;
4814 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4816 scan = ri->program + 1;
4817 cl_init(pRExC_state, &ch_class);
4818 data.start_class = &ch_class;
4819 data.last_closep = &last_close;
4822 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4823 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4827 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4828 = r->float_substr = r->float_utf8 = NULL;
4829 if (!(data.start_class->flags & ANYOF_EOS)
4830 && !cl_is_anything(data.start_class))
4832 const U32 n = add_data(pRExC_state, 1, "f");
4834 Newx(RExC_rxi->data->data[n], 1,
4835 struct regnode_charclass_class);
4836 StructCopy(data.start_class,
4837 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4838 struct regnode_charclass_class);
4839 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4840 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4841 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4842 regprop(r, sv, (regnode*)data.start_class);
4843 PerlIO_printf(Perl_debug_log,
4844 "synthetic stclass \"%s\".\n",
4845 SvPVX_const(sv));});
4849 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4850 the "real" pattern. */
4852 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4853 (IV)minlen, (IV)r->minlen);
4855 r->minlenret = minlen;
4856 if (r->minlen < minlen)
4859 if (RExC_seen & REG_SEEN_GPOS)
4860 r->extflags |= RXf_GPOS_SEEN;
4861 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4862 r->extflags |= RXf_LOOKBEHIND_SEEN;
4863 if (RExC_seen & REG_SEEN_EVAL)
4864 r->extflags |= RXf_EVAL_SEEN;
4865 if (RExC_seen & REG_SEEN_CANY)
4866 r->extflags |= RXf_CANY_SEEN;
4867 if (RExC_seen & REG_SEEN_VERBARG)
4868 r->intflags |= PREGf_VERBARG_SEEN;
4869 if (RExC_seen & REG_SEEN_CUTGROUP)
4870 r->intflags |= PREGf_CUTGROUP_SEEN;
4871 if (RExC_paren_names)
4872 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4874 RXp_PAREN_NAMES(r) = NULL;
4876 #ifdef STUPID_PATTERN_CHECKS
4877 if (RX_PRELEN(rx) == 0)
4878 r->extflags |= RXf_NULL;
4879 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4880 /* XXX: this should happen BEFORE we compile */
4881 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4882 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4883 r->extflags |= RXf_WHITE;
4884 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4885 r->extflags |= RXf_START_ONLY;
4887 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4888 /* XXX: this should happen BEFORE we compile */
4889 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4891 regnode *first = ri->program + 1;
4893 U8 nop = OP(NEXTOPER(first));
4895 if (PL_regkind[fop] == NOTHING && nop == END)
4896 r->extflags |= RXf_NULL;
4897 else if (PL_regkind[fop] == BOL && nop == END)
4898 r->extflags |= RXf_START_ONLY;
4899 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4900 r->extflags |= RXf_WHITE;
4904 if (RExC_paren_names) {
4905 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4906 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4909 ri->name_list_idx = 0;
4911 if (RExC_recurse_count) {
4912 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4913 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4914 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4917 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4918 /* assume we don't need to swap parens around before we match */
4921 PerlIO_printf(Perl_debug_log,"Final program:\n");
4924 #ifdef RE_TRACK_PATTERN_OFFSETS
4925 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4926 const U32 len = ri->u.offsets[0];
4928 GET_RE_DEBUG_FLAGS_DECL;
4929 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4930 for (i = 1; i <= len; i++) {
4931 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4932 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4933 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4935 PerlIO_printf(Perl_debug_log, "\n");
4941 #undef RE_ENGINE_PTR
4945 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4948 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4950 PERL_UNUSED_ARG(value);
4952 if (flags & RXapif_FETCH) {
4953 return reg_named_buff_fetch(rx, key, flags);
4954 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4955 Perl_croak(aTHX_ "%s", PL_no_modify);
4957 } else if (flags & RXapif_EXISTS) {
4958 return reg_named_buff_exists(rx, key, flags)
4961 } else if (flags & RXapif_REGNAMES) {
4962 return reg_named_buff_all(rx, flags);
4963 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4964 return reg_named_buff_scalar(rx, flags);
4966 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4972 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4975 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4976 PERL_UNUSED_ARG(lastkey);
4978 if (flags & RXapif_FIRSTKEY)
4979 return reg_named_buff_firstkey(rx, flags);
4980 else if (flags & RXapif_NEXTKEY)
4981 return reg_named_buff_nextkey(rx, flags);
4983 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4989 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4992 AV *retarray = NULL;
4994 struct regexp *const rx = (struct regexp *)SvANY(r);
4996 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4998 if (flags & RXapif_ALL)
5001 if (rx && RXp_PAREN_NAMES(rx)) {
5002 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5005 SV* sv_dat=HeVAL(he_str);
5006 I32 *nums=(I32*)SvPVX(sv_dat);
5007 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5008 if ((I32)(rx->nparens) >= nums[i]
5009 && rx->offs[nums[i]].start != -1
5010 && rx->offs[nums[i]].end != -1)
5013 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5017 ret = newSVsv(&PL_sv_undef);
5020 av_push(retarray, ret);
5023 return newRV_noinc(MUTABLE_SV(retarray));
5030 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5033 struct regexp *const rx = (struct regexp *)SvANY(r);
5035 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5037 if (rx && RXp_PAREN_NAMES(rx)) {
5038 if (flags & RXapif_ALL) {
5039 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5041 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5055 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5057 struct regexp *const rx = (struct regexp *)SvANY(r);
5059 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5061 if ( rx && RXp_PAREN_NAMES(rx) ) {
5062 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5064 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5071 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5073 struct regexp *const rx = (struct regexp *)SvANY(r);
5074 GET_RE_DEBUG_FLAGS_DECL;
5076 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5078 if (rx && RXp_PAREN_NAMES(rx)) {
5079 HV *hv = RXp_PAREN_NAMES(rx);
5081 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5084 SV* sv_dat = HeVAL(temphe);
5085 I32 *nums = (I32*)SvPVX(sv_dat);
5086 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5087 if ((I32)(rx->lastparen) >= nums[i] &&
5088 rx->offs[nums[i]].start != -1 &&
5089 rx->offs[nums[i]].end != -1)
5095 if (parno || flags & RXapif_ALL) {
5096 return newSVhek(HeKEY_hek(temphe));
5104 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5109 struct regexp *const rx = (struct regexp *)SvANY(r);
5111 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5113 if (rx && RXp_PAREN_NAMES(rx)) {
5114 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5115 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5116 } else if (flags & RXapif_ONE) {
5117 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5118 av = MUTABLE_AV(SvRV(ret));
5119 length = av_len(av);
5121 return newSViv(length + 1);
5123 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5127 return &PL_sv_undef;
5131 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5133 struct regexp *const rx = (struct regexp *)SvANY(r);
5136 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5138 if (rx && RXp_PAREN_NAMES(rx)) {
5139 HV *hv= RXp_PAREN_NAMES(rx);
5141 (void)hv_iterinit(hv);
5142 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5145 SV* sv_dat = HeVAL(temphe);
5146 I32 *nums = (I32*)SvPVX(sv_dat);
5147 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5148 if ((I32)(rx->lastparen) >= nums[i] &&
5149 rx->offs[nums[i]].start != -1 &&
5150 rx->offs[nums[i]].end != -1)
5156 if (parno || flags & RXapif_ALL) {
5157 av_push(av, newSVhek(HeKEY_hek(temphe)));
5162 return newRV_noinc(MUTABLE_SV(av));
5166 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5169 struct regexp *const rx = (struct regexp *)SvANY(r);
5174 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5177 sv_setsv(sv,&PL_sv_undef);
5181 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5183 i = rx->offs[0].start;
5187 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5189 s = rx->subbeg + rx->offs[0].end;
5190 i = rx->sublen - rx->offs[0].end;
5193 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5194 (s1 = rx->offs[paren].start) != -1 &&
5195 (t1 = rx->offs[paren].end) != -1)
5199 s = rx->subbeg + s1;
5201 sv_setsv(sv,&PL_sv_undef);
5204 assert(rx->sublen >= (s - rx->subbeg) + i );
5206 const int oldtainted = PL_tainted;
5208 sv_setpvn(sv, s, i);
5209 PL_tainted = oldtainted;
5210 if ( (rx->extflags & RXf_CANY_SEEN)
5211 ? (RXp_MATCH_UTF8(rx)
5212 && (!i || is_utf8_string((U8*)s, i)))
5213 : (RXp_MATCH_UTF8(rx)) )
5220 if (RXp_MATCH_TAINTED(rx)) {
5221 if (SvTYPE(sv) >= SVt_PVMG) {
5222 MAGIC* const mg = SvMAGIC(sv);
5225 SvMAGIC_set(sv, mg->mg_moremagic);
5227 if ((mgt = SvMAGIC(sv))) {
5228 mg->mg_moremagic = mgt;
5229 SvMAGIC_set(sv, mg);
5239 sv_setsv(sv,&PL_sv_undef);
5245 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5246 SV const * const value)
5248 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5250 PERL_UNUSED_ARG(rx);
5251 PERL_UNUSED_ARG(paren);
5252 PERL_UNUSED_ARG(value);
5255 Perl_croak(aTHX_ "%s", PL_no_modify);
5259 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5262 struct regexp *const rx = (struct regexp *)SvANY(r);
5266 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5268 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5270 /* $` / ${^PREMATCH} */
5271 case RX_BUFF_IDX_PREMATCH:
5272 if (rx->offs[0].start != -1) {
5273 i = rx->offs[0].start;
5281 /* $' / ${^POSTMATCH} */
5282 case RX_BUFF_IDX_POSTMATCH:
5283 if (rx->offs[0].end != -1) {
5284 i = rx->sublen - rx->offs[0].end;
5286 s1 = rx->offs[0].end;
5292 /* $& / ${^MATCH}, $1, $2, ... */
5294 if (paren <= (I32)rx->nparens &&
5295 (s1 = rx->offs[paren].start) != -1 &&
5296 (t1 = rx->offs[paren].end) != -1)
5301 if (ckWARN(WARN_UNINITIALIZED))
5302 report_uninit((const SV *)sv);
5307 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5308 const char * const s = rx->subbeg + s1;
5313 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5320 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5322 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5323 PERL_UNUSED_ARG(rx);
5327 return newSVpvs("Regexp");
5330 /* Scans the name of a named buffer from the pattern.
5331 * If flags is REG_RSN_RETURN_NULL returns null.
5332 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5333 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5334 * to the parsed name as looked up in the RExC_paren_names hash.
5335 * If there is an error throws a vFAIL().. type exception.
5338 #define REG_RSN_RETURN_NULL 0
5339 #define REG_RSN_RETURN_NAME 1
5340 #define REG_RSN_RETURN_DATA 2
5343 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5345 char *name_start = RExC_parse;
5347 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5349 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5350 /* skip IDFIRST by using do...while */
5353 RExC_parse += UTF8SKIP(RExC_parse);
5354 } while (isALNUM_utf8((U8*)RExC_parse));
5358 } while (isALNUM(*RExC_parse));
5363 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5364 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5365 if ( flags == REG_RSN_RETURN_NAME)
5367 else if (flags==REG_RSN_RETURN_DATA) {
5370 if ( ! sv_name ) /* should not happen*/
5371 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5372 if (RExC_paren_names)
5373 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5375 sv_dat = HeVAL(he_str);
5377 vFAIL("Reference to nonexistent named group");
5381 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5388 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5389 int rem=(int)(RExC_end - RExC_parse); \
5398 if (RExC_lastparse!=RExC_parse) \
5399 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5402 iscut ? "..." : "<" \
5405 PerlIO_printf(Perl_debug_log,"%16s",""); \
5408 num = RExC_size + 1; \
5410 num=REG_NODE_NUM(RExC_emit); \
5411 if (RExC_lastnum!=num) \
5412 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5414 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5415 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5416 (int)((depth*2)), "", \
5420 RExC_lastparse=RExC_parse; \
5425 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5426 DEBUG_PARSE_MSG((funcname)); \
5427 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5429 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5430 DEBUG_PARSE_MSG((funcname)); \
5431 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5434 - reg - regular expression, i.e. main body or parenthesized thing
5436 * Caller must absorb opening parenthesis.
5438 * Combining parenthesis handling with the base level of regular expression
5439 * is a trifle forced, but the need to tie the tails of the branches to what
5440 * follows makes it hard to avoid.
5442 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5444 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5446 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5450 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5451 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5454 register regnode *ret; /* Will be the head of the group. */
5455 register regnode *br;
5456 register regnode *lastbr;
5457 register regnode *ender = NULL;
5458 register I32 parno = 0;
5460 U32 oregflags = RExC_flags;
5461 bool have_branch = 0;
5463 I32 freeze_paren = 0;
5464 I32 after_freeze = 0;
5466 /* for (?g), (?gc), and (?o) warnings; warning
5467 about (?c) will warn about (?g) -- japhy */
5469 #define WASTED_O 0x01
5470 #define WASTED_G 0x02
5471 #define WASTED_C 0x04
5472 #define WASTED_GC (0x02|0x04)
5473 I32 wastedflags = 0x00;
5475 char * parse_start = RExC_parse; /* MJD */
5476 char * const oregcomp_parse = RExC_parse;
5478 GET_RE_DEBUG_FLAGS_DECL;
5480 PERL_ARGS_ASSERT_REG;
5481 DEBUG_PARSE("reg ");
5483 *flagp = 0; /* Tentatively. */
5486 /* Make an OPEN node, if parenthesized. */
5488 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5489 char *start_verb = RExC_parse;
5490 STRLEN verb_len = 0;
5491 char *start_arg = NULL;
5492 unsigned char op = 0;
5494 int internal_argval = 0; /* internal_argval is only useful if !argok */
5495 while ( *RExC_parse && *RExC_parse != ')' ) {
5496 if ( *RExC_parse == ':' ) {
5497 start_arg = RExC_parse + 1;
5503 verb_len = RExC_parse - start_verb;
5506 while ( *RExC_parse && *RExC_parse != ')' )
5508 if ( *RExC_parse != ')' )
5509 vFAIL("Unterminated verb pattern argument");
5510 if ( RExC_parse == start_arg )
5513 if ( *RExC_parse != ')' )
5514 vFAIL("Unterminated verb pattern");
5517 switch ( *start_verb ) {
5518 case 'A': /* (*ACCEPT) */
5519 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5521 internal_argval = RExC_nestroot;
5524 case 'C': /* (*COMMIT) */
5525 if ( memEQs(start_verb,verb_len,"COMMIT") )
5528 case 'F': /* (*FAIL) */
5529 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5534 case ':': /* (*:NAME) */
5535 case 'M': /* (*MARK:NAME) */
5536 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5541 case 'P': /* (*PRUNE) */
5542 if ( memEQs(start_verb,verb_len,"PRUNE") )
5545 case 'S': /* (*SKIP) */
5546 if ( memEQs(start_verb,verb_len,"SKIP") )
5549 case 'T': /* (*THEN) */
5550 /* [19:06] <TimToady> :: is then */
5551 if ( memEQs(start_verb,verb_len,"THEN") ) {
5553 RExC_seen |= REG_SEEN_CUTGROUP;
5559 vFAIL3("Unknown verb pattern '%.*s'",
5560 verb_len, start_verb);
5563 if ( start_arg && internal_argval ) {
5564 vFAIL3("Verb pattern '%.*s' may not have an argument",
5565 verb_len, start_verb);
5566 } else if ( argok < 0 && !start_arg ) {
5567 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5568 verb_len, start_verb);
5570 ret = reganode(pRExC_state, op, internal_argval);
5571 if ( ! internal_argval && ! SIZE_ONLY ) {
5573 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5574 ARG(ret) = add_data( pRExC_state, 1, "S" );
5575 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5582 if (!internal_argval)
5583 RExC_seen |= REG_SEEN_VERBARG;
5584 } else if ( start_arg ) {
5585 vFAIL3("Verb pattern '%.*s' may not have an argument",
5586 verb_len, start_verb);
5588 ret = reg_node(pRExC_state, op);
5590 nextchar(pRExC_state);
5593 if (*RExC_parse == '?') { /* (?...) */
5594 bool is_logical = 0;
5595 const char * const seqstart = RExC_parse;
5598 paren = *RExC_parse++;
5599 ret = NULL; /* For look-ahead/behind. */
5602 case 'P': /* (?P...) variants for those used to PCRE/Python */
5603 paren = *RExC_parse++;
5604 if ( paren == '<') /* (?P<...>) named capture */
5606 else if (paren == '>') { /* (?P>name) named recursion */
5607 goto named_recursion;
5609 else if (paren == '=') { /* (?P=...) named backref */
5610 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5611 you change this make sure you change that */
5612 char* name_start = RExC_parse;
5614 SV *sv_dat = reg_scan_name(pRExC_state,
5615 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5616 if (RExC_parse == name_start || *RExC_parse != ')')
5617 vFAIL2("Sequence %.3s... not terminated",parse_start);
5620 num = add_data( pRExC_state, 1, "S" );
5621 RExC_rxi->data->data[num]=(void*)sv_dat;
5622 SvREFCNT_inc_simple_void(sv_dat);
5625 ret = reganode(pRExC_state,
5626 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5630 Set_Node_Offset(ret, parse_start+1);
5631 Set_Node_Cur_Length(ret); /* MJD */
5633 nextchar(pRExC_state);
5637 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5639 case '<': /* (?<...) */
5640 if (*RExC_parse == '!')
5642 else if (*RExC_parse != '=')
5648 case '\'': /* (?'...') */
5649 name_start= RExC_parse;
5650 svname = reg_scan_name(pRExC_state,
5651 SIZE_ONLY ? /* reverse test from the others */
5652 REG_RSN_RETURN_NAME :
5653 REG_RSN_RETURN_NULL);
5654 if (RExC_parse == name_start) {
5656 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5659 if (*RExC_parse != paren)
5660 vFAIL2("Sequence (?%c... not terminated",
5661 paren=='>' ? '<' : paren);
5665 if (!svname) /* shouldnt happen */
5667 "panic: reg_scan_name returned NULL");
5668 if (!RExC_paren_names) {
5669 RExC_paren_names= newHV();
5670 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5672 RExC_paren_name_list= newAV();
5673 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5676 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5678 sv_dat = HeVAL(he_str);
5680 /* croak baby croak */
5682 "panic: paren_name hash element allocation failed");
5683 } else if ( SvPOK(sv_dat) ) {
5684 /* (?|...) can mean we have dupes so scan to check
5685 its already been stored. Maybe a flag indicating
5686 we are inside such a construct would be useful,
5687 but the arrays are likely to be quite small, so
5688 for now we punt -- dmq */
5689 IV count = SvIV(sv_dat);
5690 I32 *pv = (I32*)SvPVX(sv_dat);
5692 for ( i = 0 ; i < count ; i++ ) {
5693 if ( pv[i] == RExC_npar ) {
5699 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5700 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5701 pv[count] = RExC_npar;
5702 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5705 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5706 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5708 SvIV_set(sv_dat, 1);
5711 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5712 SvREFCNT_dec(svname);
5715 /*sv_dump(sv_dat);*/
5717 nextchar(pRExC_state);
5719 goto capturing_parens;
5721 RExC_seen |= REG_SEEN_LOOKBEHIND;
5723 case '=': /* (?=...) */
5724 RExC_seen_zerolen++;
5726 case '!': /* (?!...) */
5727 RExC_seen_zerolen++;
5728 if (*RExC_parse == ')') {
5729 ret=reg_node(pRExC_state, OPFAIL);
5730 nextchar(pRExC_state);
5734 case '|': /* (?|...) */
5735 /* branch reset, behave like a (?:...) except that
5736 buffers in alternations share the same numbers */
5738 after_freeze = freeze_paren = RExC_npar;
5740 case ':': /* (?:...) */
5741 case '>': /* (?>...) */
5743 case '$': /* (?$...) */
5744 case '@': /* (?@...) */
5745 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5747 case '#': /* (?#...) */
5748 while (*RExC_parse && *RExC_parse != ')')
5750 if (*RExC_parse != ')')
5751 FAIL("Sequence (?#... not terminated");
5752 nextchar(pRExC_state);
5755 case '0' : /* (?0) */
5756 case 'R' : /* (?R) */
5757 if (*RExC_parse != ')')
5758 FAIL("Sequence (?R) not terminated");
5759 ret = reg_node(pRExC_state, GOSTART);
5760 *flagp |= POSTPONED;
5761 nextchar(pRExC_state);
5764 { /* named and numeric backreferences */
5766 case '&': /* (?&NAME) */
5767 parse_start = RExC_parse - 1;
5770 SV *sv_dat = reg_scan_name(pRExC_state,
5771 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5772 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5774 goto gen_recurse_regop;
5777 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5779 vFAIL("Illegal pattern");
5781 goto parse_recursion;
5783 case '-': /* (?-1) */
5784 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5785 RExC_parse--; /* rewind to let it be handled later */
5789 case '1': case '2': case '3': case '4': /* (?1) */
5790 case '5': case '6': case '7': case '8': case '9':
5793 num = atoi(RExC_parse);
5794 parse_start = RExC_parse - 1; /* MJD */
5795 if (*RExC_parse == '-')
5797 while (isDIGIT(*RExC_parse))
5799 if (*RExC_parse!=')')
5800 vFAIL("Expecting close bracket");
5803 if ( paren == '-' ) {
5805 Diagram of capture buffer numbering.
5806 Top line is the normal capture buffer numbers
5807 Botton line is the negative indexing as from
5811 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5815 num = RExC_npar + num;
5818 vFAIL("Reference to nonexistent group");
5820 } else if ( paren == '+' ) {
5821 num = RExC_npar + num - 1;
5824 ret = reganode(pRExC_state, GOSUB, num);
5826 if (num > (I32)RExC_rx->nparens) {
5828 vFAIL("Reference to nonexistent group");
5830 ARG2L_SET( ret, RExC_recurse_count++);
5832 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5833 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5837 RExC_seen |= REG_SEEN_RECURSE;
5838 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5839 Set_Node_Offset(ret, parse_start); /* MJD */
5841 *flagp |= POSTPONED;
5842 nextchar(pRExC_state);
5844 } /* named and numeric backreferences */
5847 case '?': /* (??...) */
5849 if (*RExC_parse != '{') {
5851 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5854 *flagp |= POSTPONED;
5855 paren = *RExC_parse++;
5857 case '{': /* (?{...}) */
5862 char *s = RExC_parse;
5864 RExC_seen_zerolen++;
5865 RExC_seen |= REG_SEEN_EVAL;
5866 while (count && (c = *RExC_parse)) {
5877 if (*RExC_parse != ')') {
5879 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5883 OP_4tree *sop, *rop;
5884 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5887 Perl_save_re_context(aTHX);
5888 rop = sv_compile_2op(sv, &sop, "re", &pad);
5889 sop->op_private |= OPpREFCOUNTED;
5890 /* re_dup will OpREFCNT_inc */
5891 OpREFCNT_set(sop, 1);
5894 n = add_data(pRExC_state, 3, "nop");
5895 RExC_rxi->data->data[n] = (void*)rop;
5896 RExC_rxi->data->data[n+1] = (void*)sop;
5897 RExC_rxi->data->data[n+2] = (void*)pad;
5900 else { /* First pass */
5901 if (PL_reginterp_cnt < ++RExC_seen_evals
5903 /* No compiled RE interpolated, has runtime
5904 components ===> unsafe. */
5905 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5906 if (PL_tainting && PL_tainted)
5907 FAIL("Eval-group in insecure regular expression");
5908 #if PERL_VERSION > 8
5909 if (IN_PERL_COMPILETIME)
5914 nextchar(pRExC_state);
5916 ret = reg_node(pRExC_state, LOGICAL);
5919 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5920 /* deal with the length of this later - MJD */
5923 ret = reganode(pRExC_state, EVAL, n);
5924 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5925 Set_Node_Offset(ret, parse_start);
5928 case '(': /* (?(?{...})...) and (?(?=...)...) */
5931 if (RExC_parse[0] == '?') { /* (?(?...)) */
5932 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5933 || RExC_parse[1] == '<'
5934 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5937 ret = reg_node(pRExC_state, LOGICAL);
5940 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5944 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5945 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5947 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5948 char *name_start= RExC_parse++;
5950 SV *sv_dat=reg_scan_name(pRExC_state,
5951 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5952 if (RExC_parse == name_start || *RExC_parse != ch)
5953 vFAIL2("Sequence (?(%c... not terminated",
5954 (ch == '>' ? '<' : ch));
5957 num = add_data( pRExC_state, 1, "S" );
5958 RExC_rxi->data->data[num]=(void*)sv_dat;
5959 SvREFCNT_inc_simple_void(sv_dat);
5961 ret = reganode(pRExC_state,NGROUPP,num);
5962 goto insert_if_check_paren;
5964 else if (RExC_parse[0] == 'D' &&
5965 RExC_parse[1] == 'E' &&
5966 RExC_parse[2] == 'F' &&
5967 RExC_parse[3] == 'I' &&
5968 RExC_parse[4] == 'N' &&
5969 RExC_parse[5] == 'E')
5971 ret = reganode(pRExC_state,DEFINEP,0);
5974 goto insert_if_check_paren;
5976 else if (RExC_parse[0] == 'R') {
5979 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5980 parno = atoi(RExC_parse++);
5981 while (isDIGIT(*RExC_parse))
5983 } else if (RExC_parse[0] == '&') {
5986 sv_dat = reg_scan_name(pRExC_state,
5987 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5988 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5990 ret = reganode(pRExC_state,INSUBP,parno);
5991 goto insert_if_check_paren;
5993 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5996 parno = atoi(RExC_parse++);
5998 while (isDIGIT(*RExC_parse))
6000 ret = reganode(pRExC_state, GROUPP, parno);
6002 insert_if_check_paren:
6003 if ((c = *nextchar(pRExC_state)) != ')')
6004 vFAIL("Switch condition not recognized");
6006 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6007 br = regbranch(pRExC_state, &flags, 1,depth+1);
6009 br = reganode(pRExC_state, LONGJMP, 0);
6011 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6012 c = *nextchar(pRExC_state);
6017 vFAIL("(?(DEFINE)....) does not allow branches");
6018 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6019 regbranch(pRExC_state, &flags, 1,depth+1);
6020 REGTAIL(pRExC_state, ret, lastbr);
6023 c = *nextchar(pRExC_state);
6028 vFAIL("Switch (?(condition)... contains too many branches");
6029 ender = reg_node(pRExC_state, TAIL);
6030 REGTAIL(pRExC_state, br, ender);
6032 REGTAIL(pRExC_state, lastbr, ender);
6033 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6036 REGTAIL(pRExC_state, ret, ender);
6037 RExC_size++; /* XXX WHY do we need this?!!
6038 For large programs it seems to be required
6039 but I can't figure out why. -- dmq*/
6043 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6047 RExC_parse--; /* for vFAIL to print correctly */
6048 vFAIL("Sequence (? incomplete");
6052 parse_flags: /* (?i) */
6054 U32 posflags = 0, negflags = 0;
6055 U32 *flagsp = &posflags;
6057 while (*RExC_parse) {
6058 /* && strchr("iogcmsx", *RExC_parse) */
6059 /* (?g), (?gc) and (?o) are useless here
6060 and must be globally applied -- japhy */
6061 switch (*RExC_parse) {
6062 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6063 case ONCE_PAT_MOD: /* 'o' */
6064 case GLOBAL_PAT_MOD: /* 'g' */
6065 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6066 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6067 if (! (wastedflags & wflagbit) ) {
6068 wastedflags |= wflagbit;
6071 "Useless (%s%c) - %suse /%c modifier",
6072 flagsp == &negflags ? "?-" : "?",
6074 flagsp == &negflags ? "don't " : "",
6081 case CONTINUE_PAT_MOD: /* 'c' */
6082 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6083 if (! (wastedflags & WASTED_C) ) {
6084 wastedflags |= WASTED_GC;
6087 "Useless (%sc) - %suse /gc modifier",
6088 flagsp == &negflags ? "?-" : "?",
6089 flagsp == &negflags ? "don't " : ""
6094 case KEEPCOPY_PAT_MOD: /* 'p' */
6095 if (flagsp == &negflags) {
6097 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6099 *flagsp |= RXf_PMf_KEEPCOPY;
6103 if (flagsp == &negflags) {
6105 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6109 wastedflags = 0; /* reset so (?g-c) warns twice */
6115 RExC_flags |= posflags;
6116 RExC_flags &= ~negflags;
6118 oregflags |= posflags;
6119 oregflags &= ~negflags;
6121 nextchar(pRExC_state);
6132 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6137 }} /* one for the default block, one for the switch */
6144 ret = reganode(pRExC_state, OPEN, parno);
6147 RExC_nestroot = parno;
6148 if (RExC_seen & REG_SEEN_RECURSE
6149 && !RExC_open_parens[parno-1])
6151 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6152 "Setting open paren #%"IVdf" to %d\n",
6153 (IV)parno, REG_NODE_NUM(ret)));
6154 RExC_open_parens[parno-1]= ret;
6157 Set_Node_Length(ret, 1); /* MJD */
6158 Set_Node_Offset(ret, RExC_parse); /* MJD */
6166 /* Pick up the branches, linking them together. */
6167 parse_start = RExC_parse; /* MJD */
6168 br = regbranch(pRExC_state, &flags, 1,depth+1);
6171 if (RExC_npar > after_freeze)
6172 after_freeze = RExC_npar;
6173 RExC_npar = freeze_paren;
6176 /* branch_len = (paren != 0); */
6180 if (*RExC_parse == '|') {
6181 if (!SIZE_ONLY && RExC_extralen) {
6182 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6185 reginsert(pRExC_state, BRANCH, br, depth+1);
6186 Set_Node_Length(br, paren != 0);
6187 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6191 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6193 else if (paren == ':') {
6194 *flagp |= flags&SIMPLE;
6196 if (is_open) { /* Starts with OPEN. */
6197 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6199 else if (paren != '?') /* Not Conditional */
6201 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6203 while (*RExC_parse == '|') {
6204 if (!SIZE_ONLY && RExC_extralen) {
6205 ender = reganode(pRExC_state, LONGJMP,0);
6206 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6209 RExC_extralen += 2; /* Account for LONGJMP. */
6210 nextchar(pRExC_state);
6212 if (RExC_npar > after_freeze)
6213 after_freeze = RExC_npar;
6214 RExC_npar = freeze_paren;
6216 br = regbranch(pRExC_state, &flags, 0, depth+1);
6220 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6222 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6225 if (have_branch || paren != ':') {
6226 /* Make a closing node, and hook it on the end. */
6229 ender = reg_node(pRExC_state, TAIL);
6232 ender = reganode(pRExC_state, CLOSE, parno);
6233 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6234 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6235 "Setting close paren #%"IVdf" to %d\n",
6236 (IV)parno, REG_NODE_NUM(ender)));
6237 RExC_close_parens[parno-1]= ender;
6238 if (RExC_nestroot == parno)
6241 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6242 Set_Node_Length(ender,1); /* MJD */
6248 *flagp &= ~HASWIDTH;
6251 ender = reg_node(pRExC_state, SUCCEED);
6254 ender = reg_node(pRExC_state, END);
6256 assert(!RExC_opend); /* there can only be one! */
6261 REGTAIL(pRExC_state, lastbr, ender);
6263 if (have_branch && !SIZE_ONLY) {
6265 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6267 /* Hook the tails of the branches to the closing node. */
6268 for (br = ret; br; br = regnext(br)) {
6269 const U8 op = PL_regkind[OP(br)];
6271 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6273 else if (op == BRANCHJ) {
6274 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6282 static const char parens[] = "=!<,>";
6284 if (paren && (p = strchr(parens, paren))) {
6285 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6286 int flag = (p - parens) > 1;
6289 node = SUSPEND, flag = 0;
6290 reginsert(pRExC_state, node,ret, depth+1);
6291 Set_Node_Cur_Length(ret);
6292 Set_Node_Offset(ret, parse_start + 1);
6294 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6298 /* Check for proper termination. */
6300 RExC_flags = oregflags;
6301 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6302 RExC_parse = oregcomp_parse;
6303 vFAIL("Unmatched (");
6306 else if (!paren && RExC_parse < RExC_end) {
6307 if (*RExC_parse == ')') {
6309 vFAIL("Unmatched )");
6312 FAIL("Junk on end of regexp"); /* "Can't happen". */
6316 RExC_npar = after_freeze;
6321 - regbranch - one alternative of an | operator
6323 * Implements the concatenation operator.
6326 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6329 register regnode *ret;
6330 register regnode *chain = NULL;
6331 register regnode *latest;
6332 I32 flags = 0, c = 0;
6333 GET_RE_DEBUG_FLAGS_DECL;
6335 PERL_ARGS_ASSERT_REGBRANCH;
6337 DEBUG_PARSE("brnc");
6342 if (!SIZE_ONLY && RExC_extralen)
6343 ret = reganode(pRExC_state, BRANCHJ,0);
6345 ret = reg_node(pRExC_state, BRANCH);
6346 Set_Node_Length(ret, 1);
6350 if (!first && SIZE_ONLY)
6351 RExC_extralen += 1; /* BRANCHJ */
6353 *flagp = WORST; /* Tentatively. */
6356 nextchar(pRExC_state);
6357 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6359 latest = regpiece(pRExC_state, &flags,depth+1);
6360 if (latest == NULL) {
6361 if (flags & TRYAGAIN)
6365 else if (ret == NULL)
6367 *flagp |= flags&(HASWIDTH|POSTPONED);
6368 if (chain == NULL) /* First piece. */
6369 *flagp |= flags&SPSTART;
6372 REGTAIL(pRExC_state, chain, latest);
6377 if (chain == NULL) { /* Loop ran zero times. */
6378 chain = reg_node(pRExC_state, NOTHING);
6383 *flagp |= flags&SIMPLE;
6390 - regpiece - something followed by possible [*+?]
6392 * Note that the branching code sequences used for ? and the general cases
6393 * of * and + are somewhat optimized: they use the same NOTHING node as
6394 * both the endmarker for their branch list and the body of the last branch.
6395 * It might seem that this node could be dispensed with entirely, but the
6396 * endmarker role is not redundant.
6399 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6402 register regnode *ret;
6404 register char *next;
6406 const char * const origparse = RExC_parse;
6408 I32 max = REG_INFTY;
6410 const char *maxpos = NULL;
6411 GET_RE_DEBUG_FLAGS_DECL;
6413 PERL_ARGS_ASSERT_REGPIECE;
6415 DEBUG_PARSE("piec");
6417 ret = regatom(pRExC_state, &flags,depth+1);
6419 if (flags & TRYAGAIN)
6426 if (op == '{' && regcurly(RExC_parse)) {
6428 parse_start = RExC_parse; /* MJD */
6429 next = RExC_parse + 1;
6430 while (isDIGIT(*next) || *next == ',') {
6439 if (*next == '}') { /* got one */
6443 min = atoi(RExC_parse);
6447 maxpos = RExC_parse;
6449 if (!max && *maxpos != '0')
6450 max = REG_INFTY; /* meaning "infinity" */
6451 else if (max >= REG_INFTY)
6452 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6454 nextchar(pRExC_state);
6457 if ((flags&SIMPLE)) {
6458 RExC_naughty += 2 + RExC_naughty / 2;
6459 reginsert(pRExC_state, CURLY, ret, depth+1);
6460 Set_Node_Offset(ret, parse_start+1); /* MJD */
6461 Set_Node_Cur_Length(ret);
6464 regnode * const w = reg_node(pRExC_state, WHILEM);
6467 REGTAIL(pRExC_state, ret, w);
6468 if (!SIZE_ONLY && RExC_extralen) {
6469 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6470 reginsert(pRExC_state, NOTHING,ret, depth+1);
6471 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6473 reginsert(pRExC_state, CURLYX,ret, depth+1);
6475 Set_Node_Offset(ret, parse_start+1);
6476 Set_Node_Length(ret,
6477 op == '{' ? (RExC_parse - parse_start) : 1);
6479 if (!SIZE_ONLY && RExC_extralen)
6480 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6481 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6483 RExC_whilem_seen++, RExC_extralen += 3;
6484 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6493 vFAIL("Can't do {n,m} with n > m");
6495 ARG1_SET(ret, (U16)min);
6496 ARG2_SET(ret, (U16)max);
6508 #if 0 /* Now runtime fix should be reliable. */
6510 /* if this is reinstated, don't forget to put this back into perldiag:
6512 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6514 (F) The part of the regexp subject to either the * or + quantifier
6515 could match an empty string. The {#} shows in the regular
6516 expression about where the problem was discovered.
6520 if (!(flags&HASWIDTH) && op != '?')
6521 vFAIL("Regexp *+ operand could be empty");
6524 parse_start = RExC_parse;
6525 nextchar(pRExC_state);
6527 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6529 if (op == '*' && (flags&SIMPLE)) {
6530 reginsert(pRExC_state, STAR, ret, depth+1);
6534 else if (op == '*') {
6538 else if (op == '+' && (flags&SIMPLE)) {
6539 reginsert(pRExC_state, PLUS, ret, depth+1);
6543 else if (op == '+') {
6547 else if (op == '?') {
6552 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6553 ckWARN3reg(RExC_parse,
6554 "%.*s matches null string many times",
6555 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6559 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6560 nextchar(pRExC_state);
6561 reginsert(pRExC_state, MINMOD, ret, depth+1);
6562 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6564 #ifndef REG_ALLOW_MINMOD_SUSPEND
6567 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6569 nextchar(pRExC_state);
6570 ender = reg_node(pRExC_state, SUCCEED);
6571 REGTAIL(pRExC_state, ret, ender);
6572 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6574 ender = reg_node(pRExC_state, TAIL);
6575 REGTAIL(pRExC_state, ret, ender);
6579 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6581 vFAIL("Nested quantifiers");
6588 /* reg_namedseq(pRExC_state,UVp)
6590 This is expected to be called by a parser routine that has
6591 recognized '\N' and needs to handle the rest. RExC_parse is
6592 expected to point at the first char following the N at the time
6595 If valuep is non-null then it is assumed that we are parsing inside
6596 of a charclass definition and the first codepoint in the resolved
6597 string is returned via *valuep and the routine will return NULL.
6598 In this mode if a multichar string is returned from the charnames
6599 handler a warning will be issued, and only the first char in the
6600 sequence will be examined. If the string returned is zero length
6601 then the value of *valuep is undefined and NON-NULL will
6602 be returned to indicate failure. (This will NOT be a valid pointer
6605 If valuep is null then it is assumed that we are parsing normal text
6606 and inserts a new EXACT node into the program containing the resolved
6607 string and returns a pointer to the new node. If the string is
6608 zerolength a NOTHING node is emitted.
6610 On success RExC_parse is set to the char following the endbrace.
6611 Parsing failures will generate a fatal errorvia vFAIL(...)
6613 NOTE: We cache all results from the charnames handler locally in
6614 the RExC_charnames hash (created on first use) to prevent a charnames
6615 handler from playing silly-buggers and returning a short string and
6616 then a long string for a given pattern. Since the regexp program
6617 size is calculated during an initial parse this would result
6618 in a buffer overrun so we cache to prevent the charname result from
6619 changing during the course of the parse.
6623 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6625 char * name; /* start of the content of the name */
6626 char * endbrace; /* endbrace following the name */
6629 STRLEN len; /* this has various purposes throughout the code */
6630 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6631 regnode *ret = NULL;
6633 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6635 if (*RExC_parse != '{' ||
6636 (*RExC_parse == '{' && RExC_parse[1]
6637 && strchr("0123456789", RExC_parse[1])))
6639 GET_RE_DEBUG_FLAGS_DECL;
6641 /* no bare \N in a charclass */
6642 vFAIL("Missing braces on \\N{}");
6644 nextchar(pRExC_state);
6645 ret = reg_node(pRExC_state, REG_ANY);
6646 *flagp |= HASWIDTH|SIMPLE;
6649 Set_Node_Length(ret, 1); /* MJD */
6652 name = RExC_parse+1;
6653 endbrace = strchr(RExC_parse, '}');
6656 vFAIL("Missing right brace on \\N{}");
6658 RExC_parse = endbrace + 1;
6661 /* RExC_parse points at the beginning brace,
6662 endbrace points at the last */
6663 if ( name[0]=='U' && name[1]=='+' ) {
6664 /* its a "Unicode hex" notation {U+89AB} */
6665 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6666 | PERL_SCAN_DISALLOW_PREFIX
6667 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6669 len = (STRLEN)(endbrace - name - 2);
6670 cp = grok_hex(name + 2, &len, &fl, NULL);
6671 if ( len != (STRLEN)(endbrace - name - 2) ) {
6675 if (cp > 0xff) RExC_utf8 = 1;
6680 /* Need to convert to utf8 if either: won't fit into a byte, or the re
6681 * is going to be in utf8 and the representation changes under utf8. */
6682 if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
6683 U8 string[UTF8_MAXBYTES+1];
6686 tmps = uvuni_to_utf8(string, cp);
6687 sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
6688 } else { /* Otherwise, no need for utf8, can skip that step */
6691 sv_str= newSVpvn(&string, 1);
6694 /* fetch the charnames handler for this scope */
6695 HV * const table = GvHV(PL_hintgv);
6697 hv_fetchs(table, "charnames", FALSE) :
6699 SV *cv= cvp ? *cvp : NULL;
6702 /* create an SV with the name as argument */
6703 sv_name = newSVpvn(name, endbrace - name);
6705 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6706 vFAIL2("Constant(\\N{%" SVf "}) unknown: "
6707 "(possibly a missing \"use charnames ...\")",
6710 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6711 vFAIL2("Constant(\\N{%" SVf "}): "
6712 "$^H{charnames} is not defined", SVfARG(sv_name));
6717 if (!RExC_charnames) {
6718 /* make sure our cache is allocated */
6719 RExC_charnames = newHV();
6720 sv_2mortal(MUTABLE_SV(RExC_charnames));
6722 /* see if we have looked this one up before */
6723 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6725 sv_str = HeVAL(he_str);
6738 count= call_sv(cv, G_SCALAR);
6740 if (count == 1) { /* XXXX is this right? dmq */
6742 SvREFCNT_inc_simple_void(sv_str);
6750 if ( !sv_str || !SvOK(sv_str) ) {
6751 vFAIL2("Constant(\\N{%" SVf "}): Call to &{$^H{charnames}} "
6752 "did not return a defined value", SVfARG(sv_name));
6754 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6759 char *p = SvPV(sv_str, len);
6762 if ( SvUTF8(sv_str) ) {
6763 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6767 We have to turn on utf8 for high bit chars otherwise
6768 we get failures with
6770 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6771 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6773 This is different from what \x{} would do with the same
6774 codepoint, where the condition is > 0xFF.
6781 /* warn if we havent used the whole string? */
6783 if (numlen<len && SIZE_ONLY) {
6784 ckWARN2reg(RExC_parse,
6785 "Ignoring excess chars from \\N{%" SVf "} in character class",
6789 } else if (SIZE_ONLY) {
6790 ckWARN2reg(RExC_parse,
6791 "Ignoring zero length \\N{%" SVf "} in character class",
6795 SvREFCNT_dec(sv_name);
6797 SvREFCNT_dec(sv_str);
6798 return len ? NULL : (regnode *)&len;
6799 } else if(SvCUR(sv_str)) {
6805 char * parse_start = name-3; /* needed for the offsets */
6807 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6809 ret = reg_node(pRExC_state,
6810 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6813 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6814 sv_utf8_upgrade(sv_str);
6815 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6819 p = SvPV(sv_str, len);
6821 /* len is the length written, charlen is the size the char read */
6822 for ( len = 0; p < pend; p += charlen ) {
6824 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6826 STRLEN foldlen,numlen;
6827 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6828 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6829 /* Emit all the Unicode characters. */
6831 for (foldbuf = tmpbuf;
6835 uvc = utf8_to_uvchr(foldbuf, &numlen);
6837 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6840 /* In EBCDIC the numlen
6841 * and unilen can differ. */
6843 if (numlen >= foldlen)
6847 break; /* "Can't happen." */
6850 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6862 RExC_size += STR_SZ(len);
6865 RExC_emit += STR_SZ(len);
6867 Set_Node_Cur_Length(ret); /* MJD */
6869 nextchar(pRExC_state);
6870 } else { /* zero length */
6871 ret = reg_node(pRExC_state,NOTHING);
6873 SvREFCNT_dec(sv_name);
6875 SvREFCNT_dec(sv_str);
6884 * It returns the code point in utf8 for the value in *encp.
6885 * value: a code value in the source encoding
6886 * encp: a pointer to an Encode object
6888 * If the result from Encode is not a single character,
6889 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6892 S_reg_recode(pTHX_ const char value, SV **encp)
6895 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6896 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6897 const STRLEN newlen = SvCUR(sv);
6898 UV uv = UNICODE_REPLACEMENT;
6900 PERL_ARGS_ASSERT_REG_RECODE;
6904 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6907 if (!newlen || numlen != newlen) {
6908 uv = UNICODE_REPLACEMENT;
6916 - regatom - the lowest level
6918 Try to identify anything special at the start of the pattern. If there
6919 is, then handle it as required. This may involve generating a single regop,
6920 such as for an assertion; or it may involve recursing, such as to
6921 handle a () structure.
6923 If the string doesn't start with something special then we gobble up
6924 as much literal text as we can.
6926 Once we have been able to handle whatever type of thing started the
6927 sequence, we return.
6929 Note: we have to be careful with escapes, as they can be both literal
6930 and special, and in the case of \10 and friends can either, depending
6931 on context. Specifically there are two seperate switches for handling
6932 escape sequences, with the one for handling literal escapes requiring
6933 a dummy entry for all of the special escapes that are actually handled
6938 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6941 register regnode *ret = NULL;
6943 char *parse_start = RExC_parse;
6944 GET_RE_DEBUG_FLAGS_DECL;
6945 DEBUG_PARSE("atom");
6946 *flagp = WORST; /* Tentatively. */
6948 PERL_ARGS_ASSERT_REGATOM;
6951 switch ((U8)*RExC_parse) {
6953 RExC_seen_zerolen++;
6954 nextchar(pRExC_state);
6955 if (RExC_flags & RXf_PMf_MULTILINE)
6956 ret = reg_node(pRExC_state, MBOL);
6957 else if (RExC_flags & RXf_PMf_SINGLELINE)
6958 ret = reg_node(pRExC_state, SBOL);
6960 ret = reg_node(pRExC_state, BOL);
6961 Set_Node_Length(ret, 1); /* MJD */
6964 nextchar(pRExC_state);
6966 RExC_seen_zerolen++;
6967 if (RExC_flags & RXf_PMf_MULTILINE)
6968 ret = reg_node(pRExC_state, MEOL);
6969 else if (RExC_flags & RXf_PMf_SINGLELINE)
6970 ret = reg_node(pRExC_state, SEOL);
6972 ret = reg_node(pRExC_state, EOL);
6973 Set_Node_Length(ret, 1); /* MJD */
6976 nextchar(pRExC_state);
6977 if (RExC_flags & RXf_PMf_SINGLELINE)
6978 ret = reg_node(pRExC_state, SANY);
6980 ret = reg_node(pRExC_state, REG_ANY);
6981 *flagp |= HASWIDTH|SIMPLE;
6983 Set_Node_Length(ret, 1); /* MJD */
6987 char * const oregcomp_parse = ++RExC_parse;
6988 ret = regclass(pRExC_state,depth+1);
6989 if (*RExC_parse != ']') {
6990 RExC_parse = oregcomp_parse;
6991 vFAIL("Unmatched [");
6993 nextchar(pRExC_state);
6994 *flagp |= HASWIDTH|SIMPLE;
6995 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6999 nextchar(pRExC_state);
7000 ret = reg(pRExC_state, 1, &flags,depth+1);
7002 if (flags & TRYAGAIN) {
7003 if (RExC_parse == RExC_end) {
7004 /* Make parent create an empty node if needed. */
7012 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7016 if (flags & TRYAGAIN) {
7020 vFAIL("Internal urp");
7021 /* Supposed to be caught earlier. */
7024 if (!regcurly(RExC_parse)) {
7033 vFAIL("Quantifier follows nothing");
7041 len=0; /* silence a spurious compiler warning */
7042 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7043 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7044 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7045 ret = reganode(pRExC_state, FOLDCHAR, cp);
7046 Set_Node_Length(ret, 1); /* MJD */
7047 nextchar(pRExC_state); /* kill whitespace under /x */
7055 This switch handles escape sequences that resolve to some kind
7056 of special regop and not to literal text. Escape sequnces that
7057 resolve to literal text are handled below in the switch marked
7060 Every entry in this switch *must* have a corresponding entry
7061 in the literal escape switch. However, the opposite is not
7062 required, as the default for this switch is to jump to the
7063 literal text handling code.
7065 switch ((U8)*++RExC_parse) {
7070 /* Special Escapes */
7072 RExC_seen_zerolen++;
7073 ret = reg_node(pRExC_state, SBOL);
7075 goto finish_meta_pat;
7077 ret = reg_node(pRExC_state, GPOS);
7078 RExC_seen |= REG_SEEN_GPOS;
7080 goto finish_meta_pat;
7082 RExC_seen_zerolen++;
7083 ret = reg_node(pRExC_state, KEEPS);
7085 /* XXX:dmq : disabling in-place substitution seems to
7086 * be necessary here to avoid cases of memory corruption, as
7087 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7089 RExC_seen |= REG_SEEN_LOOKBEHIND;
7090 goto finish_meta_pat;
7092 ret = reg_node(pRExC_state, SEOL);
7094 RExC_seen_zerolen++; /* Do not optimize RE away */
7095 goto finish_meta_pat;
7097 ret = reg_node(pRExC_state, EOS);
7099 RExC_seen_zerolen++; /* Do not optimize RE away */
7100 goto finish_meta_pat;
7102 ret = reg_node(pRExC_state, CANY);
7103 RExC_seen |= REG_SEEN_CANY;
7104 *flagp |= HASWIDTH|SIMPLE;
7105 goto finish_meta_pat;
7107 ret = reg_node(pRExC_state, CLUMP);
7109 goto finish_meta_pat;
7111 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7112 *flagp |= HASWIDTH|SIMPLE;
7113 goto finish_meta_pat;
7115 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7116 *flagp |= HASWIDTH|SIMPLE;
7117 goto finish_meta_pat;
7119 RExC_seen_zerolen++;
7120 RExC_seen |= REG_SEEN_LOOKBEHIND;
7121 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7123 goto finish_meta_pat;
7125 RExC_seen_zerolen++;
7126 RExC_seen |= REG_SEEN_LOOKBEHIND;
7127 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7129 goto finish_meta_pat;
7131 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7132 *flagp |= HASWIDTH|SIMPLE;
7133 goto finish_meta_pat;
7135 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7136 *flagp |= HASWIDTH|SIMPLE;
7137 goto finish_meta_pat;
7139 ret = reg_node(pRExC_state, DIGIT);
7140 *flagp |= HASWIDTH|SIMPLE;
7141 goto finish_meta_pat;
7143 ret = reg_node(pRExC_state, NDIGIT);
7144 *flagp |= HASWIDTH|SIMPLE;
7145 goto finish_meta_pat;
7147 ret = reg_node(pRExC_state, LNBREAK);
7148 *flagp |= HASWIDTH|SIMPLE;
7149 goto finish_meta_pat;
7151 ret = reg_node(pRExC_state, HORIZWS);
7152 *flagp |= HASWIDTH|SIMPLE;
7153 goto finish_meta_pat;
7155 ret = reg_node(pRExC_state, NHORIZWS);
7156 *flagp |= HASWIDTH|SIMPLE;
7157 goto finish_meta_pat;
7159 ret = reg_node(pRExC_state, VERTWS);
7160 *flagp |= HASWIDTH|SIMPLE;
7161 goto finish_meta_pat;
7163 ret = reg_node(pRExC_state, NVERTWS);
7164 *flagp |= HASWIDTH|SIMPLE;
7166 nextchar(pRExC_state);
7167 Set_Node_Length(ret, 2); /* MJD */
7172 char* const oldregxend = RExC_end;
7174 char* parse_start = RExC_parse - 2;
7177 if (RExC_parse[1] == '{') {
7178 /* a lovely hack--pretend we saw [\pX] instead */
7179 RExC_end = strchr(RExC_parse, '}');
7181 const U8 c = (U8)*RExC_parse;
7183 RExC_end = oldregxend;
7184 vFAIL2("Missing right brace on \\%c{}", c);
7189 RExC_end = RExC_parse + 2;
7190 if (RExC_end > oldregxend)
7191 RExC_end = oldregxend;
7195 ret = regclass(pRExC_state,depth+1);
7197 RExC_end = oldregxend;
7200 Set_Node_Offset(ret, parse_start + 2);
7201 Set_Node_Cur_Length(ret);
7202 nextchar(pRExC_state);
7203 *flagp |= HASWIDTH|SIMPLE;
7207 /* Handle \N and \N{NAME} here and not below because it can be
7208 multicharacter. join_exact() will join them up later on.
7209 Also this makes sure that things like /\N{BLAH}+/ and
7210 \N{BLAH} being multi char Just Happen. dmq*/
7212 ret= reg_namedseq(pRExC_state, NULL, flagp);
7214 case 'k': /* Handle \k<NAME> and \k'NAME' */
7217 char ch= RExC_parse[1];
7218 if (ch != '<' && ch != '\'' && ch != '{') {
7220 vFAIL2("Sequence %.2s... not terminated",parse_start);
7222 /* this pretty much dupes the code for (?P=...) in reg(), if
7223 you change this make sure you change that */
7224 char* name_start = (RExC_parse += 2);
7226 SV *sv_dat = reg_scan_name(pRExC_state,
7227 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7228 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7229 if (RExC_parse == name_start || *RExC_parse != ch)
7230 vFAIL2("Sequence %.3s... not terminated",parse_start);
7233 num = add_data( pRExC_state, 1, "S" );
7234 RExC_rxi->data->data[num]=(void*)sv_dat;
7235 SvREFCNT_inc_simple_void(sv_dat);
7239 ret = reganode(pRExC_state,
7240 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7244 /* override incorrect value set in reganode MJD */
7245 Set_Node_Offset(ret, parse_start+1);
7246 Set_Node_Cur_Length(ret); /* MJD */
7247 nextchar(pRExC_state);
7253 case '1': case '2': case '3': case '4':
7254 case '5': case '6': case '7': case '8': case '9':
7257 bool isg = *RExC_parse == 'g';
7262 if (*RExC_parse == '{') {
7266 if (*RExC_parse == '-') {
7270 if (hasbrace && !isDIGIT(*RExC_parse)) {
7271 if (isrel) RExC_parse--;
7273 goto parse_named_seq;
7275 num = atoi(RExC_parse);
7276 if (isg && num == 0)
7277 vFAIL("Reference to invalid group 0");
7279 num = RExC_npar - num;
7281 vFAIL("Reference to nonexistent or unclosed group");
7283 if (!isg && num > 9 && num >= RExC_npar)
7286 char * const parse_start = RExC_parse - 1; /* MJD */
7287 while (isDIGIT(*RExC_parse))
7289 if (parse_start == RExC_parse - 1)
7290 vFAIL("Unterminated \\g... pattern");
7292 if (*RExC_parse != '}')
7293 vFAIL("Unterminated \\g{...} pattern");
7297 if (num > (I32)RExC_rx->nparens)
7298 vFAIL("Reference to nonexistent group");
7301 ret = reganode(pRExC_state,
7302 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7306 /* override incorrect value set in reganode MJD */
7307 Set_Node_Offset(ret, parse_start+1);
7308 Set_Node_Cur_Length(ret); /* MJD */
7310 nextchar(pRExC_state);
7315 if (RExC_parse >= RExC_end)
7316 FAIL("Trailing \\");
7319 /* Do not generate "unrecognized" warnings here, we fall
7320 back into the quick-grab loop below */
7327 if (RExC_flags & RXf_PMf_EXTENDED) {
7328 if ( reg_skipcomment( pRExC_state ) )
7335 register STRLEN len;
7340 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7342 parse_start = RExC_parse - 1;
7348 ret = reg_node(pRExC_state,
7349 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7351 for (len = 0, p = RExC_parse - 1;
7352 len < 127 && p < RExC_end;
7355 char * const oldp = p;
7357 if (RExC_flags & RXf_PMf_EXTENDED)
7358 p = regwhite( pRExC_state, p );
7363 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7364 goto normal_default;
7374 /* Literal Escapes Switch
7376 This switch is meant to handle escape sequences that
7377 resolve to a literal character.
7379 Every escape sequence that represents something
7380 else, like an assertion or a char class, is handled
7381 in the switch marked 'Special Escapes' above in this
7382 routine, but also has an entry here as anything that
7383 isn't explicitly mentioned here will be treated as
7384 an unescaped equivalent literal.
7388 /* These are all the special escapes. */
7392 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7393 goto normal_default;
7394 case 'A': /* Start assertion */
7395 case 'b': case 'B': /* Word-boundary assertion*/
7396 case 'C': /* Single char !DANGEROUS! */
7397 case 'd': case 'D': /* digit class */
7398 case 'g': case 'G': /* generic-backref, pos assertion */
7399 case 'h': case 'H': /* HORIZWS */
7400 case 'k': case 'K': /* named backref, keep marker */
7401 case 'N': /* named char sequence */
7402 case 'p': case 'P': /* Unicode property */
7403 case 'R': /* LNBREAK */
7404 case 's': case 'S': /* space class */
7405 case 'v': case 'V': /* VERTWS */
7406 case 'w': case 'W': /* word class */
7407 case 'X': /* eXtended Unicode "combining character sequence" */
7408 case 'z': case 'Z': /* End of line/string assertion */
7412 /* Anything after here is an escape that resolves to a
7413 literal. (Except digits, which may or may not)
7432 ender = ASCII_TO_NATIVE('\033');
7436 ender = ASCII_TO_NATIVE('\007');
7441 char* const e = strchr(p, '}');
7445 vFAIL("Missing right brace on \\x{}");
7448 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7449 | PERL_SCAN_DISALLOW_PREFIX;
7450 STRLEN numlen = e - p - 1;
7451 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7458 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7460 ender = grok_hex(p, &numlen, &flags, NULL);
7463 if (PL_encoding && ender < 0x100)
7464 goto recode_encoding;
7468 ender = UCHARAT(p++);
7469 ender = toCTRL(ender);
7471 case '0': case '1': case '2': case '3':case '4':
7472 case '5': case '6': case '7': case '8':case '9':
7474 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7477 ender = grok_oct(p, &numlen, &flags, NULL);
7479 /* An octal above 0xff is interpreted differently
7480 * depending on if the re is in utf8 or not. If it
7481 * is in utf8, the value will be itself, otherwise
7482 * it is interpreted as modulo 0x100. It has been
7483 * decided to discourage the use of octal above the
7484 * single-byte range. For now, warn only when
7485 * it ends up modulo */
7486 if (SIZE_ONLY && ender >= 0x100
7487 && ! UTF && ! PL_encoding) {
7488 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7496 if (PL_encoding && ender < 0x100)
7497 goto recode_encoding;
7501 SV* enc = PL_encoding;
7502 ender = reg_recode((const char)(U8)ender, &enc);
7503 if (!enc && SIZE_ONLY)
7504 ckWARNreg(p, "Invalid escape in the specified encoding");
7510 FAIL("Trailing \\");
7513 if (!SIZE_ONLY&& isALPHA(*p))
7514 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7515 goto normal_default;
7520 if (UTF8_IS_START(*p) && UTF) {
7522 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7523 &numlen, UTF8_ALLOW_DEFAULT);
7530 if ( RExC_flags & RXf_PMf_EXTENDED)
7531 p = regwhite( pRExC_state, p );
7533 /* Prime the casefolded buffer. */
7534 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7536 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7541 /* Emit all the Unicode characters. */
7543 for (foldbuf = tmpbuf;
7545 foldlen -= numlen) {
7546 ender = utf8_to_uvchr(foldbuf, &numlen);
7548 const STRLEN unilen = reguni(pRExC_state, ender, s);
7551 /* In EBCDIC the numlen
7552 * and unilen can differ. */
7554 if (numlen >= foldlen)
7558 break; /* "Can't happen." */
7562 const STRLEN unilen = reguni(pRExC_state, ender, s);
7571 REGC((char)ender, s++);
7577 /* Emit all the Unicode characters. */
7579 for (foldbuf = tmpbuf;
7581 foldlen -= numlen) {
7582 ender = utf8_to_uvchr(foldbuf, &numlen);
7584 const STRLEN unilen = reguni(pRExC_state, ender, s);
7587 /* In EBCDIC the numlen
7588 * and unilen can differ. */
7590 if (numlen >= foldlen)
7598 const STRLEN unilen = reguni(pRExC_state, ender, s);
7607 REGC((char)ender, s++);
7611 Set_Node_Cur_Length(ret); /* MJD */
7612 nextchar(pRExC_state);
7614 /* len is STRLEN which is unsigned, need to copy to signed */
7617 vFAIL("Internal disaster");
7621 if (len == 1 && UNI_IS_INVARIANT(ender))
7625 RExC_size += STR_SZ(len);
7628 RExC_emit += STR_SZ(len);
7638 S_regwhite( RExC_state_t *pRExC_state, char *p )
7640 const char *e = RExC_end;
7642 PERL_ARGS_ASSERT_REGWHITE;
7647 else if (*p == '#') {
7656 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7664 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7665 Character classes ([:foo:]) can also be negated ([:^foo:]).
7666 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7667 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7668 but trigger failures because they are currently unimplemented. */
7670 #define POSIXCC_DONE(c) ((c) == ':')
7671 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7672 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7675 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7678 I32 namedclass = OOB_NAMEDCLASS;
7680 PERL_ARGS_ASSERT_REGPPOSIXCC;
7682 if (value == '[' && RExC_parse + 1 < RExC_end &&
7683 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7684 POSIXCC(UCHARAT(RExC_parse))) {
7685 const char c = UCHARAT(RExC_parse);
7686 char* const s = RExC_parse++;
7688 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7690 if (RExC_parse == RExC_end)
7691 /* Grandfather lone [:, [=, [. */
7694 const char* const t = RExC_parse++; /* skip over the c */
7697 if (UCHARAT(RExC_parse) == ']') {
7698 const char *posixcc = s + 1;
7699 RExC_parse++; /* skip over the ending ] */
7702 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7703 const I32 skip = t - posixcc;
7705 /* Initially switch on the length of the name. */
7708 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7709 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7712 /* Names all of length 5. */
7713 /* alnum alpha ascii blank cntrl digit graph lower
7714 print punct space upper */
7715 /* Offset 4 gives the best switch position. */
7716 switch (posixcc[4]) {
7718 if (memEQ(posixcc, "alph", 4)) /* alpha */
7719 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7722 if (memEQ(posixcc, "spac", 4)) /* space */
7723 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7726 if (memEQ(posixcc, "grap", 4)) /* graph */
7727 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7730 if (memEQ(posixcc, "asci", 4)) /* ascii */
7731 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7734 if (memEQ(posixcc, "blan", 4)) /* blank */
7735 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7738 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7739 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7742 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7743 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7746 if (memEQ(posixcc, "lowe", 4)) /* lower */
7747 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7748 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7749 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7752 if (memEQ(posixcc, "digi", 4)) /* digit */
7753 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7754 else if (memEQ(posixcc, "prin", 4)) /* print */
7755 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7756 else if (memEQ(posixcc, "punc", 4)) /* punct */
7757 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7762 if (memEQ(posixcc, "xdigit", 6))
7763 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7767 if (namedclass == OOB_NAMEDCLASS)
7768 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7770 assert (posixcc[skip] == ':');
7771 assert (posixcc[skip+1] == ']');
7772 } else if (!SIZE_ONLY) {
7773 /* [[=foo=]] and [[.foo.]] are still future. */
7775 /* adjust RExC_parse so the warning shows after
7777 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7779 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7782 /* Maternal grandfather:
7783 * "[:" ending in ":" but not in ":]" */
7793 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7797 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7799 if (POSIXCC(UCHARAT(RExC_parse))) {
7800 const char *s = RExC_parse;
7801 const char c = *s++;
7805 if (*s && c == *s && s[1] == ']') {
7807 "POSIX syntax [%c %c] belongs inside character classes",
7810 /* [[=foo=]] and [[.foo.]] are still future. */
7811 if (POSIXCC_NOTYET(c)) {
7812 /* adjust RExC_parse so the error shows after
7814 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7816 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7823 #define _C_C_T_(NAME,TEST,WORD) \
7826 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7828 for (value = 0; value < 256; value++) \
7830 ANYOF_BITMAP_SET(ret, value); \
7835 case ANYOF_N##NAME: \
7837 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7839 for (value = 0; value < 256; value++) \
7841 ANYOF_BITMAP_SET(ret, value); \
7847 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7849 for (value = 0; value < 256; value++) \
7851 ANYOF_BITMAP_SET(ret, value); \
7855 case ANYOF_N##NAME: \
7856 for (value = 0; value < 256; value++) \
7858 ANYOF_BITMAP_SET(ret, value); \
7864 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7865 so that it is possible to override the option here without having to
7866 rebuild the entire core. as we are required to do if we change regcomp.h
7867 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7869 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7870 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7873 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7874 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7876 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7880 parse a class specification and produce either an ANYOF node that
7881 matches the pattern or if the pattern matches a single char only and
7882 that char is < 256 and we are case insensitive then we produce an
7887 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7890 register UV nextvalue;
7891 register IV prevvalue = OOB_UNICODE;
7892 register IV range = 0;
7893 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7894 register regnode *ret;
7897 char *rangebegin = NULL;
7898 bool need_class = 0;
7901 bool optimize_invert = TRUE;
7902 AV* unicode_alternate = NULL;
7904 UV literal_endpoint = 0;
7906 UV stored = 0; /* number of chars stored in the class */
7908 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7909 case we need to change the emitted regop to an EXACT. */
7910 const char * orig_parse = RExC_parse;
7911 GET_RE_DEBUG_FLAGS_DECL;
7913 PERL_ARGS_ASSERT_REGCLASS;
7915 PERL_UNUSED_ARG(depth);
7918 DEBUG_PARSE("clas");
7920 /* Assume we are going to generate an ANYOF node. */
7921 ret = reganode(pRExC_state, ANYOF, 0);
7924 ANYOF_FLAGS(ret) = 0;
7926 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7930 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7934 RExC_size += ANYOF_SKIP;
7935 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7938 RExC_emit += ANYOF_SKIP;
7940 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7942 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7943 ANYOF_BITMAP_ZERO(ret);
7944 listsv = newSVpvs("# comment\n");
7947 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7949 if (!SIZE_ONLY && POSIXCC(nextvalue))
7950 checkposixcc(pRExC_state);
7952 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7953 if (UCHARAT(RExC_parse) == ']')
7957 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7961 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7964 rangebegin = RExC_parse;
7966 value = utf8n_to_uvchr((U8*)RExC_parse,
7967 RExC_end - RExC_parse,
7968 &numlen, UTF8_ALLOW_DEFAULT);
7969 RExC_parse += numlen;
7972 value = UCHARAT(RExC_parse++);
7974 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7975 if (value == '[' && POSIXCC(nextvalue))
7976 namedclass = regpposixcc(pRExC_state, value);
7977 else if (value == '\\') {
7979 value = utf8n_to_uvchr((U8*)RExC_parse,
7980 RExC_end - RExC_parse,
7981 &numlen, UTF8_ALLOW_DEFAULT);
7982 RExC_parse += numlen;
7985 value = UCHARAT(RExC_parse++);
7986 /* Some compilers cannot handle switching on 64-bit integer
7987 * values, therefore value cannot be an UV. Yes, this will
7988 * be a problem later if we want switch on Unicode.
7989 * A similar issue a little bit later when switching on
7990 * namedclass. --jhi */
7991 switch ((I32)value) {
7992 case 'w': namedclass = ANYOF_ALNUM; break;
7993 case 'W': namedclass = ANYOF_NALNUM; break;
7994 case 's': namedclass = ANYOF_SPACE; break;
7995 case 'S': namedclass = ANYOF_NSPACE; break;
7996 case 'd': namedclass = ANYOF_DIGIT; break;
7997 case 'D': namedclass = ANYOF_NDIGIT; break;
7998 case 'v': namedclass = ANYOF_VERTWS; break;
7999 case 'V': namedclass = ANYOF_NVERTWS; break;
8000 case 'h': namedclass = ANYOF_HORIZWS; break;
8001 case 'H': namedclass = ANYOF_NHORIZWS; break;
8002 case 'N': /* Handle \N{NAME} in class */
8004 /* We only pay attention to the first char of
8005 multichar strings being returned. I kinda wonder
8006 if this makes sense as it does change the behaviour
8007 from earlier versions, OTOH that behaviour was broken
8009 UV v; /* value is register so we cant & it /grrr */
8010 if (reg_namedseq(pRExC_state, &v, NULL)) {
8020 if (RExC_parse >= RExC_end)
8021 vFAIL2("Empty \\%c{}", (U8)value);
8022 if (*RExC_parse == '{') {
8023 const U8 c = (U8)value;
8024 e = strchr(RExC_parse++, '}');
8026 vFAIL2("Missing right brace on \\%c{}", c);
8027 while (isSPACE(UCHARAT(RExC_parse)))
8029 if (e == RExC_parse)
8030 vFAIL2("Empty \\%c{}", c);
8032 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8040 if (UCHARAT(RExC_parse) == '^') {
8043 value = value == 'p' ? 'P' : 'p'; /* toggle */
8044 while (isSPACE(UCHARAT(RExC_parse))) {
8049 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8050 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8053 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8054 namedclass = ANYOF_MAX; /* no official name, but it's named */
8057 case 'n': value = '\n'; break;
8058 case 'r': value = '\r'; break;
8059 case 't': value = '\t'; break;
8060 case 'f': value = '\f'; break;
8061 case 'b': value = '\b'; break;
8062 case 'e': value = ASCII_TO_NATIVE('\033');break;
8063 case 'a': value = ASCII_TO_NATIVE('\007');break;
8065 if (*RExC_parse == '{') {
8066 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8067 | PERL_SCAN_DISALLOW_PREFIX;
8068 char * const e = strchr(RExC_parse++, '}');
8070 vFAIL("Missing right brace on \\x{}");
8072 numlen = e - RExC_parse;
8073 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8077 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8079 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8080 RExC_parse += numlen;
8082 if (PL_encoding && value < 0x100)
8083 goto recode_encoding;
8086 value = UCHARAT(RExC_parse++);
8087 value = toCTRL(value);
8089 case '0': case '1': case '2': case '3': case '4':
8090 case '5': case '6': case '7': case '8': case '9':
8094 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8095 RExC_parse += numlen;
8096 if (PL_encoding && value < 0x100)
8097 goto recode_encoding;
8102 SV* enc = PL_encoding;
8103 value = reg_recode((const char)(U8)value, &enc);
8104 if (!enc && SIZE_ONLY)
8105 ckWARNreg(RExC_parse,
8106 "Invalid escape in the specified encoding");
8110 if (!SIZE_ONLY && isALPHA(value))
8111 ckWARN2reg(RExC_parse,
8112 "Unrecognized escape \\%c in character class passed through",
8116 } /* end of \blah */
8122 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8124 if (!SIZE_ONLY && !need_class)
8125 ANYOF_CLASS_ZERO(ret);
8129 /* a bad range like a-\d, a-[:digit:] ? */
8133 RExC_parse >= rangebegin ?
8134 RExC_parse - rangebegin : 0;
8135 ckWARN4reg(RExC_parse,
8136 "False [] range \"%*.*s\"",
8139 if (prevvalue < 256) {
8140 ANYOF_BITMAP_SET(ret, prevvalue);
8141 ANYOF_BITMAP_SET(ret, '-');
8144 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8145 Perl_sv_catpvf(aTHX_ listsv,
8146 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8150 range = 0; /* this was not a true range */
8156 const char *what = NULL;
8159 if (namedclass > OOB_NAMEDCLASS)
8160 optimize_invert = FALSE;
8161 /* Possible truncation here but in some 64-bit environments
8162 * the compiler gets heartburn about switch on 64-bit values.
8163 * A similar issue a little earlier when switching on value.
8165 switch ((I32)namedclass) {
8167 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8168 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8169 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8170 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8171 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8172 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8173 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8174 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8175 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8176 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8177 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8178 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8179 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8181 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8182 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8184 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8185 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8186 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8189 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8192 for (value = 0; value < 128; value++)
8193 ANYOF_BITMAP_SET(ret, value);
8195 for (value = 0; value < 256; value++) {
8197 ANYOF_BITMAP_SET(ret, value);
8206 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8209 for (value = 128; value < 256; value++)
8210 ANYOF_BITMAP_SET(ret, value);
8212 for (value = 0; value < 256; value++) {
8213 if (!isASCII(value))
8214 ANYOF_BITMAP_SET(ret, value);
8223 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8225 /* consecutive digits assumed */
8226 for (value = '0'; value <= '9'; value++)
8227 ANYOF_BITMAP_SET(ret, value);
8230 what = POSIX_CC_UNI_NAME("Digit");
8234 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8236 /* consecutive digits assumed */
8237 for (value = 0; value < '0'; value++)
8238 ANYOF_BITMAP_SET(ret, value);
8239 for (value = '9' + 1; value < 256; value++)
8240 ANYOF_BITMAP_SET(ret, value);
8243 what = POSIX_CC_UNI_NAME("Digit");
8246 /* this is to handle \p and \P */
8249 vFAIL("Invalid [::] class");
8253 /* Strings such as "+utf8::isWord\n" */
8254 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8257 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8260 } /* end of namedclass \blah */
8263 if (prevvalue > (IV)value) /* b-a */ {
8264 const int w = RExC_parse - rangebegin;
8265 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8266 range = 0; /* not a valid range */
8270 prevvalue = value; /* save the beginning of the range */
8271 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8272 RExC_parse[1] != ']') {
8275 /* a bad range like \w-, [:word:]- ? */
8276 if (namedclass > OOB_NAMEDCLASS) {
8277 if (ckWARN(WARN_REGEXP)) {
8279 RExC_parse >= rangebegin ?
8280 RExC_parse - rangebegin : 0;
8282 "False [] range \"%*.*s\"",
8286 ANYOF_BITMAP_SET(ret, '-');
8288 range = 1; /* yeah, it's a range! */
8289 continue; /* but do it the next time */
8293 /* now is the next time */
8294 /*stored += (value - prevvalue + 1);*/
8296 if (prevvalue < 256) {
8297 const IV ceilvalue = value < 256 ? value : 255;
8300 /* In EBCDIC [\x89-\x91] should include
8301 * the \x8e but [i-j] should not. */
8302 if (literal_endpoint == 2 &&
8303 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8304 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8306 if (isLOWER(prevvalue)) {
8307 for (i = prevvalue; i <= ceilvalue; i++)
8308 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8310 ANYOF_BITMAP_SET(ret, i);
8313 for (i = prevvalue; i <= ceilvalue; i++)
8314 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8316 ANYOF_BITMAP_SET(ret, i);
8322 for (i = prevvalue; i <= ceilvalue; i++) {
8323 if (!ANYOF_BITMAP_TEST(ret,i)) {
8325 ANYOF_BITMAP_SET(ret, i);
8329 if (value > 255 || UTF) {
8330 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8331 const UV natvalue = NATIVE_TO_UNI(value);
8332 stored+=2; /* can't optimize this class */
8333 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8334 if (prevnatvalue < natvalue) { /* what about > ? */
8335 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8336 prevnatvalue, natvalue);
8338 else if (prevnatvalue == natvalue) {
8339 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8341 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8343 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8345 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8346 if (RExC_precomp[0] == ':' &&
8347 RExC_precomp[1] == '[' &&
8348 (f == 0xDF || f == 0x92)) {
8349 f = NATIVE_TO_UNI(f);
8352 /* If folding and foldable and a single
8353 * character, insert also the folded version
8354 * to the charclass. */
8356 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8357 if ((RExC_precomp[0] == ':' &&
8358 RExC_precomp[1] == '[' &&
8360 (value == 0xFB05 || value == 0xFB06))) ?
8361 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8362 foldlen == (STRLEN)UNISKIP(f) )
8364 if (foldlen == (STRLEN)UNISKIP(f))
8366 Perl_sv_catpvf(aTHX_ listsv,
8369 /* Any multicharacter foldings
8370 * require the following transform:
8371 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8372 * where E folds into "pq" and F folds
8373 * into "rst", all other characters
8374 * fold to single characters. We save
8375 * away these multicharacter foldings,
8376 * to be later saved as part of the
8377 * additional "s" data. */
8380 if (!unicode_alternate)
8381 unicode_alternate = newAV();
8382 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8384 av_push(unicode_alternate, sv);
8388 /* If folding and the value is one of the Greek
8389 * sigmas insert a few more sigmas to make the
8390 * folding rules of the sigmas to work right.
8391 * Note that not all the possible combinations
8392 * are handled here: some of them are handled
8393 * by the standard folding rules, and some of
8394 * them (literal or EXACTF cases) are handled
8395 * during runtime in regexec.c:S_find_byclass(). */
8396 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8397 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8398 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8399 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8400 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8402 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8403 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8404 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8409 literal_endpoint = 0;
8413 range = 0; /* this range (if it was one) is done now */
8417 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8419 RExC_size += ANYOF_CLASS_ADD_SKIP;
8421 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8427 /****** !SIZE_ONLY AFTER HERE *********/
8429 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8430 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8432 /* optimize single char class to an EXACT node
8433 but *only* when its not a UTF/high char */
8434 const char * cur_parse= RExC_parse;
8435 RExC_emit = (regnode *)orig_emit;
8436 RExC_parse = (char *)orig_parse;
8437 ret = reg_node(pRExC_state,
8438 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8439 RExC_parse = (char *)cur_parse;
8440 *STRING(ret)= (char)value;
8442 RExC_emit += STR_SZ(1);
8443 SvREFCNT_dec(listsv);
8446 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8447 if ( /* If the only flag is folding (plus possibly inversion). */
8448 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8450 for (value = 0; value < 256; ++value) {
8451 if (ANYOF_BITMAP_TEST(ret, value)) {
8452 UV fold = PL_fold[value];
8455 ANYOF_BITMAP_SET(ret, fold);
8458 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8461 /* optimize inverted simple patterns (e.g. [^a-z]) */
8462 if (optimize_invert &&
8463 /* If the only flag is inversion. */
8464 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8465 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8466 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8467 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8470 AV * const av = newAV();
8472 /* The 0th element stores the character class description
8473 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8474 * to initialize the appropriate swash (which gets stored in
8475 * the 1st element), and also useful for dumping the regnode.
8476 * The 2nd element stores the multicharacter foldings,
8477 * used later (regexec.c:S_reginclass()). */
8478 av_store(av, 0, listsv);
8479 av_store(av, 1, NULL);
8480 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8481 rv = newRV_noinc(MUTABLE_SV(av));
8482 n = add_data(pRExC_state, 1, "s");
8483 RExC_rxi->data->data[n] = (void*)rv;
8491 /* reg_skipcomment()
8493 Absorbs an /x style # comments from the input stream.
8494 Returns true if there is more text remaining in the stream.
8495 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8496 terminates the pattern without including a newline.
8498 Note its the callers responsibility to ensure that we are
8504 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8508 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8510 while (RExC_parse < RExC_end)
8511 if (*RExC_parse++ == '\n') {
8516 /* we ran off the end of the pattern without ending
8517 the comment, so we have to add an \n when wrapping */
8518 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8526 Advance that parse position, and optionall absorbs
8527 "whitespace" from the inputstream.
8529 Without /x "whitespace" means (?#...) style comments only,
8530 with /x this means (?#...) and # comments and whitespace proper.
8532 Returns the RExC_parse point from BEFORE the scan occurs.
8534 This is the /x friendly way of saying RExC_parse++.
8538 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8540 char* const retval = RExC_parse++;
8542 PERL_ARGS_ASSERT_NEXTCHAR;
8545 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8546 RExC_parse[2] == '#') {
8547 while (*RExC_parse != ')') {
8548 if (RExC_parse == RExC_end)
8549 FAIL("Sequence (?#... not terminated");
8555 if (RExC_flags & RXf_PMf_EXTENDED) {
8556 if (isSPACE(*RExC_parse)) {
8560 else if (*RExC_parse == '#') {
8561 if ( reg_skipcomment( pRExC_state ) )
8570 - reg_node - emit a node
8572 STATIC regnode * /* Location. */
8573 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8576 register regnode *ptr;
8577 regnode * const ret = RExC_emit;
8578 GET_RE_DEBUG_FLAGS_DECL;
8580 PERL_ARGS_ASSERT_REG_NODE;
8583 SIZE_ALIGN(RExC_size);
8587 if (RExC_emit >= RExC_emit_bound)
8588 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8590 NODE_ALIGN_FILL(ret);
8592 FILL_ADVANCE_NODE(ptr, op);
8593 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8594 #ifdef RE_TRACK_PATTERN_OFFSETS
8595 if (RExC_offsets) { /* MJD */
8596 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8597 "reg_node", __LINE__,
8599 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8600 ? "Overwriting end of array!\n" : "OK",
8601 (UV)(RExC_emit - RExC_emit_start),
8602 (UV)(RExC_parse - RExC_start),
8603 (UV)RExC_offsets[0]));
8604 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8612 - reganode - emit a node with an argument
8614 STATIC regnode * /* Location. */
8615 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8618 register regnode *ptr;
8619 regnode * const ret = RExC_emit;
8620 GET_RE_DEBUG_FLAGS_DECL;
8622 PERL_ARGS_ASSERT_REGANODE;
8625 SIZE_ALIGN(RExC_size);
8630 assert(2==regarglen[op]+1);
8632 Anything larger than this has to allocate the extra amount.
8633 If we changed this to be:
8635 RExC_size += (1 + regarglen[op]);
8637 then it wouldn't matter. Its not clear what side effect
8638 might come from that so its not done so far.
8643 if (RExC_emit >= RExC_emit_bound)
8644 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8646 NODE_ALIGN_FILL(ret);
8648 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8649 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8650 #ifdef RE_TRACK_PATTERN_OFFSETS
8651 if (RExC_offsets) { /* MJD */
8652 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8656 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8657 "Overwriting end of array!\n" : "OK",
8658 (UV)(RExC_emit - RExC_emit_start),
8659 (UV)(RExC_parse - RExC_start),
8660 (UV)RExC_offsets[0]));
8661 Set_Cur_Node_Offset;
8669 - reguni - emit (if appropriate) a Unicode character
8672 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8676 PERL_ARGS_ASSERT_REGUNI;
8678 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8682 - reginsert - insert an operator in front of already-emitted operand
8684 * Means relocating the operand.
8687 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8690 register regnode *src;
8691 register regnode *dst;
8692 register regnode *place;
8693 const int offset = regarglen[(U8)op];
8694 const int size = NODE_STEP_REGNODE + offset;
8695 GET_RE_DEBUG_FLAGS_DECL;
8697 PERL_ARGS_ASSERT_REGINSERT;
8698 PERL_UNUSED_ARG(depth);
8699 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8700 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8709 if (RExC_open_parens) {
8711 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8712 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8713 if ( RExC_open_parens[paren] >= opnd ) {
8714 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8715 RExC_open_parens[paren] += size;
8717 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8719 if ( RExC_close_parens[paren] >= opnd ) {
8720 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8721 RExC_close_parens[paren] += size;
8723 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8728 while (src > opnd) {
8729 StructCopy(--src, --dst, regnode);
8730 #ifdef RE_TRACK_PATTERN_OFFSETS
8731 if (RExC_offsets) { /* MJD 20010112 */
8732 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8736 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8737 ? "Overwriting end of array!\n" : "OK",
8738 (UV)(src - RExC_emit_start),
8739 (UV)(dst - RExC_emit_start),
8740 (UV)RExC_offsets[0]));
8741 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8742 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8748 place = opnd; /* Op node, where operand used to be. */
8749 #ifdef RE_TRACK_PATTERN_OFFSETS
8750 if (RExC_offsets) { /* MJD */
8751 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8755 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8756 ? "Overwriting end of array!\n" : "OK",
8757 (UV)(place - RExC_emit_start),
8758 (UV)(RExC_parse - RExC_start),
8759 (UV)RExC_offsets[0]));
8760 Set_Node_Offset(place, RExC_parse);
8761 Set_Node_Length(place, 1);
8764 src = NEXTOPER(place);
8765 FILL_ADVANCE_NODE(place, op);
8766 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8767 Zero(src, offset, regnode);
8771 - regtail - set the next-pointer at the end of a node chain of p to val.
8772 - SEE ALSO: regtail_study
8774 /* TODO: All three parms should be const */
8776 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8779 register regnode *scan;
8780 GET_RE_DEBUG_FLAGS_DECL;
8782 PERL_ARGS_ASSERT_REGTAIL;
8784 PERL_UNUSED_ARG(depth);
8790 /* Find last node. */
8793 regnode * const temp = regnext(scan);
8795 SV * const mysv=sv_newmortal();
8796 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8797 regprop(RExC_rx, mysv, scan);
8798 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8799 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8800 (temp == NULL ? "->" : ""),
8801 (temp == NULL ? PL_reg_name[OP(val)] : "")
8809 if (reg_off_by_arg[OP(scan)]) {
8810 ARG_SET(scan, val - scan);
8813 NEXT_OFF(scan) = val - scan;
8819 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8820 - Look for optimizable sequences at the same time.
8821 - currently only looks for EXACT chains.
8823 This is expermental code. The idea is to use this routine to perform
8824 in place optimizations on branches and groups as they are constructed,
8825 with the long term intention of removing optimization from study_chunk so
8826 that it is purely analytical.
8828 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8829 to control which is which.
8832 /* TODO: All four parms should be const */
8835 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8838 register regnode *scan;
8840 #ifdef EXPERIMENTAL_INPLACESCAN
8843 GET_RE_DEBUG_FLAGS_DECL;
8845 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8851 /* Find last node. */
8855 regnode * const temp = regnext(scan);
8856 #ifdef EXPERIMENTAL_INPLACESCAN
8857 if (PL_regkind[OP(scan)] == EXACT)
8858 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8866 if( exact == PSEUDO )
8868 else if ( exact != OP(scan) )
8877 SV * const mysv=sv_newmortal();
8878 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8879 regprop(RExC_rx, mysv, scan);
8880 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8881 SvPV_nolen_const(mysv),
8883 PL_reg_name[exact]);
8890 SV * const mysv_val=sv_newmortal();
8891 DEBUG_PARSE_MSG("");
8892 regprop(RExC_rx, mysv_val, val);
8893 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8894 SvPV_nolen_const(mysv_val),
8895 (IV)REG_NODE_NUM(val),
8899 if (reg_off_by_arg[OP(scan)]) {
8900 ARG_SET(scan, val - scan);
8903 NEXT_OFF(scan) = val - scan;
8911 - regcurly - a little FSA that accepts {\d+,?\d*}
8914 S_regcurly(register const char *s)
8916 PERL_ARGS_ASSERT_REGCURLY;
8935 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8939 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8944 for (bit=0; bit<32; bit++) {
8945 if (flags & (1<<bit)) {
8947 PerlIO_printf(Perl_debug_log, "%s",lead);
8948 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8953 PerlIO_printf(Perl_debug_log, "\n");
8955 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8961 Perl_regdump(pTHX_ const regexp *r)
8965 SV * const sv = sv_newmortal();
8966 SV *dsv= sv_newmortal();
8968 GET_RE_DEBUG_FLAGS_DECL;
8970 PERL_ARGS_ASSERT_REGDUMP;
8972 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8974 /* Header fields of interest. */
8975 if (r->anchored_substr) {
8976 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8977 RE_SV_DUMPLEN(r->anchored_substr), 30);
8978 PerlIO_printf(Perl_debug_log,
8979 "anchored %s%s at %"IVdf" ",
8980 s, RE_SV_TAIL(r->anchored_substr),
8981 (IV)r->anchored_offset);
8982 } else if (r->anchored_utf8) {
8983 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8984 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8985 PerlIO_printf(Perl_debug_log,
8986 "anchored utf8 %s%s at %"IVdf" ",
8987 s, RE_SV_TAIL(r->anchored_utf8),
8988 (IV)r->anchored_offset);
8990 if (r->float_substr) {
8991 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8992 RE_SV_DUMPLEN(r->float_substr), 30);
8993 PerlIO_printf(Perl_debug_log,
8994 "floating %s%s at %"IVdf"..%"UVuf" ",
8995 s, RE_SV_TAIL(r->float_substr),
8996 (IV)r->float_min_offset, (UV)r->float_max_offset);
8997 } else if (r->float_utf8) {
8998 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8999 RE_SV_DUMPLEN(r->float_utf8), 30);
9000 PerlIO_printf(Perl_debug_log,
9001 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9002 s, RE_SV_TAIL(r->float_utf8),
9003 (IV)r->float_min_offset, (UV)r->float_max_offset);
9005 if (r->check_substr || r->check_utf8)
9006 PerlIO_printf(Perl_debug_log,
9008 (r->check_substr == r->float_substr
9009 && r->check_utf8 == r->float_utf8
9010 ? "(checking floating" : "(checking anchored"));
9011 if (r->extflags & RXf_NOSCAN)
9012 PerlIO_printf(Perl_debug_log, " noscan");
9013 if (r->extflags & RXf_CHECK_ALL)
9014 PerlIO_printf(Perl_debug_log, " isall");
9015 if (r->check_substr || r->check_utf8)
9016 PerlIO_printf(Perl_debug_log, ") ");
9018 if (ri->regstclass) {
9019 regprop(r, sv, ri->regstclass);
9020 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9022 if (r->extflags & RXf_ANCH) {
9023 PerlIO_printf(Perl_debug_log, "anchored");
9024 if (r->extflags & RXf_ANCH_BOL)
9025 PerlIO_printf(Perl_debug_log, "(BOL)");
9026 if (r->extflags & RXf_ANCH_MBOL)
9027 PerlIO_printf(Perl_debug_log, "(MBOL)");
9028 if (r->extflags & RXf_ANCH_SBOL)
9029 PerlIO_printf(Perl_debug_log, "(SBOL)");
9030 if (r->extflags & RXf_ANCH_GPOS)
9031 PerlIO_printf(Perl_debug_log, "(GPOS)");
9032 PerlIO_putc(Perl_debug_log, ' ');
9034 if (r->extflags & RXf_GPOS_SEEN)
9035 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9036 if (r->intflags & PREGf_SKIP)
9037 PerlIO_printf(Perl_debug_log, "plus ");
9038 if (r->intflags & PREGf_IMPLICIT)
9039 PerlIO_printf(Perl_debug_log, "implicit ");
9040 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9041 if (r->extflags & RXf_EVAL_SEEN)
9042 PerlIO_printf(Perl_debug_log, "with eval ");
9043 PerlIO_printf(Perl_debug_log, "\n");
9044 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9046 PERL_ARGS_ASSERT_REGDUMP;
9047 PERL_UNUSED_CONTEXT;
9049 #endif /* DEBUGGING */
9053 - regprop - printable representation of opcode
9055 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9058 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9059 if (flags & ANYOF_INVERT) \
9060 /*make sure the invert info is in each */ \
9061 sv_catpvs(sv, "^"); \
9067 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9072 RXi_GET_DECL(prog,progi);
9073 GET_RE_DEBUG_FLAGS_DECL;
9075 PERL_ARGS_ASSERT_REGPROP;
9079 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9080 /* It would be nice to FAIL() here, but this may be called from
9081 regexec.c, and it would be hard to supply pRExC_state. */
9082 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9083 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9085 k = PL_regkind[OP(o)];
9089 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9090 * is a crude hack but it may be the best for now since
9091 * we have no flag "this EXACTish node was UTF-8"
9093 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9094 PERL_PV_ESCAPE_UNI_DETECT |
9095 PERL_PV_PRETTY_ELLIPSES |
9096 PERL_PV_PRETTY_LTGT |
9097 PERL_PV_PRETTY_NOCLEAR
9099 } else if (k == TRIE) {
9100 /* print the details of the trie in dumpuntil instead, as
9101 * progi->data isn't available here */
9102 const char op = OP(o);
9103 const U32 n = ARG(o);
9104 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9105 (reg_ac_data *)progi->data->data[n] :
9107 const reg_trie_data * const trie
9108 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9110 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9111 DEBUG_TRIE_COMPILE_r(
9112 Perl_sv_catpvf(aTHX_ sv,
9113 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9114 (UV)trie->startstate,
9115 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9116 (UV)trie->wordcount,
9119 (UV)TRIE_CHARCOUNT(trie),
9120 (UV)trie->uniquecharcount
9123 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9125 int rangestart = -1;
9126 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9128 for (i = 0; i <= 256; i++) {
9129 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9130 if (rangestart == -1)
9132 } else if (rangestart != -1) {
9133 if (i <= rangestart + 3)
9134 for (; rangestart < i; rangestart++)
9135 put_byte(sv, rangestart);
9137 put_byte(sv, rangestart);
9139 put_byte(sv, i - 1);
9147 } else if (k == CURLY) {
9148 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9149 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9150 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9152 else if (k == WHILEM && o->flags) /* Ordinal/of */
9153 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9154 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9155 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9156 if ( RXp_PAREN_NAMES(prog) ) {
9157 if ( k != REF || OP(o) < NREF) {
9158 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9159 SV **name= av_fetch(list, ARG(o), 0 );
9161 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9164 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9165 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9166 I32 *nums=(I32*)SvPVX(sv_dat);
9167 SV **name= av_fetch(list, nums[0], 0 );
9170 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9171 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9172 (n ? "," : ""), (IV)nums[n]);
9174 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9178 } else if (k == GOSUB)
9179 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9180 else if (k == VERB) {
9182 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9183 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9184 } else if (k == LOGICAL)
9185 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9186 else if (k == FOLDCHAR)
9187 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9188 else if (k == ANYOF) {
9189 int i, rangestart = -1;
9190 const U8 flags = ANYOF_FLAGS(o);
9193 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9194 static const char * const anyofs[] = {
9227 if (flags & ANYOF_LOCALE)
9228 sv_catpvs(sv, "{loc}");
9229 if (flags & ANYOF_FOLD)
9230 sv_catpvs(sv, "{i}");
9231 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9232 if (flags & ANYOF_INVERT)
9235 /* output what the standard cp 0-255 bitmap matches */
9236 for (i = 0; i <= 256; i++) {
9237 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9238 if (rangestart == -1)
9240 } else if (rangestart != -1) {
9241 if (i <= rangestart + 3)
9242 for (; rangestart < i; rangestart++)
9243 put_byte(sv, rangestart);
9245 put_byte(sv, rangestart);
9247 put_byte(sv, i - 1);
9254 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9255 /* output any special charclass tests (used mostly under use locale) */
9256 if (o->flags & ANYOF_CLASS)
9257 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9258 if (ANYOF_CLASS_TEST(o,i)) {
9259 sv_catpv(sv, anyofs[i]);
9263 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9265 /* output information about the unicode matching */
9266 if (flags & ANYOF_UNICODE)
9267 sv_catpvs(sv, "{unicode}");
9268 else if (flags & ANYOF_UNICODE_ALL)
9269 sv_catpvs(sv, "{unicode_all}");
9273 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9277 U8 s[UTF8_MAXBYTES_CASE+1];
9279 for (i = 0; i <= 256; i++) { /* just the first 256 */
9280 uvchr_to_utf8(s, i);
9282 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9283 if (rangestart == -1)
9285 } else if (rangestart != -1) {
9286 if (i <= rangestart + 3)
9287 for (; rangestart < i; rangestart++) {
9288 const U8 * const e = uvchr_to_utf8(s,rangestart);
9290 for(p = s; p < e; p++)
9294 const U8 *e = uvchr_to_utf8(s,rangestart);
9296 for (p = s; p < e; p++)
9299 e = uvchr_to_utf8(s, i-1);
9300 for (p = s; p < e; p++)
9307 sv_catpvs(sv, "..."); /* et cetera */
9311 char *s = savesvpv(lv);
9312 char * const origs = s;
9314 while (*s && *s != '\n')
9318 const char * const t = ++s;
9336 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9338 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9339 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9341 PERL_UNUSED_CONTEXT;
9342 PERL_UNUSED_ARG(sv);
9344 PERL_UNUSED_ARG(prog);
9345 #endif /* DEBUGGING */
9349 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9350 { /* Assume that RE_INTUIT is set */
9352 struct regexp *const prog = (struct regexp *)SvANY(r);
9353 GET_RE_DEBUG_FLAGS_DECL;
9355 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9356 PERL_UNUSED_CONTEXT;
9360 const char * const s = SvPV_nolen_const(prog->check_substr
9361 ? prog->check_substr : prog->check_utf8);
9363 if (!PL_colorset) reginitcolors();
9364 PerlIO_printf(Perl_debug_log,
9365 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9367 prog->check_substr ? "" : "utf8 ",
9368 PL_colors[5],PL_colors[0],
9371 (strlen(s) > 60 ? "..." : ""));
9374 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9380 handles refcounting and freeing the perl core regexp structure. When
9381 it is necessary to actually free the structure the first thing it
9382 does is call the 'free' method of the regexp_engine associated to to
9383 the regexp, allowing the handling of the void *pprivate; member
9384 first. (This routine is not overridable by extensions, which is why
9385 the extensions free is called first.)
9387 See regdupe and regdupe_internal if you change anything here.
9389 #ifndef PERL_IN_XSUB_RE
9391 Perl_pregfree(pTHX_ REGEXP *r)
9397 Perl_pregfree2(pTHX_ REGEXP *rx)
9400 struct regexp *const r = (struct regexp *)SvANY(rx);
9401 GET_RE_DEBUG_FLAGS_DECL;
9403 PERL_ARGS_ASSERT_PREGFREE2;
9406 ReREFCNT_dec(r->mother_re);
9408 CALLREGFREE_PVT(rx); /* free the private data */
9409 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9412 SvREFCNT_dec(r->anchored_substr);
9413 SvREFCNT_dec(r->anchored_utf8);
9414 SvREFCNT_dec(r->float_substr);
9415 SvREFCNT_dec(r->float_utf8);
9416 Safefree(r->substrs);
9418 RX_MATCH_COPY_FREE(rx);
9419 #ifdef PERL_OLD_COPY_ON_WRITE
9420 SvREFCNT_dec(r->saved_copy);
9427 This is a hacky workaround to the structural issue of match results
9428 being stored in the regexp structure which is in turn stored in
9429 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9430 could be PL_curpm in multiple contexts, and could require multiple
9431 result sets being associated with the pattern simultaneously, such
9432 as when doing a recursive match with (??{$qr})
9434 The solution is to make a lightweight copy of the regexp structure
9435 when a qr// is returned from the code executed by (??{$qr}) this
9436 lightweight copy doesnt actually own any of its data except for
9437 the starp/end and the actual regexp structure itself.
9443 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9446 struct regexp *const r = (struct regexp *)SvANY(rx);
9447 register const I32 npar = r->nparens+1;
9449 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9452 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9453 ret = (struct regexp *)SvANY(ret_x);
9455 (void)ReREFCNT_inc(rx);
9456 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9457 by pointing directly at the buffer, but flagging that the allocated
9458 space in the copy is zero. As we've just done a struct copy, it's now
9459 a case of zero-ing that, rather than copying the current length. */
9460 SvPV_set(ret_x, RX_WRAPPED(rx));
9461 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9462 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9463 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9464 SvLEN_set(ret_x, 0);
9465 Newx(ret->offs, npar, regexp_paren_pair);
9466 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9468 Newx(ret->substrs, 1, struct reg_substr_data);
9469 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9471 SvREFCNT_inc_void(ret->anchored_substr);
9472 SvREFCNT_inc_void(ret->anchored_utf8);
9473 SvREFCNT_inc_void(ret->float_substr);
9474 SvREFCNT_inc_void(ret->float_utf8);
9476 /* check_substr and check_utf8, if non-NULL, point to either their
9477 anchored or float namesakes, and don't hold a second reference. */
9479 RX_MATCH_COPIED_off(ret_x);
9480 #ifdef PERL_OLD_COPY_ON_WRITE
9481 ret->saved_copy = NULL;
9483 ret->mother_re = rx;
9489 /* regfree_internal()
9491 Free the private data in a regexp. This is overloadable by
9492 extensions. Perl takes care of the regexp structure in pregfree(),
9493 this covers the *pprivate pointer which technically perldoesnt
9494 know about, however of course we have to handle the
9495 regexp_internal structure when no extension is in use.
9497 Note this is called before freeing anything in the regexp
9502 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9505 struct regexp *const r = (struct regexp *)SvANY(rx);
9507 GET_RE_DEBUG_FLAGS_DECL;
9509 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9515 SV *dsv= sv_newmortal();
9516 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9517 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9518 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9519 PL_colors[4],PL_colors[5],s);
9522 #ifdef RE_TRACK_PATTERN_OFFSETS
9524 Safefree(ri->u.offsets); /* 20010421 MJD */
9527 int n = ri->data->count;
9528 PAD* new_comppad = NULL;
9533 /* If you add a ->what type here, update the comment in regcomp.h */
9534 switch (ri->data->what[n]) {
9538 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9541 Safefree(ri->data->data[n]);
9544 new_comppad = MUTABLE_AV(ri->data->data[n]);
9547 if (new_comppad == NULL)
9548 Perl_croak(aTHX_ "panic: pregfree comppad");
9549 PAD_SAVE_LOCAL(old_comppad,
9550 /* Watch out for global destruction's random ordering. */
9551 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9554 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9557 op_free((OP_4tree*)ri->data->data[n]);
9559 PAD_RESTORE_LOCAL(old_comppad);
9560 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9566 { /* Aho Corasick add-on structure for a trie node.
9567 Used in stclass optimization only */
9569 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9571 refcount = --aho->refcount;
9574 PerlMemShared_free(aho->states);
9575 PerlMemShared_free(aho->fail);
9576 /* do this last!!!! */
9577 PerlMemShared_free(ri->data->data[n]);
9578 PerlMemShared_free(ri->regstclass);
9584 /* trie structure. */
9586 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9588 refcount = --trie->refcount;
9591 PerlMemShared_free(trie->charmap);
9592 PerlMemShared_free(trie->states);
9593 PerlMemShared_free(trie->trans);
9595 PerlMemShared_free(trie->bitmap);
9597 PerlMemShared_free(trie->wordlen);
9599 PerlMemShared_free(trie->jump);
9601 PerlMemShared_free(trie->nextword);
9602 /* do this last!!!! */
9603 PerlMemShared_free(ri->data->data[n]);
9608 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9611 Safefree(ri->data->what);
9618 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9619 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9620 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9621 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9624 re_dup - duplicate a regexp.
9626 This routine is expected to clone a given regexp structure. It is only
9627 compiled under USE_ITHREADS.
9629 After all of the core data stored in struct regexp is duplicated
9630 the regexp_engine.dupe method is used to copy any private data
9631 stored in the *pprivate pointer. This allows extensions to handle
9632 any duplication it needs to do.
9634 See pregfree() and regfree_internal() if you change anything here.
9636 #if defined(USE_ITHREADS)
9637 #ifndef PERL_IN_XSUB_RE
9639 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9643 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9644 struct regexp *ret = (struct regexp *)SvANY(dstr);
9646 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9648 npar = r->nparens+1;
9649 Newx(ret->offs, npar, regexp_paren_pair);
9650 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9652 /* no need to copy these */
9653 Newx(ret->swap, npar, regexp_paren_pair);
9657 /* Do it this way to avoid reading from *r after the StructCopy().
9658 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9659 cache, it doesn't matter. */
9660 const bool anchored = r->check_substr
9661 ? r->check_substr == r->anchored_substr
9662 : r->check_utf8 == r->anchored_utf8;
9663 Newx(ret->substrs, 1, struct reg_substr_data);
9664 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9666 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9667 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9668 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9669 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9671 /* check_substr and check_utf8, if non-NULL, point to either their
9672 anchored or float namesakes, and don't hold a second reference. */
9674 if (ret->check_substr) {
9676 assert(r->check_utf8 == r->anchored_utf8);
9677 ret->check_substr = ret->anchored_substr;
9678 ret->check_utf8 = ret->anchored_utf8;
9680 assert(r->check_substr == r->float_substr);
9681 assert(r->check_utf8 == r->float_utf8);
9682 ret->check_substr = ret->float_substr;
9683 ret->check_utf8 = ret->float_utf8;
9685 } else if (ret->check_utf8) {
9687 ret->check_utf8 = ret->anchored_utf8;
9689 ret->check_utf8 = ret->float_utf8;
9694 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9697 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9699 if (RX_MATCH_COPIED(dstr))
9700 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9703 #ifdef PERL_OLD_COPY_ON_WRITE
9704 ret->saved_copy = NULL;
9707 if (ret->mother_re) {
9708 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9709 /* Our storage points directly to our mother regexp, but that's
9710 1: a buffer in a different thread
9711 2: something we no longer hold a reference on
9712 so we need to copy it locally. */
9713 /* Note we need to sue SvCUR() on our mother_re, because it, in
9714 turn, may well be pointing to its own mother_re. */
9715 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9716 SvCUR(ret->mother_re)+1));
9717 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9719 ret->mother_re = NULL;
9723 #endif /* PERL_IN_XSUB_RE */
9728 This is the internal complement to regdupe() which is used to copy
9729 the structure pointed to by the *pprivate pointer in the regexp.
9730 This is the core version of the extension overridable cloning hook.
9731 The regexp structure being duplicated will be copied by perl prior
9732 to this and will be provided as the regexp *r argument, however
9733 with the /old/ structures pprivate pointer value. Thus this routine
9734 may override any copying normally done by perl.
9736 It returns a pointer to the new regexp_internal structure.
9740 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9743 struct regexp *const r = (struct regexp *)SvANY(rx);
9744 regexp_internal *reti;
9748 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9750 npar = r->nparens+1;
9753 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9754 Copy(ri->program, reti->program, len+1, regnode);
9757 reti->regstclass = NULL;
9761 const int count = ri->data->count;
9764 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9765 char, struct reg_data);
9766 Newx(d->what, count, U8);
9769 for (i = 0; i < count; i++) {
9770 d->what[i] = ri->data->what[i];
9771 switch (d->what[i]) {
9772 /* legal options are one of: sSfpontTu
9773 see also regcomp.h and pregfree() */
9776 case 'p': /* actually an AV, but the dup function is identical. */
9777 case 'u': /* actually an HV, but the dup function is identical. */
9778 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9781 /* This is cheating. */
9782 Newx(d->data[i], 1, struct regnode_charclass_class);
9783 StructCopy(ri->data->data[i], d->data[i],
9784 struct regnode_charclass_class);
9785 reti->regstclass = (regnode*)d->data[i];
9788 /* Compiled op trees are readonly and in shared memory,
9789 and can thus be shared without duplication. */
9791 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9795 /* Trie stclasses are readonly and can thus be shared
9796 * without duplication. We free the stclass in pregfree
9797 * when the corresponding reg_ac_data struct is freed.
9799 reti->regstclass= ri->regstclass;
9803 ((reg_trie_data*)ri->data->data[i])->refcount++;
9807 d->data[i] = ri->data->data[i];
9810 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9819 reti->name_list_idx = ri->name_list_idx;
9821 #ifdef RE_TRACK_PATTERN_OFFSETS
9822 if (ri->u.offsets) {
9823 Newx(reti->u.offsets, 2*len+1, U32);
9824 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9827 SetProgLen(reti,len);
9833 #endif /* USE_ITHREADS */
9835 #ifndef PERL_IN_XSUB_RE
9838 - regnext - dig the "next" pointer out of a node
9841 Perl_regnext(pTHX_ register regnode *p)
9844 register I32 offset;
9849 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9858 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9861 STRLEN l1 = strlen(pat1);
9862 STRLEN l2 = strlen(pat2);
9865 const char *message;
9867 PERL_ARGS_ASSERT_RE_CROAK2;
9873 Copy(pat1, buf, l1 , char);
9874 Copy(pat2, buf + l1, l2 , char);
9875 buf[l1 + l2] = '\n';
9876 buf[l1 + l2 + 1] = '\0';
9878 /* ANSI variant takes additional second argument */
9879 va_start(args, pat2);
9883 msv = vmess(buf, &args);
9885 message = SvPV_const(msv,l1);
9888 Copy(message, buf, l1 , char);
9889 buf[l1-1] = '\0'; /* Overwrite \n */
9890 Perl_croak(aTHX_ "%s", buf);
9893 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9895 #ifndef PERL_IN_XSUB_RE
9897 Perl_save_re_context(pTHX)
9901 struct re_save_state *state;
9903 SAVEVPTR(PL_curcop);
9904 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9906 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9907 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9908 SSPUSHINT(SAVEt_RE_STATE);
9910 Copy(&PL_reg_state, state, 1, struct re_save_state);
9912 PL_reg_start_tmp = 0;
9913 PL_reg_start_tmpl = 0;
9914 PL_reg_oldsaved = NULL;
9915 PL_reg_oldsavedlen = 0;
9917 PL_reg_leftiter = 0;
9918 PL_reg_poscache = NULL;
9919 PL_reg_poscache_size = 0;
9920 #ifdef PERL_OLD_COPY_ON_WRITE
9924 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9926 const REGEXP * const rx = PM_GETRE(PL_curpm);
9929 for (i = 1; i <= RX_NPARENS(rx); i++) {
9930 char digits[TYPE_CHARS(long)];
9931 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9932 GV *const *const gvp
9933 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9936 GV * const gv = *gvp;
9937 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9947 clear_re(pTHX_ void *r)
9950 ReREFCNT_dec((REGEXP *)r);
9956 S_put_byte(pTHX_ SV *sv, int c)
9958 PERL_ARGS_ASSERT_PUT_BYTE;
9960 /* Our definition of isPRINT() ignores locales, so only bytes that are
9961 not part of UTF-8 are considered printable. I assume that the same
9962 holds for UTF-EBCDIC.
9963 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9964 which Wikipedia says:
9966 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9967 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9968 identical, to the ASCII delete (DEL) or rubout control character.
9969 ) So the old condition can be simplified to !isPRINT(c) */
9971 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9973 const char string = c;
9974 if (c == '-' || c == ']' || c == '\\' || c == '^')
9975 sv_catpvs(sv, "\\");
9976 sv_catpvn(sv, &string, 1);
9981 #define CLEAR_OPTSTART \
9982 if (optstart) STMT_START { \
9983 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9987 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9989 STATIC const regnode *
9990 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9991 const regnode *last, const regnode *plast,
9992 SV* sv, I32 indent, U32 depth)
9995 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9996 register const regnode *next;
9997 const regnode *optstart= NULL;
10000 GET_RE_DEBUG_FLAGS_DECL;
10002 PERL_ARGS_ASSERT_DUMPUNTIL;
10004 #ifdef DEBUG_DUMPUNTIL
10005 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10006 last ? last-start : 0,plast ? plast-start : 0);
10009 if (plast && plast < last)
10012 while (PL_regkind[op] != END && (!last || node < last)) {
10013 /* While that wasn't END last time... */
10016 if (op == CLOSE || op == WHILEM)
10018 next = regnext((regnode *)node);
10021 if (OP(node) == OPTIMIZED) {
10022 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10029 regprop(r, sv, node);
10030 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10031 (int)(2*indent + 1), "", SvPVX_const(sv));
10033 if (OP(node) != OPTIMIZED) {
10034 if (next == NULL) /* Next ptr. */
10035 PerlIO_printf(Perl_debug_log, " (0)");
10036 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10037 PerlIO_printf(Perl_debug_log, " (FAIL)");
10039 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10040 (void)PerlIO_putc(Perl_debug_log, '\n');
10044 if (PL_regkind[(U8)op] == BRANCHJ) {
10047 register const regnode *nnode = (OP(next) == LONGJMP
10048 ? regnext((regnode *)next)
10050 if (last && nnode > last)
10052 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10055 else if (PL_regkind[(U8)op] == BRANCH) {
10057 DUMPUNTIL(NEXTOPER(node), next);
10059 else if ( PL_regkind[(U8)op] == TRIE ) {
10060 const regnode *this_trie = node;
10061 const char op = OP(node);
10062 const U32 n = ARG(node);
10063 const reg_ac_data * const ac = op>=AHOCORASICK ?
10064 (reg_ac_data *)ri->data->data[n] :
10066 const reg_trie_data * const trie =
10067 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10069 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10071 const regnode *nextbranch= NULL;
10074 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10075 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10077 PerlIO_printf(Perl_debug_log, "%*s%s ",
10078 (int)(2*(indent+3)), "",
10079 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10080 PL_colors[0], PL_colors[1],
10081 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10082 PERL_PV_PRETTY_ELLIPSES |
10083 PERL_PV_PRETTY_LTGT
10088 U16 dist= trie->jump[word_idx+1];
10089 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10090 (UV)((dist ? this_trie + dist : next) - start));
10093 nextbranch= this_trie + trie->jump[0];
10094 DUMPUNTIL(this_trie + dist, nextbranch);
10096 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10097 nextbranch= regnext((regnode *)nextbranch);
10099 PerlIO_printf(Perl_debug_log, "\n");
10102 if (last && next > last)
10107 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10108 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10109 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10111 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10113 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10115 else if ( op == PLUS || op == STAR) {
10116 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10118 else if (op == ANYOF) {
10119 /* arglen 1 + class block */
10120 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10121 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10122 node = NEXTOPER(node);
10124 else if (PL_regkind[(U8)op] == EXACT) {
10125 /* Literal string, where present. */
10126 node += NODE_SZ_STR(node) - 1;
10127 node = NEXTOPER(node);
10130 node = NEXTOPER(node);
10131 node += regarglen[(U8)op];
10133 if (op == CURLYX || op == OPEN)
10137 #ifdef DEBUG_DUMPUNTIL
10138 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10143 #endif /* DEBUGGING */
10147 * c-indentation-style: bsd
10148 * c-basic-offset: 4
10149 * indent-tabs-mode: t
10152 * ex: set ts=8 sts=4 sw=4 noet: