5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
93 # if defined(BUGGY_MSC6)
94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 # pragma optimize("a",off)
96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 # pragma optimize("w",on )
98 # endif /* BUGGY_MSC6 */
102 #define STATIC static
105 typedef struct RExC_state_t {
106 U32 flags; /* are we folding, multilining? */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
111 char *start; /* Start of input for compile */
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
115 regnode *emit_start; /* Start of emitted-code area */
116 regnode *emit_bound; /* First regnode outside of the allocated space */
117 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
121 I32 size; /* Code size. */
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
124 I32 nestroot; /* root parens we are in - used by accept */
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
135 HV *charnames; /* cache of named sequences */
136 HV *paren_names; /* Paren names */
138 regnode **recurse; /* Recurse regops */
139 I32 recurse_count; /* Number of recurse regops */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
145 const char *lastparse;
147 AV *paren_name_list; /* idx -> name */
148 #define RExC_lastparse (pRExC_state->lastparse)
149 #define RExC_lastnum (pRExC_state->lastnum)
150 #define RExC_paren_name_list (pRExC_state->paren_name_list)
154 #define RExC_flags (pRExC_state->flags)
155 #define RExC_precomp (pRExC_state->precomp)
156 #define RExC_rx_sv (pRExC_state->rx_sv)
157 #define RExC_rx (pRExC_state->rx)
158 #define RExC_rxi (pRExC_state->rxi)
159 #define RExC_start (pRExC_state->start)
160 #define RExC_end (pRExC_state->end)
161 #define RExC_parse (pRExC_state->parse)
162 #define RExC_whilem_seen (pRExC_state->whilem_seen)
163 #ifdef RE_TRACK_PATTERN_OFFSETS
164 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166 #define RExC_emit (pRExC_state->emit)
167 #define RExC_emit_start (pRExC_state->emit_start)
168 #define RExC_emit_bound (pRExC_state->emit_bound)
169 #define RExC_naughty (pRExC_state->naughty)
170 #define RExC_sawback (pRExC_state->sawback)
171 #define RExC_seen (pRExC_state->seen)
172 #define RExC_size (pRExC_state->size)
173 #define RExC_npar (pRExC_state->npar)
174 #define RExC_nestroot (pRExC_state->nestroot)
175 #define RExC_extralen (pRExC_state->extralen)
176 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
177 #define RExC_seen_evals (pRExC_state->seen_evals)
178 #define RExC_utf8 (pRExC_state->utf8)
179 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
180 #define RExC_charnames (pRExC_state->charnames)
181 #define RExC_open_parens (pRExC_state->open_parens)
182 #define RExC_close_parens (pRExC_state->close_parens)
183 #define RExC_opend (pRExC_state->opend)
184 #define RExC_paren_names (pRExC_state->paren_names)
185 #define RExC_recurse (pRExC_state->recurse)
186 #define RExC_recurse_count (pRExC_state->recurse_count)
189 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
194 #undef SPSTART /* dratted cpp namespace... */
197 * Flags to be passed up and down.
199 #define WORST 0 /* Worst case. */
200 #define HASWIDTH 0x01 /* Known to match non-null strings. */
201 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
202 #define SPSTART 0x04 /* Starts with * or +. */
203 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
204 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
206 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
208 /* whether trie related optimizations are enabled */
209 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
210 #define TRIE_STUDY_OPT
211 #define FULL_TRIE_STUDY
217 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
218 #define PBITVAL(paren) (1 << ((paren) & 7))
219 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
220 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
221 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
224 /* About scan_data_t.
226 During optimisation we recurse through the regexp program performing
227 various inplace (keyhole style) optimisations. In addition study_chunk
228 and scan_commit populate this data structure with information about
229 what strings MUST appear in the pattern. We look for the longest
230 string that must appear for at a fixed location, and we look for the
231 longest string that may appear at a floating location. So for instance
236 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
237 strings (because they follow a .* construct). study_chunk will identify
238 both FOO and BAR as being the longest fixed and floating strings respectively.
240 The strings can be composites, for instance
244 will result in a composite fixed substring 'foo'.
246 For each string some basic information is maintained:
248 - offset or min_offset
249 This is the position the string must appear at, or not before.
250 It also implicitly (when combined with minlenp) tells us how many
251 character must match before the string we are searching.
252 Likewise when combined with minlenp and the length of the string
253 tells us how many characters must appear after the string we have
257 Only used for floating strings. This is the rightmost point that
258 the string can appear at. Ifset to I32 max it indicates that the
259 string can occur infinitely far to the right.
262 A pointer to the minimum length of the pattern that the string
263 was found inside. This is important as in the case of positive
264 lookahead or positive lookbehind we can have multiple patterns
269 The minimum length of the pattern overall is 3, the minimum length
270 of the lookahead part is 3, but the minimum length of the part that
271 will actually match is 1. So 'FOO's minimum length is 3, but the
272 minimum length for the F is 1. This is important as the minimum length
273 is used to determine offsets in front of and behind the string being
274 looked for. Since strings can be composites this is the length of the
275 pattern at the time it was commited with a scan_commit. Note that
276 the length is calculated by study_chunk, so that the minimum lengths
277 are not known until the full pattern has been compiled, thus the
278 pointer to the value.
282 In the case of lookbehind the string being searched for can be
283 offset past the start point of the final matching string.
284 If this value was just blithely removed from the min_offset it would
285 invalidate some of the calculations for how many chars must match
286 before or after (as they are derived from min_offset and minlen and
287 the length of the string being searched for).
288 When the final pattern is compiled and the data is moved from the
289 scan_data_t structure into the regexp structure the information
290 about lookbehind is factored in, with the information that would
291 have been lost precalculated in the end_shift field for the
294 The fields pos_min and pos_delta are used to store the minimum offset
295 and the delta to the maximum offset at the current point in the pattern.
299 typedef struct scan_data_t {
300 /*I32 len_min; unused */
301 /*I32 len_delta; unused */
305 I32 last_end; /* min value, <0 unless valid. */
308 SV **longest; /* Either &l_fixed, or &l_float. */
309 SV *longest_fixed; /* longest fixed string found in pattern */
310 I32 offset_fixed; /* offset where it starts */
311 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
312 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
313 SV *longest_float; /* longest floating string found in pattern */
314 I32 offset_float_min; /* earliest point in string it can appear */
315 I32 offset_float_max; /* latest point in string it can appear */
316 I32 *minlen_float; /* pointer to the minlen relevent to the string */
317 I32 lookbehind_float; /* is the position of the string modified by LB */
321 struct regnode_charclass_class *start_class;
325 * Forward declarations for pregcomp()'s friends.
328 static const scan_data_t zero_scan_data =
329 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
331 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
332 #define SF_BEFORE_SEOL 0x0001
333 #define SF_BEFORE_MEOL 0x0002
334 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
335 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
338 # define SF_FIX_SHIFT_EOL (0+2)
339 # define SF_FL_SHIFT_EOL (0+4)
341 # define SF_FIX_SHIFT_EOL (+2)
342 # define SF_FL_SHIFT_EOL (+4)
345 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
346 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
348 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
349 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
350 #define SF_IS_INF 0x0040
351 #define SF_HAS_PAR 0x0080
352 #define SF_IN_PAR 0x0100
353 #define SF_HAS_EVAL 0x0200
354 #define SCF_DO_SUBSTR 0x0400
355 #define SCF_DO_STCLASS_AND 0x0800
356 #define SCF_DO_STCLASS_OR 0x1000
357 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
358 #define SCF_WHILEM_VISITED_POS 0x2000
360 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
361 #define SCF_SEEN_ACCEPT 0x8000
363 #define UTF (RExC_utf8 != 0)
364 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
365 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
367 #define OOB_UNICODE 12345678
368 #define OOB_NAMEDCLASS -1
370 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
371 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
374 /* length of regex to show in messages that don't mark a position within */
375 #define RegexLengthToShowInErrorMessages 127
378 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
379 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
380 * op/pragma/warn/regcomp.
382 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
383 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
385 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
388 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
389 * arg. Show regex, up to a maximum length. If it's too long, chop and add
392 #define _FAIL(code) STMT_START { \
393 const char *ellipses = ""; \
394 IV len = RExC_end - RExC_precomp; \
397 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
398 if (len > RegexLengthToShowInErrorMessages) { \
399 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
400 len = RegexLengthToShowInErrorMessages - 10; \
406 #define FAIL(msg) _FAIL( \
407 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
408 msg, (int)len, RExC_precomp, ellipses))
410 #define FAIL2(msg,arg) _FAIL( \
411 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
412 arg, (int)len, RExC_precomp, ellipses))
415 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
417 #define Simple_vFAIL(m) STMT_START { \
418 const IV offset = RExC_parse - RExC_precomp; \
419 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
420 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
424 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
426 #define vFAIL(m) STMT_START { \
428 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
433 * Like Simple_vFAIL(), but accepts two arguments.
435 #define Simple_vFAIL2(m,a1) STMT_START { \
436 const IV offset = RExC_parse - RExC_precomp; \
437 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
438 (int)offset, RExC_precomp, RExC_precomp + offset); \
442 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
444 #define vFAIL2(m,a1) STMT_START { \
446 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
447 Simple_vFAIL2(m, a1); \
452 * Like Simple_vFAIL(), but accepts three arguments.
454 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
455 const IV offset = RExC_parse - RExC_precomp; \
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
461 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
463 #define vFAIL3(m,a1,a2) STMT_START { \
465 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
466 Simple_vFAIL3(m, a1, a2); \
470 * Like Simple_vFAIL(), but accepts four arguments.
472 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
473 const IV offset = RExC_parse - RExC_precomp; \
474 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
475 (int)offset, RExC_precomp, RExC_precomp + offset); \
478 #define ckWARNreg(loc,m) STMT_START { \
479 const IV offset = loc - RExC_precomp; \
480 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 (int)offset, RExC_precomp, RExC_precomp + offset); \
484 #define ckWARNregdep(loc,m) STMT_START { \
485 const IV offset = loc - RExC_precomp; \
486 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
488 (int)offset, RExC_precomp, RExC_precomp + offset); \
491 #define ckWARN2reg(loc, m, a1) STMT_START { \
492 const IV offset = loc - RExC_precomp; \
493 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
494 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
497 #define vWARN3(loc, m, a1, a2) STMT_START { \
498 const IV offset = loc - RExC_precomp; \
499 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
500 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
503 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
504 const IV offset = loc - RExC_precomp; \
505 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
506 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
509 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
510 const IV offset = loc - RExC_precomp; \
511 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
512 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
515 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
516 const IV offset = loc - RExC_precomp; \
517 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
518 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
521 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
522 const IV offset = loc - RExC_precomp; \
523 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
528 /* Allow for side effects in s */
529 #define REGC(c,s) STMT_START { \
530 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
533 /* Macros for recording node offsets. 20001227 mjd@plover.com
534 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
535 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
536 * Element 0 holds the number n.
537 * Position is 1 indexed.
539 #ifndef RE_TRACK_PATTERN_OFFSETS
540 #define Set_Node_Offset_To_R(node,byte)
541 #define Set_Node_Offset(node,byte)
542 #define Set_Cur_Node_Offset
543 #define Set_Node_Length_To_R(node,len)
544 #define Set_Node_Length(node,len)
545 #define Set_Node_Cur_Length(node)
546 #define Node_Offset(n)
547 #define Node_Length(n)
548 #define Set_Node_Offset_Length(node,offset,len)
549 #define ProgLen(ri) ri->u.proglen
550 #define SetProgLen(ri,x) ri->u.proglen = x
552 #define ProgLen(ri) ri->u.offsets[0]
553 #define SetProgLen(ri,x) ri->u.offsets[0] = x
554 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
556 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
557 __LINE__, (int)(node), (int)(byte))); \
559 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
561 RExC_offsets[2*(node)-1] = (byte); \
566 #define Set_Node_Offset(node,byte) \
567 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
568 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
570 #define Set_Node_Length_To_R(node,len) STMT_START { \
572 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
573 __LINE__, (int)(node), (int)(len))); \
575 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
577 RExC_offsets[2*(node)] = (len); \
582 #define Set_Node_Length(node,len) \
583 Set_Node_Length_To_R((node)-RExC_emit_start, len)
584 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
585 #define Set_Node_Cur_Length(node) \
586 Set_Node_Length(node, RExC_parse - parse_start)
588 /* Get offsets and lengths */
589 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
590 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
592 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
593 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
594 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
598 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
599 #define EXPERIMENTAL_INPLACESCAN
600 #endif /*RE_TRACK_PATTERN_OFFSETS*/
602 #define DEBUG_STUDYDATA(str,data,depth) \
603 DEBUG_OPTIMISE_MORE_r(if(data){ \
604 PerlIO_printf(Perl_debug_log, \
605 "%*s" str "Pos:%"IVdf"/%"IVdf \
606 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
607 (int)(depth)*2, "", \
608 (IV)((data)->pos_min), \
609 (IV)((data)->pos_delta), \
610 (UV)((data)->flags), \
611 (IV)((data)->whilem_c), \
612 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
613 is_inf ? "INF " : "" \
615 if ((data)->last_found) \
616 PerlIO_printf(Perl_debug_log, \
617 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
618 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
619 SvPVX_const((data)->last_found), \
620 (IV)((data)->last_end), \
621 (IV)((data)->last_start_min), \
622 (IV)((data)->last_start_max), \
623 ((data)->longest && \
624 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
625 SvPVX_const((data)->longest_fixed), \
626 (IV)((data)->offset_fixed), \
627 ((data)->longest && \
628 (data)->longest==&((data)->longest_float)) ? "*" : "", \
629 SvPVX_const((data)->longest_float), \
630 (IV)((data)->offset_float_min), \
631 (IV)((data)->offset_float_max) \
633 PerlIO_printf(Perl_debug_log,"\n"); \
636 static void clear_re(pTHX_ void *r);
638 /* Mark that we cannot extend a found fixed substring at this point.
639 Update the longest found anchored substring and the longest found
640 floating substrings if needed. */
643 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
645 const STRLEN l = CHR_SVLEN(data->last_found);
646 const STRLEN old_l = CHR_SVLEN(*data->longest);
647 GET_RE_DEBUG_FLAGS_DECL;
649 PERL_ARGS_ASSERT_SCAN_COMMIT;
651 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
652 SvSetMagicSV(*data->longest, data->last_found);
653 if (*data->longest == data->longest_fixed) {
654 data->offset_fixed = l ? data->last_start_min : data->pos_min;
655 if (data->flags & SF_BEFORE_EOL)
657 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
659 data->flags &= ~SF_FIX_BEFORE_EOL;
660 data->minlen_fixed=minlenp;
661 data->lookbehind_fixed=0;
663 else { /* *data->longest == data->longest_float */
664 data->offset_float_min = l ? data->last_start_min : data->pos_min;
665 data->offset_float_max = (l
666 ? data->last_start_max
667 : data->pos_min + data->pos_delta);
668 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
669 data->offset_float_max = I32_MAX;
670 if (data->flags & SF_BEFORE_EOL)
672 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
674 data->flags &= ~SF_FL_BEFORE_EOL;
675 data->minlen_float=minlenp;
676 data->lookbehind_float=0;
679 SvCUR_set(data->last_found, 0);
681 SV * const sv = data->last_found;
682 if (SvUTF8(sv) && SvMAGICAL(sv)) {
683 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
689 data->flags &= ~SF_BEFORE_EOL;
690 DEBUG_STUDYDATA("commit: ",data,0);
693 /* Can match anything (initialization) */
695 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
697 PERL_ARGS_ASSERT_CL_ANYTHING;
699 ANYOF_CLASS_ZERO(cl);
700 ANYOF_BITMAP_SETALL(cl);
701 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
703 cl->flags |= ANYOF_LOCALE;
706 /* Can match anything (initialization) */
708 S_cl_is_anything(const struct regnode_charclass_class *cl)
712 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
714 for (value = 0; value <= ANYOF_MAX; value += 2)
715 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
717 if (!(cl->flags & ANYOF_UNICODE_ALL))
719 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
724 /* Can match anything (initialization) */
726 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
728 PERL_ARGS_ASSERT_CL_INIT;
730 Zero(cl, 1, struct regnode_charclass_class);
732 cl_anything(pRExC_state, cl);
736 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
738 PERL_ARGS_ASSERT_CL_INIT_ZERO;
740 Zero(cl, 1, struct regnode_charclass_class);
742 cl_anything(pRExC_state, cl);
744 cl->flags |= ANYOF_LOCALE;
747 /* 'And' a given class with another one. Can create false positives */
748 /* We assume that cl is not inverted */
750 S_cl_and(struct regnode_charclass_class *cl,
751 const struct regnode_charclass_class *and_with)
753 PERL_ARGS_ASSERT_CL_AND;
755 assert(and_with->type == ANYOF);
756 if (!(and_with->flags & ANYOF_CLASS)
757 && !(cl->flags & ANYOF_CLASS)
758 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
759 && !(and_with->flags & ANYOF_FOLD)
760 && !(cl->flags & ANYOF_FOLD)) {
763 if (and_with->flags & ANYOF_INVERT)
764 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
765 cl->bitmap[i] &= ~and_with->bitmap[i];
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] &= and_with->bitmap[i];
769 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
770 if (!(and_with->flags & ANYOF_EOS))
771 cl->flags &= ~ANYOF_EOS;
773 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
774 !(and_with->flags & ANYOF_INVERT)) {
775 cl->flags &= ~ANYOF_UNICODE_ALL;
776 cl->flags |= ANYOF_UNICODE;
777 ARG_SET(cl, ARG(and_with));
779 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
780 !(and_with->flags & ANYOF_INVERT))
781 cl->flags &= ~ANYOF_UNICODE_ALL;
782 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
783 !(and_with->flags & ANYOF_INVERT))
784 cl->flags &= ~ANYOF_UNICODE;
787 /* 'OR' a given class with another one. Can create false positives */
788 /* We assume that cl is not inverted */
790 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
792 PERL_ARGS_ASSERT_CL_OR;
794 if (or_with->flags & ANYOF_INVERT) {
796 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
797 * <= (B1 | !B2) | (CL1 | !CL2)
798 * which is wasteful if CL2 is small, but we ignore CL2:
799 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
800 * XXXX Can we handle case-fold? Unclear:
801 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
802 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
804 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 && !(or_with->flags & ANYOF_FOLD)
806 && !(cl->flags & ANYOF_FOLD) ) {
809 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
810 cl->bitmap[i] |= ~or_with->bitmap[i];
811 } /* XXXX: logic is complicated otherwise */
813 cl_anything(pRExC_state, cl);
816 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
817 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
818 && (!(or_with->flags & ANYOF_FOLD)
819 || (cl->flags & ANYOF_FOLD)) ) {
822 /* OR char bitmap and class bitmap separately */
823 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
824 cl->bitmap[i] |= or_with->bitmap[i];
825 if (or_with->flags & ANYOF_CLASS) {
826 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
827 cl->classflags[i] |= or_with->classflags[i];
828 cl->flags |= ANYOF_CLASS;
831 else { /* XXXX: logic is complicated, leave it along for a moment. */
832 cl_anything(pRExC_state, cl);
835 if (or_with->flags & ANYOF_EOS)
836 cl->flags |= ANYOF_EOS;
838 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
839 ARG(cl) != ARG(or_with)) {
840 cl->flags |= ANYOF_UNICODE_ALL;
841 cl->flags &= ~ANYOF_UNICODE;
843 if (or_with->flags & ANYOF_UNICODE_ALL) {
844 cl->flags |= ANYOF_UNICODE_ALL;
845 cl->flags &= ~ANYOF_UNICODE;
849 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
850 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
851 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
852 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
857 dump_trie(trie,widecharmap,revcharmap)
858 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
859 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
861 These routines dump out a trie in a somewhat readable format.
862 The _interim_ variants are used for debugging the interim
863 tables that are used to generate the final compressed
864 representation which is what dump_trie expects.
866 Part of the reason for their existance is to provide a form
867 of documentation as to how the different representations function.
872 Dumps the final compressed table form of the trie to Perl_debug_log.
873 Used for debugging make_trie().
877 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
878 AV *revcharmap, U32 depth)
881 SV *sv=sv_newmortal();
882 int colwidth= widecharmap ? 6 : 4;
883 GET_RE_DEBUG_FLAGS_DECL;
885 PERL_ARGS_ASSERT_DUMP_TRIE;
887 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
888 (int)depth * 2 + 2,"",
889 "Match","Base","Ofs" );
891 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
892 SV ** const tmp = av_fetch( revcharmap, state, 0);
894 PerlIO_printf( Perl_debug_log, "%*s",
896 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
897 PL_colors[0], PL_colors[1],
898 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
899 PERL_PV_ESCAPE_FIRSTCHAR
904 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
905 (int)depth * 2 + 2,"");
907 for( state = 0 ; state < trie->uniquecharcount ; state++ )
908 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
909 PerlIO_printf( Perl_debug_log, "\n");
911 for( state = 1 ; state < trie->statecount ; state++ ) {
912 const U32 base = trie->states[ state ].trans.base;
914 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
916 if ( trie->states[ state ].wordnum ) {
917 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
919 PerlIO_printf( Perl_debug_log, "%6s", "" );
922 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
927 while( ( base + ofs < trie->uniquecharcount ) ||
928 ( base + ofs - trie->uniquecharcount < trie->lasttrans
929 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
932 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
934 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
935 if ( ( base + ofs >= trie->uniquecharcount ) &&
936 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
937 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
939 PerlIO_printf( Perl_debug_log, "%*"UVXf,
941 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
943 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
947 PerlIO_printf( Perl_debug_log, "]");
950 PerlIO_printf( Perl_debug_log, "\n" );
954 Dumps a fully constructed but uncompressed trie in list form.
955 List tries normally only are used for construction when the number of
956 possible chars (trie->uniquecharcount) is very high.
957 Used for debugging make_trie().
960 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
961 HV *widecharmap, AV *revcharmap, U32 next_alloc,
965 SV *sv=sv_newmortal();
966 int colwidth= widecharmap ? 6 : 4;
967 GET_RE_DEBUG_FLAGS_DECL;
969 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
971 /* print out the table precompression. */
972 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
973 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
974 "------:-----+-----------------\n" );
976 for( state=1 ; state < next_alloc ; state ++ ) {
979 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
980 (int)depth * 2 + 2,"", (UV)state );
981 if ( ! trie->states[ state ].wordnum ) {
982 PerlIO_printf( Perl_debug_log, "%5s| ","");
984 PerlIO_printf( Perl_debug_log, "W%4x| ",
985 trie->states[ state ].wordnum
988 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
989 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
991 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
993 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
994 PL_colors[0], PL_colors[1],
995 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
996 PERL_PV_ESCAPE_FIRSTCHAR
998 TRIE_LIST_ITEM(state,charid).forid,
999 (UV)TRIE_LIST_ITEM(state,charid).newstate
1002 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1003 (int)((depth * 2) + 14), "");
1006 PerlIO_printf( Perl_debug_log, "\n");
1011 Dumps a fully constructed but uncompressed trie in table form.
1012 This is the normal DFA style state transition table, with a few
1013 twists to facilitate compression later.
1014 Used for debugging make_trie().
1017 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1018 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1023 SV *sv=sv_newmortal();
1024 int colwidth= widecharmap ? 6 : 4;
1025 GET_RE_DEBUG_FLAGS_DECL;
1027 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1030 print out the table precompression so that we can do a visual check
1031 that they are identical.
1034 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1036 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1037 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1039 PerlIO_printf( Perl_debug_log, "%*s",
1041 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1042 PL_colors[0], PL_colors[1],
1043 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1044 PERL_PV_ESCAPE_FIRSTCHAR
1050 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1052 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1053 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1056 PerlIO_printf( Perl_debug_log, "\n" );
1058 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1060 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1061 (int)depth * 2 + 2,"",
1062 (UV)TRIE_NODENUM( state ) );
1064 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1065 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1067 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1069 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1071 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1074 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1075 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1082 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1083 startbranch: the first branch in the whole branch sequence
1084 first : start branch of sequence of branch-exact nodes.
1085 May be the same as startbranch
1086 last : Thing following the last branch.
1087 May be the same as tail.
1088 tail : item following the branch sequence
1089 count : words in the sequence
1090 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1091 depth : indent depth
1093 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1095 A trie is an N'ary tree where the branches are determined by digital
1096 decomposition of the key. IE, at the root node you look up the 1st character and
1097 follow that branch repeat until you find the end of the branches. Nodes can be
1098 marked as "accepting" meaning they represent a complete word. Eg:
1102 would convert into the following structure. Numbers represent states, letters
1103 following numbers represent valid transitions on the letter from that state, if
1104 the number is in square brackets it represents an accepting state, otherwise it
1105 will be in parenthesis.
1107 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1111 (1) +-i->(6)-+-s->[7]
1113 +-s->(3)-+-h->(4)-+-e->[5]
1115 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1117 This shows that when matching against the string 'hers' we will begin at state 1
1118 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1119 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1120 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1121 single traverse. We store a mapping from accepting to state to which word was
1122 matched, and then when we have multiple possibilities we try to complete the
1123 rest of the regex in the order in which they occured in the alternation.
1125 The only prior NFA like behaviour that would be changed by the TRIE support is
1126 the silent ignoring of duplicate alternations which are of the form:
1128 / (DUPE|DUPE) X? (?{ ... }) Y /x
1130 Thus EVAL blocks follwing a trie may be called a different number of times with
1131 and without the optimisation. With the optimisations dupes will be silently
1132 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1133 the following demonstrates:
1135 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1137 which prints out 'word' three times, but
1139 'words'=~/(word|word|word)(?{ print $1 })S/
1141 which doesnt print it out at all. This is due to other optimisations kicking in.
1143 Example of what happens on a structural level:
1145 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1147 1: CURLYM[1] {1,32767}(18)
1158 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1159 and should turn into:
1161 1: CURLYM[1] {1,32767}(18)
1163 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1171 Cases where tail != last would be like /(?foo|bar)baz/:
1181 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1182 and would end up looking like:
1185 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1192 d = uvuni_to_utf8_flags(d, uv, 0);
1194 is the recommended Unicode-aware way of saying
1199 #define TRIE_STORE_REVCHAR \
1202 SV *zlopp = newSV(2); \
1203 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1204 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1205 SvCUR_set(zlopp, kapow - flrbbbbb); \
1208 av_push(revcharmap, zlopp); \
1210 char ooooff = (char)uvc; \
1211 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1215 #define TRIE_READ_CHAR STMT_START { \
1219 if ( foldlen > 0 ) { \
1220 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1225 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1226 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1227 foldlen -= UNISKIP( uvc ); \
1228 scan = foldbuf + UNISKIP( uvc ); \
1231 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1241 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1242 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1243 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1244 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1246 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1247 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1248 TRIE_LIST_CUR( state )++; \
1251 #define TRIE_LIST_NEW(state) STMT_START { \
1252 Newxz( trie->states[ state ].trans.list, \
1253 4, reg_trie_trans_le ); \
1254 TRIE_LIST_CUR( state ) = 1; \
1255 TRIE_LIST_LEN( state ) = 4; \
1258 #define TRIE_HANDLE_WORD(state) STMT_START { \
1259 U16 dupe= trie->states[ state ].wordnum; \
1260 regnode * const noper_next = regnext( noper ); \
1262 if (trie->wordlen) \
1263 trie->wordlen[ curword ] = wordlen; \
1265 /* store the word for dumping */ \
1267 if (OP(noper) != NOTHING) \
1268 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1270 tmp = newSVpvn_utf8( "", 0, UTF ); \
1271 av_push( trie_words, tmp ); \
1276 if ( noper_next < tail ) { \
1278 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1279 trie->jump[curword] = (U16)(noper_next - convert); \
1281 jumper = noper_next; \
1283 nextbranch= regnext(cur); \
1287 /* So it's a dupe. This means we need to maintain a */\
1288 /* linked-list from the first to the next. */\
1289 /* we only allocate the nextword buffer when there */\
1290 /* a dupe, so first time we have to do the allocation */\
1291 if (!trie->nextword) \
1292 trie->nextword = (U16 *) \
1293 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1294 while ( trie->nextword[dupe] ) \
1295 dupe= trie->nextword[dupe]; \
1296 trie->nextword[dupe]= curword; \
1298 /* we haven't inserted this word yet. */ \
1299 trie->states[ state ].wordnum = curword; \
1304 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1305 ( ( base + charid >= ucharcount \
1306 && base + charid < ubound \
1307 && state == trie->trans[ base - ucharcount + charid ].check \
1308 && trie->trans[ base - ucharcount + charid ].next ) \
1309 ? trie->trans[ base - ucharcount + charid ].next \
1310 : ( state==1 ? special : 0 ) \
1314 #define MADE_JUMP_TRIE 2
1315 #define MADE_EXACT_TRIE 4
1318 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1321 /* first pass, loop through and scan words */
1322 reg_trie_data *trie;
1323 HV *widecharmap = NULL;
1324 AV *revcharmap = newAV();
1326 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1331 regnode *jumper = NULL;
1332 regnode *nextbranch = NULL;
1333 regnode *convert = NULL;
1334 /* we just use folder as a flag in utf8 */
1335 const U8 * const folder = ( flags == EXACTF
1337 : ( flags == EXACTFL
1344 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1345 AV *trie_words = NULL;
1346 /* along with revcharmap, this only used during construction but both are
1347 * useful during debugging so we store them in the struct when debugging.
1350 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1351 STRLEN trie_charcount=0;
1353 SV *re_trie_maxbuff;
1354 GET_RE_DEBUG_FLAGS_DECL;
1356 PERL_ARGS_ASSERT_MAKE_TRIE;
1358 PERL_UNUSED_ARG(depth);
1361 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1363 trie->startstate = 1;
1364 trie->wordcount = word_count;
1365 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1366 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1367 if (!(UTF && folder))
1368 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1370 trie_words = newAV();
1373 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1374 if (!SvIOK(re_trie_maxbuff)) {
1375 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1378 PerlIO_printf( Perl_debug_log,
1379 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1380 (int)depth * 2 + 2, "",
1381 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1382 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1386 /* Find the node we are going to overwrite */
1387 if ( first == startbranch && OP( last ) != BRANCH ) {
1388 /* whole branch chain */
1391 /* branch sub-chain */
1392 convert = NEXTOPER( first );
1395 /* -- First loop and Setup --
1397 We first traverse the branches and scan each word to determine if it
1398 contains widechars, and how many unique chars there are, this is
1399 important as we have to build a table with at least as many columns as we
1402 We use an array of integers to represent the character codes 0..255
1403 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1404 native representation of the character value as the key and IV's for the
1407 *TODO* If we keep track of how many times each character is used we can
1408 remap the columns so that the table compression later on is more
1409 efficient in terms of memory by ensuring most common value is in the
1410 middle and the least common are on the outside. IMO this would be better
1411 than a most to least common mapping as theres a decent chance the most
1412 common letter will share a node with the least common, meaning the node
1413 will not be compressable. With a middle is most common approach the worst
1414 case is when we have the least common nodes twice.
1418 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1419 regnode * const noper = NEXTOPER( cur );
1420 const U8 *uc = (U8*)STRING( noper );
1421 const U8 * const e = uc + STR_LEN( noper );
1423 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1424 const U8 *scan = (U8*)NULL;
1425 U32 wordlen = 0; /* required init */
1427 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1429 if (OP(noper) == NOTHING) {
1433 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1434 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1435 regardless of encoding */
1437 for ( ; uc < e ; uc += len ) {
1438 TRIE_CHARCOUNT(trie)++;
1442 if ( !trie->charmap[ uvc ] ) {
1443 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1445 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1449 /* store the codepoint in the bitmap, and if its ascii
1450 also store its folded equivelent. */
1451 TRIE_BITMAP_SET(trie,uvc);
1453 /* store the folded codepoint */
1454 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1457 /* store first byte of utf8 representation of
1458 codepoints in the 127 < uvc < 256 range */
1459 if (127 < uvc && uvc < 192) {
1460 TRIE_BITMAP_SET(trie,194);
1461 } else if (191 < uvc ) {
1462 TRIE_BITMAP_SET(trie,195);
1463 /* && uvc < 256 -- we know uvc is < 256 already */
1466 set_bit = 0; /* We've done our bit :-) */
1471 widecharmap = newHV();
1473 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1476 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1478 if ( !SvTRUE( *svpp ) ) {
1479 sv_setiv( *svpp, ++trie->uniquecharcount );
1484 if( cur == first ) {
1487 } else if (chars < trie->minlen) {
1489 } else if (chars > trie->maxlen) {
1493 } /* end first pass */
1494 DEBUG_TRIE_COMPILE_r(
1495 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1496 (int)depth * 2 + 2,"",
1497 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1498 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1499 (int)trie->minlen, (int)trie->maxlen )
1501 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1504 We now know what we are dealing with in terms of unique chars and
1505 string sizes so we can calculate how much memory a naive
1506 representation using a flat table will take. If it's over a reasonable
1507 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1508 conservative but potentially much slower representation using an array
1511 At the end we convert both representations into the same compressed
1512 form that will be used in regexec.c for matching with. The latter
1513 is a form that cannot be used to construct with but has memory
1514 properties similar to the list form and access properties similar
1515 to the table form making it both suitable for fast searches and
1516 small enough that its feasable to store for the duration of a program.
1518 See the comment in the code where the compressed table is produced
1519 inplace from the flat tabe representation for an explanation of how
1520 the compression works.
1525 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1527 Second Pass -- Array Of Lists Representation
1529 Each state will be represented by a list of charid:state records
1530 (reg_trie_trans_le) the first such element holds the CUR and LEN
1531 points of the allocated array. (See defines above).
1533 We build the initial structure using the lists, and then convert
1534 it into the compressed table form which allows faster lookups
1535 (but cant be modified once converted).
1538 STRLEN transcount = 1;
1540 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1541 "%*sCompiling trie using list compiler\n",
1542 (int)depth * 2 + 2, ""));
1544 trie->states = (reg_trie_state *)
1545 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1546 sizeof(reg_trie_state) );
1550 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1552 regnode * const noper = NEXTOPER( cur );
1553 U8 *uc = (U8*)STRING( noper );
1554 const U8 * const e = uc + STR_LEN( noper );
1555 U32 state = 1; /* required init */
1556 U16 charid = 0; /* sanity init */
1557 U8 *scan = (U8*)NULL; /* sanity init */
1558 STRLEN foldlen = 0; /* required init */
1559 U32 wordlen = 0; /* required init */
1560 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1562 if (OP(noper) != NOTHING) {
1563 for ( ; uc < e ; uc += len ) {
1568 charid = trie->charmap[ uvc ];
1570 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1574 charid=(U16)SvIV( *svpp );
1577 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1584 if ( !trie->states[ state ].trans.list ) {
1585 TRIE_LIST_NEW( state );
1587 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1588 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1589 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1594 newstate = next_alloc++;
1595 TRIE_LIST_PUSH( state, charid, newstate );
1600 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1604 TRIE_HANDLE_WORD(state);
1606 } /* end second pass */
1608 /* next alloc is the NEXT state to be allocated */
1609 trie->statecount = next_alloc;
1610 trie->states = (reg_trie_state *)
1611 PerlMemShared_realloc( trie->states,
1613 * sizeof(reg_trie_state) );
1615 /* and now dump it out before we compress it */
1616 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1617 revcharmap, next_alloc,
1621 trie->trans = (reg_trie_trans *)
1622 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1629 for( state=1 ; state < next_alloc ; state ++ ) {
1633 DEBUG_TRIE_COMPILE_MORE_r(
1634 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1638 if (trie->states[state].trans.list) {
1639 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1643 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1644 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1645 if ( forid < minid ) {
1647 } else if ( forid > maxid ) {
1651 if ( transcount < tp + maxid - minid + 1) {
1653 trie->trans = (reg_trie_trans *)
1654 PerlMemShared_realloc( trie->trans,
1656 * sizeof(reg_trie_trans) );
1657 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1659 base = trie->uniquecharcount + tp - minid;
1660 if ( maxid == minid ) {
1662 for ( ; zp < tp ; zp++ ) {
1663 if ( ! trie->trans[ zp ].next ) {
1664 base = trie->uniquecharcount + zp - minid;
1665 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1666 trie->trans[ zp ].check = state;
1672 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1673 trie->trans[ tp ].check = state;
1678 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1679 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1680 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1681 trie->trans[ tid ].check = state;
1683 tp += ( maxid - minid + 1 );
1685 Safefree(trie->states[ state ].trans.list);
1688 DEBUG_TRIE_COMPILE_MORE_r(
1689 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1692 trie->states[ state ].trans.base=base;
1694 trie->lasttrans = tp + 1;
1698 Second Pass -- Flat Table Representation.
1700 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1701 We know that we will need Charcount+1 trans at most to store the data
1702 (one row per char at worst case) So we preallocate both structures
1703 assuming worst case.
1705 We then construct the trie using only the .next slots of the entry
1708 We use the .check field of the first entry of the node temporarily to
1709 make compression both faster and easier by keeping track of how many non
1710 zero fields are in the node.
1712 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1715 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1716 number representing the first entry of the node, and state as a
1717 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1718 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1719 are 2 entrys per node. eg:
1727 The table is internally in the right hand, idx form. However as we also
1728 have to deal with the states array which is indexed by nodenum we have to
1729 use TRIE_NODENUM() to convert.
1732 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1733 "%*sCompiling trie using table compiler\n",
1734 (int)depth * 2 + 2, ""));
1736 trie->trans = (reg_trie_trans *)
1737 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1738 * trie->uniquecharcount + 1,
1739 sizeof(reg_trie_trans) );
1740 trie->states = (reg_trie_state *)
1741 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1742 sizeof(reg_trie_state) );
1743 next_alloc = trie->uniquecharcount + 1;
1746 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1748 regnode * const noper = NEXTOPER( cur );
1749 const U8 *uc = (U8*)STRING( noper );
1750 const U8 * const e = uc + STR_LEN( noper );
1752 U32 state = 1; /* required init */
1754 U16 charid = 0; /* sanity init */
1755 U32 accept_state = 0; /* sanity init */
1756 U8 *scan = (U8*)NULL; /* sanity init */
1758 STRLEN foldlen = 0; /* required init */
1759 U32 wordlen = 0; /* required init */
1760 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1762 if ( OP(noper) != NOTHING ) {
1763 for ( ; uc < e ; uc += len ) {
1768 charid = trie->charmap[ uvc ];
1770 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1771 charid = svpp ? (U16)SvIV(*svpp) : 0;
1775 if ( !trie->trans[ state + charid ].next ) {
1776 trie->trans[ state + charid ].next = next_alloc;
1777 trie->trans[ state ].check++;
1778 next_alloc += trie->uniquecharcount;
1780 state = trie->trans[ state + charid ].next;
1782 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1784 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1787 accept_state = TRIE_NODENUM( state );
1788 TRIE_HANDLE_WORD(accept_state);
1790 } /* end second pass */
1792 /* and now dump it out before we compress it */
1793 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1795 next_alloc, depth+1));
1799 * Inplace compress the table.*
1801 For sparse data sets the table constructed by the trie algorithm will
1802 be mostly 0/FAIL transitions or to put it another way mostly empty.
1803 (Note that leaf nodes will not contain any transitions.)
1805 This algorithm compresses the tables by eliminating most such
1806 transitions, at the cost of a modest bit of extra work during lookup:
1808 - Each states[] entry contains a .base field which indicates the
1809 index in the state[] array wheres its transition data is stored.
1811 - If .base is 0 there are no valid transitions from that node.
1813 - If .base is nonzero then charid is added to it to find an entry in
1816 -If trans[states[state].base+charid].check!=state then the
1817 transition is taken to be a 0/Fail transition. Thus if there are fail
1818 transitions at the front of the node then the .base offset will point
1819 somewhere inside the previous nodes data (or maybe even into a node
1820 even earlier), but the .check field determines if the transition is
1824 The following process inplace converts the table to the compressed
1825 table: We first do not compress the root node 1,and mark its all its
1826 .check pointers as 1 and set its .base pointer as 1 as well. This
1827 allows to do a DFA construction from the compressed table later, and
1828 ensures that any .base pointers we calculate later are greater than
1831 - We set 'pos' to indicate the first entry of the second node.
1833 - We then iterate over the columns of the node, finding the first and
1834 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1835 and set the .check pointers accordingly, and advance pos
1836 appropriately and repreat for the next node. Note that when we copy
1837 the next pointers we have to convert them from the original
1838 NODEIDX form to NODENUM form as the former is not valid post
1841 - If a node has no transitions used we mark its base as 0 and do not
1842 advance the pos pointer.
1844 - If a node only has one transition we use a second pointer into the
1845 structure to fill in allocated fail transitions from other states.
1846 This pointer is independent of the main pointer and scans forward
1847 looking for null transitions that are allocated to a state. When it
1848 finds one it writes the single transition into the "hole". If the
1849 pointer doesnt find one the single transition is appended as normal.
1851 - Once compressed we can Renew/realloc the structures to release the
1854 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1855 specifically Fig 3.47 and the associated pseudocode.
1859 const U32 laststate = TRIE_NODENUM( next_alloc );
1862 trie->statecount = laststate;
1864 for ( state = 1 ; state < laststate ; state++ ) {
1866 const U32 stateidx = TRIE_NODEIDX( state );
1867 const U32 o_used = trie->trans[ stateidx ].check;
1868 U32 used = trie->trans[ stateidx ].check;
1869 trie->trans[ stateidx ].check = 0;
1871 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1872 if ( flag || trie->trans[ stateidx + charid ].next ) {
1873 if ( trie->trans[ stateidx + charid ].next ) {
1875 for ( ; zp < pos ; zp++ ) {
1876 if ( ! trie->trans[ zp ].next ) {
1880 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1881 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1882 trie->trans[ zp ].check = state;
1883 if ( ++zp > pos ) pos = zp;
1890 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1892 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1893 trie->trans[ pos ].check = state;
1898 trie->lasttrans = pos + 1;
1899 trie->states = (reg_trie_state *)
1900 PerlMemShared_realloc( trie->states, laststate
1901 * sizeof(reg_trie_state) );
1902 DEBUG_TRIE_COMPILE_MORE_r(
1903 PerlIO_printf( Perl_debug_log,
1904 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1905 (int)depth * 2 + 2,"",
1906 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1909 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1912 } /* end table compress */
1914 DEBUG_TRIE_COMPILE_MORE_r(
1915 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1916 (int)depth * 2 + 2, "",
1917 (UV)trie->statecount,
1918 (UV)trie->lasttrans)
1920 /* resize the trans array to remove unused space */
1921 trie->trans = (reg_trie_trans *)
1922 PerlMemShared_realloc( trie->trans, trie->lasttrans
1923 * sizeof(reg_trie_trans) );
1925 /* and now dump out the compressed format */
1926 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1928 { /* Modify the program and insert the new TRIE node*/
1929 U8 nodetype =(U8)(flags & 0xFF);
1933 regnode *optimize = NULL;
1934 #ifdef RE_TRACK_PATTERN_OFFSETS
1937 U32 mjd_nodelen = 0;
1938 #endif /* RE_TRACK_PATTERN_OFFSETS */
1939 #endif /* DEBUGGING */
1941 This means we convert either the first branch or the first Exact,
1942 depending on whether the thing following (in 'last') is a branch
1943 or not and whther first is the startbranch (ie is it a sub part of
1944 the alternation or is it the whole thing.)
1945 Assuming its a sub part we conver the EXACT otherwise we convert
1946 the whole branch sequence, including the first.
1948 /* Find the node we are going to overwrite */
1949 if ( first != startbranch || OP( last ) == BRANCH ) {
1950 /* branch sub-chain */
1951 NEXT_OFF( first ) = (U16)(last - first);
1952 #ifdef RE_TRACK_PATTERN_OFFSETS
1954 mjd_offset= Node_Offset((convert));
1955 mjd_nodelen= Node_Length((convert));
1958 /* whole branch chain */
1960 #ifdef RE_TRACK_PATTERN_OFFSETS
1963 const regnode *nop = NEXTOPER( convert );
1964 mjd_offset= Node_Offset((nop));
1965 mjd_nodelen= Node_Length((nop));
1969 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1970 (int)depth * 2 + 2, "",
1971 (UV)mjd_offset, (UV)mjd_nodelen)
1974 /* But first we check to see if there is a common prefix we can
1975 split out as an EXACT and put in front of the TRIE node. */
1976 trie->startstate= 1;
1977 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1979 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1983 const U32 base = trie->states[ state ].trans.base;
1985 if ( trie->states[state].wordnum )
1988 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1989 if ( ( base + ofs >= trie->uniquecharcount ) &&
1990 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1991 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1993 if ( ++count > 1 ) {
1994 SV **tmp = av_fetch( revcharmap, ofs, 0);
1995 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1996 if ( state == 1 ) break;
1998 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2000 PerlIO_printf(Perl_debug_log,
2001 "%*sNew Start State=%"UVuf" Class: [",
2002 (int)depth * 2 + 2, "",
2005 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2006 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2008 TRIE_BITMAP_SET(trie,*ch);
2010 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2012 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2016 TRIE_BITMAP_SET(trie,*ch);
2018 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2019 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2025 SV **tmp = av_fetch( revcharmap, idx, 0);
2027 char *ch = SvPV( *tmp, len );
2029 SV *sv=sv_newmortal();
2030 PerlIO_printf( Perl_debug_log,
2031 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2032 (int)depth * 2 + 2, "",
2034 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2035 PL_colors[0], PL_colors[1],
2036 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2037 PERL_PV_ESCAPE_FIRSTCHAR
2042 OP( convert ) = nodetype;
2043 str=STRING(convert);
2046 STR_LEN(convert) += len;
2052 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2058 regnode *n = convert+NODE_SZ_STR(convert);
2059 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2060 trie->startstate = state;
2061 trie->minlen -= (state - 1);
2062 trie->maxlen -= (state - 1);
2064 /* At least the UNICOS C compiler choked on this
2065 * being argument to DEBUG_r(), so let's just have
2068 #ifdef PERL_EXT_RE_BUILD
2074 regnode *fix = convert;
2075 U32 word = trie->wordcount;
2077 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2078 while( ++fix < n ) {
2079 Set_Node_Offset_Length(fix, 0, 0);
2082 SV ** const tmp = av_fetch( trie_words, word, 0 );
2084 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2087 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2095 NEXT_OFF(convert) = (U16)(tail - convert);
2096 DEBUG_r(optimize= n);
2102 if ( trie->maxlen ) {
2103 NEXT_OFF( convert ) = (U16)(tail - convert);
2104 ARG_SET( convert, data_slot );
2105 /* Store the offset to the first unabsorbed branch in
2106 jump[0], which is otherwise unused by the jump logic.
2107 We use this when dumping a trie and during optimisation. */
2109 trie->jump[0] = (U16)(nextbranch - convert);
2112 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2113 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2115 OP( convert ) = TRIEC;
2116 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2117 PerlMemShared_free(trie->bitmap);
2120 OP( convert ) = TRIE;
2122 /* store the type in the flags */
2123 convert->flags = nodetype;
2127 + regarglen[ OP( convert ) ];
2129 /* XXX We really should free up the resource in trie now,
2130 as we won't use them - (which resources?) dmq */
2132 /* needed for dumping*/
2133 DEBUG_r(if (optimize) {
2134 regnode *opt = convert;
2136 while ( ++opt < optimize) {
2137 Set_Node_Offset_Length(opt,0,0);
2140 Try to clean up some of the debris left after the
2143 while( optimize < jumper ) {
2144 mjd_nodelen += Node_Length((optimize));
2145 OP( optimize ) = OPTIMIZED;
2146 Set_Node_Offset_Length(optimize,0,0);
2149 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2151 } /* end node insert */
2152 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2154 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2155 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2157 SvREFCNT_dec(revcharmap);
2161 : trie->startstate>1
2167 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2169 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2171 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2172 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2175 We find the fail state for each state in the trie, this state is the longest proper
2176 suffix of the current states 'word' that is also a proper prefix of another word in our
2177 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2178 the DFA not to have to restart after its tried and failed a word at a given point, it
2179 simply continues as though it had been matching the other word in the first place.
2181 'abcdgu'=~/abcdefg|cdgu/
2182 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2183 fail, which would bring use to the state representing 'd' in the second word where we would
2184 try 'g' and succeed, prodceding to match 'cdgu'.
2186 /* add a fail transition */
2187 const U32 trie_offset = ARG(source);
2188 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2190 const U32 ucharcount = trie->uniquecharcount;
2191 const U32 numstates = trie->statecount;
2192 const U32 ubound = trie->lasttrans + ucharcount;
2196 U32 base = trie->states[ 1 ].trans.base;
2199 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2200 GET_RE_DEBUG_FLAGS_DECL;
2202 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2204 PERL_UNUSED_ARG(depth);
2208 ARG_SET( stclass, data_slot );
2209 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2210 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2211 aho->trie=trie_offset;
2212 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2213 Copy( trie->states, aho->states, numstates, reg_trie_state );
2214 Newxz( q, numstates, U32);
2215 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2218 /* initialize fail[0..1] to be 1 so that we always have
2219 a valid final fail state */
2220 fail[ 0 ] = fail[ 1 ] = 1;
2222 for ( charid = 0; charid < ucharcount ; charid++ ) {
2223 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2225 q[ q_write ] = newstate;
2226 /* set to point at the root */
2227 fail[ q[ q_write++ ] ]=1;
2230 while ( q_read < q_write) {
2231 const U32 cur = q[ q_read++ % numstates ];
2232 base = trie->states[ cur ].trans.base;
2234 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2235 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2237 U32 fail_state = cur;
2240 fail_state = fail[ fail_state ];
2241 fail_base = aho->states[ fail_state ].trans.base;
2242 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2244 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2245 fail[ ch_state ] = fail_state;
2246 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2248 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2250 q[ q_write++ % numstates] = ch_state;
2254 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2255 when we fail in state 1, this allows us to use the
2256 charclass scan to find a valid start char. This is based on the principle
2257 that theres a good chance the string being searched contains lots of stuff
2258 that cant be a start char.
2260 fail[ 0 ] = fail[ 1 ] = 0;
2261 DEBUG_TRIE_COMPILE_r({
2262 PerlIO_printf(Perl_debug_log,
2263 "%*sStclass Failtable (%"UVuf" states): 0",
2264 (int)(depth * 2), "", (UV)numstates
2266 for( q_read=1; q_read<numstates; q_read++ ) {
2267 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2269 PerlIO_printf(Perl_debug_log, "\n");
2272 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2277 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2278 * These need to be revisited when a newer toolchain becomes available.
2280 #if defined(__sparc64__) && defined(__GNUC__)
2281 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2282 # undef SPARC64_GCC_WORKAROUND
2283 # define SPARC64_GCC_WORKAROUND 1
2287 #define DEBUG_PEEP(str,scan,depth) \
2288 DEBUG_OPTIMISE_r({if (scan){ \
2289 SV * const mysv=sv_newmortal(); \
2290 regnode *Next = regnext(scan); \
2291 regprop(RExC_rx, mysv, scan); \
2292 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2293 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2294 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2301 #define JOIN_EXACT(scan,min,flags) \
2302 if (PL_regkind[OP(scan)] == EXACT) \
2303 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2306 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2307 /* Merge several consecutive EXACTish nodes into one. */
2308 regnode *n = regnext(scan);
2310 regnode *next = scan + NODE_SZ_STR(scan);
2314 regnode *stop = scan;
2315 GET_RE_DEBUG_FLAGS_DECL;
2317 PERL_UNUSED_ARG(depth);
2320 PERL_ARGS_ASSERT_JOIN_EXACT;
2321 #ifndef EXPERIMENTAL_INPLACESCAN
2322 PERL_UNUSED_ARG(flags);
2323 PERL_UNUSED_ARG(val);
2325 DEBUG_PEEP("join",scan,depth);
2327 /* Skip NOTHING, merge EXACT*. */
2329 ( PL_regkind[OP(n)] == NOTHING ||
2330 (stringok && (OP(n) == OP(scan))))
2332 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2334 if (OP(n) == TAIL || n > next)
2336 if (PL_regkind[OP(n)] == NOTHING) {
2337 DEBUG_PEEP("skip:",n,depth);
2338 NEXT_OFF(scan) += NEXT_OFF(n);
2339 next = n + NODE_STEP_REGNODE;
2346 else if (stringok) {
2347 const unsigned int oldl = STR_LEN(scan);
2348 regnode * const nnext = regnext(n);
2350 DEBUG_PEEP("merg",n,depth);
2353 if (oldl + STR_LEN(n) > U8_MAX)
2355 NEXT_OFF(scan) += NEXT_OFF(n);
2356 STR_LEN(scan) += STR_LEN(n);
2357 next = n + NODE_SZ_STR(n);
2358 /* Now we can overwrite *n : */
2359 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2367 #ifdef EXPERIMENTAL_INPLACESCAN
2368 if (flags && !NEXT_OFF(n)) {
2369 DEBUG_PEEP("atch", val, depth);
2370 if (reg_off_by_arg[OP(n)]) {
2371 ARG_SET(n, val - n);
2374 NEXT_OFF(n) = val - n;
2381 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2383 Two problematic code points in Unicode casefolding of EXACT nodes:
2385 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2386 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2392 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2393 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2395 This means that in case-insensitive matching (or "loose matching",
2396 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2397 length of the above casefolded versions) can match a target string
2398 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2399 This would rather mess up the minimum length computation.
2401 What we'll do is to look for the tail four bytes, and then peek
2402 at the preceding two bytes to see whether we need to decrease
2403 the minimum length by four (six minus two).
2405 Thanks to the design of UTF-8, there cannot be false matches:
2406 A sequence of valid UTF-8 bytes cannot be a subsequence of
2407 another valid sequence of UTF-8 bytes.
2410 char * const s0 = STRING(scan), *s, *t;
2411 char * const s1 = s0 + STR_LEN(scan) - 1;
2412 char * const s2 = s1 - 4;
2413 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2414 const char t0[] = "\xaf\x49\xaf\x42";
2416 const char t0[] = "\xcc\x88\xcc\x81";
2418 const char * const t1 = t0 + 3;
2421 s < s2 && (t = ninstr(s, s1, t0, t1));
2424 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2425 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2427 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2428 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2436 n = scan + NODE_SZ_STR(scan);
2438 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2445 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2449 /* REx optimizer. Converts nodes into quickier variants "in place".
2450 Finds fixed substrings. */
2452 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2453 to the position after last scanned or to NULL. */
2455 #define INIT_AND_WITHP \
2456 assert(!and_withp); \
2457 Newx(and_withp,1,struct regnode_charclass_class); \
2458 SAVEFREEPV(and_withp)
2460 /* this is a chain of data about sub patterns we are processing that
2461 need to be handled seperately/specially in study_chunk. Its so
2462 we can simulate recursion without losing state. */
2464 typedef struct scan_frame {
2465 regnode *last; /* last node to process in this frame */
2466 regnode *next; /* next node to process when last is reached */
2467 struct scan_frame *prev; /*previous frame*/
2468 I32 stop; /* what stopparen do we use */
2472 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2474 #define CASE_SYNST_FNC(nAmE) \
2476 if (flags & SCF_DO_STCLASS_AND) { \
2477 for (value = 0; value < 256; value++) \
2478 if (!is_ ## nAmE ## _cp(value)) \
2479 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2482 for (value = 0; value < 256; value++) \
2483 if (is_ ## nAmE ## _cp(value)) \
2484 ANYOF_BITMAP_SET(data->start_class, value); \
2488 if (flags & SCF_DO_STCLASS_AND) { \
2489 for (value = 0; value < 256; value++) \
2490 if (is_ ## nAmE ## _cp(value)) \
2491 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2494 for (value = 0; value < 256; value++) \
2495 if (!is_ ## nAmE ## _cp(value)) \
2496 ANYOF_BITMAP_SET(data->start_class, value); \
2503 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2504 I32 *minlenp, I32 *deltap,
2509 struct regnode_charclass_class *and_withp,
2510 U32 flags, U32 depth)
2511 /* scanp: Start here (read-write). */
2512 /* deltap: Write maxlen-minlen here. */
2513 /* last: Stop before this one. */
2514 /* data: string data about the pattern */
2515 /* stopparen: treat close N as END */
2516 /* recursed: which subroutines have we recursed into */
2517 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2520 I32 min = 0, pars = 0, code;
2521 regnode *scan = *scanp, *next;
2523 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2524 int is_inf_internal = 0; /* The studied chunk is infinite */
2525 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2526 scan_data_t data_fake;
2527 SV *re_trie_maxbuff = NULL;
2528 regnode *first_non_open = scan;
2529 I32 stopmin = I32_MAX;
2530 scan_frame *frame = NULL;
2531 GET_RE_DEBUG_FLAGS_DECL;
2533 PERL_ARGS_ASSERT_STUDY_CHUNK;
2536 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2540 while (first_non_open && OP(first_non_open) == OPEN)
2541 first_non_open=regnext(first_non_open);
2546 while ( scan && OP(scan) != END && scan < last ){
2547 /* Peephole optimizer: */
2548 DEBUG_STUDYDATA("Peep:", data,depth);
2549 DEBUG_PEEP("Peep",scan,depth);
2550 JOIN_EXACT(scan,&min,0);
2552 /* Follow the next-chain of the current node and optimize
2553 away all the NOTHINGs from it. */
2554 if (OP(scan) != CURLYX) {
2555 const int max = (reg_off_by_arg[OP(scan)]
2557 /* I32 may be smaller than U16 on CRAYs! */
2558 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2559 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2563 /* Skip NOTHING and LONGJMP. */
2564 while ((n = regnext(n))
2565 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2566 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2567 && off + noff < max)
2569 if (reg_off_by_arg[OP(scan)])
2572 NEXT_OFF(scan) = off;
2577 /* The principal pseudo-switch. Cannot be a switch, since we
2578 look into several different things. */
2579 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2580 || OP(scan) == IFTHEN) {
2581 next = regnext(scan);
2583 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2585 if (OP(next) == code || code == IFTHEN) {
2586 /* NOTE - There is similar code to this block below for handling
2587 TRIE nodes on a re-study. If you change stuff here check there
2589 I32 max1 = 0, min1 = I32_MAX, num = 0;
2590 struct regnode_charclass_class accum;
2591 regnode * const startbranch=scan;
2593 if (flags & SCF_DO_SUBSTR)
2594 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2595 if (flags & SCF_DO_STCLASS)
2596 cl_init_zero(pRExC_state, &accum);
2598 while (OP(scan) == code) {
2599 I32 deltanext, minnext, f = 0, fake;
2600 struct regnode_charclass_class this_class;
2603 data_fake.flags = 0;
2605 data_fake.whilem_c = data->whilem_c;
2606 data_fake.last_closep = data->last_closep;
2609 data_fake.last_closep = &fake;
2611 data_fake.pos_delta = delta;
2612 next = regnext(scan);
2613 scan = NEXTOPER(scan);
2615 scan = NEXTOPER(scan);
2616 if (flags & SCF_DO_STCLASS) {
2617 cl_init(pRExC_state, &this_class);
2618 data_fake.start_class = &this_class;
2619 f = SCF_DO_STCLASS_AND;
2621 if (flags & SCF_WHILEM_VISITED_POS)
2622 f |= SCF_WHILEM_VISITED_POS;
2624 /* we suppose the run is continuous, last=next...*/
2625 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2627 stopparen, recursed, NULL, f,depth+1);
2630 if (max1 < minnext + deltanext)
2631 max1 = minnext + deltanext;
2632 if (deltanext == I32_MAX)
2633 is_inf = is_inf_internal = 1;
2635 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2637 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2638 if ( stopmin > minnext)
2639 stopmin = min + min1;
2640 flags &= ~SCF_DO_SUBSTR;
2642 data->flags |= SCF_SEEN_ACCEPT;
2645 if (data_fake.flags & SF_HAS_EVAL)
2646 data->flags |= SF_HAS_EVAL;
2647 data->whilem_c = data_fake.whilem_c;
2649 if (flags & SCF_DO_STCLASS)
2650 cl_or(pRExC_state, &accum, &this_class);
2652 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2654 if (flags & SCF_DO_SUBSTR) {
2655 data->pos_min += min1;
2656 data->pos_delta += max1 - min1;
2657 if (max1 != min1 || is_inf)
2658 data->longest = &(data->longest_float);
2661 delta += max1 - min1;
2662 if (flags & SCF_DO_STCLASS_OR) {
2663 cl_or(pRExC_state, data->start_class, &accum);
2665 cl_and(data->start_class, and_withp);
2666 flags &= ~SCF_DO_STCLASS;
2669 else if (flags & SCF_DO_STCLASS_AND) {
2671 cl_and(data->start_class, &accum);
2672 flags &= ~SCF_DO_STCLASS;
2675 /* Switch to OR mode: cache the old value of
2676 * data->start_class */
2678 StructCopy(data->start_class, and_withp,
2679 struct regnode_charclass_class);
2680 flags &= ~SCF_DO_STCLASS_AND;
2681 StructCopy(&accum, data->start_class,
2682 struct regnode_charclass_class);
2683 flags |= SCF_DO_STCLASS_OR;
2684 data->start_class->flags |= ANYOF_EOS;
2688 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2691 Assuming this was/is a branch we are dealing with: 'scan' now
2692 points at the item that follows the branch sequence, whatever
2693 it is. We now start at the beginning of the sequence and look
2700 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2702 If we can find such a subseqence we need to turn the first
2703 element into a trie and then add the subsequent branch exact
2704 strings to the trie.
2708 1. patterns where the whole set of branch can be converted.
2710 2. patterns where only a subset can be converted.
2712 In case 1 we can replace the whole set with a single regop
2713 for the trie. In case 2 we need to keep the start and end
2716 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2717 becomes BRANCH TRIE; BRANCH X;
2719 There is an additional case, that being where there is a
2720 common prefix, which gets split out into an EXACT like node
2721 preceding the TRIE node.
2723 If x(1..n)==tail then we can do a simple trie, if not we make
2724 a "jump" trie, such that when we match the appropriate word
2725 we "jump" to the appopriate tail node. Essentailly we turn
2726 a nested if into a case structure of sorts.
2731 if (!re_trie_maxbuff) {
2732 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2733 if (!SvIOK(re_trie_maxbuff))
2734 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2736 if ( SvIV(re_trie_maxbuff)>=0 ) {
2738 regnode *first = (regnode *)NULL;
2739 regnode *last = (regnode *)NULL;
2740 regnode *tail = scan;
2745 SV * const mysv = sv_newmortal(); /* for dumping */
2747 /* var tail is used because there may be a TAIL
2748 regop in the way. Ie, the exacts will point to the
2749 thing following the TAIL, but the last branch will
2750 point at the TAIL. So we advance tail. If we
2751 have nested (?:) we may have to move through several
2755 while ( OP( tail ) == TAIL ) {
2756 /* this is the TAIL generated by (?:) */
2757 tail = regnext( tail );
2762 regprop(RExC_rx, mysv, tail );
2763 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2764 (int)depth * 2 + 2, "",
2765 "Looking for TRIE'able sequences. Tail node is: ",
2766 SvPV_nolen_const( mysv )
2772 step through the branches, cur represents each
2773 branch, noper is the first thing to be matched
2774 as part of that branch and noper_next is the
2775 regnext() of that node. if noper is an EXACT
2776 and noper_next is the same as scan (our current
2777 position in the regex) then the EXACT branch is
2778 a possible optimization target. Once we have
2779 two or more consequetive such branches we can
2780 create a trie of the EXACT's contents and stich
2781 it in place. If the sequence represents all of
2782 the branches we eliminate the whole thing and
2783 replace it with a single TRIE. If it is a
2784 subsequence then we need to stitch it in. This
2785 means the first branch has to remain, and needs
2786 to be repointed at the item on the branch chain
2787 following the last branch optimized. This could
2788 be either a BRANCH, in which case the
2789 subsequence is internal, or it could be the
2790 item following the branch sequence in which
2791 case the subsequence is at the end.
2795 /* dont use tail as the end marker for this traverse */
2796 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2797 regnode * const noper = NEXTOPER( cur );
2798 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2799 regnode * const noper_next = regnext( noper );
2803 regprop(RExC_rx, mysv, cur);
2804 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2805 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2807 regprop(RExC_rx, mysv, noper);
2808 PerlIO_printf( Perl_debug_log, " -> %s",
2809 SvPV_nolen_const(mysv));
2812 regprop(RExC_rx, mysv, noper_next );
2813 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2814 SvPV_nolen_const(mysv));
2816 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2817 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2819 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2820 : PL_regkind[ OP( noper ) ] == EXACT )
2821 || OP(noper) == NOTHING )
2823 && noper_next == tail
2828 if ( !first || optype == NOTHING ) {
2829 if (!first) first = cur;
2830 optype = OP( noper );
2836 Currently we assume that the trie can handle unicode and ascii
2837 matches fold cased matches. If this proves true then the following
2838 define will prevent tries in this situation.
2840 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2842 #define TRIE_TYPE_IS_SAFE 1
2843 if ( last && TRIE_TYPE_IS_SAFE ) {
2844 make_trie( pRExC_state,
2845 startbranch, first, cur, tail, count,
2848 if ( PL_regkind[ OP( noper ) ] == EXACT
2850 && noper_next == tail
2855 optype = OP( noper );
2865 regprop(RExC_rx, mysv, cur);
2866 PerlIO_printf( Perl_debug_log,
2867 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2868 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2872 if ( last && TRIE_TYPE_IS_SAFE ) {
2873 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2874 #ifdef TRIE_STUDY_OPT
2875 if ( ((made == MADE_EXACT_TRIE &&
2876 startbranch == first)
2877 || ( first_non_open == first )) &&
2879 flags |= SCF_TRIE_RESTUDY;
2880 if ( startbranch == first
2883 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2893 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2894 scan = NEXTOPER(NEXTOPER(scan));
2895 } else /* single branch is optimized. */
2896 scan = NEXTOPER(scan);
2898 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2899 scan_frame *newframe = NULL;
2904 if (OP(scan) != SUSPEND) {
2905 /* set the pointer */
2906 if (OP(scan) == GOSUB) {
2908 RExC_recurse[ARG2L(scan)] = scan;
2909 start = RExC_open_parens[paren-1];
2910 end = RExC_close_parens[paren-1];
2913 start = RExC_rxi->program + 1;
2917 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2918 SAVEFREEPV(recursed);
2920 if (!PAREN_TEST(recursed,paren+1)) {
2921 PAREN_SET(recursed,paren+1);
2922 Newx(newframe,1,scan_frame);
2924 if (flags & SCF_DO_SUBSTR) {
2925 SCAN_COMMIT(pRExC_state,data,minlenp);
2926 data->longest = &(data->longest_float);
2928 is_inf = is_inf_internal = 1;
2929 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2930 cl_anything(pRExC_state, data->start_class);
2931 flags &= ~SCF_DO_STCLASS;
2934 Newx(newframe,1,scan_frame);
2937 end = regnext(scan);
2942 SAVEFREEPV(newframe);
2943 newframe->next = regnext(scan);
2944 newframe->last = last;
2945 newframe->stop = stopparen;
2946 newframe->prev = frame;
2956 else if (OP(scan) == EXACT) {
2957 I32 l = STR_LEN(scan);
2960 const U8 * const s = (U8*)STRING(scan);
2961 l = utf8_length(s, s + l);
2962 uc = utf8_to_uvchr(s, NULL);
2964 uc = *((U8*)STRING(scan));
2967 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2968 /* The code below prefers earlier match for fixed
2969 offset, later match for variable offset. */
2970 if (data->last_end == -1) { /* Update the start info. */
2971 data->last_start_min = data->pos_min;
2972 data->last_start_max = is_inf
2973 ? I32_MAX : data->pos_min + data->pos_delta;
2975 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2977 SvUTF8_on(data->last_found);
2979 SV * const sv = data->last_found;
2980 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2981 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2982 if (mg && mg->mg_len >= 0)
2983 mg->mg_len += utf8_length((U8*)STRING(scan),
2984 (U8*)STRING(scan)+STR_LEN(scan));
2986 data->last_end = data->pos_min + l;
2987 data->pos_min += l; /* As in the first entry. */
2988 data->flags &= ~SF_BEFORE_EOL;
2990 if (flags & SCF_DO_STCLASS_AND) {
2991 /* Check whether it is compatible with what we know already! */
2995 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2996 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2997 && (!(data->start_class->flags & ANYOF_FOLD)
2998 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3001 ANYOF_CLASS_ZERO(data->start_class);
3002 ANYOF_BITMAP_ZERO(data->start_class);
3004 ANYOF_BITMAP_SET(data->start_class, uc);
3005 data->start_class->flags &= ~ANYOF_EOS;
3007 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3009 else if (flags & SCF_DO_STCLASS_OR) {
3010 /* false positive possible if the class is case-folded */
3012 ANYOF_BITMAP_SET(data->start_class, uc);
3014 data->start_class->flags |= ANYOF_UNICODE_ALL;
3015 data->start_class->flags &= ~ANYOF_EOS;
3016 cl_and(data->start_class, and_withp);
3018 flags &= ~SCF_DO_STCLASS;
3020 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3021 I32 l = STR_LEN(scan);
3022 UV uc = *((U8*)STRING(scan));
3024 /* Search for fixed substrings supports EXACT only. */
3025 if (flags & SCF_DO_SUBSTR) {
3027 SCAN_COMMIT(pRExC_state, data, minlenp);
3030 const U8 * const s = (U8 *)STRING(scan);
3031 l = utf8_length(s, s + l);
3032 uc = utf8_to_uvchr(s, NULL);
3035 if (flags & SCF_DO_SUBSTR)
3037 if (flags & SCF_DO_STCLASS_AND) {
3038 /* Check whether it is compatible with what we know already! */
3042 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3043 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3044 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3046 ANYOF_CLASS_ZERO(data->start_class);
3047 ANYOF_BITMAP_ZERO(data->start_class);
3049 ANYOF_BITMAP_SET(data->start_class, uc);
3050 data->start_class->flags &= ~ANYOF_EOS;
3051 data->start_class->flags |= ANYOF_FOLD;
3052 if (OP(scan) == EXACTFL)
3053 data->start_class->flags |= ANYOF_LOCALE;
3056 else if (flags & SCF_DO_STCLASS_OR) {
3057 if (data->start_class->flags & ANYOF_FOLD) {
3058 /* false positive possible if the class is case-folded.
3059 Assume that the locale settings are the same... */
3061 ANYOF_BITMAP_SET(data->start_class, uc);
3062 data->start_class->flags &= ~ANYOF_EOS;
3064 cl_and(data->start_class, and_withp);
3066 flags &= ~SCF_DO_STCLASS;
3068 else if (strchr((const char*)PL_varies,OP(scan))) {
3069 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3070 I32 f = flags, pos_before = 0;
3071 regnode * const oscan = scan;
3072 struct regnode_charclass_class this_class;
3073 struct regnode_charclass_class *oclass = NULL;
3074 I32 next_is_eval = 0;
3076 switch (PL_regkind[OP(scan)]) {
3077 case WHILEM: /* End of (?:...)* . */
3078 scan = NEXTOPER(scan);
3081 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3082 next = NEXTOPER(scan);
3083 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3085 maxcount = REG_INFTY;
3086 next = regnext(scan);
3087 scan = NEXTOPER(scan);
3091 if (flags & SCF_DO_SUBSTR)
3096 if (flags & SCF_DO_STCLASS) {
3098 maxcount = REG_INFTY;
3099 next = regnext(scan);
3100 scan = NEXTOPER(scan);
3103 is_inf = is_inf_internal = 1;
3104 scan = regnext(scan);
3105 if (flags & SCF_DO_SUBSTR) {
3106 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3107 data->longest = &(data->longest_float);
3109 goto optimize_curly_tail;
3111 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3112 && (scan->flags == stopparen))
3117 mincount = ARG1(scan);
3118 maxcount = ARG2(scan);
3120 next = regnext(scan);
3121 if (OP(scan) == CURLYX) {
3122 I32 lp = (data ? *(data->last_closep) : 0);
3123 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3125 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3126 next_is_eval = (OP(scan) == EVAL);
3128 if (flags & SCF_DO_SUBSTR) {
3129 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3130 pos_before = data->pos_min;
3134 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3136 data->flags |= SF_IS_INF;
3138 if (flags & SCF_DO_STCLASS) {
3139 cl_init(pRExC_state, &this_class);
3140 oclass = data->start_class;
3141 data->start_class = &this_class;
3142 f |= SCF_DO_STCLASS_AND;
3143 f &= ~SCF_DO_STCLASS_OR;
3145 /* These are the cases when once a subexpression
3146 fails at a particular position, it cannot succeed
3147 even after backtracking at the enclosing scope.
3149 XXXX what if minimal match and we are at the
3150 initial run of {n,m}? */
3151 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3152 f &= ~SCF_WHILEM_VISITED_POS;
3154 /* This will finish on WHILEM, setting scan, or on NULL: */
3155 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3156 last, data, stopparen, recursed, NULL,
3158 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3160 if (flags & SCF_DO_STCLASS)
3161 data->start_class = oclass;
3162 if (mincount == 0 || minnext == 0) {
3163 if (flags & SCF_DO_STCLASS_OR) {
3164 cl_or(pRExC_state, data->start_class, &this_class);
3166 else if (flags & SCF_DO_STCLASS_AND) {
3167 /* Switch to OR mode: cache the old value of
3168 * data->start_class */
3170 StructCopy(data->start_class, and_withp,
3171 struct regnode_charclass_class);
3172 flags &= ~SCF_DO_STCLASS_AND;
3173 StructCopy(&this_class, data->start_class,
3174 struct regnode_charclass_class);
3175 flags |= SCF_DO_STCLASS_OR;
3176 data->start_class->flags |= ANYOF_EOS;
3178 } else { /* Non-zero len */
3179 if (flags & SCF_DO_STCLASS_OR) {
3180 cl_or(pRExC_state, data->start_class, &this_class);
3181 cl_and(data->start_class, and_withp);
3183 else if (flags & SCF_DO_STCLASS_AND)
3184 cl_and(data->start_class, &this_class);
3185 flags &= ~SCF_DO_STCLASS;
3187 if (!scan) /* It was not CURLYX, but CURLY. */
3189 if ( /* ? quantifier ok, except for (?{ ... }) */
3190 (next_is_eval || !(mincount == 0 && maxcount == 1))
3191 && (minnext == 0) && (deltanext == 0)
3192 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3193 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3195 ckWARNreg(RExC_parse,
3196 "Quantifier unexpected on zero-length expression");
3199 min += minnext * mincount;
3200 is_inf_internal |= ((maxcount == REG_INFTY
3201 && (minnext + deltanext) > 0)
3202 || deltanext == I32_MAX);
3203 is_inf |= is_inf_internal;
3204 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3206 /* Try powerful optimization CURLYX => CURLYN. */
3207 if ( OP(oscan) == CURLYX && data
3208 && data->flags & SF_IN_PAR
3209 && !(data->flags & SF_HAS_EVAL)
3210 && !deltanext && minnext == 1 ) {
3211 /* Try to optimize to CURLYN. */
3212 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3213 regnode * const nxt1 = nxt;
3220 if (!strchr((const char*)PL_simple,OP(nxt))
3221 && !(PL_regkind[OP(nxt)] == EXACT
3222 && STR_LEN(nxt) == 1))
3228 if (OP(nxt) != CLOSE)
3230 if (RExC_open_parens) {
3231 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3232 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3234 /* Now we know that nxt2 is the only contents: */
3235 oscan->flags = (U8)ARG(nxt);
3237 OP(nxt1) = NOTHING; /* was OPEN. */
3240 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3241 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3242 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3243 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3244 OP(nxt + 1) = OPTIMIZED; /* was count. */
3245 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3250 /* Try optimization CURLYX => CURLYM. */
3251 if ( OP(oscan) == CURLYX && data
3252 && !(data->flags & SF_HAS_PAR)
3253 && !(data->flags & SF_HAS_EVAL)
3254 && !deltanext /* atom is fixed width */
3255 && minnext != 0 /* CURLYM can't handle zero width */
3257 /* XXXX How to optimize if data == 0? */
3258 /* Optimize to a simpler form. */
3259 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3263 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3264 && (OP(nxt2) != WHILEM))
3266 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3267 /* Need to optimize away parenths. */
3268 if (data->flags & SF_IN_PAR) {
3269 /* Set the parenth number. */
3270 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3272 if (OP(nxt) != CLOSE)
3273 FAIL("Panic opt close");
3274 oscan->flags = (U8)ARG(nxt);
3275 if (RExC_open_parens) {
3276 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3277 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3279 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3280 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3283 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3284 OP(nxt + 1) = OPTIMIZED; /* was count. */
3285 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3286 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3289 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3290 regnode *nnxt = regnext(nxt1);
3293 if (reg_off_by_arg[OP(nxt1)])
3294 ARG_SET(nxt1, nxt2 - nxt1);
3295 else if (nxt2 - nxt1 < U16_MAX)
3296 NEXT_OFF(nxt1) = nxt2 - nxt1;
3298 OP(nxt) = NOTHING; /* Cannot beautify */
3303 /* Optimize again: */
3304 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3305 NULL, stopparen, recursed, NULL, 0,depth+1);
3310 else if ((OP(oscan) == CURLYX)
3311 && (flags & SCF_WHILEM_VISITED_POS)
3312 /* See the comment on a similar expression above.
3313 However, this time it not a subexpression
3314 we care about, but the expression itself. */
3315 && (maxcount == REG_INFTY)
3316 && data && ++data->whilem_c < 16) {
3317 /* This stays as CURLYX, we can put the count/of pair. */
3318 /* Find WHILEM (as in regexec.c) */
3319 regnode *nxt = oscan + NEXT_OFF(oscan);
3321 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3323 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3324 | (RExC_whilem_seen << 4)); /* On WHILEM */
3326 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3328 if (flags & SCF_DO_SUBSTR) {
3329 SV *last_str = NULL;
3330 int counted = mincount != 0;
3332 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3333 #if defined(SPARC64_GCC_WORKAROUND)
3336 const char *s = NULL;
3339 if (pos_before >= data->last_start_min)
3342 b = data->last_start_min;
3345 s = SvPV_const(data->last_found, l);
3346 old = b - data->last_start_min;
3349 I32 b = pos_before >= data->last_start_min
3350 ? pos_before : data->last_start_min;
3352 const char * const s = SvPV_const(data->last_found, l);
3353 I32 old = b - data->last_start_min;
3357 old = utf8_hop((U8*)s, old) - (U8*)s;
3360 /* Get the added string: */
3361 last_str = newSVpvn_utf8(s + old, l, UTF);
3362 if (deltanext == 0 && pos_before == b) {
3363 /* What was added is a constant string */
3365 SvGROW(last_str, (mincount * l) + 1);
3366 repeatcpy(SvPVX(last_str) + l,
3367 SvPVX_const(last_str), l, mincount - 1);
3368 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3369 /* Add additional parts. */
3370 SvCUR_set(data->last_found,
3371 SvCUR(data->last_found) - l);
3372 sv_catsv(data->last_found, last_str);
3374 SV * sv = data->last_found;
3376 SvUTF8(sv) && SvMAGICAL(sv) ?
3377 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3378 if (mg && mg->mg_len >= 0)
3379 mg->mg_len += CHR_SVLEN(last_str) - l;
3381 data->last_end += l * (mincount - 1);
3384 /* start offset must point into the last copy */
3385 data->last_start_min += minnext * (mincount - 1);
3386 data->last_start_max += is_inf ? I32_MAX
3387 : (maxcount - 1) * (minnext + data->pos_delta);
3390 /* It is counted once already... */
3391 data->pos_min += minnext * (mincount - counted);
3392 data->pos_delta += - counted * deltanext +
3393 (minnext + deltanext) * maxcount - minnext * mincount;
3394 if (mincount != maxcount) {
3395 /* Cannot extend fixed substrings found inside
3397 SCAN_COMMIT(pRExC_state,data,minlenp);
3398 if (mincount && last_str) {
3399 SV * const sv = data->last_found;
3400 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3401 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3405 sv_setsv(sv, last_str);
3406 data->last_end = data->pos_min;
3407 data->last_start_min =
3408 data->pos_min - CHR_SVLEN(last_str);
3409 data->last_start_max = is_inf
3411 : data->pos_min + data->pos_delta
3412 - CHR_SVLEN(last_str);
3414 data->longest = &(data->longest_float);
3416 SvREFCNT_dec(last_str);
3418 if (data && (fl & SF_HAS_EVAL))
3419 data->flags |= SF_HAS_EVAL;
3420 optimize_curly_tail:
3421 if (OP(oscan) != CURLYX) {
3422 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3424 NEXT_OFF(oscan) += NEXT_OFF(next);
3427 default: /* REF and CLUMP only? */
3428 if (flags & SCF_DO_SUBSTR) {
3429 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3430 data->longest = &(data->longest_float);
3432 is_inf = is_inf_internal = 1;
3433 if (flags & SCF_DO_STCLASS_OR)
3434 cl_anything(pRExC_state, data->start_class);
3435 flags &= ~SCF_DO_STCLASS;
3439 else if (OP(scan) == LNBREAK) {
3440 if (flags & SCF_DO_STCLASS) {
3442 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3443 if (flags & SCF_DO_STCLASS_AND) {
3444 for (value = 0; value < 256; value++)
3445 if (!is_VERTWS_cp(value))
3446 ANYOF_BITMAP_CLEAR(data->start_class, value);
3449 for (value = 0; value < 256; value++)
3450 if (is_VERTWS_cp(value))
3451 ANYOF_BITMAP_SET(data->start_class, value);
3453 if (flags & SCF_DO_STCLASS_OR)
3454 cl_and(data->start_class, and_withp);
3455 flags &= ~SCF_DO_STCLASS;
3459 if (flags & SCF_DO_SUBSTR) {
3460 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3462 data->pos_delta += 1;
3463 data->longest = &(data->longest_float);
3467 else if (OP(scan) == FOLDCHAR) {
3468 int d = ARG(scan)==0xDF ? 1 : 2;
3469 flags &= ~SCF_DO_STCLASS;
3472 if (flags & SCF_DO_SUBSTR) {
3473 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3475 data->pos_delta += d;
3476 data->longest = &(data->longest_float);
3479 else if (strchr((const char*)PL_simple,OP(scan))) {
3482 if (flags & SCF_DO_SUBSTR) {
3483 SCAN_COMMIT(pRExC_state,data,minlenp);
3487 if (flags & SCF_DO_STCLASS) {
3488 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3490 /* Some of the logic below assumes that switching
3491 locale on will only add false positives. */
3492 switch (PL_regkind[OP(scan)]) {
3496 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3497 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3498 cl_anything(pRExC_state, data->start_class);
3501 if (OP(scan) == SANY)
3503 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3504 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3505 || (data->start_class->flags & ANYOF_CLASS));
3506 cl_anything(pRExC_state, data->start_class);
3508 if (flags & SCF_DO_STCLASS_AND || !value)
3509 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3512 if (flags & SCF_DO_STCLASS_AND)
3513 cl_and(data->start_class,
3514 (struct regnode_charclass_class*)scan);
3516 cl_or(pRExC_state, data->start_class,
3517 (struct regnode_charclass_class*)scan);
3520 if (flags & SCF_DO_STCLASS_AND) {
3521 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3522 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3523 for (value = 0; value < 256; value++)
3524 if (!isALNUM(value))
3525 ANYOF_BITMAP_CLEAR(data->start_class, value);
3529 if (data->start_class->flags & ANYOF_LOCALE)
3530 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3532 for (value = 0; value < 256; value++)
3534 ANYOF_BITMAP_SET(data->start_class, value);
3539 if (flags & SCF_DO_STCLASS_AND) {
3540 if (data->start_class->flags & ANYOF_LOCALE)
3541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3544 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3545 data->start_class->flags |= ANYOF_LOCALE;
3549 if (flags & SCF_DO_STCLASS_AND) {
3550 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3551 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3552 for (value = 0; value < 256; value++)
3554 ANYOF_BITMAP_CLEAR(data->start_class, value);
3558 if (data->start_class->flags & ANYOF_LOCALE)
3559 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3561 for (value = 0; value < 256; value++)
3562 if (!isALNUM(value))
3563 ANYOF_BITMAP_SET(data->start_class, value);
3568 if (flags & SCF_DO_STCLASS_AND) {
3569 if (data->start_class->flags & ANYOF_LOCALE)
3570 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3573 data->start_class->flags |= ANYOF_LOCALE;
3574 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3578 if (flags & SCF_DO_STCLASS_AND) {
3579 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3580 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3581 for (value = 0; value < 256; value++)
3582 if (!isSPACE(value))
3583 ANYOF_BITMAP_CLEAR(data->start_class, value);
3587 if (data->start_class->flags & ANYOF_LOCALE)
3588 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3590 for (value = 0; value < 256; value++)
3592 ANYOF_BITMAP_SET(data->start_class, value);
3597 if (flags & SCF_DO_STCLASS_AND) {
3598 if (data->start_class->flags & ANYOF_LOCALE)
3599 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3602 data->start_class->flags |= ANYOF_LOCALE;
3603 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3607 if (flags & SCF_DO_STCLASS_AND) {
3608 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3609 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3610 for (value = 0; value < 256; value++)
3612 ANYOF_BITMAP_CLEAR(data->start_class, value);
3616 if (data->start_class->flags & ANYOF_LOCALE)
3617 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3619 for (value = 0; value < 256; value++)
3620 if (!isSPACE(value))
3621 ANYOF_BITMAP_SET(data->start_class, value);
3626 if (flags & SCF_DO_STCLASS_AND) {
3627 if (data->start_class->flags & ANYOF_LOCALE) {
3628 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3629 for (value = 0; value < 256; value++)
3630 if (!isSPACE(value))
3631 ANYOF_BITMAP_CLEAR(data->start_class, value);
3635 data->start_class->flags |= ANYOF_LOCALE;
3636 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3640 if (flags & SCF_DO_STCLASS_AND) {
3641 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3642 for (value = 0; value < 256; value++)
3643 if (!isDIGIT(value))
3644 ANYOF_BITMAP_CLEAR(data->start_class, value);
3647 if (data->start_class->flags & ANYOF_LOCALE)
3648 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3650 for (value = 0; value < 256; value++)
3652 ANYOF_BITMAP_SET(data->start_class, value);
3657 if (flags & SCF_DO_STCLASS_AND) {
3658 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3659 for (value = 0; value < 256; value++)
3661 ANYOF_BITMAP_CLEAR(data->start_class, value);
3664 if (data->start_class->flags & ANYOF_LOCALE)
3665 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3667 for (value = 0; value < 256; value++)
3668 if (!isDIGIT(value))
3669 ANYOF_BITMAP_SET(data->start_class, value);
3673 CASE_SYNST_FNC(VERTWS);
3674 CASE_SYNST_FNC(HORIZWS);
3677 if (flags & SCF_DO_STCLASS_OR)
3678 cl_and(data->start_class, and_withp);
3679 flags &= ~SCF_DO_STCLASS;
3682 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3683 data->flags |= (OP(scan) == MEOL
3687 else if ( PL_regkind[OP(scan)] == BRANCHJ
3688 /* Lookbehind, or need to calculate parens/evals/stclass: */
3689 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3690 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3691 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3692 || OP(scan) == UNLESSM )
3694 /* Negative Lookahead/lookbehind
3695 In this case we can't do fixed string optimisation.
3698 I32 deltanext, minnext, fake = 0;
3700 struct regnode_charclass_class intrnl;
3703 data_fake.flags = 0;
3705 data_fake.whilem_c = data->whilem_c;
3706 data_fake.last_closep = data->last_closep;
3709 data_fake.last_closep = &fake;
3710 data_fake.pos_delta = delta;
3711 if ( flags & SCF_DO_STCLASS && !scan->flags
3712 && OP(scan) == IFMATCH ) { /* Lookahead */
3713 cl_init(pRExC_state, &intrnl);
3714 data_fake.start_class = &intrnl;
3715 f |= SCF_DO_STCLASS_AND;
3717 if (flags & SCF_WHILEM_VISITED_POS)
3718 f |= SCF_WHILEM_VISITED_POS;
3719 next = regnext(scan);
3720 nscan = NEXTOPER(NEXTOPER(scan));
3721 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3722 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3725 FAIL("Variable length lookbehind not implemented");
3727 else if (minnext > (I32)U8_MAX) {
3728 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3730 scan->flags = (U8)minnext;
3733 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3735 if (data_fake.flags & SF_HAS_EVAL)
3736 data->flags |= SF_HAS_EVAL;
3737 data->whilem_c = data_fake.whilem_c;
3739 if (f & SCF_DO_STCLASS_AND) {
3740 if (flags & SCF_DO_STCLASS_OR) {
3741 /* OR before, AND after: ideally we would recurse with
3742 * data_fake to get the AND applied by study of the
3743 * remainder of the pattern, and then derecurse;
3744 * *** HACK *** for now just treat as "no information".
3745 * See [perl #56690].
3747 cl_init(pRExC_state, data->start_class);
3749 /* AND before and after: combine and continue */
3750 const int was = (data->start_class->flags & ANYOF_EOS);
3752 cl_and(data->start_class, &intrnl);
3754 data->start_class->flags |= ANYOF_EOS;
3758 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3760 /* Positive Lookahead/lookbehind
3761 In this case we can do fixed string optimisation,
3762 but we must be careful about it. Note in the case of
3763 lookbehind the positions will be offset by the minimum
3764 length of the pattern, something we won't know about
3765 until after the recurse.
3767 I32 deltanext, fake = 0;
3769 struct regnode_charclass_class intrnl;
3771 /* We use SAVEFREEPV so that when the full compile
3772 is finished perl will clean up the allocated
3773 minlens when its all done. This was we don't
3774 have to worry about freeing them when we know
3775 they wont be used, which would be a pain.
3778 Newx( minnextp, 1, I32 );
3779 SAVEFREEPV(minnextp);
3782 StructCopy(data, &data_fake, scan_data_t);
3783 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3786 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3787 data_fake.last_found=newSVsv(data->last_found);
3791 data_fake.last_closep = &fake;
3792 data_fake.flags = 0;
3793 data_fake.pos_delta = delta;
3795 data_fake.flags |= SF_IS_INF;
3796 if ( flags & SCF_DO_STCLASS && !scan->flags
3797 && OP(scan) == IFMATCH ) { /* Lookahead */
3798 cl_init(pRExC_state, &intrnl);
3799 data_fake.start_class = &intrnl;
3800 f |= SCF_DO_STCLASS_AND;
3802 if (flags & SCF_WHILEM_VISITED_POS)
3803 f |= SCF_WHILEM_VISITED_POS;
3804 next = regnext(scan);
3805 nscan = NEXTOPER(NEXTOPER(scan));
3807 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3808 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3811 FAIL("Variable length lookbehind not implemented");
3813 else if (*minnextp > (I32)U8_MAX) {
3814 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3816 scan->flags = (U8)*minnextp;
3821 if (f & SCF_DO_STCLASS_AND) {
3822 const int was = (data->start_class->flags & ANYOF_EOS);
3824 cl_and(data->start_class, &intrnl);
3826 data->start_class->flags |= ANYOF_EOS;
3829 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3831 if (data_fake.flags & SF_HAS_EVAL)
3832 data->flags |= SF_HAS_EVAL;
3833 data->whilem_c = data_fake.whilem_c;
3834 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3835 if (RExC_rx->minlen<*minnextp)
3836 RExC_rx->minlen=*minnextp;
3837 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3838 SvREFCNT_dec(data_fake.last_found);
3840 if ( data_fake.minlen_fixed != minlenp )
3842 data->offset_fixed= data_fake.offset_fixed;
3843 data->minlen_fixed= data_fake.minlen_fixed;
3844 data->lookbehind_fixed+= scan->flags;
3846 if ( data_fake.minlen_float != minlenp )
3848 data->minlen_float= data_fake.minlen_float;
3849 data->offset_float_min=data_fake.offset_float_min;
3850 data->offset_float_max=data_fake.offset_float_max;
3851 data->lookbehind_float+= scan->flags;
3860 else if (OP(scan) == OPEN) {
3861 if (stopparen != (I32)ARG(scan))
3864 else if (OP(scan) == CLOSE) {
3865 if (stopparen == (I32)ARG(scan)) {
3868 if ((I32)ARG(scan) == is_par) {
3869 next = regnext(scan);
3871 if ( next && (OP(next) != WHILEM) && next < last)
3872 is_par = 0; /* Disable optimization */
3875 *(data->last_closep) = ARG(scan);
3877 else if (OP(scan) == EVAL) {
3879 data->flags |= SF_HAS_EVAL;
3881 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3882 if (flags & SCF_DO_SUBSTR) {
3883 SCAN_COMMIT(pRExC_state,data,minlenp);
3884 flags &= ~SCF_DO_SUBSTR;
3886 if (data && OP(scan)==ACCEPT) {
3887 data->flags |= SCF_SEEN_ACCEPT;
3892 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3894 if (flags & SCF_DO_SUBSTR) {
3895 SCAN_COMMIT(pRExC_state,data,minlenp);
3896 data->longest = &(data->longest_float);
3898 is_inf = is_inf_internal = 1;
3899 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3900 cl_anything(pRExC_state, data->start_class);
3901 flags &= ~SCF_DO_STCLASS;
3903 else if (OP(scan) == GPOS) {
3904 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3905 !(delta || is_inf || (data && data->pos_delta)))
3907 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3908 RExC_rx->extflags |= RXf_ANCH_GPOS;
3909 if (RExC_rx->gofs < (U32)min)
3910 RExC_rx->gofs = min;
3912 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3916 #ifdef TRIE_STUDY_OPT
3917 #ifdef FULL_TRIE_STUDY
3918 else if (PL_regkind[OP(scan)] == TRIE) {
3919 /* NOTE - There is similar code to this block above for handling
3920 BRANCH nodes on the initial study. If you change stuff here
3922 regnode *trie_node= scan;
3923 regnode *tail= regnext(scan);
3924 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3925 I32 max1 = 0, min1 = I32_MAX;
3926 struct regnode_charclass_class accum;
3928 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3929 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3930 if (flags & SCF_DO_STCLASS)
3931 cl_init_zero(pRExC_state, &accum);
3937 const regnode *nextbranch= NULL;
3940 for ( word=1 ; word <= trie->wordcount ; word++)
3942 I32 deltanext=0, minnext=0, f = 0, fake;
3943 struct regnode_charclass_class this_class;
3945 data_fake.flags = 0;
3947 data_fake.whilem_c = data->whilem_c;
3948 data_fake.last_closep = data->last_closep;
3951 data_fake.last_closep = &fake;
3952 data_fake.pos_delta = delta;
3953 if (flags & SCF_DO_STCLASS) {
3954 cl_init(pRExC_state, &this_class);
3955 data_fake.start_class = &this_class;
3956 f = SCF_DO_STCLASS_AND;
3958 if (flags & SCF_WHILEM_VISITED_POS)
3959 f |= SCF_WHILEM_VISITED_POS;
3961 if (trie->jump[word]) {
3963 nextbranch = trie_node + trie->jump[0];
3964 scan= trie_node + trie->jump[word];
3965 /* We go from the jump point to the branch that follows
3966 it. Note this means we need the vestigal unused branches
3967 even though they arent otherwise used.
3969 minnext = study_chunk(pRExC_state, &scan, minlenp,
3970 &deltanext, (regnode *)nextbranch, &data_fake,
3971 stopparen, recursed, NULL, f,depth+1);
3973 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3974 nextbranch= regnext((regnode*)nextbranch);
3976 if (min1 > (I32)(minnext + trie->minlen))
3977 min1 = minnext + trie->minlen;
3978 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3979 max1 = minnext + deltanext + trie->maxlen;
3980 if (deltanext == I32_MAX)
3981 is_inf = is_inf_internal = 1;
3983 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3985 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3986 if ( stopmin > min + min1)
3987 stopmin = min + min1;
3988 flags &= ~SCF_DO_SUBSTR;
3990 data->flags |= SCF_SEEN_ACCEPT;
3993 if (data_fake.flags & SF_HAS_EVAL)
3994 data->flags |= SF_HAS_EVAL;
3995 data->whilem_c = data_fake.whilem_c;
3997 if (flags & SCF_DO_STCLASS)
3998 cl_or(pRExC_state, &accum, &this_class);
4001 if (flags & SCF_DO_SUBSTR) {
4002 data->pos_min += min1;
4003 data->pos_delta += max1 - min1;
4004 if (max1 != min1 || is_inf)
4005 data->longest = &(data->longest_float);
4008 delta += max1 - min1;
4009 if (flags & SCF_DO_STCLASS_OR) {
4010 cl_or(pRExC_state, data->start_class, &accum);
4012 cl_and(data->start_class, and_withp);
4013 flags &= ~SCF_DO_STCLASS;
4016 else if (flags & SCF_DO_STCLASS_AND) {
4018 cl_and(data->start_class, &accum);
4019 flags &= ~SCF_DO_STCLASS;
4022 /* Switch to OR mode: cache the old value of
4023 * data->start_class */
4025 StructCopy(data->start_class, and_withp,
4026 struct regnode_charclass_class);
4027 flags &= ~SCF_DO_STCLASS_AND;
4028 StructCopy(&accum, data->start_class,
4029 struct regnode_charclass_class);
4030 flags |= SCF_DO_STCLASS_OR;
4031 data->start_class->flags |= ANYOF_EOS;
4038 else if (PL_regkind[OP(scan)] == TRIE) {
4039 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4042 min += trie->minlen;
4043 delta += (trie->maxlen - trie->minlen);
4044 flags &= ~SCF_DO_STCLASS; /* xxx */
4045 if (flags & SCF_DO_SUBSTR) {
4046 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4047 data->pos_min += trie->minlen;
4048 data->pos_delta += (trie->maxlen - trie->minlen);
4049 if (trie->maxlen != trie->minlen)
4050 data->longest = &(data->longest_float);
4052 if (trie->jump) /* no more substrings -- for now /grr*/
4053 flags &= ~SCF_DO_SUBSTR;
4055 #endif /* old or new */
4056 #endif /* TRIE_STUDY_OPT */
4058 /* Else: zero-length, ignore. */
4059 scan = regnext(scan);
4064 stopparen = frame->stop;
4065 frame = frame->prev;
4066 goto fake_study_recurse;
4071 DEBUG_STUDYDATA("pre-fin:",data,depth);
4074 *deltap = is_inf_internal ? I32_MAX : delta;
4075 if (flags & SCF_DO_SUBSTR && is_inf)
4076 data->pos_delta = I32_MAX - data->pos_min;
4077 if (is_par > (I32)U8_MAX)
4079 if (is_par && pars==1 && data) {
4080 data->flags |= SF_IN_PAR;
4081 data->flags &= ~SF_HAS_PAR;
4083 else if (pars && data) {
4084 data->flags |= SF_HAS_PAR;
4085 data->flags &= ~SF_IN_PAR;
4087 if (flags & SCF_DO_STCLASS_OR)
4088 cl_and(data->start_class, and_withp);
4089 if (flags & SCF_TRIE_RESTUDY)
4090 data->flags |= SCF_TRIE_RESTUDY;
4092 DEBUG_STUDYDATA("post-fin:",data,depth);
4094 return min < stopmin ? min : stopmin;
4098 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4100 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4102 PERL_ARGS_ASSERT_ADD_DATA;
4104 Renewc(RExC_rxi->data,
4105 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4106 char, struct reg_data);
4108 Renew(RExC_rxi->data->what, count + n, U8);
4110 Newx(RExC_rxi->data->what, n, U8);
4111 RExC_rxi->data->count = count + n;
4112 Copy(s, RExC_rxi->data->what + count, n, U8);
4116 /*XXX: todo make this not included in a non debugging perl */
4117 #ifndef PERL_IN_XSUB_RE
4119 Perl_reginitcolors(pTHX)
4122 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4124 char *t = savepv(s);
4128 t = strchr(t, '\t');
4134 PL_colors[i] = t = (char *)"";
4139 PL_colors[i++] = (char *)"";
4146 #ifdef TRIE_STUDY_OPT
4147 #define CHECK_RESTUDY_GOTO \
4149 (data.flags & SCF_TRIE_RESTUDY) \
4153 #define CHECK_RESTUDY_GOTO
4157 - pregcomp - compile a regular expression into internal code
4159 * We can't allocate space until we know how big the compiled form will be,
4160 * but we can't compile it (and thus know how big it is) until we've got a
4161 * place to put the code. So we cheat: we compile it twice, once with code
4162 * generation turned off and size counting turned on, and once "for real".
4163 * This also means that we don't allocate space until we are sure that the
4164 * thing really will compile successfully, and we never have to move the
4165 * code and thus invalidate pointers into it. (Note that it has to be in
4166 * one piece because free() must be able to free it all.) [NB: not true in perl]
4168 * Beware that the optimization-preparation code in here knows about some
4169 * of the structure of the compiled regexp. [I'll say.]
4174 #ifndef PERL_IN_XSUB_RE
4175 #define RE_ENGINE_PTR &PL_core_reg_engine
4177 extern const struct regexp_engine my_reg_engine;
4178 #define RE_ENGINE_PTR &my_reg_engine
4181 #ifndef PERL_IN_XSUB_RE
4183 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4186 HV * const table = GvHV(PL_hintgv);
4188 PERL_ARGS_ASSERT_PREGCOMP;
4190 /* Dispatch a request to compile a regexp to correct
4193 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4194 GET_RE_DEBUG_FLAGS_DECL;
4195 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4196 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4198 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4201 return CALLREGCOMP_ENG(eng, pattern, flags);
4204 return Perl_re_compile(aTHX_ pattern, flags);
4209 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4214 register regexp_internal *ri;
4216 char *exp = SvPV(pattern, plen);
4217 char* xend = exp + plen;
4224 RExC_state_t RExC_state;
4225 RExC_state_t * const pRExC_state = &RExC_state;
4226 #ifdef TRIE_STUDY_OPT
4228 RExC_state_t copyRExC_state;
4230 GET_RE_DEBUG_FLAGS_DECL;
4232 PERL_ARGS_ASSERT_RE_COMPILE;
4234 DEBUG_r(if (!PL_colorset) reginitcolors());
4236 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4239 SV *dsv= sv_newmortal();
4240 RE_PV_QUOTED_DECL(s, RExC_utf8,
4241 dsv, exp, plen, 60);
4242 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4243 PL_colors[4],PL_colors[5],s);
4248 RExC_flags = pm_flags;
4252 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4253 RExC_seen_evals = 0;
4256 /* First pass: determine size, legality. */
4264 RExC_emit = &PL_regdummy;
4265 RExC_whilem_seen = 0;
4266 RExC_charnames = NULL;
4267 RExC_open_parens = NULL;
4268 RExC_close_parens = NULL;
4270 RExC_paren_names = NULL;
4272 RExC_paren_name_list = NULL;
4274 RExC_recurse = NULL;
4275 RExC_recurse_count = 0;
4277 #if 0 /* REGC() is (currently) a NOP at the first pass.
4278 * Clever compilers notice this and complain. --jhi */
4279 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4281 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4282 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4283 RExC_precomp = NULL;
4286 if (RExC_utf8 && !RExC_orig_utf8) {
4287 /* It's possible to write a regexp in ascii that represents Unicode
4288 codepoints outside of the byte range, such as via \x{100}. If we
4289 detect such a sequence we have to convert the entire pattern to utf8
4290 and then recompile, as our sizing calculation will have been based
4291 on 1 byte == 1 character, but we will need to use utf8 to encode
4292 at least some part of the pattern, and therefore must convert the whole
4294 XXX: somehow figure out how to make this less expensive...
4297 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4298 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4299 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4301 RExC_orig_utf8 = RExC_utf8;
4303 goto redo_first_pass;
4306 PerlIO_printf(Perl_debug_log,
4307 "Required size %"IVdf" nodes\n"
4308 "Starting second pass (creation)\n",
4311 RExC_lastparse=NULL;
4313 /* Small enough for pointer-storage convention?
4314 If extralen==0, this means that we will not need long jumps. */
4315 if (RExC_size >= 0x10000L && RExC_extralen)
4316 RExC_size += RExC_extralen;
4319 if (RExC_whilem_seen > 15)
4320 RExC_whilem_seen = 15;
4322 /* Allocate space and zero-initialize. Note, the two step process
4323 of zeroing when in debug mode, thus anything assigned has to
4324 happen after that */
4325 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4326 r = (struct regexp*)SvANY(rx);
4327 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4328 char, regexp_internal);
4329 if ( r == NULL || ri == NULL )
4330 FAIL("Regexp out of space");
4332 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4333 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4335 /* bulk initialize base fields with 0. */
4336 Zero(ri, sizeof(regexp_internal), char);
4339 /* non-zero initialization begins here */
4341 r->engine= RE_ENGINE_PTR;
4342 r->extflags = pm_flags;
4344 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4345 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4346 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4347 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4348 >> RXf_PMf_STD_PMMOD_SHIFT);
4349 const char *fptr = STD_PAT_MODS; /*"msix"*/
4351 const STRLEN wraplen = plen + has_minus + has_p + has_runon
4352 + (sizeof(STD_PAT_MODS) - 1)
4353 + (sizeof("(?:)") - 1);
4355 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4356 SvCUR_set(rx, wraplen);
4358 SvFLAGS(rx) |= SvUTF8(pattern);
4361 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4363 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4364 char *colon = r + 1;
4367 while((ch = *fptr++)) {
4381 Copy(RExC_precomp, p, plen, char);
4382 assert ((RX_WRAPPED(rx) - p) < 16);
4383 r->pre_prefix = p - RX_WRAPPED(rx);
4392 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4394 if (RExC_seen & REG_SEEN_RECURSE) {
4395 Newxz(RExC_open_parens, RExC_npar,regnode *);
4396 SAVEFREEPV(RExC_open_parens);
4397 Newxz(RExC_close_parens,RExC_npar,regnode *);
4398 SAVEFREEPV(RExC_close_parens);
4401 /* Useful during FAIL. */
4402 #ifdef RE_TRACK_PATTERN_OFFSETS
4403 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4404 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4405 "%s %"UVuf" bytes for offset annotations.\n",
4406 ri->u.offsets ? "Got" : "Couldn't get",
4407 (UV)((2*RExC_size+1) * sizeof(U32))));
4409 SetProgLen(ri,RExC_size);
4413 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4415 /* Second pass: emit code. */
4416 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4421 RExC_emit_start = ri->program;
4422 RExC_emit = ri->program;
4423 RExC_emit_bound = ri->program + RExC_size + 1;
4425 /* Store the count of eval-groups for security checks: */
4426 RExC_rx->seen_evals = RExC_seen_evals;
4427 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4428 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4432 /* XXXX To minimize changes to RE engine we always allocate
4433 3-units-long substrs field. */
4434 Newx(r->substrs, 1, struct reg_substr_data);
4435 if (RExC_recurse_count) {
4436 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4437 SAVEFREEPV(RExC_recurse);
4441 r->minlen = minlen = sawplus = sawopen = 0;
4442 Zero(r->substrs, 1, struct reg_substr_data);
4444 #ifdef TRIE_STUDY_OPT
4446 StructCopy(&zero_scan_data, &data, scan_data_t);
4447 copyRExC_state = RExC_state;
4450 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4452 RExC_state = copyRExC_state;
4453 if (seen & REG_TOP_LEVEL_BRANCHES)
4454 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4456 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4457 if (data.last_found) {
4458 SvREFCNT_dec(data.longest_fixed);
4459 SvREFCNT_dec(data.longest_float);
4460 SvREFCNT_dec(data.last_found);
4462 StructCopy(&zero_scan_data, &data, scan_data_t);
4465 StructCopy(&zero_scan_data, &data, scan_data_t);
4468 /* Dig out information for optimizations. */
4469 r->extflags = RExC_flags; /* was pm_op */
4470 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4473 SvUTF8_on(rx); /* Unicode in it? */
4474 ri->regstclass = NULL;
4475 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4476 r->intflags |= PREGf_NAUGHTY;
4477 scan = ri->program + 1; /* First BRANCH. */
4479 /* testing for BRANCH here tells us whether there is "must appear"
4480 data in the pattern. If there is then we can use it for optimisations */
4481 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4483 STRLEN longest_float_length, longest_fixed_length;
4484 struct regnode_charclass_class ch_class; /* pointed to by data */
4486 I32 last_close = 0; /* pointed to by data */
4487 regnode *first= scan;
4488 regnode *first_next= regnext(first);
4491 * Skip introductions and multiplicators >= 1
4492 * so that we can extract the 'meat' of the pattern that must
4493 * match in the large if() sequence following.
4494 * NOTE that EXACT is NOT covered here, as it is normally
4495 * picked up by the optimiser separately.
4497 * This is unfortunate as the optimiser isnt handling lookahead
4498 * properly currently.
4501 while ((OP(first) == OPEN && (sawopen = 1)) ||
4502 /* An OR of *one* alternative - should not happen now. */
4503 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4504 /* for now we can't handle lookbehind IFMATCH*/
4505 (OP(first) == IFMATCH && !first->flags) ||
4506 (OP(first) == PLUS) ||
4507 (OP(first) == MINMOD) ||
4508 /* An {n,m} with n>0 */
4509 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4510 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4513 * the only op that could be a regnode is PLUS, all the rest
4514 * will be regnode_1 or regnode_2.
4517 if (OP(first) == PLUS)
4520 first += regarglen[OP(first)];
4522 first = NEXTOPER(first);
4523 first_next= regnext(first);
4526 /* Starting-point info. */
4528 DEBUG_PEEP("first:",first,0);
4529 /* Ignore EXACT as we deal with it later. */
4530 if (PL_regkind[OP(first)] == EXACT) {
4531 if (OP(first) == EXACT)
4532 NOOP; /* Empty, get anchored substr later. */
4533 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4534 ri->regstclass = first;
4537 else if (PL_regkind[OP(first)] == TRIE &&
4538 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4541 /* this can happen only on restudy */
4542 if ( OP(first) == TRIE ) {
4543 struct regnode_1 *trieop = (struct regnode_1 *)
4544 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4545 StructCopy(first,trieop,struct regnode_1);
4546 trie_op=(regnode *)trieop;
4548 struct regnode_charclass *trieop = (struct regnode_charclass *)
4549 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4550 StructCopy(first,trieop,struct regnode_charclass);
4551 trie_op=(regnode *)trieop;
4554 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4555 ri->regstclass = trie_op;
4558 else if (strchr((const char*)PL_simple,OP(first)))
4559 ri->regstclass = first;
4560 else if (PL_regkind[OP(first)] == BOUND ||
4561 PL_regkind[OP(first)] == NBOUND)
4562 ri->regstclass = first;
4563 else if (PL_regkind[OP(first)] == BOL) {
4564 r->extflags |= (OP(first) == MBOL
4566 : (OP(first) == SBOL
4569 first = NEXTOPER(first);
4572 else if (OP(first) == GPOS) {
4573 r->extflags |= RXf_ANCH_GPOS;
4574 first = NEXTOPER(first);
4577 else if ((!sawopen || !RExC_sawback) &&
4578 (OP(first) == STAR &&
4579 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4580 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4582 /* turn .* into ^.* with an implied $*=1 */
4584 (OP(NEXTOPER(first)) == REG_ANY)
4587 r->extflags |= type;
4588 r->intflags |= PREGf_IMPLICIT;
4589 first = NEXTOPER(first);
4592 if (sawplus && (!sawopen || !RExC_sawback)
4593 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4594 /* x+ must match at the 1st pos of run of x's */
4595 r->intflags |= PREGf_SKIP;
4597 /* Scan is after the zeroth branch, first is atomic matcher. */
4598 #ifdef TRIE_STUDY_OPT
4601 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4602 (IV)(first - scan + 1))
4606 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4607 (IV)(first - scan + 1))
4613 * If there's something expensive in the r.e., find the
4614 * longest literal string that must appear and make it the
4615 * regmust. Resolve ties in favor of later strings, since
4616 * the regstart check works with the beginning of the r.e.
4617 * and avoiding duplication strengthens checking. Not a
4618 * strong reason, but sufficient in the absence of others.
4619 * [Now we resolve ties in favor of the earlier string if
4620 * it happens that c_offset_min has been invalidated, since the
4621 * earlier string may buy us something the later one won't.]
4624 data.longest_fixed = newSVpvs("");
4625 data.longest_float = newSVpvs("");
4626 data.last_found = newSVpvs("");
4627 data.longest = &(data.longest_fixed);
4629 if (!ri->regstclass) {
4630 cl_init(pRExC_state, &ch_class);
4631 data.start_class = &ch_class;
4632 stclass_flag = SCF_DO_STCLASS_AND;
4633 } else /* XXXX Check for BOUND? */
4635 data.last_closep = &last_close;
4637 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4638 &data, -1, NULL, NULL,
4639 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4645 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4646 && data.last_start_min == 0 && data.last_end > 0
4647 && !RExC_seen_zerolen
4648 && !(RExC_seen & REG_SEEN_VERBARG)
4649 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4650 r->extflags |= RXf_CHECK_ALL;
4651 scan_commit(pRExC_state, &data,&minlen,0);
4652 SvREFCNT_dec(data.last_found);
4654 /* Note that code very similar to this but for anchored string
4655 follows immediately below, changes may need to be made to both.
4658 longest_float_length = CHR_SVLEN(data.longest_float);
4659 if (longest_float_length
4660 || (data.flags & SF_FL_BEFORE_EOL
4661 && (!(data.flags & SF_FL_BEFORE_MEOL)
4662 || (RExC_flags & RXf_PMf_MULTILINE))))
4666 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4667 && data.offset_fixed == data.offset_float_min
4668 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4669 goto remove_float; /* As in (a)+. */
4671 /* copy the information about the longest float from the reg_scan_data
4672 over to the program. */
4673 if (SvUTF8(data.longest_float)) {
4674 r->float_utf8 = data.longest_float;
4675 r->float_substr = NULL;
4677 r->float_substr = data.longest_float;
4678 r->float_utf8 = NULL;
4680 /* float_end_shift is how many chars that must be matched that
4681 follow this item. We calculate it ahead of time as once the
4682 lookbehind offset is added in we lose the ability to correctly
4684 ml = data.minlen_float ? *(data.minlen_float)
4685 : (I32)longest_float_length;
4686 r->float_end_shift = ml - data.offset_float_min
4687 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4688 + data.lookbehind_float;
4689 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4690 r->float_max_offset = data.offset_float_max;
4691 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4692 r->float_max_offset -= data.lookbehind_float;
4694 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4695 && (!(data.flags & SF_FL_BEFORE_MEOL)
4696 || (RExC_flags & RXf_PMf_MULTILINE)));
4697 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4701 r->float_substr = r->float_utf8 = NULL;
4702 SvREFCNT_dec(data.longest_float);
4703 longest_float_length = 0;
4706 /* Note that code very similar to this but for floating string
4707 is immediately above, changes may need to be made to both.
4710 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4711 if (longest_fixed_length
4712 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4713 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4714 || (RExC_flags & RXf_PMf_MULTILINE))))
4718 /* copy the information about the longest fixed
4719 from the reg_scan_data over to the program. */
4720 if (SvUTF8(data.longest_fixed)) {
4721 r->anchored_utf8 = data.longest_fixed;
4722 r->anchored_substr = NULL;
4724 r->anchored_substr = data.longest_fixed;
4725 r->anchored_utf8 = NULL;
4727 /* fixed_end_shift is how many chars that must be matched that
4728 follow this item. We calculate it ahead of time as once the
4729 lookbehind offset is added in we lose the ability to correctly
4731 ml = data.minlen_fixed ? *(data.minlen_fixed)
4732 : (I32)longest_fixed_length;
4733 r->anchored_end_shift = ml - data.offset_fixed
4734 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4735 + data.lookbehind_fixed;
4736 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4738 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4739 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4740 || (RExC_flags & RXf_PMf_MULTILINE)));
4741 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4744 r->anchored_substr = r->anchored_utf8 = NULL;
4745 SvREFCNT_dec(data.longest_fixed);
4746 longest_fixed_length = 0;
4749 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4750 ri->regstclass = NULL;
4751 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4753 && !(data.start_class->flags & ANYOF_EOS)
4754 && !cl_is_anything(data.start_class))
4756 const U32 n = add_data(pRExC_state, 1, "f");
4758 Newx(RExC_rxi->data->data[n], 1,
4759 struct regnode_charclass_class);
4760 StructCopy(data.start_class,
4761 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4762 struct regnode_charclass_class);
4763 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4764 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4765 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4766 regprop(r, sv, (regnode*)data.start_class);
4767 PerlIO_printf(Perl_debug_log,
4768 "synthetic stclass \"%s\".\n",
4769 SvPVX_const(sv));});
4772 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4773 if (longest_fixed_length > longest_float_length) {
4774 r->check_end_shift = r->anchored_end_shift;
4775 r->check_substr = r->anchored_substr;
4776 r->check_utf8 = r->anchored_utf8;
4777 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4778 if (r->extflags & RXf_ANCH_SINGLE)
4779 r->extflags |= RXf_NOSCAN;
4782 r->check_end_shift = r->float_end_shift;
4783 r->check_substr = r->float_substr;
4784 r->check_utf8 = r->float_utf8;
4785 r->check_offset_min = r->float_min_offset;
4786 r->check_offset_max = r->float_max_offset;
4788 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4789 This should be changed ASAP! */
4790 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4791 r->extflags |= RXf_USE_INTUIT;
4792 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4793 r->extflags |= RXf_INTUIT_TAIL;
4795 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4796 if ( (STRLEN)minlen < longest_float_length )
4797 minlen= longest_float_length;
4798 if ( (STRLEN)minlen < longest_fixed_length )
4799 minlen= longest_fixed_length;
4803 /* Several toplevels. Best we can is to set minlen. */
4805 struct regnode_charclass_class ch_class;
4808 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4810 scan = ri->program + 1;
4811 cl_init(pRExC_state, &ch_class);
4812 data.start_class = &ch_class;
4813 data.last_closep = &last_close;
4816 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4817 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4821 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4822 = r->float_substr = r->float_utf8 = NULL;
4823 if (!(data.start_class->flags & ANYOF_EOS)
4824 && !cl_is_anything(data.start_class))
4826 const U32 n = add_data(pRExC_state, 1, "f");
4828 Newx(RExC_rxi->data->data[n], 1,
4829 struct regnode_charclass_class);
4830 StructCopy(data.start_class,
4831 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4832 struct regnode_charclass_class);
4833 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4834 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4835 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4836 regprop(r, sv, (regnode*)data.start_class);
4837 PerlIO_printf(Perl_debug_log,
4838 "synthetic stclass \"%s\".\n",
4839 SvPVX_const(sv));});
4843 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4844 the "real" pattern. */
4846 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4847 (IV)minlen, (IV)r->minlen);
4849 r->minlenret = minlen;
4850 if (r->minlen < minlen)
4853 if (RExC_seen & REG_SEEN_GPOS)
4854 r->extflags |= RXf_GPOS_SEEN;
4855 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4856 r->extflags |= RXf_LOOKBEHIND_SEEN;
4857 if (RExC_seen & REG_SEEN_EVAL)
4858 r->extflags |= RXf_EVAL_SEEN;
4859 if (RExC_seen & REG_SEEN_CANY)
4860 r->extflags |= RXf_CANY_SEEN;
4861 if (RExC_seen & REG_SEEN_VERBARG)
4862 r->intflags |= PREGf_VERBARG_SEEN;
4863 if (RExC_seen & REG_SEEN_CUTGROUP)
4864 r->intflags |= PREGf_CUTGROUP_SEEN;
4865 if (RExC_paren_names)
4866 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4868 RXp_PAREN_NAMES(r) = NULL;
4870 #ifdef STUPID_PATTERN_CHECKS
4871 if (RX_PRELEN(rx) == 0)
4872 r->extflags |= RXf_NULL;
4873 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4874 /* XXX: this should happen BEFORE we compile */
4875 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4876 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4877 r->extflags |= RXf_WHITE;
4878 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4879 r->extflags |= RXf_START_ONLY;
4881 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4882 /* XXX: this should happen BEFORE we compile */
4883 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4885 regnode *first = ri->program + 1;
4887 U8 nop = OP(NEXTOPER(first));
4889 if (PL_regkind[fop] == NOTHING && nop == END)
4890 r->extflags |= RXf_NULL;
4891 else if (PL_regkind[fop] == BOL && nop == END)
4892 r->extflags |= RXf_START_ONLY;
4893 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4894 r->extflags |= RXf_WHITE;
4898 if (RExC_paren_names) {
4899 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4900 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4903 ri->name_list_idx = 0;
4905 if (RExC_recurse_count) {
4906 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4907 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4908 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4911 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4912 /* assume we don't need to swap parens around before we match */
4915 PerlIO_printf(Perl_debug_log,"Final program:\n");
4918 #ifdef RE_TRACK_PATTERN_OFFSETS
4919 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4920 const U32 len = ri->u.offsets[0];
4922 GET_RE_DEBUG_FLAGS_DECL;
4923 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4924 for (i = 1; i <= len; i++) {
4925 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4926 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4927 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4929 PerlIO_printf(Perl_debug_log, "\n");
4935 #undef RE_ENGINE_PTR
4939 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4942 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4944 PERL_UNUSED_ARG(value);
4946 if (flags & RXapif_FETCH) {
4947 return reg_named_buff_fetch(rx, key, flags);
4948 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4949 Perl_croak(aTHX_ "%s", PL_no_modify);
4951 } else if (flags & RXapif_EXISTS) {
4952 return reg_named_buff_exists(rx, key, flags)
4955 } else if (flags & RXapif_REGNAMES) {
4956 return reg_named_buff_all(rx, flags);
4957 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4958 return reg_named_buff_scalar(rx, flags);
4960 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4966 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4969 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4970 PERL_UNUSED_ARG(lastkey);
4972 if (flags & RXapif_FIRSTKEY)
4973 return reg_named_buff_firstkey(rx, flags);
4974 else if (flags & RXapif_NEXTKEY)
4975 return reg_named_buff_nextkey(rx, flags);
4977 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4983 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4986 AV *retarray = NULL;
4988 struct regexp *const rx = (struct regexp *)SvANY(r);
4990 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4992 if (flags & RXapif_ALL)
4995 if (rx && RXp_PAREN_NAMES(rx)) {
4996 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
4999 SV* sv_dat=HeVAL(he_str);
5000 I32 *nums=(I32*)SvPVX(sv_dat);
5001 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5002 if ((I32)(rx->nparens) >= nums[i]
5003 && rx->offs[nums[i]].start != -1
5004 && rx->offs[nums[i]].end != -1)
5007 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5011 ret = newSVsv(&PL_sv_undef);
5014 av_push(retarray, ret);
5017 return newRV_noinc(MUTABLE_SV(retarray));
5024 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5027 struct regexp *const rx = (struct regexp *)SvANY(r);
5029 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5031 if (rx && RXp_PAREN_NAMES(rx)) {
5032 if (flags & RXapif_ALL) {
5033 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5035 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5049 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5051 struct regexp *const rx = (struct regexp *)SvANY(r);
5053 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5055 if ( rx && RXp_PAREN_NAMES(rx) ) {
5056 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5058 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5065 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5067 struct regexp *const rx = (struct regexp *)SvANY(r);
5068 GET_RE_DEBUG_FLAGS_DECL;
5070 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5072 if (rx && RXp_PAREN_NAMES(rx)) {
5073 HV *hv = RXp_PAREN_NAMES(rx);
5075 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5078 SV* sv_dat = HeVAL(temphe);
5079 I32 *nums = (I32*)SvPVX(sv_dat);
5080 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5081 if ((I32)(rx->lastparen) >= nums[i] &&
5082 rx->offs[nums[i]].start != -1 &&
5083 rx->offs[nums[i]].end != -1)
5089 if (parno || flags & RXapif_ALL) {
5090 return newSVhek(HeKEY_hek(temphe));
5098 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5103 struct regexp *const rx = (struct regexp *)SvANY(r);
5105 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5107 if (rx && RXp_PAREN_NAMES(rx)) {
5108 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5109 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5110 } else if (flags & RXapif_ONE) {
5111 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5112 av = MUTABLE_AV(SvRV(ret));
5113 length = av_len(av);
5115 return newSViv(length + 1);
5117 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5121 return &PL_sv_undef;
5125 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5127 struct regexp *const rx = (struct regexp *)SvANY(r);
5130 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5132 if (rx && RXp_PAREN_NAMES(rx)) {
5133 HV *hv= RXp_PAREN_NAMES(rx);
5135 (void)hv_iterinit(hv);
5136 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5139 SV* sv_dat = HeVAL(temphe);
5140 I32 *nums = (I32*)SvPVX(sv_dat);
5141 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5142 if ((I32)(rx->lastparen) >= nums[i] &&
5143 rx->offs[nums[i]].start != -1 &&
5144 rx->offs[nums[i]].end != -1)
5150 if (parno || flags & RXapif_ALL) {
5151 av_push(av, newSVhek(HeKEY_hek(temphe)));
5156 return newRV_noinc(MUTABLE_SV(av));
5160 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5163 struct regexp *const rx = (struct regexp *)SvANY(r);
5168 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5171 sv_setsv(sv,&PL_sv_undef);
5175 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5177 i = rx->offs[0].start;
5181 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5183 s = rx->subbeg + rx->offs[0].end;
5184 i = rx->sublen - rx->offs[0].end;
5187 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5188 (s1 = rx->offs[paren].start) != -1 &&
5189 (t1 = rx->offs[paren].end) != -1)
5193 s = rx->subbeg + s1;
5195 sv_setsv(sv,&PL_sv_undef);
5198 assert(rx->sublen >= (s - rx->subbeg) + i );
5200 const int oldtainted = PL_tainted;
5202 sv_setpvn(sv, s, i);
5203 PL_tainted = oldtainted;
5204 if ( (rx->extflags & RXf_CANY_SEEN)
5205 ? (RXp_MATCH_UTF8(rx)
5206 && (!i || is_utf8_string((U8*)s, i)))
5207 : (RXp_MATCH_UTF8(rx)) )
5214 if (RXp_MATCH_TAINTED(rx)) {
5215 if (SvTYPE(sv) >= SVt_PVMG) {
5216 MAGIC* const mg = SvMAGIC(sv);
5219 SvMAGIC_set(sv, mg->mg_moremagic);
5221 if ((mgt = SvMAGIC(sv))) {
5222 mg->mg_moremagic = mgt;
5223 SvMAGIC_set(sv, mg);
5233 sv_setsv(sv,&PL_sv_undef);
5239 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5240 SV const * const value)
5242 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5244 PERL_UNUSED_ARG(rx);
5245 PERL_UNUSED_ARG(paren);
5246 PERL_UNUSED_ARG(value);
5249 Perl_croak(aTHX_ "%s", PL_no_modify);
5253 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5256 struct regexp *const rx = (struct regexp *)SvANY(r);
5260 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5262 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5264 /* $` / ${^PREMATCH} */
5265 case RX_BUFF_IDX_PREMATCH:
5266 if (rx->offs[0].start != -1) {
5267 i = rx->offs[0].start;
5275 /* $' / ${^POSTMATCH} */
5276 case RX_BUFF_IDX_POSTMATCH:
5277 if (rx->offs[0].end != -1) {
5278 i = rx->sublen - rx->offs[0].end;
5280 s1 = rx->offs[0].end;
5286 /* $& / ${^MATCH}, $1, $2, ... */
5288 if (paren <= (I32)rx->nparens &&
5289 (s1 = rx->offs[paren].start) != -1 &&
5290 (t1 = rx->offs[paren].end) != -1)
5295 if (ckWARN(WARN_UNINITIALIZED))
5296 report_uninit((const SV *)sv);
5301 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5302 const char * const s = rx->subbeg + s1;
5307 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5314 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5316 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5317 PERL_UNUSED_ARG(rx);
5321 return newSVpvs("Regexp");
5324 /* Scans the name of a named buffer from the pattern.
5325 * If flags is REG_RSN_RETURN_NULL returns null.
5326 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5327 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5328 * to the parsed name as looked up in the RExC_paren_names hash.
5329 * If there is an error throws a vFAIL().. type exception.
5332 #define REG_RSN_RETURN_NULL 0
5333 #define REG_RSN_RETURN_NAME 1
5334 #define REG_RSN_RETURN_DATA 2
5337 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5339 char *name_start = RExC_parse;
5341 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5343 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5344 /* skip IDFIRST by using do...while */
5347 RExC_parse += UTF8SKIP(RExC_parse);
5348 } while (isALNUM_utf8((U8*)RExC_parse));
5352 } while (isALNUM(*RExC_parse));
5357 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5358 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5359 if ( flags == REG_RSN_RETURN_NAME)
5361 else if (flags==REG_RSN_RETURN_DATA) {
5364 if ( ! sv_name ) /* should not happen*/
5365 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5366 if (RExC_paren_names)
5367 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5369 sv_dat = HeVAL(he_str);
5371 vFAIL("Reference to nonexistent named group");
5375 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5382 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5383 int rem=(int)(RExC_end - RExC_parse); \
5392 if (RExC_lastparse!=RExC_parse) \
5393 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5396 iscut ? "..." : "<" \
5399 PerlIO_printf(Perl_debug_log,"%16s",""); \
5402 num = RExC_size + 1; \
5404 num=REG_NODE_NUM(RExC_emit); \
5405 if (RExC_lastnum!=num) \
5406 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5408 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5409 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5410 (int)((depth*2)), "", \
5414 RExC_lastparse=RExC_parse; \
5419 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5420 DEBUG_PARSE_MSG((funcname)); \
5421 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5423 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5424 DEBUG_PARSE_MSG((funcname)); \
5425 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5428 - reg - regular expression, i.e. main body or parenthesized thing
5430 * Caller must absorb opening parenthesis.
5432 * Combining parenthesis handling with the base level of regular expression
5433 * is a trifle forced, but the need to tie the tails of the branches to what
5434 * follows makes it hard to avoid.
5436 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5438 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5440 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5444 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5445 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5448 register regnode *ret; /* Will be the head of the group. */
5449 register regnode *br;
5450 register regnode *lastbr;
5451 register regnode *ender = NULL;
5452 register I32 parno = 0;
5454 U32 oregflags = RExC_flags;
5455 bool have_branch = 0;
5457 I32 freeze_paren = 0;
5458 I32 after_freeze = 0;
5460 /* for (?g), (?gc), and (?o) warnings; warning
5461 about (?c) will warn about (?g) -- japhy */
5463 #define WASTED_O 0x01
5464 #define WASTED_G 0x02
5465 #define WASTED_C 0x04
5466 #define WASTED_GC (0x02|0x04)
5467 I32 wastedflags = 0x00;
5469 char * parse_start = RExC_parse; /* MJD */
5470 char * const oregcomp_parse = RExC_parse;
5472 GET_RE_DEBUG_FLAGS_DECL;
5474 PERL_ARGS_ASSERT_REG;
5475 DEBUG_PARSE("reg ");
5477 *flagp = 0; /* Tentatively. */
5480 /* Make an OPEN node, if parenthesized. */
5482 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5483 char *start_verb = RExC_parse;
5484 STRLEN verb_len = 0;
5485 char *start_arg = NULL;
5486 unsigned char op = 0;
5488 int internal_argval = 0; /* internal_argval is only useful if !argok */
5489 while ( *RExC_parse && *RExC_parse != ')' ) {
5490 if ( *RExC_parse == ':' ) {
5491 start_arg = RExC_parse + 1;
5497 verb_len = RExC_parse - start_verb;
5500 while ( *RExC_parse && *RExC_parse != ')' )
5502 if ( *RExC_parse != ')' )
5503 vFAIL("Unterminated verb pattern argument");
5504 if ( RExC_parse == start_arg )
5507 if ( *RExC_parse != ')' )
5508 vFAIL("Unterminated verb pattern");
5511 switch ( *start_verb ) {
5512 case 'A': /* (*ACCEPT) */
5513 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5515 internal_argval = RExC_nestroot;
5518 case 'C': /* (*COMMIT) */
5519 if ( memEQs(start_verb,verb_len,"COMMIT") )
5522 case 'F': /* (*FAIL) */
5523 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5528 case ':': /* (*:NAME) */
5529 case 'M': /* (*MARK:NAME) */
5530 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5535 case 'P': /* (*PRUNE) */
5536 if ( memEQs(start_verb,verb_len,"PRUNE") )
5539 case 'S': /* (*SKIP) */
5540 if ( memEQs(start_verb,verb_len,"SKIP") )
5543 case 'T': /* (*THEN) */
5544 /* [19:06] <TimToady> :: is then */
5545 if ( memEQs(start_verb,verb_len,"THEN") ) {
5547 RExC_seen |= REG_SEEN_CUTGROUP;
5553 vFAIL3("Unknown verb pattern '%.*s'",
5554 verb_len, start_verb);
5557 if ( start_arg && internal_argval ) {
5558 vFAIL3("Verb pattern '%.*s' may not have an argument",
5559 verb_len, start_verb);
5560 } else if ( argok < 0 && !start_arg ) {
5561 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5562 verb_len, start_verb);
5564 ret = reganode(pRExC_state, op, internal_argval);
5565 if ( ! internal_argval && ! SIZE_ONLY ) {
5567 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5568 ARG(ret) = add_data( pRExC_state, 1, "S" );
5569 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5576 if (!internal_argval)
5577 RExC_seen |= REG_SEEN_VERBARG;
5578 } else if ( start_arg ) {
5579 vFAIL3("Verb pattern '%.*s' may not have an argument",
5580 verb_len, start_verb);
5582 ret = reg_node(pRExC_state, op);
5584 nextchar(pRExC_state);
5587 if (*RExC_parse == '?') { /* (?...) */
5588 bool is_logical = 0;
5589 const char * const seqstart = RExC_parse;
5592 paren = *RExC_parse++;
5593 ret = NULL; /* For look-ahead/behind. */
5596 case 'P': /* (?P...) variants for those used to PCRE/Python */
5597 paren = *RExC_parse++;
5598 if ( paren == '<') /* (?P<...>) named capture */
5600 else if (paren == '>') { /* (?P>name) named recursion */
5601 goto named_recursion;
5603 else if (paren == '=') { /* (?P=...) named backref */
5604 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5605 you change this make sure you change that */
5606 char* name_start = RExC_parse;
5608 SV *sv_dat = reg_scan_name(pRExC_state,
5609 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5610 if (RExC_parse == name_start || *RExC_parse != ')')
5611 vFAIL2("Sequence %.3s... not terminated",parse_start);
5614 num = add_data( pRExC_state, 1, "S" );
5615 RExC_rxi->data->data[num]=(void*)sv_dat;
5616 SvREFCNT_inc_simple_void(sv_dat);
5619 ret = reganode(pRExC_state,
5620 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5624 Set_Node_Offset(ret, parse_start+1);
5625 Set_Node_Cur_Length(ret); /* MJD */
5627 nextchar(pRExC_state);
5631 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5633 case '<': /* (?<...) */
5634 if (*RExC_parse == '!')
5636 else if (*RExC_parse != '=')
5642 case '\'': /* (?'...') */
5643 name_start= RExC_parse;
5644 svname = reg_scan_name(pRExC_state,
5645 SIZE_ONLY ? /* reverse test from the others */
5646 REG_RSN_RETURN_NAME :
5647 REG_RSN_RETURN_NULL);
5648 if (RExC_parse == name_start) {
5650 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5653 if (*RExC_parse != paren)
5654 vFAIL2("Sequence (?%c... not terminated",
5655 paren=='>' ? '<' : paren);
5659 if (!svname) /* shouldnt happen */
5661 "panic: reg_scan_name returned NULL");
5662 if (!RExC_paren_names) {
5663 RExC_paren_names= newHV();
5664 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5666 RExC_paren_name_list= newAV();
5667 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5670 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5672 sv_dat = HeVAL(he_str);
5674 /* croak baby croak */
5676 "panic: paren_name hash element allocation failed");
5677 } else if ( SvPOK(sv_dat) ) {
5678 /* (?|...) can mean we have dupes so scan to check
5679 its already been stored. Maybe a flag indicating
5680 we are inside such a construct would be useful,
5681 but the arrays are likely to be quite small, so
5682 for now we punt -- dmq */
5683 IV count = SvIV(sv_dat);
5684 I32 *pv = (I32*)SvPVX(sv_dat);
5686 for ( i = 0 ; i < count ; i++ ) {
5687 if ( pv[i] == RExC_npar ) {
5693 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5694 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5695 pv[count] = RExC_npar;
5696 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5699 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5700 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5702 SvIV_set(sv_dat, 1);
5705 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5706 SvREFCNT_dec(svname);
5709 /*sv_dump(sv_dat);*/
5711 nextchar(pRExC_state);
5713 goto capturing_parens;
5715 RExC_seen |= REG_SEEN_LOOKBEHIND;
5717 case '=': /* (?=...) */
5718 RExC_seen_zerolen++;
5720 case '!': /* (?!...) */
5721 RExC_seen_zerolen++;
5722 if (*RExC_parse == ')') {
5723 ret=reg_node(pRExC_state, OPFAIL);
5724 nextchar(pRExC_state);
5728 case '|': /* (?|...) */
5729 /* branch reset, behave like a (?:...) except that
5730 buffers in alternations share the same numbers */
5732 after_freeze = freeze_paren = RExC_npar;
5734 case ':': /* (?:...) */
5735 case '>': /* (?>...) */
5737 case '$': /* (?$...) */
5738 case '@': /* (?@...) */
5739 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5741 case '#': /* (?#...) */
5742 while (*RExC_parse && *RExC_parse != ')')
5744 if (*RExC_parse != ')')
5745 FAIL("Sequence (?#... not terminated");
5746 nextchar(pRExC_state);
5749 case '0' : /* (?0) */
5750 case 'R' : /* (?R) */
5751 if (*RExC_parse != ')')
5752 FAIL("Sequence (?R) not terminated");
5753 ret = reg_node(pRExC_state, GOSTART);
5754 *flagp |= POSTPONED;
5755 nextchar(pRExC_state);
5758 { /* named and numeric backreferences */
5760 case '&': /* (?&NAME) */
5761 parse_start = RExC_parse - 1;
5764 SV *sv_dat = reg_scan_name(pRExC_state,
5765 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5766 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5768 goto gen_recurse_regop;
5771 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5773 vFAIL("Illegal pattern");
5775 goto parse_recursion;
5777 case '-': /* (?-1) */
5778 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5779 RExC_parse--; /* rewind to let it be handled later */
5783 case '1': case '2': case '3': case '4': /* (?1) */
5784 case '5': case '6': case '7': case '8': case '9':
5787 num = atoi(RExC_parse);
5788 parse_start = RExC_parse - 1; /* MJD */
5789 if (*RExC_parse == '-')
5791 while (isDIGIT(*RExC_parse))
5793 if (*RExC_parse!=')')
5794 vFAIL("Expecting close bracket");
5797 if ( paren == '-' ) {
5799 Diagram of capture buffer numbering.
5800 Top line is the normal capture buffer numbers
5801 Botton line is the negative indexing as from
5805 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5809 num = RExC_npar + num;
5812 vFAIL("Reference to nonexistent group");
5814 } else if ( paren == '+' ) {
5815 num = RExC_npar + num - 1;
5818 ret = reganode(pRExC_state, GOSUB, num);
5820 if (num > (I32)RExC_rx->nparens) {
5822 vFAIL("Reference to nonexistent group");
5824 ARG2L_SET( ret, RExC_recurse_count++);
5826 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5827 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5831 RExC_seen |= REG_SEEN_RECURSE;
5832 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5833 Set_Node_Offset(ret, parse_start); /* MJD */
5835 *flagp |= POSTPONED;
5836 nextchar(pRExC_state);
5838 } /* named and numeric backreferences */
5841 case '?': /* (??...) */
5843 if (*RExC_parse != '{') {
5845 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5848 *flagp |= POSTPONED;
5849 paren = *RExC_parse++;
5851 case '{': /* (?{...}) */
5856 char *s = RExC_parse;
5858 RExC_seen_zerolen++;
5859 RExC_seen |= REG_SEEN_EVAL;
5860 while (count && (c = *RExC_parse)) {
5871 if (*RExC_parse != ')') {
5873 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5877 OP_4tree *sop, *rop;
5878 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5881 Perl_save_re_context(aTHX);
5882 rop = sv_compile_2op(sv, &sop, "re", &pad);
5883 sop->op_private |= OPpREFCOUNTED;
5884 /* re_dup will OpREFCNT_inc */
5885 OpREFCNT_set(sop, 1);
5888 n = add_data(pRExC_state, 3, "nop");
5889 RExC_rxi->data->data[n] = (void*)rop;
5890 RExC_rxi->data->data[n+1] = (void*)sop;
5891 RExC_rxi->data->data[n+2] = (void*)pad;
5894 else { /* First pass */
5895 if (PL_reginterp_cnt < ++RExC_seen_evals
5897 /* No compiled RE interpolated, has runtime
5898 components ===> unsafe. */
5899 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5900 if (PL_tainting && PL_tainted)
5901 FAIL("Eval-group in insecure regular expression");
5902 #if PERL_VERSION > 8
5903 if (IN_PERL_COMPILETIME)
5908 nextchar(pRExC_state);
5910 ret = reg_node(pRExC_state, LOGICAL);
5913 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5914 /* deal with the length of this later - MJD */
5917 ret = reganode(pRExC_state, EVAL, n);
5918 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5919 Set_Node_Offset(ret, parse_start);
5922 case '(': /* (?(?{...})...) and (?(?=...)...) */
5925 if (RExC_parse[0] == '?') { /* (?(?...)) */
5926 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5927 || RExC_parse[1] == '<'
5928 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5931 ret = reg_node(pRExC_state, LOGICAL);
5934 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5938 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5939 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5941 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5942 char *name_start= RExC_parse++;
5944 SV *sv_dat=reg_scan_name(pRExC_state,
5945 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5946 if (RExC_parse == name_start || *RExC_parse != ch)
5947 vFAIL2("Sequence (?(%c... not terminated",
5948 (ch == '>' ? '<' : ch));
5951 num = add_data( pRExC_state, 1, "S" );
5952 RExC_rxi->data->data[num]=(void*)sv_dat;
5953 SvREFCNT_inc_simple_void(sv_dat);
5955 ret = reganode(pRExC_state,NGROUPP,num);
5956 goto insert_if_check_paren;
5958 else if (RExC_parse[0] == 'D' &&
5959 RExC_parse[1] == 'E' &&
5960 RExC_parse[2] == 'F' &&
5961 RExC_parse[3] == 'I' &&
5962 RExC_parse[4] == 'N' &&
5963 RExC_parse[5] == 'E')
5965 ret = reganode(pRExC_state,DEFINEP,0);
5968 goto insert_if_check_paren;
5970 else if (RExC_parse[0] == 'R') {
5973 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5974 parno = atoi(RExC_parse++);
5975 while (isDIGIT(*RExC_parse))
5977 } else if (RExC_parse[0] == '&') {
5980 sv_dat = reg_scan_name(pRExC_state,
5981 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5982 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5984 ret = reganode(pRExC_state,INSUBP,parno);
5985 goto insert_if_check_paren;
5987 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5990 parno = atoi(RExC_parse++);
5992 while (isDIGIT(*RExC_parse))
5994 ret = reganode(pRExC_state, GROUPP, parno);
5996 insert_if_check_paren:
5997 if ((c = *nextchar(pRExC_state)) != ')')
5998 vFAIL("Switch condition not recognized");
6000 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6001 br = regbranch(pRExC_state, &flags, 1,depth+1);
6003 br = reganode(pRExC_state, LONGJMP, 0);
6005 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6006 c = *nextchar(pRExC_state);
6011 vFAIL("(?(DEFINE)....) does not allow branches");
6012 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6013 regbranch(pRExC_state, &flags, 1,depth+1);
6014 REGTAIL(pRExC_state, ret, lastbr);
6017 c = *nextchar(pRExC_state);
6022 vFAIL("Switch (?(condition)... contains too many branches");
6023 ender = reg_node(pRExC_state, TAIL);
6024 REGTAIL(pRExC_state, br, ender);
6026 REGTAIL(pRExC_state, lastbr, ender);
6027 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6030 REGTAIL(pRExC_state, ret, ender);
6031 RExC_size++; /* XXX WHY do we need this?!!
6032 For large programs it seems to be required
6033 but I can't figure out why. -- dmq*/
6037 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6041 RExC_parse--; /* for vFAIL to print correctly */
6042 vFAIL("Sequence (? incomplete");
6046 parse_flags: /* (?i) */
6048 U32 posflags = 0, negflags = 0;
6049 U32 *flagsp = &posflags;
6051 while (*RExC_parse) {
6052 /* && strchr("iogcmsx", *RExC_parse) */
6053 /* (?g), (?gc) and (?o) are useless here
6054 and must be globally applied -- japhy */
6055 switch (*RExC_parse) {
6056 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6057 case ONCE_PAT_MOD: /* 'o' */
6058 case GLOBAL_PAT_MOD: /* 'g' */
6059 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6060 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6061 if (! (wastedflags & wflagbit) ) {
6062 wastedflags |= wflagbit;
6065 "Useless (%s%c) - %suse /%c modifier",
6066 flagsp == &negflags ? "?-" : "?",
6068 flagsp == &negflags ? "don't " : "",
6075 case CONTINUE_PAT_MOD: /* 'c' */
6076 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6077 if (! (wastedflags & WASTED_C) ) {
6078 wastedflags |= WASTED_GC;
6081 "Useless (%sc) - %suse /gc modifier",
6082 flagsp == &negflags ? "?-" : "?",
6083 flagsp == &negflags ? "don't " : ""
6088 case KEEPCOPY_PAT_MOD: /* 'p' */
6089 if (flagsp == &negflags) {
6091 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6093 *flagsp |= RXf_PMf_KEEPCOPY;
6097 if (flagsp == &negflags) {
6099 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6103 wastedflags = 0; /* reset so (?g-c) warns twice */
6109 RExC_flags |= posflags;
6110 RExC_flags &= ~negflags;
6112 oregflags |= posflags;
6113 oregflags &= ~negflags;
6115 nextchar(pRExC_state);
6126 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6131 }} /* one for the default block, one for the switch */
6138 ret = reganode(pRExC_state, OPEN, parno);
6141 RExC_nestroot = parno;
6142 if (RExC_seen & REG_SEEN_RECURSE
6143 && !RExC_open_parens[parno-1])
6145 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6146 "Setting open paren #%"IVdf" to %d\n",
6147 (IV)parno, REG_NODE_NUM(ret)));
6148 RExC_open_parens[parno-1]= ret;
6151 Set_Node_Length(ret, 1); /* MJD */
6152 Set_Node_Offset(ret, RExC_parse); /* MJD */
6160 /* Pick up the branches, linking them together. */
6161 parse_start = RExC_parse; /* MJD */
6162 br = regbranch(pRExC_state, &flags, 1,depth+1);
6165 if (RExC_npar > after_freeze)
6166 after_freeze = RExC_npar;
6167 RExC_npar = freeze_paren;
6170 /* branch_len = (paren != 0); */
6174 if (*RExC_parse == '|') {
6175 if (!SIZE_ONLY && RExC_extralen) {
6176 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6179 reginsert(pRExC_state, BRANCH, br, depth+1);
6180 Set_Node_Length(br, paren != 0);
6181 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6185 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6187 else if (paren == ':') {
6188 *flagp |= flags&SIMPLE;
6190 if (is_open) { /* Starts with OPEN. */
6191 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6193 else if (paren != '?') /* Not Conditional */
6195 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6197 while (*RExC_parse == '|') {
6198 if (!SIZE_ONLY && RExC_extralen) {
6199 ender = reganode(pRExC_state, LONGJMP,0);
6200 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6203 RExC_extralen += 2; /* Account for LONGJMP. */
6204 nextchar(pRExC_state);
6206 if (RExC_npar > after_freeze)
6207 after_freeze = RExC_npar;
6208 RExC_npar = freeze_paren;
6210 br = regbranch(pRExC_state, &flags, 0, depth+1);
6214 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6216 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6219 if (have_branch || paren != ':') {
6220 /* Make a closing node, and hook it on the end. */
6223 ender = reg_node(pRExC_state, TAIL);
6226 ender = reganode(pRExC_state, CLOSE, parno);
6227 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6228 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6229 "Setting close paren #%"IVdf" to %d\n",
6230 (IV)parno, REG_NODE_NUM(ender)));
6231 RExC_close_parens[parno-1]= ender;
6232 if (RExC_nestroot == parno)
6235 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6236 Set_Node_Length(ender,1); /* MJD */
6242 *flagp &= ~HASWIDTH;
6245 ender = reg_node(pRExC_state, SUCCEED);
6248 ender = reg_node(pRExC_state, END);
6250 assert(!RExC_opend); /* there can only be one! */
6255 REGTAIL(pRExC_state, lastbr, ender);
6257 if (have_branch && !SIZE_ONLY) {
6259 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6261 /* Hook the tails of the branches to the closing node. */
6262 for (br = ret; br; br = regnext(br)) {
6263 const U8 op = PL_regkind[OP(br)];
6265 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6267 else if (op == BRANCHJ) {
6268 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6276 static const char parens[] = "=!<,>";
6278 if (paren && (p = strchr(parens, paren))) {
6279 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6280 int flag = (p - parens) > 1;
6283 node = SUSPEND, flag = 0;
6284 reginsert(pRExC_state, node,ret, depth+1);
6285 Set_Node_Cur_Length(ret);
6286 Set_Node_Offset(ret, parse_start + 1);
6288 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6292 /* Check for proper termination. */
6294 RExC_flags = oregflags;
6295 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6296 RExC_parse = oregcomp_parse;
6297 vFAIL("Unmatched (");
6300 else if (!paren && RExC_parse < RExC_end) {
6301 if (*RExC_parse == ')') {
6303 vFAIL("Unmatched )");
6306 FAIL("Junk on end of regexp"); /* "Can't happen". */
6310 RExC_npar = after_freeze;
6315 - regbranch - one alternative of an | operator
6317 * Implements the concatenation operator.
6320 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6323 register regnode *ret;
6324 register regnode *chain = NULL;
6325 register regnode *latest;
6326 I32 flags = 0, c = 0;
6327 GET_RE_DEBUG_FLAGS_DECL;
6329 PERL_ARGS_ASSERT_REGBRANCH;
6331 DEBUG_PARSE("brnc");
6336 if (!SIZE_ONLY && RExC_extralen)
6337 ret = reganode(pRExC_state, BRANCHJ,0);
6339 ret = reg_node(pRExC_state, BRANCH);
6340 Set_Node_Length(ret, 1);
6344 if (!first && SIZE_ONLY)
6345 RExC_extralen += 1; /* BRANCHJ */
6347 *flagp = WORST; /* Tentatively. */
6350 nextchar(pRExC_state);
6351 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6353 latest = regpiece(pRExC_state, &flags,depth+1);
6354 if (latest == NULL) {
6355 if (flags & TRYAGAIN)
6359 else if (ret == NULL)
6361 *flagp |= flags&(HASWIDTH|POSTPONED);
6362 if (chain == NULL) /* First piece. */
6363 *flagp |= flags&SPSTART;
6366 REGTAIL(pRExC_state, chain, latest);
6371 if (chain == NULL) { /* Loop ran zero times. */
6372 chain = reg_node(pRExC_state, NOTHING);
6377 *flagp |= flags&SIMPLE;
6384 - regpiece - something followed by possible [*+?]
6386 * Note that the branching code sequences used for ? and the general cases
6387 * of * and + are somewhat optimized: they use the same NOTHING node as
6388 * both the endmarker for their branch list and the body of the last branch.
6389 * It might seem that this node could be dispensed with entirely, but the
6390 * endmarker role is not redundant.
6393 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6396 register regnode *ret;
6398 register char *next;
6400 const char * const origparse = RExC_parse;
6402 I32 max = REG_INFTY;
6404 const char *maxpos = NULL;
6405 GET_RE_DEBUG_FLAGS_DECL;
6407 PERL_ARGS_ASSERT_REGPIECE;
6409 DEBUG_PARSE("piec");
6411 ret = regatom(pRExC_state, &flags,depth+1);
6413 if (flags & TRYAGAIN)
6420 if (op == '{' && regcurly(RExC_parse)) {
6422 parse_start = RExC_parse; /* MJD */
6423 next = RExC_parse + 1;
6424 while (isDIGIT(*next) || *next == ',') {
6433 if (*next == '}') { /* got one */
6437 min = atoi(RExC_parse);
6441 maxpos = RExC_parse;
6443 if (!max && *maxpos != '0')
6444 max = REG_INFTY; /* meaning "infinity" */
6445 else if (max >= REG_INFTY)
6446 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6448 nextchar(pRExC_state);
6451 if ((flags&SIMPLE)) {
6452 RExC_naughty += 2 + RExC_naughty / 2;
6453 reginsert(pRExC_state, CURLY, ret, depth+1);
6454 Set_Node_Offset(ret, parse_start+1); /* MJD */
6455 Set_Node_Cur_Length(ret);
6458 regnode * const w = reg_node(pRExC_state, WHILEM);
6461 REGTAIL(pRExC_state, ret, w);
6462 if (!SIZE_ONLY && RExC_extralen) {
6463 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6464 reginsert(pRExC_state, NOTHING,ret, depth+1);
6465 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6467 reginsert(pRExC_state, CURLYX,ret, depth+1);
6469 Set_Node_Offset(ret, parse_start+1);
6470 Set_Node_Length(ret,
6471 op == '{' ? (RExC_parse - parse_start) : 1);
6473 if (!SIZE_ONLY && RExC_extralen)
6474 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6475 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6477 RExC_whilem_seen++, RExC_extralen += 3;
6478 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6487 vFAIL("Can't do {n,m} with n > m");
6489 ARG1_SET(ret, (U16)min);
6490 ARG2_SET(ret, (U16)max);
6502 #if 0 /* Now runtime fix should be reliable. */
6504 /* if this is reinstated, don't forget to put this back into perldiag:
6506 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6508 (F) The part of the regexp subject to either the * or + quantifier
6509 could match an empty string. The {#} shows in the regular
6510 expression about where the problem was discovered.
6514 if (!(flags&HASWIDTH) && op != '?')
6515 vFAIL("Regexp *+ operand could be empty");
6518 parse_start = RExC_parse;
6519 nextchar(pRExC_state);
6521 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6523 if (op == '*' && (flags&SIMPLE)) {
6524 reginsert(pRExC_state, STAR, ret, depth+1);
6528 else if (op == '*') {
6532 else if (op == '+' && (flags&SIMPLE)) {
6533 reginsert(pRExC_state, PLUS, ret, depth+1);
6537 else if (op == '+') {
6541 else if (op == '?') {
6546 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6547 ckWARN3reg(RExC_parse,
6548 "%.*s matches null string many times",
6549 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6553 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6554 nextchar(pRExC_state);
6555 reginsert(pRExC_state, MINMOD, ret, depth+1);
6556 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6558 #ifndef REG_ALLOW_MINMOD_SUSPEND
6561 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6563 nextchar(pRExC_state);
6564 ender = reg_node(pRExC_state, SUCCEED);
6565 REGTAIL(pRExC_state, ret, ender);
6566 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6568 ender = reg_node(pRExC_state, TAIL);
6569 REGTAIL(pRExC_state, ret, ender);
6573 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6575 vFAIL("Nested quantifiers");
6582 /* reg_namedseq(pRExC_state,UVp)
6584 This is expected to be called by a parser routine that has
6585 recognized '\N' and needs to handle the rest. RExC_parse is
6586 expected to point at the first char following the N at the time
6589 If valuep is non-null then it is assumed that we are parsing inside
6590 of a charclass definition and the first codepoint in the resolved
6591 string is returned via *valuep and the routine will return NULL.
6592 In this mode if a multichar string is returned from the charnames
6593 handler a warning will be issued, and only the first char in the
6594 sequence will be examined. If the string returned is zero length
6595 then the value of *valuep is undefined and NON-NULL will
6596 be returned to indicate failure. (This will NOT be a valid pointer
6599 If valuep is null then it is assumed that we are parsing normal text
6600 and inserts a new EXACT node into the program containing the resolved
6601 string and returns a pointer to the new node. If the string is
6602 zerolength a NOTHING node is emitted.
6604 On success RExC_parse is set to the char following the endbrace.
6605 Parsing failures will generate a fatal errorvia vFAIL(...)
6607 NOTE: We cache all results from the charnames handler locally in
6608 the RExC_charnames hash (created on first use) to prevent a charnames
6609 handler from playing silly-buggers and returning a short string and
6610 then a long string for a given pattern. Since the regexp program
6611 size is calculated during an initial parse this would result
6612 in a buffer overrun so we cache to prevent the charname result from
6613 changing during the course of the parse.
6617 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6619 char * name; /* start of the content of the name */
6620 char * endbrace; /* endbrace following the name */
6623 STRLEN len; /* this has various purposes throughout the code */
6624 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6625 regnode *ret = NULL;
6627 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6629 if (*RExC_parse != '{' ||
6630 (*RExC_parse == '{' && RExC_parse[1]
6631 && strchr("0123456789", RExC_parse[1])))
6633 GET_RE_DEBUG_FLAGS_DECL;
6635 /* no bare \N in a charclass */
6636 vFAIL("Missing braces on \\N{}");
6638 nextchar(pRExC_state);
6639 ret = reg_node(pRExC_state, REG_ANY);
6640 *flagp |= HASWIDTH|SIMPLE;
6643 Set_Node_Length(ret, 1); /* MJD */
6646 name = RExC_parse+1;
6647 endbrace = strchr(RExC_parse, '}');
6650 vFAIL("Missing right brace on \\N{}");
6652 RExC_parse = endbrace + 1;
6655 /* RExC_parse points at the beginning brace,
6656 endbrace points at the last */
6657 if ( name[0]=='U' && name[1]=='+' ) {
6658 /* its a "Unicode hex" notation {U+89AB} */
6659 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6660 | PERL_SCAN_DISALLOW_PREFIX
6661 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6663 len = (STRLEN)(endbrace - name - 2);
6664 cp = grok_hex(name + 2, &len, &fl, NULL);
6665 if ( len != (STRLEN)(endbrace - name - 2) ) {
6669 if (cp > 0xff) RExC_utf8 = 1;
6674 /* Need to convert to utf8 if either: won't fit into a byte, or the re
6675 * is going to be in utf8 and the representation changes under utf8. */
6676 if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
6677 U8 string[UTF8_MAXBYTES+1];
6680 tmps = uvuni_to_utf8(string, cp);
6681 sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
6682 } else { /* Otherwise, no need for utf8, can skip that step */
6685 sv_str= newSVpvn(&string, 1);
6688 /* fetch the charnames handler for this scope */
6689 HV * const table = GvHV(PL_hintgv);
6691 hv_fetchs(table, "charnames", FALSE) :
6693 SV *cv= cvp ? *cvp : NULL;
6696 /* create an SV with the name as argument */
6697 sv_name = newSVpvn(name, endbrace - name);
6699 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6700 vFAIL2("Constant(\\N{%" SVf "}) unknown: "
6701 "(possibly a missing \"use charnames ...\")",
6704 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6705 vFAIL2("Constant(\\N{%" SVf "}): "
6706 "$^H{charnames} is not defined", SVfARG(sv_name));
6711 if (!RExC_charnames) {
6712 /* make sure our cache is allocated */
6713 RExC_charnames = newHV();
6714 sv_2mortal(MUTABLE_SV(RExC_charnames));
6716 /* see if we have looked this one up before */
6717 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6719 sv_str = HeVAL(he_str);
6732 count= call_sv(cv, G_SCALAR);
6734 if (count == 1) { /* XXXX is this right? dmq */
6736 SvREFCNT_inc_simple_void(sv_str);
6744 if ( !sv_str || !SvOK(sv_str) ) {
6745 vFAIL2("Constant(\\N{%" SVf "}): Call to &{$^H{charnames}} "
6746 "did not return a defined value", SVfARG(sv_name));
6748 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6753 char *p = SvPV(sv_str, len);
6756 if ( SvUTF8(sv_str) ) {
6757 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6761 We have to turn on utf8 for high bit chars otherwise
6762 we get failures with
6764 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6765 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6767 This is different from what \x{} would do with the same
6768 codepoint, where the condition is > 0xFF.
6775 /* warn if we havent used the whole string? */
6777 if (numlen<len && SIZE_ONLY) {
6778 ckWARN2reg(RExC_parse,
6779 "Ignoring excess chars from \\N{%" SVf "} in character class",
6783 } else if (SIZE_ONLY) {
6784 ckWARN2reg(RExC_parse,
6785 "Ignoring zero length \\N{%" SVf "} in character class",
6790 SvREFCNT_dec(sv_name);
6792 SvREFCNT_dec(sv_str);
6793 return len ? NULL : (regnode *)&len;
6794 } else if(SvCUR(sv_str)) {
6800 char * parse_start = name-3; /* needed for the offsets */
6802 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6804 ret = reg_node(pRExC_state,
6805 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6808 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6809 sv_utf8_upgrade(sv_str);
6810 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6814 p = SvPV(sv_str, len);
6816 /* len is the length written, charlen is the size the char read */
6817 for ( len = 0; p < pend; p += charlen ) {
6819 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6821 STRLEN foldlen,numlen;
6822 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6823 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6824 /* Emit all the Unicode characters. */
6826 for (foldbuf = tmpbuf;
6830 uvc = utf8_to_uvchr(foldbuf, &numlen);
6832 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6835 /* In EBCDIC the numlen
6836 * and unilen can differ. */
6838 if (numlen >= foldlen)
6842 break; /* "Can't happen." */
6845 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6857 RExC_size += STR_SZ(len);
6860 RExC_emit += STR_SZ(len);
6862 Set_Node_Cur_Length(ret); /* MJD */
6864 nextchar(pRExC_state);
6865 } else { /* zero length */
6866 ret = reg_node(pRExC_state,NOTHING);
6869 SvREFCNT_dec(sv_str);
6872 SvREFCNT_dec(sv_name);
6882 * It returns the code point in utf8 for the value in *encp.
6883 * value: a code value in the source encoding
6884 * encp: a pointer to an Encode object
6886 * If the result from Encode is not a single character,
6887 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6890 S_reg_recode(pTHX_ const char value, SV **encp)
6893 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6894 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6895 const STRLEN newlen = SvCUR(sv);
6896 UV uv = UNICODE_REPLACEMENT;
6898 PERL_ARGS_ASSERT_REG_RECODE;
6902 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6905 if (!newlen || numlen != newlen) {
6906 uv = UNICODE_REPLACEMENT;
6914 - regatom - the lowest level
6916 Try to identify anything special at the start of the pattern. If there
6917 is, then handle it as required. This may involve generating a single regop,
6918 such as for an assertion; or it may involve recursing, such as to
6919 handle a () structure.
6921 If the string doesn't start with something special then we gobble up
6922 as much literal text as we can.
6924 Once we have been able to handle whatever type of thing started the
6925 sequence, we return.
6927 Note: we have to be careful with escapes, as they can be both literal
6928 and special, and in the case of \10 and friends can either, depending
6929 on context. Specifically there are two seperate switches for handling
6930 escape sequences, with the one for handling literal escapes requiring
6931 a dummy entry for all of the special escapes that are actually handled
6936 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6939 register regnode *ret = NULL;
6941 char *parse_start = RExC_parse;
6942 GET_RE_DEBUG_FLAGS_DECL;
6943 DEBUG_PARSE("atom");
6944 *flagp = WORST; /* Tentatively. */
6946 PERL_ARGS_ASSERT_REGATOM;
6949 switch ((U8)*RExC_parse) {
6951 RExC_seen_zerolen++;
6952 nextchar(pRExC_state);
6953 if (RExC_flags & RXf_PMf_MULTILINE)
6954 ret = reg_node(pRExC_state, MBOL);
6955 else if (RExC_flags & RXf_PMf_SINGLELINE)
6956 ret = reg_node(pRExC_state, SBOL);
6958 ret = reg_node(pRExC_state, BOL);
6959 Set_Node_Length(ret, 1); /* MJD */
6962 nextchar(pRExC_state);
6964 RExC_seen_zerolen++;
6965 if (RExC_flags & RXf_PMf_MULTILINE)
6966 ret = reg_node(pRExC_state, MEOL);
6967 else if (RExC_flags & RXf_PMf_SINGLELINE)
6968 ret = reg_node(pRExC_state, SEOL);
6970 ret = reg_node(pRExC_state, EOL);
6971 Set_Node_Length(ret, 1); /* MJD */
6974 nextchar(pRExC_state);
6975 if (RExC_flags & RXf_PMf_SINGLELINE)
6976 ret = reg_node(pRExC_state, SANY);
6978 ret = reg_node(pRExC_state, REG_ANY);
6979 *flagp |= HASWIDTH|SIMPLE;
6981 Set_Node_Length(ret, 1); /* MJD */
6985 char * const oregcomp_parse = ++RExC_parse;
6986 ret = regclass(pRExC_state,depth+1);
6987 if (*RExC_parse != ']') {
6988 RExC_parse = oregcomp_parse;
6989 vFAIL("Unmatched [");
6991 nextchar(pRExC_state);
6992 *flagp |= HASWIDTH|SIMPLE;
6993 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6997 nextchar(pRExC_state);
6998 ret = reg(pRExC_state, 1, &flags,depth+1);
7000 if (flags & TRYAGAIN) {
7001 if (RExC_parse == RExC_end) {
7002 /* Make parent create an empty node if needed. */
7010 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7014 if (flags & TRYAGAIN) {
7018 vFAIL("Internal urp");
7019 /* Supposed to be caught earlier. */
7022 if (!regcurly(RExC_parse)) {
7031 vFAIL("Quantifier follows nothing");
7039 len=0; /* silence a spurious compiler warning */
7040 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7041 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7042 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7043 ret = reganode(pRExC_state, FOLDCHAR, cp);
7044 Set_Node_Length(ret, 1); /* MJD */
7045 nextchar(pRExC_state); /* kill whitespace under /x */
7053 This switch handles escape sequences that resolve to some kind
7054 of special regop and not to literal text. Escape sequnces that
7055 resolve to literal text are handled below in the switch marked
7058 Every entry in this switch *must* have a corresponding entry
7059 in the literal escape switch. However, the opposite is not
7060 required, as the default for this switch is to jump to the
7061 literal text handling code.
7063 switch ((U8)*++RExC_parse) {
7068 /* Special Escapes */
7070 RExC_seen_zerolen++;
7071 ret = reg_node(pRExC_state, SBOL);
7073 goto finish_meta_pat;
7075 ret = reg_node(pRExC_state, GPOS);
7076 RExC_seen |= REG_SEEN_GPOS;
7078 goto finish_meta_pat;
7080 RExC_seen_zerolen++;
7081 ret = reg_node(pRExC_state, KEEPS);
7083 /* XXX:dmq : disabling in-place substitution seems to
7084 * be necessary here to avoid cases of memory corruption, as
7085 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7087 RExC_seen |= REG_SEEN_LOOKBEHIND;
7088 goto finish_meta_pat;
7090 ret = reg_node(pRExC_state, SEOL);
7092 RExC_seen_zerolen++; /* Do not optimize RE away */
7093 goto finish_meta_pat;
7095 ret = reg_node(pRExC_state, EOS);
7097 RExC_seen_zerolen++; /* Do not optimize RE away */
7098 goto finish_meta_pat;
7100 ret = reg_node(pRExC_state, CANY);
7101 RExC_seen |= REG_SEEN_CANY;
7102 *flagp |= HASWIDTH|SIMPLE;
7103 goto finish_meta_pat;
7105 ret = reg_node(pRExC_state, CLUMP);
7107 goto finish_meta_pat;
7109 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7110 *flagp |= HASWIDTH|SIMPLE;
7111 goto finish_meta_pat;
7113 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7114 *flagp |= HASWIDTH|SIMPLE;
7115 goto finish_meta_pat;
7117 RExC_seen_zerolen++;
7118 RExC_seen |= REG_SEEN_LOOKBEHIND;
7119 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7121 goto finish_meta_pat;
7123 RExC_seen_zerolen++;
7124 RExC_seen |= REG_SEEN_LOOKBEHIND;
7125 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7127 goto finish_meta_pat;
7129 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7130 *flagp |= HASWIDTH|SIMPLE;
7131 goto finish_meta_pat;
7133 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7134 *flagp |= HASWIDTH|SIMPLE;
7135 goto finish_meta_pat;
7137 ret = reg_node(pRExC_state, DIGIT);
7138 *flagp |= HASWIDTH|SIMPLE;
7139 goto finish_meta_pat;
7141 ret = reg_node(pRExC_state, NDIGIT);
7142 *flagp |= HASWIDTH|SIMPLE;
7143 goto finish_meta_pat;
7145 ret = reg_node(pRExC_state, LNBREAK);
7146 *flagp |= HASWIDTH|SIMPLE;
7147 goto finish_meta_pat;
7149 ret = reg_node(pRExC_state, HORIZWS);
7150 *flagp |= HASWIDTH|SIMPLE;
7151 goto finish_meta_pat;
7153 ret = reg_node(pRExC_state, NHORIZWS);
7154 *flagp |= HASWIDTH|SIMPLE;
7155 goto finish_meta_pat;
7157 ret = reg_node(pRExC_state, VERTWS);
7158 *flagp |= HASWIDTH|SIMPLE;
7159 goto finish_meta_pat;
7161 ret = reg_node(pRExC_state, NVERTWS);
7162 *flagp |= HASWIDTH|SIMPLE;
7164 nextchar(pRExC_state);
7165 Set_Node_Length(ret, 2); /* MJD */
7170 char* const oldregxend = RExC_end;
7172 char* parse_start = RExC_parse - 2;
7175 if (RExC_parse[1] == '{') {
7176 /* a lovely hack--pretend we saw [\pX] instead */
7177 RExC_end = strchr(RExC_parse, '}');
7179 const U8 c = (U8)*RExC_parse;
7181 RExC_end = oldregxend;
7182 vFAIL2("Missing right brace on \\%c{}", c);
7187 RExC_end = RExC_parse + 2;
7188 if (RExC_end > oldregxend)
7189 RExC_end = oldregxend;
7193 ret = regclass(pRExC_state,depth+1);
7195 RExC_end = oldregxend;
7198 Set_Node_Offset(ret, parse_start + 2);
7199 Set_Node_Cur_Length(ret);
7200 nextchar(pRExC_state);
7201 *flagp |= HASWIDTH|SIMPLE;
7205 /* Handle \N and \N{NAME} here and not below because it can be
7206 multicharacter. join_exact() will join them up later on.
7207 Also this makes sure that things like /\N{BLAH}+/ and
7208 \N{BLAH} being multi char Just Happen. dmq*/
7210 ret= reg_namedseq(pRExC_state, NULL, flagp);
7212 case 'k': /* Handle \k<NAME> and \k'NAME' */
7215 char ch= RExC_parse[1];
7216 if (ch != '<' && ch != '\'' && ch != '{') {
7218 vFAIL2("Sequence %.2s... not terminated",parse_start);
7220 /* this pretty much dupes the code for (?P=...) in reg(), if
7221 you change this make sure you change that */
7222 char* name_start = (RExC_parse += 2);
7224 SV *sv_dat = reg_scan_name(pRExC_state,
7225 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7226 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7227 if (RExC_parse == name_start || *RExC_parse != ch)
7228 vFAIL2("Sequence %.3s... not terminated",parse_start);
7231 num = add_data( pRExC_state, 1, "S" );
7232 RExC_rxi->data->data[num]=(void*)sv_dat;
7233 SvREFCNT_inc_simple_void(sv_dat);
7237 ret = reganode(pRExC_state,
7238 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7242 /* override incorrect value set in reganode MJD */
7243 Set_Node_Offset(ret, parse_start+1);
7244 Set_Node_Cur_Length(ret); /* MJD */
7245 nextchar(pRExC_state);
7251 case '1': case '2': case '3': case '4':
7252 case '5': case '6': case '7': case '8': case '9':
7255 bool isg = *RExC_parse == 'g';
7260 if (*RExC_parse == '{') {
7264 if (*RExC_parse == '-') {
7268 if (hasbrace && !isDIGIT(*RExC_parse)) {
7269 if (isrel) RExC_parse--;
7271 goto parse_named_seq;
7273 num = atoi(RExC_parse);
7274 if (isg && num == 0)
7275 vFAIL("Reference to invalid group 0");
7277 num = RExC_npar - num;
7279 vFAIL("Reference to nonexistent or unclosed group");
7281 if (!isg && num > 9 && num >= RExC_npar)
7284 char * const parse_start = RExC_parse - 1; /* MJD */
7285 while (isDIGIT(*RExC_parse))
7287 if (parse_start == RExC_parse - 1)
7288 vFAIL("Unterminated \\g... pattern");
7290 if (*RExC_parse != '}')
7291 vFAIL("Unterminated \\g{...} pattern");
7295 if (num > (I32)RExC_rx->nparens)
7296 vFAIL("Reference to nonexistent group");
7299 ret = reganode(pRExC_state,
7300 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7304 /* override incorrect value set in reganode MJD */
7305 Set_Node_Offset(ret, parse_start+1);
7306 Set_Node_Cur_Length(ret); /* MJD */
7308 nextchar(pRExC_state);
7313 if (RExC_parse >= RExC_end)
7314 FAIL("Trailing \\");
7317 /* Do not generate "unrecognized" warnings here, we fall
7318 back into the quick-grab loop below */
7325 if (RExC_flags & RXf_PMf_EXTENDED) {
7326 if ( reg_skipcomment( pRExC_state ) )
7333 register STRLEN len;
7338 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7340 parse_start = RExC_parse - 1;
7346 ret = reg_node(pRExC_state,
7347 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7349 for (len = 0, p = RExC_parse - 1;
7350 len < 127 && p < RExC_end;
7353 char * const oldp = p;
7355 if (RExC_flags & RXf_PMf_EXTENDED)
7356 p = regwhite( pRExC_state, p );
7361 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7362 goto normal_default;
7372 /* Literal Escapes Switch
7374 This switch is meant to handle escape sequences that
7375 resolve to a literal character.
7377 Every escape sequence that represents something
7378 else, like an assertion or a char class, is handled
7379 in the switch marked 'Special Escapes' above in this
7380 routine, but also has an entry here as anything that
7381 isn't explicitly mentioned here will be treated as
7382 an unescaped equivalent literal.
7386 /* These are all the special escapes. */
7390 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7391 goto normal_default;
7392 case 'A': /* Start assertion */
7393 case 'b': case 'B': /* Word-boundary assertion*/
7394 case 'C': /* Single char !DANGEROUS! */
7395 case 'd': case 'D': /* digit class */
7396 case 'g': case 'G': /* generic-backref, pos assertion */
7397 case 'h': case 'H': /* HORIZWS */
7398 case 'k': case 'K': /* named backref, keep marker */
7399 case 'N': /* named char sequence */
7400 case 'p': case 'P': /* Unicode property */
7401 case 'R': /* LNBREAK */
7402 case 's': case 'S': /* space class */
7403 case 'v': case 'V': /* VERTWS */
7404 case 'w': case 'W': /* word class */
7405 case 'X': /* eXtended Unicode "combining character sequence" */
7406 case 'z': case 'Z': /* End of line/string assertion */
7410 /* Anything after here is an escape that resolves to a
7411 literal. (Except digits, which may or may not)
7430 ender = ASCII_TO_NATIVE('\033');
7434 ender = ASCII_TO_NATIVE('\007');
7439 char* const e = strchr(p, '}');
7443 vFAIL("Missing right brace on \\x{}");
7446 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7447 | PERL_SCAN_DISALLOW_PREFIX;
7448 STRLEN numlen = e - p - 1;
7449 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7456 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7458 ender = grok_hex(p, &numlen, &flags, NULL);
7461 if (PL_encoding && ender < 0x100)
7462 goto recode_encoding;
7466 ender = UCHARAT(p++);
7467 ender = toCTRL(ender);
7469 case '0': case '1': case '2': case '3':case '4':
7470 case '5': case '6': case '7': case '8':case '9':
7472 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7475 ender = grok_oct(p, &numlen, &flags, NULL);
7477 /* An octal above 0xff is interpreted differently
7478 * depending on if the re is in utf8 or not. If it
7479 * is in utf8, the value will be itself, otherwise
7480 * it is interpreted as modulo 0x100. It has been
7481 * decided to discourage the use of octal above the
7482 * single-byte range. For now, warn only when
7483 * it ends up modulo */
7484 if (SIZE_ONLY && ender >= 0x100
7485 && ! UTF && ! PL_encoding) {
7486 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7494 if (PL_encoding && ender < 0x100)
7495 goto recode_encoding;
7499 SV* enc = PL_encoding;
7500 ender = reg_recode((const char)(U8)ender, &enc);
7501 if (!enc && SIZE_ONLY)
7502 ckWARNreg(p, "Invalid escape in the specified encoding");
7508 FAIL("Trailing \\");
7511 if (!SIZE_ONLY&& isALPHA(*p))
7512 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7513 goto normal_default;
7518 if (UTF8_IS_START(*p) && UTF) {
7520 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7521 &numlen, UTF8_ALLOW_DEFAULT);
7528 if ( RExC_flags & RXf_PMf_EXTENDED)
7529 p = regwhite( pRExC_state, p );
7531 /* Prime the casefolded buffer. */
7532 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7534 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7539 /* Emit all the Unicode characters. */
7541 for (foldbuf = tmpbuf;
7543 foldlen -= numlen) {
7544 ender = utf8_to_uvchr(foldbuf, &numlen);
7546 const STRLEN unilen = reguni(pRExC_state, ender, s);
7549 /* In EBCDIC the numlen
7550 * and unilen can differ. */
7552 if (numlen >= foldlen)
7556 break; /* "Can't happen." */
7560 const STRLEN unilen = reguni(pRExC_state, ender, s);
7569 REGC((char)ender, s++);
7575 /* Emit all the Unicode characters. */
7577 for (foldbuf = tmpbuf;
7579 foldlen -= numlen) {
7580 ender = utf8_to_uvchr(foldbuf, &numlen);
7582 const STRLEN unilen = reguni(pRExC_state, ender, s);
7585 /* In EBCDIC the numlen
7586 * and unilen can differ. */
7588 if (numlen >= foldlen)
7596 const STRLEN unilen = reguni(pRExC_state, ender, s);
7605 REGC((char)ender, s++);
7609 Set_Node_Cur_Length(ret); /* MJD */
7610 nextchar(pRExC_state);
7612 /* len is STRLEN which is unsigned, need to copy to signed */
7615 vFAIL("Internal disaster");
7619 if (len == 1 && UNI_IS_INVARIANT(ender))
7623 RExC_size += STR_SZ(len);
7626 RExC_emit += STR_SZ(len);
7636 S_regwhite( RExC_state_t *pRExC_state, char *p )
7638 const char *e = RExC_end;
7640 PERL_ARGS_ASSERT_REGWHITE;
7645 else if (*p == '#') {
7654 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7662 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7663 Character classes ([:foo:]) can also be negated ([:^foo:]).
7664 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7665 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7666 but trigger failures because they are currently unimplemented. */
7668 #define POSIXCC_DONE(c) ((c) == ':')
7669 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7670 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7673 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7676 I32 namedclass = OOB_NAMEDCLASS;
7678 PERL_ARGS_ASSERT_REGPPOSIXCC;
7680 if (value == '[' && RExC_parse + 1 < RExC_end &&
7681 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7682 POSIXCC(UCHARAT(RExC_parse))) {
7683 const char c = UCHARAT(RExC_parse);
7684 char* const s = RExC_parse++;
7686 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7688 if (RExC_parse == RExC_end)
7689 /* Grandfather lone [:, [=, [. */
7692 const char* const t = RExC_parse++; /* skip over the c */
7695 if (UCHARAT(RExC_parse) == ']') {
7696 const char *posixcc = s + 1;
7697 RExC_parse++; /* skip over the ending ] */
7700 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7701 const I32 skip = t - posixcc;
7703 /* Initially switch on the length of the name. */
7706 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7707 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7710 /* Names all of length 5. */
7711 /* alnum alpha ascii blank cntrl digit graph lower
7712 print punct space upper */
7713 /* Offset 4 gives the best switch position. */
7714 switch (posixcc[4]) {
7716 if (memEQ(posixcc, "alph", 4)) /* alpha */
7717 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7720 if (memEQ(posixcc, "spac", 4)) /* space */
7721 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7724 if (memEQ(posixcc, "grap", 4)) /* graph */
7725 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7728 if (memEQ(posixcc, "asci", 4)) /* ascii */
7729 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7732 if (memEQ(posixcc, "blan", 4)) /* blank */
7733 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7736 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7737 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7740 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7741 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7744 if (memEQ(posixcc, "lowe", 4)) /* lower */
7745 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7746 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7747 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7750 if (memEQ(posixcc, "digi", 4)) /* digit */
7751 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7752 else if (memEQ(posixcc, "prin", 4)) /* print */
7753 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7754 else if (memEQ(posixcc, "punc", 4)) /* punct */
7755 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7760 if (memEQ(posixcc, "xdigit", 6))
7761 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7765 if (namedclass == OOB_NAMEDCLASS)
7766 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7768 assert (posixcc[skip] == ':');
7769 assert (posixcc[skip+1] == ']');
7770 } else if (!SIZE_ONLY) {
7771 /* [[=foo=]] and [[.foo.]] are still future. */
7773 /* adjust RExC_parse so the warning shows after
7775 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7777 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7780 /* Maternal grandfather:
7781 * "[:" ending in ":" but not in ":]" */
7791 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7795 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7797 if (POSIXCC(UCHARAT(RExC_parse))) {
7798 const char *s = RExC_parse;
7799 const char c = *s++;
7803 if (*s && c == *s && s[1] == ']') {
7805 "POSIX syntax [%c %c] belongs inside character classes",
7808 /* [[=foo=]] and [[.foo.]] are still future. */
7809 if (POSIXCC_NOTYET(c)) {
7810 /* adjust RExC_parse so the error shows after
7812 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7814 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7821 #define _C_C_T_(NAME,TEST,WORD) \
7824 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7826 for (value = 0; value < 256; value++) \
7828 ANYOF_BITMAP_SET(ret, value); \
7833 case ANYOF_N##NAME: \
7835 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7837 for (value = 0; value < 256; value++) \
7839 ANYOF_BITMAP_SET(ret, value); \
7845 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7847 for (value = 0; value < 256; value++) \
7849 ANYOF_BITMAP_SET(ret, value); \
7853 case ANYOF_N##NAME: \
7854 for (value = 0; value < 256; value++) \
7856 ANYOF_BITMAP_SET(ret, value); \
7862 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7863 so that it is possible to override the option here without having to
7864 rebuild the entire core. as we are required to do if we change regcomp.h
7865 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7867 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7868 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7871 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7872 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7874 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7878 parse a class specification and produce either an ANYOF node that
7879 matches the pattern or if the pattern matches a single char only and
7880 that char is < 256 and we are case insensitive then we produce an
7885 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7888 register UV nextvalue;
7889 register IV prevvalue = OOB_UNICODE;
7890 register IV range = 0;
7891 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7892 register regnode *ret;
7895 char *rangebegin = NULL;
7896 bool need_class = 0;
7899 bool optimize_invert = TRUE;
7900 AV* unicode_alternate = NULL;
7902 UV literal_endpoint = 0;
7904 UV stored = 0; /* number of chars stored in the class */
7906 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7907 case we need to change the emitted regop to an EXACT. */
7908 const char * orig_parse = RExC_parse;
7909 GET_RE_DEBUG_FLAGS_DECL;
7911 PERL_ARGS_ASSERT_REGCLASS;
7913 PERL_UNUSED_ARG(depth);
7916 DEBUG_PARSE("clas");
7918 /* Assume we are going to generate an ANYOF node. */
7919 ret = reganode(pRExC_state, ANYOF, 0);
7922 ANYOF_FLAGS(ret) = 0;
7924 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7928 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7932 RExC_size += ANYOF_SKIP;
7933 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7936 RExC_emit += ANYOF_SKIP;
7938 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7940 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7941 ANYOF_BITMAP_ZERO(ret);
7942 listsv = newSVpvs("# comment\n");
7945 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7947 if (!SIZE_ONLY && POSIXCC(nextvalue))
7948 checkposixcc(pRExC_state);
7950 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7951 if (UCHARAT(RExC_parse) == ']')
7955 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7959 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7962 rangebegin = RExC_parse;
7964 value = utf8n_to_uvchr((U8*)RExC_parse,
7965 RExC_end - RExC_parse,
7966 &numlen, UTF8_ALLOW_DEFAULT);
7967 RExC_parse += numlen;
7970 value = UCHARAT(RExC_parse++);
7972 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7973 if (value == '[' && POSIXCC(nextvalue))
7974 namedclass = regpposixcc(pRExC_state, value);
7975 else if (value == '\\') {
7977 value = utf8n_to_uvchr((U8*)RExC_parse,
7978 RExC_end - RExC_parse,
7979 &numlen, UTF8_ALLOW_DEFAULT);
7980 RExC_parse += numlen;
7983 value = UCHARAT(RExC_parse++);
7984 /* Some compilers cannot handle switching on 64-bit integer
7985 * values, therefore value cannot be an UV. Yes, this will
7986 * be a problem later if we want switch on Unicode.
7987 * A similar issue a little bit later when switching on
7988 * namedclass. --jhi */
7989 switch ((I32)value) {
7990 case 'w': namedclass = ANYOF_ALNUM; break;
7991 case 'W': namedclass = ANYOF_NALNUM; break;
7992 case 's': namedclass = ANYOF_SPACE; break;
7993 case 'S': namedclass = ANYOF_NSPACE; break;
7994 case 'd': namedclass = ANYOF_DIGIT; break;
7995 case 'D': namedclass = ANYOF_NDIGIT; break;
7996 case 'v': namedclass = ANYOF_VERTWS; break;
7997 case 'V': namedclass = ANYOF_NVERTWS; break;
7998 case 'h': namedclass = ANYOF_HORIZWS; break;
7999 case 'H': namedclass = ANYOF_NHORIZWS; break;
8000 case 'N': /* Handle \N{NAME} in class */
8002 /* We only pay attention to the first char of
8003 multichar strings being returned. I kinda wonder
8004 if this makes sense as it does change the behaviour
8005 from earlier versions, OTOH that behaviour was broken
8007 UV v; /* value is register so we cant & it /grrr */
8008 if (reg_namedseq(pRExC_state, &v, NULL)) {
8018 if (RExC_parse >= RExC_end)
8019 vFAIL2("Empty \\%c{}", (U8)value);
8020 if (*RExC_parse == '{') {
8021 const U8 c = (U8)value;
8022 e = strchr(RExC_parse++, '}');
8024 vFAIL2("Missing right brace on \\%c{}", c);
8025 while (isSPACE(UCHARAT(RExC_parse)))
8027 if (e == RExC_parse)
8028 vFAIL2("Empty \\%c{}", c);
8030 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8038 if (UCHARAT(RExC_parse) == '^') {
8041 value = value == 'p' ? 'P' : 'p'; /* toggle */
8042 while (isSPACE(UCHARAT(RExC_parse))) {
8047 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8048 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8051 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8052 namedclass = ANYOF_MAX; /* no official name, but it's named */
8055 case 'n': value = '\n'; break;
8056 case 'r': value = '\r'; break;
8057 case 't': value = '\t'; break;
8058 case 'f': value = '\f'; break;
8059 case 'b': value = '\b'; break;
8060 case 'e': value = ASCII_TO_NATIVE('\033');break;
8061 case 'a': value = ASCII_TO_NATIVE('\007');break;
8063 if (*RExC_parse == '{') {
8064 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8065 | PERL_SCAN_DISALLOW_PREFIX;
8066 char * const e = strchr(RExC_parse++, '}');
8068 vFAIL("Missing right brace on \\x{}");
8070 numlen = e - RExC_parse;
8071 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8075 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8077 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8078 RExC_parse += numlen;
8080 if (PL_encoding && value < 0x100)
8081 goto recode_encoding;
8084 value = UCHARAT(RExC_parse++);
8085 value = toCTRL(value);
8087 case '0': case '1': case '2': case '3': case '4':
8088 case '5': case '6': case '7': case '8': case '9':
8092 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8093 RExC_parse += numlen;
8094 if (PL_encoding && value < 0x100)
8095 goto recode_encoding;
8100 SV* enc = PL_encoding;
8101 value = reg_recode((const char)(U8)value, &enc);
8102 if (!enc && SIZE_ONLY)
8103 ckWARNreg(RExC_parse,
8104 "Invalid escape in the specified encoding");
8108 if (!SIZE_ONLY && isALPHA(value))
8109 ckWARN2reg(RExC_parse,
8110 "Unrecognized escape \\%c in character class passed through",
8114 } /* end of \blah */
8120 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8122 if (!SIZE_ONLY && !need_class)
8123 ANYOF_CLASS_ZERO(ret);
8127 /* a bad range like a-\d, a-[:digit:] ? */
8131 RExC_parse >= rangebegin ?
8132 RExC_parse - rangebegin : 0;
8133 ckWARN4reg(RExC_parse,
8134 "False [] range \"%*.*s\"",
8137 if (prevvalue < 256) {
8138 ANYOF_BITMAP_SET(ret, prevvalue);
8139 ANYOF_BITMAP_SET(ret, '-');
8142 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8143 Perl_sv_catpvf(aTHX_ listsv,
8144 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8148 range = 0; /* this was not a true range */
8154 const char *what = NULL;
8157 if (namedclass > OOB_NAMEDCLASS)
8158 optimize_invert = FALSE;
8159 /* Possible truncation here but in some 64-bit environments
8160 * the compiler gets heartburn about switch on 64-bit values.
8161 * A similar issue a little earlier when switching on value.
8163 switch ((I32)namedclass) {
8165 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8166 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8167 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8168 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8169 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8170 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8171 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8172 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8173 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8174 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8175 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8176 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8177 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8179 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8180 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8182 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8183 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8184 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8187 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8190 for (value = 0; value < 128; value++)
8191 ANYOF_BITMAP_SET(ret, value);
8193 for (value = 0; value < 256; value++) {
8195 ANYOF_BITMAP_SET(ret, value);
8204 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8207 for (value = 128; value < 256; value++)
8208 ANYOF_BITMAP_SET(ret, value);
8210 for (value = 0; value < 256; value++) {
8211 if (!isASCII(value))
8212 ANYOF_BITMAP_SET(ret, value);
8221 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8223 /* consecutive digits assumed */
8224 for (value = '0'; value <= '9'; value++)
8225 ANYOF_BITMAP_SET(ret, value);
8228 what = POSIX_CC_UNI_NAME("Digit");
8232 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8234 /* consecutive digits assumed */
8235 for (value = 0; value < '0'; value++)
8236 ANYOF_BITMAP_SET(ret, value);
8237 for (value = '9' + 1; value < 256; value++)
8238 ANYOF_BITMAP_SET(ret, value);
8241 what = POSIX_CC_UNI_NAME("Digit");
8244 /* this is to handle \p and \P */
8247 vFAIL("Invalid [::] class");
8251 /* Strings such as "+utf8::isWord\n" */
8252 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8255 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8258 } /* end of namedclass \blah */
8261 if (prevvalue > (IV)value) /* b-a */ {
8262 const int w = RExC_parse - rangebegin;
8263 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8264 range = 0; /* not a valid range */
8268 prevvalue = value; /* save the beginning of the range */
8269 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8270 RExC_parse[1] != ']') {
8273 /* a bad range like \w-, [:word:]- ? */
8274 if (namedclass > OOB_NAMEDCLASS) {
8275 if (ckWARN(WARN_REGEXP)) {
8277 RExC_parse >= rangebegin ?
8278 RExC_parse - rangebegin : 0;
8280 "False [] range \"%*.*s\"",
8284 ANYOF_BITMAP_SET(ret, '-');
8286 range = 1; /* yeah, it's a range! */
8287 continue; /* but do it the next time */
8291 /* now is the next time */
8292 /*stored += (value - prevvalue + 1);*/
8294 if (prevvalue < 256) {
8295 const IV ceilvalue = value < 256 ? value : 255;
8298 /* In EBCDIC [\x89-\x91] should include
8299 * the \x8e but [i-j] should not. */
8300 if (literal_endpoint == 2 &&
8301 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8302 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8304 if (isLOWER(prevvalue)) {
8305 for (i = prevvalue; i <= ceilvalue; i++)
8306 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8308 ANYOF_BITMAP_SET(ret, i);
8311 for (i = prevvalue; i <= ceilvalue; i++)
8312 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8314 ANYOF_BITMAP_SET(ret, i);
8320 for (i = prevvalue; i <= ceilvalue; i++) {
8321 if (!ANYOF_BITMAP_TEST(ret,i)) {
8323 ANYOF_BITMAP_SET(ret, i);
8327 if (value > 255 || UTF) {
8328 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8329 const UV natvalue = NATIVE_TO_UNI(value);
8330 stored+=2; /* can't optimize this class */
8331 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8332 if (prevnatvalue < natvalue) { /* what about > ? */
8333 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8334 prevnatvalue, natvalue);
8336 else if (prevnatvalue == natvalue) {
8337 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8339 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8341 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8343 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8344 if (RExC_precomp[0] == ':' &&
8345 RExC_precomp[1] == '[' &&
8346 (f == 0xDF || f == 0x92)) {
8347 f = NATIVE_TO_UNI(f);
8350 /* If folding and foldable and a single
8351 * character, insert also the folded version
8352 * to the charclass. */
8354 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8355 if ((RExC_precomp[0] == ':' &&
8356 RExC_precomp[1] == '[' &&
8358 (value == 0xFB05 || value == 0xFB06))) ?
8359 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8360 foldlen == (STRLEN)UNISKIP(f) )
8362 if (foldlen == (STRLEN)UNISKIP(f))
8364 Perl_sv_catpvf(aTHX_ listsv,
8367 /* Any multicharacter foldings
8368 * require the following transform:
8369 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8370 * where E folds into "pq" and F folds
8371 * into "rst", all other characters
8372 * fold to single characters. We save
8373 * away these multicharacter foldings,
8374 * to be later saved as part of the
8375 * additional "s" data. */
8378 if (!unicode_alternate)
8379 unicode_alternate = newAV();
8380 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8382 av_push(unicode_alternate, sv);
8386 /* If folding and the value is one of the Greek
8387 * sigmas insert a few more sigmas to make the
8388 * folding rules of the sigmas to work right.
8389 * Note that not all the possible combinations
8390 * are handled here: some of them are handled
8391 * by the standard folding rules, and some of
8392 * them (literal or EXACTF cases) are handled
8393 * during runtime in regexec.c:S_find_byclass(). */
8394 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8395 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8396 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8397 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8398 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8400 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8401 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8402 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8407 literal_endpoint = 0;
8411 range = 0; /* this range (if it was one) is done now */
8415 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8417 RExC_size += ANYOF_CLASS_ADD_SKIP;
8419 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8425 /****** !SIZE_ONLY AFTER HERE *********/
8427 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8428 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8430 /* optimize single char class to an EXACT node
8431 but *only* when its not a UTF/high char */
8432 const char * cur_parse= RExC_parse;
8433 RExC_emit = (regnode *)orig_emit;
8434 RExC_parse = (char *)orig_parse;
8435 ret = reg_node(pRExC_state,
8436 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8437 RExC_parse = (char *)cur_parse;
8438 *STRING(ret)= (char)value;
8440 RExC_emit += STR_SZ(1);
8442 SvREFCNT_dec(listsv);
8446 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8447 if ( /* If the only flag is folding (plus possibly inversion). */
8448 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8450 for (value = 0; value < 256; ++value) {
8451 if (ANYOF_BITMAP_TEST(ret, value)) {
8452 UV fold = PL_fold[value];
8455 ANYOF_BITMAP_SET(ret, fold);
8458 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8461 /* optimize inverted simple patterns (e.g. [^a-z]) */
8462 if (optimize_invert &&
8463 /* If the only flag is inversion. */
8464 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8465 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8466 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8467 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8470 AV * const av = newAV();
8472 /* The 0th element stores the character class description
8473 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8474 * to initialize the appropriate swash (which gets stored in
8475 * the 1st element), and also useful for dumping the regnode.
8476 * The 2nd element stores the multicharacter foldings,
8477 * used later (regexec.c:S_reginclass()). */
8478 av_store(av, 0, listsv);
8479 av_store(av, 1, NULL);
8480 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8481 rv = newRV_noinc(MUTABLE_SV(av));
8482 n = add_data(pRExC_state, 1, "s");
8483 RExC_rxi->data->data[n] = (void*)rv;
8491 /* reg_skipcomment()
8493 Absorbs an /x style # comments from the input stream.
8494 Returns true if there is more text remaining in the stream.
8495 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8496 terminates the pattern without including a newline.
8498 Note its the callers responsibility to ensure that we are
8504 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8508 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8510 while (RExC_parse < RExC_end)
8511 if (*RExC_parse++ == '\n') {
8516 /* we ran off the end of the pattern without ending
8517 the comment, so we have to add an \n when wrapping */
8518 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8526 Advance that parse position, and optionall absorbs
8527 "whitespace" from the inputstream.
8529 Without /x "whitespace" means (?#...) style comments only,
8530 with /x this means (?#...) and # comments and whitespace proper.
8532 Returns the RExC_parse point from BEFORE the scan occurs.
8534 This is the /x friendly way of saying RExC_parse++.
8538 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8540 char* const retval = RExC_parse++;
8542 PERL_ARGS_ASSERT_NEXTCHAR;
8545 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8546 RExC_parse[2] == '#') {
8547 while (*RExC_parse != ')') {
8548 if (RExC_parse == RExC_end)
8549 FAIL("Sequence (?#... not terminated");
8555 if (RExC_flags & RXf_PMf_EXTENDED) {
8556 if (isSPACE(*RExC_parse)) {
8560 else if (*RExC_parse == '#') {
8561 if ( reg_skipcomment( pRExC_state ) )
8570 - reg_node - emit a node
8572 STATIC regnode * /* Location. */
8573 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8576 register regnode *ptr;
8577 regnode * const ret = RExC_emit;
8578 GET_RE_DEBUG_FLAGS_DECL;
8580 PERL_ARGS_ASSERT_REG_NODE;
8583 SIZE_ALIGN(RExC_size);
8587 if (RExC_emit >= RExC_emit_bound)
8588 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8590 NODE_ALIGN_FILL(ret);
8592 FILL_ADVANCE_NODE(ptr, op);
8593 REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 1);
8594 #ifdef RE_TRACK_PATTERN_OFFSETS
8595 if (RExC_offsets) { /* MJD */
8596 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8597 "reg_node", __LINE__,
8599 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8600 ? "Overwriting end of array!\n" : "OK",
8601 (UV)(RExC_emit - RExC_emit_start),
8602 (UV)(RExC_parse - RExC_start),
8603 (UV)RExC_offsets[0]));
8604 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8612 - reganode - emit a node with an argument
8614 STATIC regnode * /* Location. */
8615 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8618 register regnode *ptr;
8619 regnode * const ret = RExC_emit;
8620 GET_RE_DEBUG_FLAGS_DECL;
8622 PERL_ARGS_ASSERT_REGANODE;
8625 SIZE_ALIGN(RExC_size);
8630 assert(2==regarglen[op]+1);
8632 Anything larger than this has to allocate the extra amount.
8633 If we changed this to be:
8635 RExC_size += (1 + regarglen[op]);
8637 then it wouldn't matter. Its not clear what side effect
8638 might come from that so its not done so far.
8643 if (RExC_emit >= RExC_emit_bound)
8644 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8646 NODE_ALIGN_FILL(ret);
8648 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8649 REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 2);
8650 #ifdef RE_TRACK_PATTERN_OFFSETS
8651 if (RExC_offsets) { /* MJD */
8652 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8656 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8657 "Overwriting end of array!\n" : "OK",
8658 (UV)(RExC_emit - RExC_emit_start),
8659 (UV)(RExC_parse - RExC_start),
8660 (UV)RExC_offsets[0]));
8661 Set_Cur_Node_Offset;
8669 - reguni - emit (if appropriate) a Unicode character
8672 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8676 PERL_ARGS_ASSERT_REGUNI;
8678 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8682 - reginsert - insert an operator in front of already-emitted operand
8684 * Means relocating the operand.
8687 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8690 register regnode *src;
8691 register regnode *dst;
8692 register regnode *place;
8693 const int offset = regarglen[(U8)op];
8694 const int size = NODE_STEP_REGNODE + offset;
8695 GET_RE_DEBUG_FLAGS_DECL;
8697 PERL_ARGS_ASSERT_REGINSERT;
8698 PERL_UNUSED_ARG(depth);
8699 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8700 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8709 if (RExC_open_parens) {
8711 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8712 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8713 if ( RExC_open_parens[paren] >= opnd ) {
8714 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8715 RExC_open_parens[paren] += size;
8717 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8719 if ( RExC_close_parens[paren] >= opnd ) {
8720 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8721 RExC_close_parens[paren] += size;
8723 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8728 while (src > opnd) {
8729 StructCopy(--src, --dst, regnode);
8730 #ifdef RE_TRACK_PATTERN_OFFSETS
8731 if (RExC_offsets) { /* MJD 20010112 */
8732 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8736 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8737 ? "Overwriting end of array!\n" : "OK",
8738 (UV)(src - RExC_emit_start),
8739 (UV)(dst - RExC_emit_start),
8740 (UV)RExC_offsets[0]));
8741 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8742 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8748 place = opnd; /* Op node, where operand used to be. */
8749 #ifdef RE_TRACK_PATTERN_OFFSETS
8750 if (RExC_offsets) { /* MJD */
8751 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8755 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8756 ? "Overwriting end of array!\n" : "OK",
8757 (UV)(place - RExC_emit_start),
8758 (UV)(RExC_parse - RExC_start),
8759 (UV)RExC_offsets[0]));
8760 Set_Node_Offset(place, RExC_parse);
8761 Set_Node_Length(place, 1);
8764 src = NEXTOPER(place);
8765 FILL_ADVANCE_NODE(place, op);
8766 REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (place) - 1);
8767 Zero(src, offset, regnode);
8771 - regtail - set the next-pointer at the end of a node chain of p to val.
8772 - SEE ALSO: regtail_study
8774 /* TODO: All three parms should be const */
8776 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8779 register regnode *scan;
8780 GET_RE_DEBUG_FLAGS_DECL;
8782 PERL_ARGS_ASSERT_REGTAIL;
8784 PERL_UNUSED_ARG(depth);
8790 /* Find last node. */
8793 regnode * const temp = regnext(scan);
8795 SV * const mysv=sv_newmortal();
8796 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8797 regprop(RExC_rx, mysv, scan);
8798 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8799 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8800 (temp == NULL ? "->" : ""),
8801 (temp == NULL ? PL_reg_name[OP(val)] : "")
8809 if (reg_off_by_arg[OP(scan)]) {
8810 ARG_SET(scan, val - scan);
8813 NEXT_OFF(scan) = val - scan;
8819 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8820 - Look for optimizable sequences at the same time.
8821 - currently only looks for EXACT chains.
8823 This is expermental code. The idea is to use this routine to perform
8824 in place optimizations on branches and groups as they are constructed,
8825 with the long term intention of removing optimization from study_chunk so
8826 that it is purely analytical.
8828 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8829 to control which is which.
8832 /* TODO: All four parms should be const */
8835 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8838 register regnode *scan;
8840 #ifdef EXPERIMENTAL_INPLACESCAN
8843 GET_RE_DEBUG_FLAGS_DECL;
8845 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8851 /* Find last node. */
8855 regnode * const temp = regnext(scan);
8856 #ifdef EXPERIMENTAL_INPLACESCAN
8857 if (PL_regkind[OP(scan)] == EXACT)
8858 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8866 if( exact == PSEUDO )
8868 else if ( exact != OP(scan) )
8877 SV * const mysv=sv_newmortal();
8878 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8879 regprop(RExC_rx, mysv, scan);
8880 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8881 SvPV_nolen_const(mysv),
8883 PL_reg_name[exact]);
8890 SV * const mysv_val=sv_newmortal();
8891 DEBUG_PARSE_MSG("");
8892 regprop(RExC_rx, mysv_val, val);
8893 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8894 SvPV_nolen_const(mysv_val),
8895 (IV)REG_NODE_NUM(val),
8899 if (reg_off_by_arg[OP(scan)]) {
8900 ARG_SET(scan, val - scan);
8903 NEXT_OFF(scan) = val - scan;
8911 - regcurly - a little FSA that accepts {\d+,?\d*}
8914 S_regcurly(register const char *s)
8916 PERL_ARGS_ASSERT_REGCURLY;
8935 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8939 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8944 for (bit=0; bit<32; bit++) {
8945 if (flags & (1<<bit)) {
8947 PerlIO_printf(Perl_debug_log, "%s",lead);
8948 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8953 PerlIO_printf(Perl_debug_log, "\n");
8955 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8961 Perl_regdump(pTHX_ const regexp *r)
8965 SV * const sv = sv_newmortal();
8966 SV *dsv= sv_newmortal();
8968 GET_RE_DEBUG_FLAGS_DECL;
8970 PERL_ARGS_ASSERT_REGDUMP;
8972 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8974 /* Header fields of interest. */
8975 if (r->anchored_substr) {
8976 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8977 RE_SV_DUMPLEN(r->anchored_substr), 30);
8978 PerlIO_printf(Perl_debug_log,
8979 "anchored %s%s at %"IVdf" ",
8980 s, RE_SV_TAIL(r->anchored_substr),
8981 (IV)r->anchored_offset);
8982 } else if (r->anchored_utf8) {
8983 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8984 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8985 PerlIO_printf(Perl_debug_log,
8986 "anchored utf8 %s%s at %"IVdf" ",
8987 s, RE_SV_TAIL(r->anchored_utf8),
8988 (IV)r->anchored_offset);
8990 if (r->float_substr) {
8991 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8992 RE_SV_DUMPLEN(r->float_substr), 30);
8993 PerlIO_printf(Perl_debug_log,
8994 "floating %s%s at %"IVdf"..%"UVuf" ",
8995 s, RE_SV_TAIL(r->float_substr),
8996 (IV)r->float_min_offset, (UV)r->float_max_offset);
8997 } else if (r->float_utf8) {
8998 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8999 RE_SV_DUMPLEN(r->float_utf8), 30);
9000 PerlIO_printf(Perl_debug_log,
9001 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9002 s, RE_SV_TAIL(r->float_utf8),
9003 (IV)r->float_min_offset, (UV)r->float_max_offset);
9005 if (r->check_substr || r->check_utf8)
9006 PerlIO_printf(Perl_debug_log,
9008 (r->check_substr == r->float_substr
9009 && r->check_utf8 == r->float_utf8
9010 ? "(checking floating" : "(checking anchored"));
9011 if (r->extflags & RXf_NOSCAN)
9012 PerlIO_printf(Perl_debug_log, " noscan");
9013 if (r->extflags & RXf_CHECK_ALL)
9014 PerlIO_printf(Perl_debug_log, " isall");
9015 if (r->check_substr || r->check_utf8)
9016 PerlIO_printf(Perl_debug_log, ") ");
9018 if (ri->regstclass) {
9019 regprop(r, sv, ri->regstclass);
9020 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9022 if (r->extflags & RXf_ANCH) {
9023 PerlIO_printf(Perl_debug_log, "anchored");
9024 if (r->extflags & RXf_ANCH_BOL)
9025 PerlIO_printf(Perl_debug_log, "(BOL)");
9026 if (r->extflags & RXf_ANCH_MBOL)
9027 PerlIO_printf(Perl_debug_log, "(MBOL)");
9028 if (r->extflags & RXf_ANCH_SBOL)
9029 PerlIO_printf(Perl_debug_log, "(SBOL)");
9030 if (r->extflags & RXf_ANCH_GPOS)
9031 PerlIO_printf(Perl_debug_log, "(GPOS)");
9032 PerlIO_putc(Perl_debug_log, ' ');
9034 if (r->extflags & RXf_GPOS_SEEN)
9035 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9036 if (r->intflags & PREGf_SKIP)
9037 PerlIO_printf(Perl_debug_log, "plus ");
9038 if (r->intflags & PREGf_IMPLICIT)
9039 PerlIO_printf(Perl_debug_log, "implicit ");
9040 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9041 if (r->extflags & RXf_EVAL_SEEN)
9042 PerlIO_printf(Perl_debug_log, "with eval ");
9043 PerlIO_printf(Perl_debug_log, "\n");
9044 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9046 PERL_ARGS_ASSERT_REGDUMP;
9047 PERL_UNUSED_CONTEXT;
9049 #endif /* DEBUGGING */
9053 - regprop - printable representation of opcode
9055 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9058 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9059 if (flags & ANYOF_INVERT) \
9060 /*make sure the invert info is in each */ \
9061 sv_catpvs(sv, "^"); \
9067 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9072 RXi_GET_DECL(prog,progi);
9073 GET_RE_DEBUG_FLAGS_DECL;
9075 PERL_ARGS_ASSERT_REGPROP;
9079 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9080 /* It would be nice to FAIL() here, but this may be called from
9081 regexec.c, and it would be hard to supply pRExC_state. */
9082 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9083 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9085 k = PL_regkind[OP(o)];
9089 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9090 * is a crude hack but it may be the best for now since
9091 * we have no flag "this EXACTish node was UTF-8"
9093 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9094 PERL_PV_ESCAPE_UNI_DETECT |
9095 PERL_PV_PRETTY_ELLIPSES |
9096 PERL_PV_PRETTY_LTGT |
9097 PERL_PV_PRETTY_NOCLEAR
9099 } else if (k == TRIE) {
9100 /* print the details of the trie in dumpuntil instead, as
9101 * progi->data isn't available here */
9102 const char op = OP(o);
9103 const U32 n = ARG(o);
9104 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9105 (reg_ac_data *)progi->data->data[n] :
9107 const reg_trie_data * const trie
9108 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9110 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9111 DEBUG_TRIE_COMPILE_r(
9112 Perl_sv_catpvf(aTHX_ sv,
9113 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9114 (UV)trie->startstate,
9115 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9116 (UV)trie->wordcount,
9119 (UV)TRIE_CHARCOUNT(trie),
9120 (UV)trie->uniquecharcount
9123 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9125 int rangestart = -1;
9126 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9128 for (i = 0; i <= 256; i++) {
9129 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9130 if (rangestart == -1)
9132 } else if (rangestart != -1) {
9133 if (i <= rangestart + 3)
9134 for (; rangestart < i; rangestart++)
9135 put_byte(sv, rangestart);
9137 put_byte(sv, rangestart);
9139 put_byte(sv, i - 1);
9147 } else if (k == CURLY) {
9148 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9149 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9150 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9152 else if (k == WHILEM && o->flags) /* Ordinal/of */
9153 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9154 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9155 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9156 if ( RXp_PAREN_NAMES(prog) ) {
9157 if ( k != REF || OP(o) < NREF) {
9158 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9159 SV **name= av_fetch(list, ARG(o), 0 );
9161 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9164 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9165 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9166 I32 *nums=(I32*)SvPVX(sv_dat);
9167 SV **name= av_fetch(list, nums[0], 0 );
9170 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9171 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9172 (n ? "," : ""), (IV)nums[n]);
9174 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9178 } else if (k == GOSUB)
9179 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9180 else if (k == VERB) {
9182 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9183 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9184 } else if (k == LOGICAL)
9185 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9186 else if (k == FOLDCHAR)
9187 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9188 else if (k == ANYOF) {
9189 int i, rangestart = -1;
9190 const U8 flags = ANYOF_FLAGS(o);
9193 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9194 static const char * const anyofs[] = {
9227 if (flags & ANYOF_LOCALE)
9228 sv_catpvs(sv, "{loc}");
9229 if (flags & ANYOF_FOLD)
9230 sv_catpvs(sv, "{i}");
9231 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9232 if (flags & ANYOF_INVERT)
9235 /* output what the standard cp 0-255 bitmap matches */
9236 for (i = 0; i <= 256; i++) {
9237 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9238 if (rangestart == -1)
9240 } else if (rangestart != -1) {
9241 if (i <= rangestart + 3)
9242 for (; rangestart < i; rangestart++)
9243 put_byte(sv, rangestart);
9245 put_byte(sv, rangestart);
9247 put_byte(sv, i - 1);
9254 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9255 /* output any special charclass tests (used mostly under use locale) */
9256 if (o->flags & ANYOF_CLASS)
9257 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9258 if (ANYOF_CLASS_TEST(o,i)) {
9259 sv_catpv(sv, anyofs[i]);
9263 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9265 /* output information about the unicode matching */
9266 if (flags & ANYOF_UNICODE)
9267 sv_catpvs(sv, "{unicode}");
9268 else if (flags & ANYOF_UNICODE_ALL)
9269 sv_catpvs(sv, "{unicode_all}");
9273 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9277 U8 s[UTF8_MAXBYTES_CASE+1];
9279 for (i = 0; i <= 256; i++) { /* just the first 256 */
9280 uvchr_to_utf8(s, i);
9282 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9283 if (rangestart == -1)
9285 } else if (rangestart != -1) {
9286 if (i <= rangestart + 3)
9287 for (; rangestart < i; rangestart++) {
9288 const U8 * const e = uvchr_to_utf8(s,rangestart);
9290 for(p = s; p < e; p++)
9294 const U8 *e = uvchr_to_utf8(s,rangestart);
9296 for (p = s; p < e; p++)
9299 e = uvchr_to_utf8(s, i-1);
9300 for (p = s; p < e; p++)
9307 sv_catpvs(sv, "..."); /* et cetera */
9311 char *s = savesvpv(lv);
9312 char * const origs = s;
9314 while (*s && *s != '\n')
9318 const char * const t = ++s;
9336 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9338 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9339 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9341 PERL_UNUSED_CONTEXT;
9342 PERL_UNUSED_ARG(sv);
9344 PERL_UNUSED_ARG(prog);
9345 #endif /* DEBUGGING */
9349 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9350 { /* Assume that RE_INTUIT is set */
9352 struct regexp *const prog = (struct regexp *)SvANY(r);
9353 GET_RE_DEBUG_FLAGS_DECL;
9355 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9356 PERL_UNUSED_CONTEXT;
9360 const char * const s = SvPV_nolen_const(prog->check_substr
9361 ? prog->check_substr : prog->check_utf8);
9363 if (!PL_colorset) reginitcolors();
9364 PerlIO_printf(Perl_debug_log,
9365 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9367 prog->check_substr ? "" : "utf8 ",
9368 PL_colors[5],PL_colors[0],
9371 (strlen(s) > 60 ? "..." : ""));
9374 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9380 handles refcounting and freeing the perl core regexp structure. When
9381 it is necessary to actually free the structure the first thing it
9382 does is call the 'free' method of the regexp_engine associated to to
9383 the regexp, allowing the handling of the void *pprivate; member
9384 first. (This routine is not overridable by extensions, which is why
9385 the extensions free is called first.)
9387 See regdupe and regdupe_internal if you change anything here.
9389 #ifndef PERL_IN_XSUB_RE
9391 Perl_pregfree(pTHX_ REGEXP *r)
9397 Perl_pregfree2(pTHX_ REGEXP *rx)
9400 struct regexp *const r = (struct regexp *)SvANY(rx);
9401 GET_RE_DEBUG_FLAGS_DECL;
9403 PERL_ARGS_ASSERT_PREGFREE2;
9406 ReREFCNT_dec(r->mother_re);
9408 CALLREGFREE_PVT(rx); /* free the private data */
9409 if (RXp_PAREN_NAMES(r))
9410 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9413 if (r->anchored_substr)
9414 SvREFCNT_dec(r->anchored_substr);
9415 if (r->anchored_utf8)
9416 SvREFCNT_dec(r->anchored_utf8);
9417 if (r->float_substr)
9418 SvREFCNT_dec(r->float_substr);
9420 SvREFCNT_dec(r->float_utf8);
9421 Safefree(r->substrs);
9423 RX_MATCH_COPY_FREE(rx);
9424 #ifdef PERL_OLD_COPY_ON_WRITE
9426 SvREFCNT_dec(r->saved_copy);
9433 This is a hacky workaround to the structural issue of match results
9434 being stored in the regexp structure which is in turn stored in
9435 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9436 could be PL_curpm in multiple contexts, and could require multiple
9437 result sets being associated with the pattern simultaneously, such
9438 as when doing a recursive match with (??{$qr})
9440 The solution is to make a lightweight copy of the regexp structure
9441 when a qr// is returned from the code executed by (??{$qr}) this
9442 lightweight copy doesnt actually own any of its data except for
9443 the starp/end and the actual regexp structure itself.
9449 Perl_reg_temp_copy (pTHX_ REGEXP *rx)
9451 REGEXP *ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9452 struct regexp *ret = (struct regexp *)SvANY(ret_x);
9453 struct regexp *const r = (struct regexp *)SvANY(rx);
9454 register const I32 npar = r->nparens+1;
9456 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9458 (void)ReREFCNT_inc(rx);
9459 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9460 by pointing directly at the buffer, but flagging that the allocated
9461 space in the copy is zero. As we've just done a struct copy, it's now
9462 a case of zero-ing that, rather than copying the current length. */
9463 SvPV_set(ret_x, RX_WRAPPED(rx));
9464 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9465 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9466 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9467 SvLEN_set(ret_x, 0);
9468 Newx(ret->offs, npar, regexp_paren_pair);
9469 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9471 Newx(ret->substrs, 1, struct reg_substr_data);
9472 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9474 SvREFCNT_inc_void(ret->anchored_substr);
9475 SvREFCNT_inc_void(ret->anchored_utf8);
9476 SvREFCNT_inc_void(ret->float_substr);
9477 SvREFCNT_inc_void(ret->float_utf8);
9479 /* check_substr and check_utf8, if non-NULL, point to either their
9480 anchored or float namesakes, and don't hold a second reference. */
9482 RX_MATCH_COPIED_off(ret_x);
9483 #ifdef PERL_OLD_COPY_ON_WRITE
9484 ret->saved_copy = NULL;
9486 ret->mother_re = rx;
9492 /* regfree_internal()
9494 Free the private data in a regexp. This is overloadable by
9495 extensions. Perl takes care of the regexp structure in pregfree(),
9496 this covers the *pprivate pointer which technically perldoesnt
9497 know about, however of course we have to handle the
9498 regexp_internal structure when no extension is in use.
9500 Note this is called before freeing anything in the regexp
9505 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9508 struct regexp *const r = (struct regexp *)SvANY(rx);
9510 GET_RE_DEBUG_FLAGS_DECL;
9512 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9518 SV *dsv= sv_newmortal();
9519 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9520 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9521 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9522 PL_colors[4],PL_colors[5],s);
9525 #ifdef RE_TRACK_PATTERN_OFFSETS
9527 Safefree(ri->u.offsets); /* 20010421 MJD */
9530 int n = ri->data->count;
9531 PAD* new_comppad = NULL;
9536 /* If you add a ->what type here, update the comment in regcomp.h */
9537 switch (ri->data->what[n]) {
9541 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9544 Safefree(ri->data->data[n]);
9547 new_comppad = MUTABLE_AV(ri->data->data[n]);
9550 if (new_comppad == NULL)
9551 Perl_croak(aTHX_ "panic: pregfree comppad");
9552 PAD_SAVE_LOCAL(old_comppad,
9553 /* Watch out for global destruction's random ordering. */
9554 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9557 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9560 op_free((OP_4tree*)ri->data->data[n]);
9562 PAD_RESTORE_LOCAL(old_comppad);
9563 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9569 { /* Aho Corasick add-on structure for a trie node.
9570 Used in stclass optimization only */
9572 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9574 refcount = --aho->refcount;
9577 PerlMemShared_free(aho->states);
9578 PerlMemShared_free(aho->fail);
9579 /* do this last!!!! */
9580 PerlMemShared_free(ri->data->data[n]);
9581 PerlMemShared_free(ri->regstclass);
9587 /* trie structure. */
9589 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9591 refcount = --trie->refcount;
9594 PerlMemShared_free(trie->charmap);
9595 PerlMemShared_free(trie->states);
9596 PerlMemShared_free(trie->trans);
9598 PerlMemShared_free(trie->bitmap);
9600 PerlMemShared_free(trie->wordlen);
9602 PerlMemShared_free(trie->jump);
9604 PerlMemShared_free(trie->nextword);
9605 /* do this last!!!! */
9606 PerlMemShared_free(ri->data->data[n]);
9611 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9614 Safefree(ri->data->what);
9621 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9622 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9623 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9624 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9627 re_dup - duplicate a regexp.
9629 This routine is expected to clone a given regexp structure. It is only
9630 compiled under USE_ITHREADS.
9632 After all of the core data stored in struct regexp is duplicated
9633 the regexp_engine.dupe method is used to copy any private data
9634 stored in the *pprivate pointer. This allows extensions to handle
9635 any duplication it needs to do.
9637 See pregfree() and regfree_internal() if you change anything here.
9639 #if defined(USE_ITHREADS)
9640 #ifndef PERL_IN_XSUB_RE
9642 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9646 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9647 struct regexp *ret = (struct regexp *)SvANY(dstr);
9649 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9651 npar = r->nparens+1;
9652 Newx(ret->offs, npar, regexp_paren_pair);
9653 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9655 /* no need to copy these */
9656 Newx(ret->swap, npar, regexp_paren_pair);
9660 /* Do it this way to avoid reading from *r after the StructCopy().
9661 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9662 cache, it doesn't matter. */
9663 const bool anchored = r->check_substr
9664 ? r->check_substr == r->anchored_substr
9665 : r->check_utf8 == r->anchored_utf8;
9666 Newx(ret->substrs, 1, struct reg_substr_data);
9667 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9669 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9670 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9671 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9672 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9674 /* check_substr and check_utf8, if non-NULL, point to either their
9675 anchored or float namesakes, and don't hold a second reference. */
9677 if (ret->check_substr) {
9679 assert(r->check_utf8 == r->anchored_utf8);
9680 ret->check_substr = ret->anchored_substr;
9681 ret->check_utf8 = ret->anchored_utf8;
9683 assert(r->check_substr == r->float_substr);
9684 assert(r->check_utf8 == r->float_utf8);
9685 ret->check_substr = ret->float_substr;
9686 ret->check_utf8 = ret->float_utf8;
9688 } else if (ret->check_utf8) {
9690 ret->check_utf8 = ret->anchored_utf8;
9692 ret->check_utf8 = ret->float_utf8;
9697 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9700 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9702 if (RX_MATCH_COPIED(dstr))
9703 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9706 #ifdef PERL_OLD_COPY_ON_WRITE
9707 ret->saved_copy = NULL;
9710 ret->mother_re = NULL;
9713 #endif /* PERL_IN_XSUB_RE */
9718 This is the internal complement to regdupe() which is used to copy
9719 the structure pointed to by the *pprivate pointer in the regexp.
9720 This is the core version of the extension overridable cloning hook.
9721 The regexp structure being duplicated will be copied by perl prior
9722 to this and will be provided as the regexp *r argument, however
9723 with the /old/ structures pprivate pointer value. Thus this routine
9724 may override any copying normally done by perl.
9726 It returns a pointer to the new regexp_internal structure.
9730 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9733 struct regexp *const r = (struct regexp *)SvANY(rx);
9734 regexp_internal *reti;
9738 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9740 npar = r->nparens+1;
9743 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9744 Copy(ri->program, reti->program, len+1, regnode);
9747 reti->regstclass = NULL;
9751 const int count = ri->data->count;
9754 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9755 char, struct reg_data);
9756 Newx(d->what, count, U8);
9759 for (i = 0; i < count; i++) {
9760 d->what[i] = ri->data->what[i];
9761 switch (d->what[i]) {
9762 /* legal options are one of: sSfpontTu
9763 see also regcomp.h and pregfree() */
9766 case 'p': /* actually an AV, but the dup function is identical. */
9767 case 'u': /* actually an HV, but the dup function is identical. */
9768 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9771 /* This is cheating. */
9772 Newx(d->data[i], 1, struct regnode_charclass_class);
9773 StructCopy(ri->data->data[i], d->data[i],
9774 struct regnode_charclass_class);
9775 reti->regstclass = (regnode*)d->data[i];
9778 /* Compiled op trees are readonly and in shared memory,
9779 and can thus be shared without duplication. */
9781 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9785 /* Trie stclasses are readonly and can thus be shared
9786 * without duplication. We free the stclass in pregfree
9787 * when the corresponding reg_ac_data struct is freed.
9789 reti->regstclass= ri->regstclass;
9793 ((reg_trie_data*)ri->data->data[i])->refcount++;
9797 d->data[i] = ri->data->data[i];
9800 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9809 reti->name_list_idx = ri->name_list_idx;
9811 #ifdef RE_TRACK_PATTERN_OFFSETS
9812 if (ri->u.offsets) {
9813 Newx(reti->u.offsets, 2*len+1, U32);
9814 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9817 SetProgLen(reti,len);
9823 #endif /* USE_ITHREADS */
9825 #ifndef PERL_IN_XSUB_RE
9828 - regnext - dig the "next" pointer out of a node
9831 Perl_regnext(pTHX_ register regnode *p)
9834 register I32 offset;
9839 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9848 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9851 STRLEN l1 = strlen(pat1);
9852 STRLEN l2 = strlen(pat2);
9855 const char *message;
9857 PERL_ARGS_ASSERT_RE_CROAK2;
9863 Copy(pat1, buf, l1 , char);
9864 Copy(pat2, buf + l1, l2 , char);
9865 buf[l1 + l2] = '\n';
9866 buf[l1 + l2 + 1] = '\0';
9868 /* ANSI variant takes additional second argument */
9869 va_start(args, pat2);
9873 msv = vmess(buf, &args);
9875 message = SvPV_const(msv,l1);
9878 Copy(message, buf, l1 , char);
9879 buf[l1-1] = '\0'; /* Overwrite \n */
9880 Perl_croak(aTHX_ "%s", buf);
9883 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9885 #ifndef PERL_IN_XSUB_RE
9887 Perl_save_re_context(pTHX)
9891 struct re_save_state *state;
9893 SAVEVPTR(PL_curcop);
9894 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9896 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9897 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9898 SSPUSHINT(SAVEt_RE_STATE);
9900 Copy(&PL_reg_state, state, 1, struct re_save_state);
9902 PL_reg_start_tmp = 0;
9903 PL_reg_start_tmpl = 0;
9904 PL_reg_oldsaved = NULL;
9905 PL_reg_oldsavedlen = 0;
9907 PL_reg_leftiter = 0;
9908 PL_reg_poscache = NULL;
9909 PL_reg_poscache_size = 0;
9910 #ifdef PERL_OLD_COPY_ON_WRITE
9914 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9916 const REGEXP * const rx = PM_GETRE(PL_curpm);
9919 for (i = 1; i <= RX_NPARENS(rx); i++) {
9920 char digits[TYPE_CHARS(long)];
9921 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9922 GV *const *const gvp
9923 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9926 GV * const gv = *gvp;
9927 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9937 clear_re(pTHX_ void *r)
9940 ReREFCNT_dec((REGEXP *)r);
9946 S_put_byte(pTHX_ SV *sv, int c)
9948 PERL_ARGS_ASSERT_PUT_BYTE;
9950 /* Our definition of isPRINT() ignores locales, so only bytes that are
9951 not part of UTF-8 are considered printable. I assume that the same
9952 holds for UTF-EBCDIC.
9953 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9954 which Wikipedia says:
9956 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9957 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9958 identical, to the ASCII delete (DEL) or rubout control character.
9959 ) So the old condition can be simplified to !isPRINT(c) */
9961 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9963 const char string = c;
9964 if (c == '-' || c == ']' || c == '\\' || c == '^')
9965 sv_catpvs(sv, "\\");
9966 sv_catpvn(sv, &string, 1);
9971 #define CLEAR_OPTSTART \
9972 if (optstart) STMT_START { \
9973 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9977 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9979 STATIC const regnode *
9980 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9981 const regnode *last, const regnode *plast,
9982 SV* sv, I32 indent, U32 depth)
9985 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9986 register const regnode *next;
9987 const regnode *optstart= NULL;
9990 GET_RE_DEBUG_FLAGS_DECL;
9992 PERL_ARGS_ASSERT_DUMPUNTIL;
9994 #ifdef DEBUG_DUMPUNTIL
9995 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9996 last ? last-start : 0,plast ? plast-start : 0);
9999 if (plast && plast < last)
10002 while (PL_regkind[op] != END && (!last || node < last)) {
10003 /* While that wasn't END last time... */
10006 if (op == CLOSE || op == WHILEM)
10008 next = regnext((regnode *)node);
10011 if (OP(node) == OPTIMIZED) {
10012 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10019 regprop(r, sv, node);
10020 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10021 (int)(2*indent + 1), "", SvPVX_const(sv));
10023 if (OP(node) != OPTIMIZED) {
10024 if (next == NULL) /* Next ptr. */
10025 PerlIO_printf(Perl_debug_log, " (0)");
10026 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10027 PerlIO_printf(Perl_debug_log, " (FAIL)");
10029 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10030 (void)PerlIO_putc(Perl_debug_log, '\n');
10034 if (PL_regkind[(U8)op] == BRANCHJ) {
10037 register const regnode *nnode = (OP(next) == LONGJMP
10038 ? regnext((regnode *)next)
10040 if (last && nnode > last)
10042 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10045 else if (PL_regkind[(U8)op] == BRANCH) {
10047 DUMPUNTIL(NEXTOPER(node), next);
10049 else if ( PL_regkind[(U8)op] == TRIE ) {
10050 const regnode *this_trie = node;
10051 const char op = OP(node);
10052 const U32 n = ARG(node);
10053 const reg_ac_data * const ac = op>=AHOCORASICK ?
10054 (reg_ac_data *)ri->data->data[n] :
10056 const reg_trie_data * const trie =
10057 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10059 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10061 const regnode *nextbranch= NULL;
10064 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10065 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10067 PerlIO_printf(Perl_debug_log, "%*s%s ",
10068 (int)(2*(indent+3)), "",
10069 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10070 PL_colors[0], PL_colors[1],
10071 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10072 PERL_PV_PRETTY_ELLIPSES |
10073 PERL_PV_PRETTY_LTGT
10078 U16 dist= trie->jump[word_idx+1];
10079 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10080 (UV)((dist ? this_trie + dist : next) - start));
10083 nextbranch= this_trie + trie->jump[0];
10084 DUMPUNTIL(this_trie + dist, nextbranch);
10086 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10087 nextbranch= regnext((regnode *)nextbranch);
10089 PerlIO_printf(Perl_debug_log, "\n");
10092 if (last && next > last)
10097 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10098 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10099 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10101 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10103 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10105 else if ( op == PLUS || op == STAR) {
10106 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10108 else if (op == ANYOF) {
10109 /* arglen 1 + class block */
10110 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10111 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10112 node = NEXTOPER(node);
10114 else if (PL_regkind[(U8)op] == EXACT) {
10115 /* Literal string, where present. */
10116 node += NODE_SZ_STR(node) - 1;
10117 node = NEXTOPER(node);
10120 node = NEXTOPER(node);
10121 node += regarglen[(U8)op];
10123 if (op == CURLYX || op == OPEN)
10127 #ifdef DEBUG_DUMPUNTIL
10128 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10133 #endif /* DEBUGGING */
10137 * c-indentation-style: bsd
10138 * c-basic-offset: 4
10139 * indent-tabs-mode: t
10142 * ex: set ts=8 sts=4 sw=4 noet: