5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
93 # if defined(BUGGY_MSC6)
94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 # pragma optimize("a",off)
96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 # pragma optimize("w",on )
98 # endif /* BUGGY_MSC6 */
102 #define STATIC static
105 typedef struct RExC_state_t {
106 U32 flags; /* are we folding, multilining? */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
111 char *start; /* Start of input for compile */
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
115 regnode *emit_start; /* Start of emitted-code area */
116 regnode *emit_bound; /* First regnode outside of the allocated space */
117 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
121 I32 size; /* Code size. */
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
124 I32 nestroot; /* root parens we are in - used by accept */
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
135 HV *paren_names; /* Paren names */
137 regnode **recurse; /* Recurse regops */
138 I32 recurse_count; /* Number of recurse regops */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
144 const char *lastparse;
146 AV *paren_name_list; /* idx -> name */
147 #define RExC_lastparse (pRExC_state->lastparse)
148 #define RExC_lastnum (pRExC_state->lastnum)
149 #define RExC_paren_name_list (pRExC_state->paren_name_list)
153 #define RExC_flags (pRExC_state->flags)
154 #define RExC_precomp (pRExC_state->precomp)
155 #define RExC_rx_sv (pRExC_state->rx_sv)
156 #define RExC_rx (pRExC_state->rx)
157 #define RExC_rxi (pRExC_state->rxi)
158 #define RExC_start (pRExC_state->start)
159 #define RExC_end (pRExC_state->end)
160 #define RExC_parse (pRExC_state->parse)
161 #define RExC_whilem_seen (pRExC_state->whilem_seen)
162 #ifdef RE_TRACK_PATTERN_OFFSETS
163 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
165 #define RExC_emit (pRExC_state->emit)
166 #define RExC_emit_start (pRExC_state->emit_start)
167 #define RExC_emit_bound (pRExC_state->emit_bound)
168 #define RExC_naughty (pRExC_state->naughty)
169 #define RExC_sawback (pRExC_state->sawback)
170 #define RExC_seen (pRExC_state->seen)
171 #define RExC_size (pRExC_state->size)
172 #define RExC_npar (pRExC_state->npar)
173 #define RExC_nestroot (pRExC_state->nestroot)
174 #define RExC_extralen (pRExC_state->extralen)
175 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
176 #define RExC_seen_evals (pRExC_state->seen_evals)
177 #define RExC_utf8 (pRExC_state->utf8)
178 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
179 #define RExC_open_parens (pRExC_state->open_parens)
180 #define RExC_close_parens (pRExC_state->close_parens)
181 #define RExC_opend (pRExC_state->opend)
182 #define RExC_paren_names (pRExC_state->paren_names)
183 #define RExC_recurse (pRExC_state->recurse)
184 #define RExC_recurse_count (pRExC_state->recurse_count)
187 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
188 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 ((*s) == '{' && regcurly(s)))
192 #undef SPSTART /* dratted cpp namespace... */
195 * Flags to be passed up and down.
197 #define WORST 0 /* Worst case. */
198 #define HASWIDTH 0x01 /* Known to match non-null strings. */
199 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
200 #define SPSTART 0x04 /* Starts with * or +. */
201 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
202 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
204 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
206 /* whether trie related optimizations are enabled */
207 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208 #define TRIE_STUDY_OPT
209 #define FULL_TRIE_STUDY
215 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216 #define PBITVAL(paren) (1 << ((paren) & 7))
217 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
222 /* About scan_data_t.
224 During optimisation we recurse through the regexp program performing
225 various inplace (keyhole style) optimisations. In addition study_chunk
226 and scan_commit populate this data structure with information about
227 what strings MUST appear in the pattern. We look for the longest
228 string that must appear for at a fixed location, and we look for the
229 longest string that may appear at a floating location. So for instance
234 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
235 strings (because they follow a .* construct). study_chunk will identify
236 both FOO and BAR as being the longest fixed and floating strings respectively.
238 The strings can be composites, for instance
242 will result in a composite fixed substring 'foo'.
244 For each string some basic information is maintained:
246 - offset or min_offset
247 This is the position the string must appear at, or not before.
248 It also implicitly (when combined with minlenp) tells us how many
249 character must match before the string we are searching.
250 Likewise when combined with minlenp and the length of the string
251 tells us how many characters must appear after the string we have
255 Only used for floating strings. This is the rightmost point that
256 the string can appear at. Ifset to I32 max it indicates that the
257 string can occur infinitely far to the right.
260 A pointer to the minimum length of the pattern that the string
261 was found inside. This is important as in the case of positive
262 lookahead or positive lookbehind we can have multiple patterns
267 The minimum length of the pattern overall is 3, the minimum length
268 of the lookahead part is 3, but the minimum length of the part that
269 will actually match is 1. So 'FOO's minimum length is 3, but the
270 minimum length for the F is 1. This is important as the minimum length
271 is used to determine offsets in front of and behind the string being
272 looked for. Since strings can be composites this is the length of the
273 pattern at the time it was commited with a scan_commit. Note that
274 the length is calculated by study_chunk, so that the minimum lengths
275 are not known until the full pattern has been compiled, thus the
276 pointer to the value.
280 In the case of lookbehind the string being searched for can be
281 offset past the start point of the final matching string.
282 If this value was just blithely removed from the min_offset it would
283 invalidate some of the calculations for how many chars must match
284 before or after (as they are derived from min_offset and minlen and
285 the length of the string being searched for).
286 When the final pattern is compiled and the data is moved from the
287 scan_data_t structure into the regexp structure the information
288 about lookbehind is factored in, with the information that would
289 have been lost precalculated in the end_shift field for the
292 The fields pos_min and pos_delta are used to store the minimum offset
293 and the delta to the maximum offset at the current point in the pattern.
297 typedef struct scan_data_t {
298 /*I32 len_min; unused */
299 /*I32 len_delta; unused */
303 I32 last_end; /* min value, <0 unless valid. */
306 SV **longest; /* Either &l_fixed, or &l_float. */
307 SV *longest_fixed; /* longest fixed string found in pattern */
308 I32 offset_fixed; /* offset where it starts */
309 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
310 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
311 SV *longest_float; /* longest floating string found in pattern */
312 I32 offset_float_min; /* earliest point in string it can appear */
313 I32 offset_float_max; /* latest point in string it can appear */
314 I32 *minlen_float; /* pointer to the minlen relevent to the string */
315 I32 lookbehind_float; /* is the position of the string modified by LB */
319 struct regnode_charclass_class *start_class;
323 * Forward declarations for pregcomp()'s friends.
326 static const scan_data_t zero_scan_data =
327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
329 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
330 #define SF_BEFORE_SEOL 0x0001
331 #define SF_BEFORE_MEOL 0x0002
332 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
336 # define SF_FIX_SHIFT_EOL (0+2)
337 # define SF_FL_SHIFT_EOL (0+4)
339 # define SF_FIX_SHIFT_EOL (+2)
340 # define SF_FL_SHIFT_EOL (+4)
343 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
346 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
348 #define SF_IS_INF 0x0040
349 #define SF_HAS_PAR 0x0080
350 #define SF_IN_PAR 0x0100
351 #define SF_HAS_EVAL 0x0200
352 #define SCF_DO_SUBSTR 0x0400
353 #define SCF_DO_STCLASS_AND 0x0800
354 #define SCF_DO_STCLASS_OR 0x1000
355 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
356 #define SCF_WHILEM_VISITED_POS 0x2000
358 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
359 #define SCF_SEEN_ACCEPT 0x8000
361 #define UTF (RExC_utf8 != 0)
362 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
365 #define OOB_UNICODE 12345678
366 #define OOB_NAMEDCLASS -1
368 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
372 /* length of regex to show in messages that don't mark a position within */
373 #define RegexLengthToShowInErrorMessages 127
376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
378 * op/pragma/warn/regcomp.
380 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
381 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
383 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
387 * arg. Show regex, up to a maximum length. If it's too long, chop and add
390 #define _FAIL(code) STMT_START { \
391 const char *ellipses = ""; \
392 IV len = RExC_end - RExC_precomp; \
395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
396 if (len > RegexLengthToShowInErrorMessages) { \
397 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
398 len = RegexLengthToShowInErrorMessages - 10; \
404 #define FAIL(msg) _FAIL( \
405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
406 msg, (int)len, RExC_precomp, ellipses))
408 #define FAIL2(msg,arg) _FAIL( \
409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
410 arg, (int)len, RExC_precomp, ellipses))
413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
415 #define Simple_vFAIL(m) STMT_START { \
416 const IV offset = RExC_parse - RExC_precomp; \
417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
424 #define vFAIL(m) STMT_START { \
426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
431 * Like Simple_vFAIL(), but accepts two arguments.
433 #define Simple_vFAIL2(m,a1) STMT_START { \
434 const IV offset = RExC_parse - RExC_precomp; \
435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
436 (int)offset, RExC_precomp, RExC_precomp + offset); \
440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
442 #define vFAIL2(m,a1) STMT_START { \
444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
445 Simple_vFAIL2(m, a1); \
450 * Like Simple_vFAIL(), but accepts three arguments.
452 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
453 const IV offset = RExC_parse - RExC_precomp; \
454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
455 (int)offset, RExC_precomp, RExC_precomp + offset); \
459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
461 #define vFAIL3(m,a1,a2) STMT_START { \
463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
464 Simple_vFAIL3(m, a1, a2); \
468 * Like Simple_vFAIL(), but accepts four arguments.
470 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
471 const IV offset = RExC_parse - RExC_precomp; \
472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
473 (int)offset, RExC_precomp, RExC_precomp + offset); \
476 #define ckWARNreg(loc,m) STMT_START { \
477 const IV offset = loc - RExC_precomp; \
478 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 (int)offset, RExC_precomp, RExC_precomp + offset); \
482 #define ckWARNregdep(loc,m) STMT_START { \
483 const IV offset = loc - RExC_precomp; \
484 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
486 (int)offset, RExC_precomp, RExC_precomp + offset); \
489 #define ckWARN2reg(loc, m, a1) STMT_START { \
490 const IV offset = loc - RExC_precomp; \
491 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
492 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
495 #define vWARN3(loc, m, a1, a2) STMT_START { \
496 const IV offset = loc - RExC_precomp; \
497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
498 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
501 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
502 const IV offset = loc - RExC_precomp; \
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
507 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
508 const IV offset = loc - RExC_precomp; \
509 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
510 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
513 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
514 const IV offset = loc - RExC_precomp; \
515 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
516 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
519 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
520 const IV offset = loc - RExC_precomp; \
521 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
522 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
526 /* Allow for side effects in s */
527 #define REGC(c,s) STMT_START { \
528 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
531 /* Macros for recording node offsets. 20001227 mjd@plover.com
532 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
533 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
534 * Element 0 holds the number n.
535 * Position is 1 indexed.
537 #ifndef RE_TRACK_PATTERN_OFFSETS
538 #define Set_Node_Offset_To_R(node,byte)
539 #define Set_Node_Offset(node,byte)
540 #define Set_Cur_Node_Offset
541 #define Set_Node_Length_To_R(node,len)
542 #define Set_Node_Length(node,len)
543 #define Set_Node_Cur_Length(node)
544 #define Node_Offset(n)
545 #define Node_Length(n)
546 #define Set_Node_Offset_Length(node,offset,len)
547 #define ProgLen(ri) ri->u.proglen
548 #define SetProgLen(ri,x) ri->u.proglen = x
550 #define ProgLen(ri) ri->u.offsets[0]
551 #define SetProgLen(ri,x) ri->u.offsets[0] = x
552 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
554 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
555 __LINE__, (int)(node), (int)(byte))); \
557 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
559 RExC_offsets[2*(node)-1] = (byte); \
564 #define Set_Node_Offset(node,byte) \
565 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
566 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
568 #define Set_Node_Length_To_R(node,len) STMT_START { \
570 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
571 __LINE__, (int)(node), (int)(len))); \
573 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
575 RExC_offsets[2*(node)] = (len); \
580 #define Set_Node_Length(node,len) \
581 Set_Node_Length_To_R((node)-RExC_emit_start, len)
582 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
583 #define Set_Node_Cur_Length(node) \
584 Set_Node_Length(node, RExC_parse - parse_start)
586 /* Get offsets and lengths */
587 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
588 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
590 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
591 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
592 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
596 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
597 #define EXPERIMENTAL_INPLACESCAN
598 #endif /*RE_TRACK_PATTERN_OFFSETS*/
600 #define DEBUG_STUDYDATA(str,data,depth) \
601 DEBUG_OPTIMISE_MORE_r(if(data){ \
602 PerlIO_printf(Perl_debug_log, \
603 "%*s" str "Pos:%"IVdf"/%"IVdf \
604 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
605 (int)(depth)*2, "", \
606 (IV)((data)->pos_min), \
607 (IV)((data)->pos_delta), \
608 (UV)((data)->flags), \
609 (IV)((data)->whilem_c), \
610 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
611 is_inf ? "INF " : "" \
613 if ((data)->last_found) \
614 PerlIO_printf(Perl_debug_log, \
615 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
616 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
617 SvPVX_const((data)->last_found), \
618 (IV)((data)->last_end), \
619 (IV)((data)->last_start_min), \
620 (IV)((data)->last_start_max), \
621 ((data)->longest && \
622 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
623 SvPVX_const((data)->longest_fixed), \
624 (IV)((data)->offset_fixed), \
625 ((data)->longest && \
626 (data)->longest==&((data)->longest_float)) ? "*" : "", \
627 SvPVX_const((data)->longest_float), \
628 (IV)((data)->offset_float_min), \
629 (IV)((data)->offset_float_max) \
631 PerlIO_printf(Perl_debug_log,"\n"); \
634 static void clear_re(pTHX_ void *r);
636 /* Mark that we cannot extend a found fixed substring at this point.
637 Update the longest found anchored substring and the longest found
638 floating substrings if needed. */
641 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
643 const STRLEN l = CHR_SVLEN(data->last_found);
644 const STRLEN old_l = CHR_SVLEN(*data->longest);
645 GET_RE_DEBUG_FLAGS_DECL;
647 PERL_ARGS_ASSERT_SCAN_COMMIT;
649 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
650 SvSetMagicSV(*data->longest, data->last_found);
651 if (*data->longest == data->longest_fixed) {
652 data->offset_fixed = l ? data->last_start_min : data->pos_min;
653 if (data->flags & SF_BEFORE_EOL)
655 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
657 data->flags &= ~SF_FIX_BEFORE_EOL;
658 data->minlen_fixed=minlenp;
659 data->lookbehind_fixed=0;
661 else { /* *data->longest == data->longest_float */
662 data->offset_float_min = l ? data->last_start_min : data->pos_min;
663 data->offset_float_max = (l
664 ? data->last_start_max
665 : data->pos_min + data->pos_delta);
666 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
667 data->offset_float_max = I32_MAX;
668 if (data->flags & SF_BEFORE_EOL)
670 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
672 data->flags &= ~SF_FL_BEFORE_EOL;
673 data->minlen_float=minlenp;
674 data->lookbehind_float=0;
677 SvCUR_set(data->last_found, 0);
679 SV * const sv = data->last_found;
680 if (SvUTF8(sv) && SvMAGICAL(sv)) {
681 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
687 data->flags &= ~SF_BEFORE_EOL;
688 DEBUG_STUDYDATA("commit: ",data,0);
691 /* Can match anything (initialization) */
693 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
695 PERL_ARGS_ASSERT_CL_ANYTHING;
697 ANYOF_CLASS_ZERO(cl);
698 ANYOF_BITMAP_SETALL(cl);
699 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
701 cl->flags |= ANYOF_LOCALE;
704 /* Can match anything (initialization) */
706 S_cl_is_anything(const struct regnode_charclass_class *cl)
710 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
712 for (value = 0; value <= ANYOF_MAX; value += 2)
713 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
715 if (!(cl->flags & ANYOF_UNICODE_ALL))
717 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
722 /* Can match anything (initialization) */
724 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
726 PERL_ARGS_ASSERT_CL_INIT;
728 Zero(cl, 1, struct regnode_charclass_class);
730 cl_anything(pRExC_state, cl);
734 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
736 PERL_ARGS_ASSERT_CL_INIT_ZERO;
738 Zero(cl, 1, struct regnode_charclass_class);
740 cl_anything(pRExC_state, cl);
742 cl->flags |= ANYOF_LOCALE;
745 /* 'And' a given class with another one. Can create false positives */
746 /* We assume that cl is not inverted */
748 S_cl_and(struct regnode_charclass_class *cl,
749 const struct regnode_charclass_class *and_with)
751 PERL_ARGS_ASSERT_CL_AND;
753 assert(and_with->type == ANYOF);
754 if (!(and_with->flags & ANYOF_CLASS)
755 && !(cl->flags & ANYOF_CLASS)
756 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
757 && !(and_with->flags & ANYOF_FOLD)
758 && !(cl->flags & ANYOF_FOLD)) {
761 if (and_with->flags & ANYOF_INVERT)
762 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
763 cl->bitmap[i] &= ~and_with->bitmap[i];
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] &= and_with->bitmap[i];
767 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
768 if (!(and_with->flags & ANYOF_EOS))
769 cl->flags &= ~ANYOF_EOS;
771 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
772 !(and_with->flags & ANYOF_INVERT)) {
773 cl->flags &= ~ANYOF_UNICODE_ALL;
774 cl->flags |= ANYOF_UNICODE;
775 ARG_SET(cl, ARG(and_with));
777 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
778 !(and_with->flags & ANYOF_INVERT))
779 cl->flags &= ~ANYOF_UNICODE_ALL;
780 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
781 !(and_with->flags & ANYOF_INVERT))
782 cl->flags &= ~ANYOF_UNICODE;
785 /* 'OR' a given class with another one. Can create false positives */
786 /* We assume that cl is not inverted */
788 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
790 PERL_ARGS_ASSERT_CL_OR;
792 if (or_with->flags & ANYOF_INVERT) {
794 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
795 * <= (B1 | !B2) | (CL1 | !CL2)
796 * which is wasteful if CL2 is small, but we ignore CL2:
797 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
798 * XXXX Can we handle case-fold? Unclear:
799 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
800 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
802 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
803 && !(or_with->flags & ANYOF_FOLD)
804 && !(cl->flags & ANYOF_FOLD) ) {
807 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
808 cl->bitmap[i] |= ~or_with->bitmap[i];
809 } /* XXXX: logic is complicated otherwise */
811 cl_anything(pRExC_state, cl);
814 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
815 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
816 && (!(or_with->flags & ANYOF_FOLD)
817 || (cl->flags & ANYOF_FOLD)) ) {
820 /* OR char bitmap and class bitmap separately */
821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 cl->bitmap[i] |= or_with->bitmap[i];
823 if (or_with->flags & ANYOF_CLASS) {
824 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
825 cl->classflags[i] |= or_with->classflags[i];
826 cl->flags |= ANYOF_CLASS;
829 else { /* XXXX: logic is complicated, leave it along for a moment. */
830 cl_anything(pRExC_state, cl);
833 if (or_with->flags & ANYOF_EOS)
834 cl->flags |= ANYOF_EOS;
836 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
837 ARG(cl) != ARG(or_with)) {
838 cl->flags |= ANYOF_UNICODE_ALL;
839 cl->flags &= ~ANYOF_UNICODE;
841 if (or_with->flags & ANYOF_UNICODE_ALL) {
842 cl->flags |= ANYOF_UNICODE_ALL;
843 cl->flags &= ~ANYOF_UNICODE;
847 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
848 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
849 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
850 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
855 dump_trie(trie,widecharmap,revcharmap)
856 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
857 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
859 These routines dump out a trie in a somewhat readable format.
860 The _interim_ variants are used for debugging the interim
861 tables that are used to generate the final compressed
862 representation which is what dump_trie expects.
864 Part of the reason for their existance is to provide a form
865 of documentation as to how the different representations function.
870 Dumps the final compressed table form of the trie to Perl_debug_log.
871 Used for debugging make_trie().
875 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
876 AV *revcharmap, U32 depth)
879 SV *sv=sv_newmortal();
880 int colwidth= widecharmap ? 6 : 4;
881 GET_RE_DEBUG_FLAGS_DECL;
883 PERL_ARGS_ASSERT_DUMP_TRIE;
885 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
886 (int)depth * 2 + 2,"",
887 "Match","Base","Ofs" );
889 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
890 SV ** const tmp = av_fetch( revcharmap, state, 0);
892 PerlIO_printf( Perl_debug_log, "%*s",
894 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
895 PL_colors[0], PL_colors[1],
896 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
897 PERL_PV_ESCAPE_FIRSTCHAR
902 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
903 (int)depth * 2 + 2,"");
905 for( state = 0 ; state < trie->uniquecharcount ; state++ )
906 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
907 PerlIO_printf( Perl_debug_log, "\n");
909 for( state = 1 ; state < trie->statecount ; state++ ) {
910 const U32 base = trie->states[ state ].trans.base;
912 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
914 if ( trie->states[ state ].wordnum ) {
915 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
917 PerlIO_printf( Perl_debug_log, "%6s", "" );
920 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
925 while( ( base + ofs < trie->uniquecharcount ) ||
926 ( base + ofs - trie->uniquecharcount < trie->lasttrans
927 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
930 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
932 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
933 if ( ( base + ofs >= trie->uniquecharcount ) &&
934 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
935 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
937 PerlIO_printf( Perl_debug_log, "%*"UVXf,
939 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
941 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
945 PerlIO_printf( Perl_debug_log, "]");
948 PerlIO_printf( Perl_debug_log, "\n" );
952 Dumps a fully constructed but uncompressed trie in list form.
953 List tries normally only are used for construction when the number of
954 possible chars (trie->uniquecharcount) is very high.
955 Used for debugging make_trie().
958 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
959 HV *widecharmap, AV *revcharmap, U32 next_alloc,
963 SV *sv=sv_newmortal();
964 int colwidth= widecharmap ? 6 : 4;
965 GET_RE_DEBUG_FLAGS_DECL;
967 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
969 /* print out the table precompression. */
970 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
971 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
972 "------:-----+-----------------\n" );
974 for( state=1 ; state < next_alloc ; state ++ ) {
977 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
978 (int)depth * 2 + 2,"", (UV)state );
979 if ( ! trie->states[ state ].wordnum ) {
980 PerlIO_printf( Perl_debug_log, "%5s| ","");
982 PerlIO_printf( Perl_debug_log, "W%4x| ",
983 trie->states[ state ].wordnum
986 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
987 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
989 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
991 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
992 PL_colors[0], PL_colors[1],
993 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
994 PERL_PV_ESCAPE_FIRSTCHAR
996 TRIE_LIST_ITEM(state,charid).forid,
997 (UV)TRIE_LIST_ITEM(state,charid).newstate
1000 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1001 (int)((depth * 2) + 14), "");
1004 PerlIO_printf( Perl_debug_log, "\n");
1009 Dumps a fully constructed but uncompressed trie in table form.
1010 This is the normal DFA style state transition table, with a few
1011 twists to facilitate compression later.
1012 Used for debugging make_trie().
1015 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1016 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1021 SV *sv=sv_newmortal();
1022 int colwidth= widecharmap ? 6 : 4;
1023 GET_RE_DEBUG_FLAGS_DECL;
1025 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1028 print out the table precompression so that we can do a visual check
1029 that they are identical.
1032 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1034 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1035 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1037 PerlIO_printf( Perl_debug_log, "%*s",
1039 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1040 PL_colors[0], PL_colors[1],
1041 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1042 PERL_PV_ESCAPE_FIRSTCHAR
1048 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1050 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1051 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1054 PerlIO_printf( Perl_debug_log, "\n" );
1056 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1058 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1059 (int)depth * 2 + 2,"",
1060 (UV)TRIE_NODENUM( state ) );
1062 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1063 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1065 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1067 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1069 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1070 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1073 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1080 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1081 startbranch: the first branch in the whole branch sequence
1082 first : start branch of sequence of branch-exact nodes.
1083 May be the same as startbranch
1084 last : Thing following the last branch.
1085 May be the same as tail.
1086 tail : item following the branch sequence
1087 count : words in the sequence
1088 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1089 depth : indent depth
1091 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1093 A trie is an N'ary tree where the branches are determined by digital
1094 decomposition of the key. IE, at the root node you look up the 1st character and
1095 follow that branch repeat until you find the end of the branches. Nodes can be
1096 marked as "accepting" meaning they represent a complete word. Eg:
1100 would convert into the following structure. Numbers represent states, letters
1101 following numbers represent valid transitions on the letter from that state, if
1102 the number is in square brackets it represents an accepting state, otherwise it
1103 will be in parenthesis.
1105 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1109 (1) +-i->(6)-+-s->[7]
1111 +-s->(3)-+-h->(4)-+-e->[5]
1113 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1115 This shows that when matching against the string 'hers' we will begin at state 1
1116 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1117 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1118 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1119 single traverse. We store a mapping from accepting to state to which word was
1120 matched, and then when we have multiple possibilities we try to complete the
1121 rest of the regex in the order in which they occured in the alternation.
1123 The only prior NFA like behaviour that would be changed by the TRIE support is
1124 the silent ignoring of duplicate alternations which are of the form:
1126 / (DUPE|DUPE) X? (?{ ... }) Y /x
1128 Thus EVAL blocks follwing a trie may be called a different number of times with
1129 and without the optimisation. With the optimisations dupes will be silently
1130 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1131 the following demonstrates:
1133 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1135 which prints out 'word' three times, but
1137 'words'=~/(word|word|word)(?{ print $1 })S/
1139 which doesnt print it out at all. This is due to other optimisations kicking in.
1141 Example of what happens on a structural level:
1143 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1145 1: CURLYM[1] {1,32767}(18)
1156 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1157 and should turn into:
1159 1: CURLYM[1] {1,32767}(18)
1161 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1169 Cases where tail != last would be like /(?foo|bar)baz/:
1179 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1180 and would end up looking like:
1183 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1190 d = uvuni_to_utf8_flags(d, uv, 0);
1192 is the recommended Unicode-aware way of saying
1197 #define TRIE_STORE_REVCHAR \
1200 SV *zlopp = newSV(2); \
1201 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1202 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1203 SvCUR_set(zlopp, kapow - flrbbbbb); \
1206 av_push(revcharmap, zlopp); \
1208 char ooooff = (char)uvc; \
1209 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1213 #define TRIE_READ_CHAR STMT_START { \
1217 if ( foldlen > 0 ) { \
1218 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1223 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1224 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1225 foldlen -= UNISKIP( uvc ); \
1226 scan = foldbuf + UNISKIP( uvc ); \
1229 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1239 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1240 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1241 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1242 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1244 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1245 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1246 TRIE_LIST_CUR( state )++; \
1249 #define TRIE_LIST_NEW(state) STMT_START { \
1250 Newxz( trie->states[ state ].trans.list, \
1251 4, reg_trie_trans_le ); \
1252 TRIE_LIST_CUR( state ) = 1; \
1253 TRIE_LIST_LEN( state ) = 4; \
1256 #define TRIE_HANDLE_WORD(state) STMT_START { \
1257 U16 dupe= trie->states[ state ].wordnum; \
1258 regnode * const noper_next = regnext( noper ); \
1260 if (trie->wordlen) \
1261 trie->wordlen[ curword ] = wordlen; \
1263 /* store the word for dumping */ \
1265 if (OP(noper) != NOTHING) \
1266 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1268 tmp = newSVpvn_utf8( "", 0, UTF ); \
1269 av_push( trie_words, tmp ); \
1274 if ( noper_next < tail ) { \
1276 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1277 trie->jump[curword] = (U16)(noper_next - convert); \
1279 jumper = noper_next; \
1281 nextbranch= regnext(cur); \
1285 /* So it's a dupe. This means we need to maintain a */\
1286 /* linked-list from the first to the next. */\
1287 /* we only allocate the nextword buffer when there */\
1288 /* a dupe, so first time we have to do the allocation */\
1289 if (!trie->nextword) \
1290 trie->nextword = (U16 *) \
1291 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1292 while ( trie->nextword[dupe] ) \
1293 dupe= trie->nextword[dupe]; \
1294 trie->nextword[dupe]= curword; \
1296 /* we haven't inserted this word yet. */ \
1297 trie->states[ state ].wordnum = curword; \
1302 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1303 ( ( base + charid >= ucharcount \
1304 && base + charid < ubound \
1305 && state == trie->trans[ base - ucharcount + charid ].check \
1306 && trie->trans[ base - ucharcount + charid ].next ) \
1307 ? trie->trans[ base - ucharcount + charid ].next \
1308 : ( state==1 ? special : 0 ) \
1312 #define MADE_JUMP_TRIE 2
1313 #define MADE_EXACT_TRIE 4
1316 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1319 /* first pass, loop through and scan words */
1320 reg_trie_data *trie;
1321 HV *widecharmap = NULL;
1322 AV *revcharmap = newAV();
1324 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1329 regnode *jumper = NULL;
1330 regnode *nextbranch = NULL;
1331 regnode *convert = NULL;
1332 /* we just use folder as a flag in utf8 */
1333 const U8 * const folder = ( flags == EXACTF
1335 : ( flags == EXACTFL
1342 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1343 AV *trie_words = NULL;
1344 /* along with revcharmap, this only used during construction but both are
1345 * useful during debugging so we store them in the struct when debugging.
1348 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1349 STRLEN trie_charcount=0;
1351 SV *re_trie_maxbuff;
1352 GET_RE_DEBUG_FLAGS_DECL;
1354 PERL_ARGS_ASSERT_MAKE_TRIE;
1356 PERL_UNUSED_ARG(depth);
1359 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1361 trie->startstate = 1;
1362 trie->wordcount = word_count;
1363 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1364 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1365 if (!(UTF && folder))
1366 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1368 trie_words = newAV();
1371 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1372 if (!SvIOK(re_trie_maxbuff)) {
1373 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1376 PerlIO_printf( Perl_debug_log,
1377 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1378 (int)depth * 2 + 2, "",
1379 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1380 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1384 /* Find the node we are going to overwrite */
1385 if ( first == startbranch && OP( last ) != BRANCH ) {
1386 /* whole branch chain */
1389 /* branch sub-chain */
1390 convert = NEXTOPER( first );
1393 /* -- First loop and Setup --
1395 We first traverse the branches and scan each word to determine if it
1396 contains widechars, and how many unique chars there are, this is
1397 important as we have to build a table with at least as many columns as we
1400 We use an array of integers to represent the character codes 0..255
1401 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1402 native representation of the character value as the key and IV's for the
1405 *TODO* If we keep track of how many times each character is used we can
1406 remap the columns so that the table compression later on is more
1407 efficient in terms of memory by ensuring most common value is in the
1408 middle and the least common are on the outside. IMO this would be better
1409 than a most to least common mapping as theres a decent chance the most
1410 common letter will share a node with the least common, meaning the node
1411 will not be compressable. With a middle is most common approach the worst
1412 case is when we have the least common nodes twice.
1416 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1417 regnode * const noper = NEXTOPER( cur );
1418 const U8 *uc = (U8*)STRING( noper );
1419 const U8 * const e = uc + STR_LEN( noper );
1421 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1422 const U8 *scan = (U8*)NULL;
1423 U32 wordlen = 0; /* required init */
1425 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1427 if (OP(noper) == NOTHING) {
1431 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1432 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1433 regardless of encoding */
1435 for ( ; uc < e ; uc += len ) {
1436 TRIE_CHARCOUNT(trie)++;
1440 if ( !trie->charmap[ uvc ] ) {
1441 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1443 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1447 /* store the codepoint in the bitmap, and if its ascii
1448 also store its folded equivelent. */
1449 TRIE_BITMAP_SET(trie,uvc);
1451 /* store the folded codepoint */
1452 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1455 /* store first byte of utf8 representation of
1456 codepoints in the 127 < uvc < 256 range */
1457 if (127 < uvc && uvc < 192) {
1458 TRIE_BITMAP_SET(trie,194);
1459 } else if (191 < uvc ) {
1460 TRIE_BITMAP_SET(trie,195);
1461 /* && uvc < 256 -- we know uvc is < 256 already */
1464 set_bit = 0; /* We've done our bit :-) */
1469 widecharmap = newHV();
1471 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1474 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1476 if ( !SvTRUE( *svpp ) ) {
1477 sv_setiv( *svpp, ++trie->uniquecharcount );
1482 if( cur == first ) {
1485 } else if (chars < trie->minlen) {
1487 } else if (chars > trie->maxlen) {
1491 } /* end first pass */
1492 DEBUG_TRIE_COMPILE_r(
1493 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1494 (int)depth * 2 + 2,"",
1495 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1496 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1497 (int)trie->minlen, (int)trie->maxlen )
1499 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1502 We now know what we are dealing with in terms of unique chars and
1503 string sizes so we can calculate how much memory a naive
1504 representation using a flat table will take. If it's over a reasonable
1505 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1506 conservative but potentially much slower representation using an array
1509 At the end we convert both representations into the same compressed
1510 form that will be used in regexec.c for matching with. The latter
1511 is a form that cannot be used to construct with but has memory
1512 properties similar to the list form and access properties similar
1513 to the table form making it both suitable for fast searches and
1514 small enough that its feasable to store for the duration of a program.
1516 See the comment in the code where the compressed table is produced
1517 inplace from the flat tabe representation for an explanation of how
1518 the compression works.
1523 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1525 Second Pass -- Array Of Lists Representation
1527 Each state will be represented by a list of charid:state records
1528 (reg_trie_trans_le) the first such element holds the CUR and LEN
1529 points of the allocated array. (See defines above).
1531 We build the initial structure using the lists, and then convert
1532 it into the compressed table form which allows faster lookups
1533 (but cant be modified once converted).
1536 STRLEN transcount = 1;
1538 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1539 "%*sCompiling trie using list compiler\n",
1540 (int)depth * 2 + 2, ""));
1542 trie->states = (reg_trie_state *)
1543 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1544 sizeof(reg_trie_state) );
1548 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1550 regnode * const noper = NEXTOPER( cur );
1551 U8 *uc = (U8*)STRING( noper );
1552 const U8 * const e = uc + STR_LEN( noper );
1553 U32 state = 1; /* required init */
1554 U16 charid = 0; /* sanity init */
1555 U8 *scan = (U8*)NULL; /* sanity init */
1556 STRLEN foldlen = 0; /* required init */
1557 U32 wordlen = 0; /* required init */
1558 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1560 if (OP(noper) != NOTHING) {
1561 for ( ; uc < e ; uc += len ) {
1566 charid = trie->charmap[ uvc ];
1568 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1572 charid=(U16)SvIV( *svpp );
1575 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1582 if ( !trie->states[ state ].trans.list ) {
1583 TRIE_LIST_NEW( state );
1585 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1586 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1587 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1592 newstate = next_alloc++;
1593 TRIE_LIST_PUSH( state, charid, newstate );
1598 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1602 TRIE_HANDLE_WORD(state);
1604 } /* end second pass */
1606 /* next alloc is the NEXT state to be allocated */
1607 trie->statecount = next_alloc;
1608 trie->states = (reg_trie_state *)
1609 PerlMemShared_realloc( trie->states,
1611 * sizeof(reg_trie_state) );
1613 /* and now dump it out before we compress it */
1614 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1615 revcharmap, next_alloc,
1619 trie->trans = (reg_trie_trans *)
1620 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1627 for( state=1 ; state < next_alloc ; state ++ ) {
1631 DEBUG_TRIE_COMPILE_MORE_r(
1632 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1636 if (trie->states[state].trans.list) {
1637 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1641 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1642 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1643 if ( forid < minid ) {
1645 } else if ( forid > maxid ) {
1649 if ( transcount < tp + maxid - minid + 1) {
1651 trie->trans = (reg_trie_trans *)
1652 PerlMemShared_realloc( trie->trans,
1654 * sizeof(reg_trie_trans) );
1655 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1657 base = trie->uniquecharcount + tp - minid;
1658 if ( maxid == minid ) {
1660 for ( ; zp < tp ; zp++ ) {
1661 if ( ! trie->trans[ zp ].next ) {
1662 base = trie->uniquecharcount + zp - minid;
1663 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1664 trie->trans[ zp ].check = state;
1670 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1671 trie->trans[ tp ].check = state;
1676 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1677 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1678 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1679 trie->trans[ tid ].check = state;
1681 tp += ( maxid - minid + 1 );
1683 Safefree(trie->states[ state ].trans.list);
1686 DEBUG_TRIE_COMPILE_MORE_r(
1687 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1690 trie->states[ state ].trans.base=base;
1692 trie->lasttrans = tp + 1;
1696 Second Pass -- Flat Table Representation.
1698 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1699 We know that we will need Charcount+1 trans at most to store the data
1700 (one row per char at worst case) So we preallocate both structures
1701 assuming worst case.
1703 We then construct the trie using only the .next slots of the entry
1706 We use the .check field of the first entry of the node temporarily to
1707 make compression both faster and easier by keeping track of how many non
1708 zero fields are in the node.
1710 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1713 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1714 number representing the first entry of the node, and state as a
1715 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1716 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1717 are 2 entrys per node. eg:
1725 The table is internally in the right hand, idx form. However as we also
1726 have to deal with the states array which is indexed by nodenum we have to
1727 use TRIE_NODENUM() to convert.
1730 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1731 "%*sCompiling trie using table compiler\n",
1732 (int)depth * 2 + 2, ""));
1734 trie->trans = (reg_trie_trans *)
1735 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1736 * trie->uniquecharcount + 1,
1737 sizeof(reg_trie_trans) );
1738 trie->states = (reg_trie_state *)
1739 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1740 sizeof(reg_trie_state) );
1741 next_alloc = trie->uniquecharcount + 1;
1744 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1746 regnode * const noper = NEXTOPER( cur );
1747 const U8 *uc = (U8*)STRING( noper );
1748 const U8 * const e = uc + STR_LEN( noper );
1750 U32 state = 1; /* required init */
1752 U16 charid = 0; /* sanity init */
1753 U32 accept_state = 0; /* sanity init */
1754 U8 *scan = (U8*)NULL; /* sanity init */
1756 STRLEN foldlen = 0; /* required init */
1757 U32 wordlen = 0; /* required init */
1758 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1760 if ( OP(noper) != NOTHING ) {
1761 for ( ; uc < e ; uc += len ) {
1766 charid = trie->charmap[ uvc ];
1768 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1769 charid = svpp ? (U16)SvIV(*svpp) : 0;
1773 if ( !trie->trans[ state + charid ].next ) {
1774 trie->trans[ state + charid ].next = next_alloc;
1775 trie->trans[ state ].check++;
1776 next_alloc += trie->uniquecharcount;
1778 state = trie->trans[ state + charid ].next;
1780 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1782 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1785 accept_state = TRIE_NODENUM( state );
1786 TRIE_HANDLE_WORD(accept_state);
1788 } /* end second pass */
1790 /* and now dump it out before we compress it */
1791 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1793 next_alloc, depth+1));
1797 * Inplace compress the table.*
1799 For sparse data sets the table constructed by the trie algorithm will
1800 be mostly 0/FAIL transitions or to put it another way mostly empty.
1801 (Note that leaf nodes will not contain any transitions.)
1803 This algorithm compresses the tables by eliminating most such
1804 transitions, at the cost of a modest bit of extra work during lookup:
1806 - Each states[] entry contains a .base field which indicates the
1807 index in the state[] array wheres its transition data is stored.
1809 - If .base is 0 there are no valid transitions from that node.
1811 - If .base is nonzero then charid is added to it to find an entry in
1814 -If trans[states[state].base+charid].check!=state then the
1815 transition is taken to be a 0/Fail transition. Thus if there are fail
1816 transitions at the front of the node then the .base offset will point
1817 somewhere inside the previous nodes data (or maybe even into a node
1818 even earlier), but the .check field determines if the transition is
1822 The following process inplace converts the table to the compressed
1823 table: We first do not compress the root node 1,and mark its all its
1824 .check pointers as 1 and set its .base pointer as 1 as well. This
1825 allows to do a DFA construction from the compressed table later, and
1826 ensures that any .base pointers we calculate later are greater than
1829 - We set 'pos' to indicate the first entry of the second node.
1831 - We then iterate over the columns of the node, finding the first and
1832 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1833 and set the .check pointers accordingly, and advance pos
1834 appropriately and repreat for the next node. Note that when we copy
1835 the next pointers we have to convert them from the original
1836 NODEIDX form to NODENUM form as the former is not valid post
1839 - If a node has no transitions used we mark its base as 0 and do not
1840 advance the pos pointer.
1842 - If a node only has one transition we use a second pointer into the
1843 structure to fill in allocated fail transitions from other states.
1844 This pointer is independent of the main pointer and scans forward
1845 looking for null transitions that are allocated to a state. When it
1846 finds one it writes the single transition into the "hole". If the
1847 pointer doesnt find one the single transition is appended as normal.
1849 - Once compressed we can Renew/realloc the structures to release the
1852 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1853 specifically Fig 3.47 and the associated pseudocode.
1857 const U32 laststate = TRIE_NODENUM( next_alloc );
1860 trie->statecount = laststate;
1862 for ( state = 1 ; state < laststate ; state++ ) {
1864 const U32 stateidx = TRIE_NODEIDX( state );
1865 const U32 o_used = trie->trans[ stateidx ].check;
1866 U32 used = trie->trans[ stateidx ].check;
1867 trie->trans[ stateidx ].check = 0;
1869 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1870 if ( flag || trie->trans[ stateidx + charid ].next ) {
1871 if ( trie->trans[ stateidx + charid ].next ) {
1873 for ( ; zp < pos ; zp++ ) {
1874 if ( ! trie->trans[ zp ].next ) {
1878 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1879 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880 trie->trans[ zp ].check = state;
1881 if ( ++zp > pos ) pos = zp;
1888 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1890 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1891 trie->trans[ pos ].check = state;
1896 trie->lasttrans = pos + 1;
1897 trie->states = (reg_trie_state *)
1898 PerlMemShared_realloc( trie->states, laststate
1899 * sizeof(reg_trie_state) );
1900 DEBUG_TRIE_COMPILE_MORE_r(
1901 PerlIO_printf( Perl_debug_log,
1902 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1903 (int)depth * 2 + 2,"",
1904 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1907 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1910 } /* end table compress */
1912 DEBUG_TRIE_COMPILE_MORE_r(
1913 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1914 (int)depth * 2 + 2, "",
1915 (UV)trie->statecount,
1916 (UV)trie->lasttrans)
1918 /* resize the trans array to remove unused space */
1919 trie->trans = (reg_trie_trans *)
1920 PerlMemShared_realloc( trie->trans, trie->lasttrans
1921 * sizeof(reg_trie_trans) );
1923 /* and now dump out the compressed format */
1924 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1926 { /* Modify the program and insert the new TRIE node*/
1927 U8 nodetype =(U8)(flags & 0xFF);
1931 regnode *optimize = NULL;
1932 #ifdef RE_TRACK_PATTERN_OFFSETS
1935 U32 mjd_nodelen = 0;
1936 #endif /* RE_TRACK_PATTERN_OFFSETS */
1937 #endif /* DEBUGGING */
1939 This means we convert either the first branch or the first Exact,
1940 depending on whether the thing following (in 'last') is a branch
1941 or not and whther first is the startbranch (ie is it a sub part of
1942 the alternation or is it the whole thing.)
1943 Assuming its a sub part we conver the EXACT otherwise we convert
1944 the whole branch sequence, including the first.
1946 /* Find the node we are going to overwrite */
1947 if ( first != startbranch || OP( last ) == BRANCH ) {
1948 /* branch sub-chain */
1949 NEXT_OFF( first ) = (U16)(last - first);
1950 #ifdef RE_TRACK_PATTERN_OFFSETS
1952 mjd_offset= Node_Offset((convert));
1953 mjd_nodelen= Node_Length((convert));
1956 /* whole branch chain */
1958 #ifdef RE_TRACK_PATTERN_OFFSETS
1961 const regnode *nop = NEXTOPER( convert );
1962 mjd_offset= Node_Offset((nop));
1963 mjd_nodelen= Node_Length((nop));
1967 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1968 (int)depth * 2 + 2, "",
1969 (UV)mjd_offset, (UV)mjd_nodelen)
1972 /* But first we check to see if there is a common prefix we can
1973 split out as an EXACT and put in front of the TRIE node. */
1974 trie->startstate= 1;
1975 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1977 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1981 const U32 base = trie->states[ state ].trans.base;
1983 if ( trie->states[state].wordnum )
1986 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1987 if ( ( base + ofs >= trie->uniquecharcount ) &&
1988 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1989 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1991 if ( ++count > 1 ) {
1992 SV **tmp = av_fetch( revcharmap, ofs, 0);
1993 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1994 if ( state == 1 ) break;
1996 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1998 PerlIO_printf(Perl_debug_log,
1999 "%*sNew Start State=%"UVuf" Class: [",
2000 (int)depth * 2 + 2, "",
2003 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2004 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2006 TRIE_BITMAP_SET(trie,*ch);
2008 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2010 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2014 TRIE_BITMAP_SET(trie,*ch);
2016 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2017 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2023 SV **tmp = av_fetch( revcharmap, idx, 0);
2025 char *ch = SvPV( *tmp, len );
2027 SV *sv=sv_newmortal();
2028 PerlIO_printf( Perl_debug_log,
2029 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2030 (int)depth * 2 + 2, "",
2032 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2033 PL_colors[0], PL_colors[1],
2034 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2035 PERL_PV_ESCAPE_FIRSTCHAR
2040 OP( convert ) = nodetype;
2041 str=STRING(convert);
2044 STR_LEN(convert) += len;
2050 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2056 regnode *n = convert+NODE_SZ_STR(convert);
2057 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2058 trie->startstate = state;
2059 trie->minlen -= (state - 1);
2060 trie->maxlen -= (state - 1);
2062 /* At least the UNICOS C compiler choked on this
2063 * being argument to DEBUG_r(), so let's just have
2066 #ifdef PERL_EXT_RE_BUILD
2072 regnode *fix = convert;
2073 U32 word = trie->wordcount;
2075 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2076 while( ++fix < n ) {
2077 Set_Node_Offset_Length(fix, 0, 0);
2080 SV ** const tmp = av_fetch( trie_words, word, 0 );
2082 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2083 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2093 NEXT_OFF(convert) = (U16)(tail - convert);
2094 DEBUG_r(optimize= n);
2100 if ( trie->maxlen ) {
2101 NEXT_OFF( convert ) = (U16)(tail - convert);
2102 ARG_SET( convert, data_slot );
2103 /* Store the offset to the first unabsorbed branch in
2104 jump[0], which is otherwise unused by the jump logic.
2105 We use this when dumping a trie and during optimisation. */
2107 trie->jump[0] = (U16)(nextbranch - convert);
2110 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2111 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2113 OP( convert ) = TRIEC;
2114 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2115 PerlMemShared_free(trie->bitmap);
2118 OP( convert ) = TRIE;
2120 /* store the type in the flags */
2121 convert->flags = nodetype;
2125 + regarglen[ OP( convert ) ];
2127 /* XXX We really should free up the resource in trie now,
2128 as we won't use them - (which resources?) dmq */
2130 /* needed for dumping*/
2131 DEBUG_r(if (optimize) {
2132 regnode *opt = convert;
2134 while ( ++opt < optimize) {
2135 Set_Node_Offset_Length(opt,0,0);
2138 Try to clean up some of the debris left after the
2141 while( optimize < jumper ) {
2142 mjd_nodelen += Node_Length((optimize));
2143 OP( optimize ) = OPTIMIZED;
2144 Set_Node_Offset_Length(optimize,0,0);
2147 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2149 } /* end node insert */
2150 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2151 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2153 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2154 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2156 SvREFCNT_dec(revcharmap);
2160 : trie->startstate>1
2166 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2168 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2170 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2171 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2174 We find the fail state for each state in the trie, this state is the longest proper
2175 suffix of the current states 'word' that is also a proper prefix of another word in our
2176 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2177 the DFA not to have to restart after its tried and failed a word at a given point, it
2178 simply continues as though it had been matching the other word in the first place.
2180 'abcdgu'=~/abcdefg|cdgu/
2181 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2182 fail, which would bring use to the state representing 'd' in the second word where we would
2183 try 'g' and succeed, prodceding to match 'cdgu'.
2185 /* add a fail transition */
2186 const U32 trie_offset = ARG(source);
2187 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2189 const U32 ucharcount = trie->uniquecharcount;
2190 const U32 numstates = trie->statecount;
2191 const U32 ubound = trie->lasttrans + ucharcount;
2195 U32 base = trie->states[ 1 ].trans.base;
2198 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2199 GET_RE_DEBUG_FLAGS_DECL;
2201 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2203 PERL_UNUSED_ARG(depth);
2207 ARG_SET( stclass, data_slot );
2208 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2209 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2210 aho->trie=trie_offset;
2211 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2212 Copy( trie->states, aho->states, numstates, reg_trie_state );
2213 Newxz( q, numstates, U32);
2214 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2217 /* initialize fail[0..1] to be 1 so that we always have
2218 a valid final fail state */
2219 fail[ 0 ] = fail[ 1 ] = 1;
2221 for ( charid = 0; charid < ucharcount ; charid++ ) {
2222 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2224 q[ q_write ] = newstate;
2225 /* set to point at the root */
2226 fail[ q[ q_write++ ] ]=1;
2229 while ( q_read < q_write) {
2230 const U32 cur = q[ q_read++ % numstates ];
2231 base = trie->states[ cur ].trans.base;
2233 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2234 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2236 U32 fail_state = cur;
2239 fail_state = fail[ fail_state ];
2240 fail_base = aho->states[ fail_state ].trans.base;
2241 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2243 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2244 fail[ ch_state ] = fail_state;
2245 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2247 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2249 q[ q_write++ % numstates] = ch_state;
2253 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2254 when we fail in state 1, this allows us to use the
2255 charclass scan to find a valid start char. This is based on the principle
2256 that theres a good chance the string being searched contains lots of stuff
2257 that cant be a start char.
2259 fail[ 0 ] = fail[ 1 ] = 0;
2260 DEBUG_TRIE_COMPILE_r({
2261 PerlIO_printf(Perl_debug_log,
2262 "%*sStclass Failtable (%"UVuf" states): 0",
2263 (int)(depth * 2), "", (UV)numstates
2265 for( q_read=1; q_read<numstates; q_read++ ) {
2266 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2268 PerlIO_printf(Perl_debug_log, "\n");
2271 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2276 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2277 * These need to be revisited when a newer toolchain becomes available.
2279 #if defined(__sparc64__) && defined(__GNUC__)
2280 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2281 # undef SPARC64_GCC_WORKAROUND
2282 # define SPARC64_GCC_WORKAROUND 1
2286 #define DEBUG_PEEP(str,scan,depth) \
2287 DEBUG_OPTIMISE_r({if (scan){ \
2288 SV * const mysv=sv_newmortal(); \
2289 regnode *Next = regnext(scan); \
2290 regprop(RExC_rx, mysv, scan); \
2291 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2292 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2293 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2300 #define JOIN_EXACT(scan,min,flags) \
2301 if (PL_regkind[OP(scan)] == EXACT) \
2302 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2305 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2306 /* Merge several consecutive EXACTish nodes into one. */
2307 regnode *n = regnext(scan);
2309 regnode *next = scan + NODE_SZ_STR(scan);
2313 regnode *stop = scan;
2314 GET_RE_DEBUG_FLAGS_DECL;
2316 PERL_UNUSED_ARG(depth);
2319 PERL_ARGS_ASSERT_JOIN_EXACT;
2320 #ifndef EXPERIMENTAL_INPLACESCAN
2321 PERL_UNUSED_ARG(flags);
2322 PERL_UNUSED_ARG(val);
2324 DEBUG_PEEP("join",scan,depth);
2326 /* Skip NOTHING, merge EXACT*. */
2328 ( PL_regkind[OP(n)] == NOTHING ||
2329 (stringok && (OP(n) == OP(scan))))
2331 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2333 if (OP(n) == TAIL || n > next)
2335 if (PL_regkind[OP(n)] == NOTHING) {
2336 DEBUG_PEEP("skip:",n,depth);
2337 NEXT_OFF(scan) += NEXT_OFF(n);
2338 next = n + NODE_STEP_REGNODE;
2345 else if (stringok) {
2346 const unsigned int oldl = STR_LEN(scan);
2347 regnode * const nnext = regnext(n);
2349 DEBUG_PEEP("merg",n,depth);
2352 if (oldl + STR_LEN(n) > U8_MAX)
2354 NEXT_OFF(scan) += NEXT_OFF(n);
2355 STR_LEN(scan) += STR_LEN(n);
2356 next = n + NODE_SZ_STR(n);
2357 /* Now we can overwrite *n : */
2358 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2366 #ifdef EXPERIMENTAL_INPLACESCAN
2367 if (flags && !NEXT_OFF(n)) {
2368 DEBUG_PEEP("atch", val, depth);
2369 if (reg_off_by_arg[OP(n)]) {
2370 ARG_SET(n, val - n);
2373 NEXT_OFF(n) = val - n;
2380 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2382 Two problematic code points in Unicode casefolding of EXACT nodes:
2384 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2385 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2391 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2392 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2394 This means that in case-insensitive matching (or "loose matching",
2395 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2396 length of the above casefolded versions) can match a target string
2397 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2398 This would rather mess up the minimum length computation.
2400 What we'll do is to look for the tail four bytes, and then peek
2401 at the preceding two bytes to see whether we need to decrease
2402 the minimum length by four (six minus two).
2404 Thanks to the design of UTF-8, there cannot be false matches:
2405 A sequence of valid UTF-8 bytes cannot be a subsequence of
2406 another valid sequence of UTF-8 bytes.
2409 char * const s0 = STRING(scan), *s, *t;
2410 char * const s1 = s0 + STR_LEN(scan) - 1;
2411 char * const s2 = s1 - 4;
2412 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2413 const char t0[] = "\xaf\x49\xaf\x42";
2415 const char t0[] = "\xcc\x88\xcc\x81";
2417 const char * const t1 = t0 + 3;
2420 s < s2 && (t = ninstr(s, s1, t0, t1));
2423 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2424 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2426 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2427 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2435 n = scan + NODE_SZ_STR(scan);
2437 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2444 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2448 /* REx optimizer. Converts nodes into quickier variants "in place".
2449 Finds fixed substrings. */
2451 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2452 to the position after last scanned or to NULL. */
2454 #define INIT_AND_WITHP \
2455 assert(!and_withp); \
2456 Newx(and_withp,1,struct regnode_charclass_class); \
2457 SAVEFREEPV(and_withp)
2459 /* this is a chain of data about sub patterns we are processing that
2460 need to be handled seperately/specially in study_chunk. Its so
2461 we can simulate recursion without losing state. */
2463 typedef struct scan_frame {
2464 regnode *last; /* last node to process in this frame */
2465 regnode *next; /* next node to process when last is reached */
2466 struct scan_frame *prev; /*previous frame*/
2467 I32 stop; /* what stopparen do we use */
2471 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2473 #define CASE_SYNST_FNC(nAmE) \
2475 if (flags & SCF_DO_STCLASS_AND) { \
2476 for (value = 0; value < 256; value++) \
2477 if (!is_ ## nAmE ## _cp(value)) \
2478 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2481 for (value = 0; value < 256; value++) \
2482 if (is_ ## nAmE ## _cp(value)) \
2483 ANYOF_BITMAP_SET(data->start_class, value); \
2487 if (flags & SCF_DO_STCLASS_AND) { \
2488 for (value = 0; value < 256; value++) \
2489 if (is_ ## nAmE ## _cp(value)) \
2490 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2493 for (value = 0; value < 256; value++) \
2494 if (!is_ ## nAmE ## _cp(value)) \
2495 ANYOF_BITMAP_SET(data->start_class, value); \
2502 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2503 I32 *minlenp, I32 *deltap,
2508 struct regnode_charclass_class *and_withp,
2509 U32 flags, U32 depth)
2510 /* scanp: Start here (read-write). */
2511 /* deltap: Write maxlen-minlen here. */
2512 /* last: Stop before this one. */
2513 /* data: string data about the pattern */
2514 /* stopparen: treat close N as END */
2515 /* recursed: which subroutines have we recursed into */
2516 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2519 I32 min = 0, pars = 0, code;
2520 regnode *scan = *scanp, *next;
2522 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2523 int is_inf_internal = 0; /* The studied chunk is infinite */
2524 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2525 scan_data_t data_fake;
2526 SV *re_trie_maxbuff = NULL;
2527 regnode *first_non_open = scan;
2528 I32 stopmin = I32_MAX;
2529 scan_frame *frame = NULL;
2530 GET_RE_DEBUG_FLAGS_DECL;
2532 PERL_ARGS_ASSERT_STUDY_CHUNK;
2535 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2539 while (first_non_open && OP(first_non_open) == OPEN)
2540 first_non_open=regnext(first_non_open);
2545 while ( scan && OP(scan) != END && scan < last ){
2546 /* Peephole optimizer: */
2547 DEBUG_STUDYDATA("Peep:", data,depth);
2548 DEBUG_PEEP("Peep",scan,depth);
2549 JOIN_EXACT(scan,&min,0);
2551 /* Follow the next-chain of the current node and optimize
2552 away all the NOTHINGs from it. */
2553 if (OP(scan) != CURLYX) {
2554 const int max = (reg_off_by_arg[OP(scan)]
2556 /* I32 may be smaller than U16 on CRAYs! */
2557 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2558 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2562 /* Skip NOTHING and LONGJMP. */
2563 while ((n = regnext(n))
2564 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2565 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2566 && off + noff < max)
2568 if (reg_off_by_arg[OP(scan)])
2571 NEXT_OFF(scan) = off;
2576 /* The principal pseudo-switch. Cannot be a switch, since we
2577 look into several different things. */
2578 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2579 || OP(scan) == IFTHEN) {
2580 next = regnext(scan);
2582 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2584 if (OP(next) == code || code == IFTHEN) {
2585 /* NOTE - There is similar code to this block below for handling
2586 TRIE nodes on a re-study. If you change stuff here check there
2588 I32 max1 = 0, min1 = I32_MAX, num = 0;
2589 struct regnode_charclass_class accum;
2590 regnode * const startbranch=scan;
2592 if (flags & SCF_DO_SUBSTR)
2593 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2594 if (flags & SCF_DO_STCLASS)
2595 cl_init_zero(pRExC_state, &accum);
2597 while (OP(scan) == code) {
2598 I32 deltanext, minnext, f = 0, fake;
2599 struct regnode_charclass_class this_class;
2602 data_fake.flags = 0;
2604 data_fake.whilem_c = data->whilem_c;
2605 data_fake.last_closep = data->last_closep;
2608 data_fake.last_closep = &fake;
2610 data_fake.pos_delta = delta;
2611 next = regnext(scan);
2612 scan = NEXTOPER(scan);
2614 scan = NEXTOPER(scan);
2615 if (flags & SCF_DO_STCLASS) {
2616 cl_init(pRExC_state, &this_class);
2617 data_fake.start_class = &this_class;
2618 f = SCF_DO_STCLASS_AND;
2620 if (flags & SCF_WHILEM_VISITED_POS)
2621 f |= SCF_WHILEM_VISITED_POS;
2623 /* we suppose the run is continuous, last=next...*/
2624 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2626 stopparen, recursed, NULL, f,depth+1);
2629 if (max1 < minnext + deltanext)
2630 max1 = minnext + deltanext;
2631 if (deltanext == I32_MAX)
2632 is_inf = is_inf_internal = 1;
2634 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2636 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2637 if ( stopmin > minnext)
2638 stopmin = min + min1;
2639 flags &= ~SCF_DO_SUBSTR;
2641 data->flags |= SCF_SEEN_ACCEPT;
2644 if (data_fake.flags & SF_HAS_EVAL)
2645 data->flags |= SF_HAS_EVAL;
2646 data->whilem_c = data_fake.whilem_c;
2648 if (flags & SCF_DO_STCLASS)
2649 cl_or(pRExC_state, &accum, &this_class);
2651 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2653 if (flags & SCF_DO_SUBSTR) {
2654 data->pos_min += min1;
2655 data->pos_delta += max1 - min1;
2656 if (max1 != min1 || is_inf)
2657 data->longest = &(data->longest_float);
2660 delta += max1 - min1;
2661 if (flags & SCF_DO_STCLASS_OR) {
2662 cl_or(pRExC_state, data->start_class, &accum);
2664 cl_and(data->start_class, and_withp);
2665 flags &= ~SCF_DO_STCLASS;
2668 else if (flags & SCF_DO_STCLASS_AND) {
2670 cl_and(data->start_class, &accum);
2671 flags &= ~SCF_DO_STCLASS;
2674 /* Switch to OR mode: cache the old value of
2675 * data->start_class */
2677 StructCopy(data->start_class, and_withp,
2678 struct regnode_charclass_class);
2679 flags &= ~SCF_DO_STCLASS_AND;
2680 StructCopy(&accum, data->start_class,
2681 struct regnode_charclass_class);
2682 flags |= SCF_DO_STCLASS_OR;
2683 data->start_class->flags |= ANYOF_EOS;
2687 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2690 Assuming this was/is a branch we are dealing with: 'scan' now
2691 points at the item that follows the branch sequence, whatever
2692 it is. We now start at the beginning of the sequence and look
2699 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2701 If we can find such a subseqence we need to turn the first
2702 element into a trie and then add the subsequent branch exact
2703 strings to the trie.
2707 1. patterns where the whole set of branch can be converted.
2709 2. patterns where only a subset can be converted.
2711 In case 1 we can replace the whole set with a single regop
2712 for the trie. In case 2 we need to keep the start and end
2715 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2716 becomes BRANCH TRIE; BRANCH X;
2718 There is an additional case, that being where there is a
2719 common prefix, which gets split out into an EXACT like node
2720 preceding the TRIE node.
2722 If x(1..n)==tail then we can do a simple trie, if not we make
2723 a "jump" trie, such that when we match the appropriate word
2724 we "jump" to the appopriate tail node. Essentailly we turn
2725 a nested if into a case structure of sorts.
2730 if (!re_trie_maxbuff) {
2731 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2732 if (!SvIOK(re_trie_maxbuff))
2733 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2735 if ( SvIV(re_trie_maxbuff)>=0 ) {
2737 regnode *first = (regnode *)NULL;
2738 regnode *last = (regnode *)NULL;
2739 regnode *tail = scan;
2744 SV * const mysv = sv_newmortal(); /* for dumping */
2746 /* var tail is used because there may be a TAIL
2747 regop in the way. Ie, the exacts will point to the
2748 thing following the TAIL, but the last branch will
2749 point at the TAIL. So we advance tail. If we
2750 have nested (?:) we may have to move through several
2754 while ( OP( tail ) == TAIL ) {
2755 /* this is the TAIL generated by (?:) */
2756 tail = regnext( tail );
2761 regprop(RExC_rx, mysv, tail );
2762 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2763 (int)depth * 2 + 2, "",
2764 "Looking for TRIE'able sequences. Tail node is: ",
2765 SvPV_nolen_const( mysv )
2771 step through the branches, cur represents each
2772 branch, noper is the first thing to be matched
2773 as part of that branch and noper_next is the
2774 regnext() of that node. if noper is an EXACT
2775 and noper_next is the same as scan (our current
2776 position in the regex) then the EXACT branch is
2777 a possible optimization target. Once we have
2778 two or more consequetive such branches we can
2779 create a trie of the EXACT's contents and stich
2780 it in place. If the sequence represents all of
2781 the branches we eliminate the whole thing and
2782 replace it with a single TRIE. If it is a
2783 subsequence then we need to stitch it in. This
2784 means the first branch has to remain, and needs
2785 to be repointed at the item on the branch chain
2786 following the last branch optimized. This could
2787 be either a BRANCH, in which case the
2788 subsequence is internal, or it could be the
2789 item following the branch sequence in which
2790 case the subsequence is at the end.
2794 /* dont use tail as the end marker for this traverse */
2795 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2796 regnode * const noper = NEXTOPER( cur );
2797 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2798 regnode * const noper_next = regnext( noper );
2802 regprop(RExC_rx, mysv, cur);
2803 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2804 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2806 regprop(RExC_rx, mysv, noper);
2807 PerlIO_printf( Perl_debug_log, " -> %s",
2808 SvPV_nolen_const(mysv));
2811 regprop(RExC_rx, mysv, noper_next );
2812 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2813 SvPV_nolen_const(mysv));
2815 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2816 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2818 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2819 : PL_regkind[ OP( noper ) ] == EXACT )
2820 || OP(noper) == NOTHING )
2822 && noper_next == tail
2827 if ( !first || optype == NOTHING ) {
2828 if (!first) first = cur;
2829 optype = OP( noper );
2835 Currently we do not believe that the trie logic can
2836 handle case insensitive matching properly when the
2837 pattern is not unicode (thus forcing unicode semantics).
2839 If/when this is fixed the following define can be swapped
2840 in below to fully enable trie logic.
2842 #define TRIE_TYPE_IS_SAFE 1
2845 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2847 if ( last && TRIE_TYPE_IS_SAFE ) {
2848 make_trie( pRExC_state,
2849 startbranch, first, cur, tail, count,
2852 if ( PL_regkind[ OP( noper ) ] == EXACT
2854 && noper_next == tail
2859 optype = OP( noper );
2869 regprop(RExC_rx, mysv, cur);
2870 PerlIO_printf( Perl_debug_log,
2871 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2872 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2876 if ( last && TRIE_TYPE_IS_SAFE ) {
2877 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2878 #ifdef TRIE_STUDY_OPT
2879 if ( ((made == MADE_EXACT_TRIE &&
2880 startbranch == first)
2881 || ( first_non_open == first )) &&
2883 flags |= SCF_TRIE_RESTUDY;
2884 if ( startbranch == first
2887 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2897 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2898 scan = NEXTOPER(NEXTOPER(scan));
2899 } else /* single branch is optimized. */
2900 scan = NEXTOPER(scan);
2902 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2903 scan_frame *newframe = NULL;
2908 if (OP(scan) != SUSPEND) {
2909 /* set the pointer */
2910 if (OP(scan) == GOSUB) {
2912 RExC_recurse[ARG2L(scan)] = scan;
2913 start = RExC_open_parens[paren-1];
2914 end = RExC_close_parens[paren-1];
2917 start = RExC_rxi->program + 1;
2921 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2922 SAVEFREEPV(recursed);
2924 if (!PAREN_TEST(recursed,paren+1)) {
2925 PAREN_SET(recursed,paren+1);
2926 Newx(newframe,1,scan_frame);
2928 if (flags & SCF_DO_SUBSTR) {
2929 SCAN_COMMIT(pRExC_state,data,minlenp);
2930 data->longest = &(data->longest_float);
2932 is_inf = is_inf_internal = 1;
2933 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2934 cl_anything(pRExC_state, data->start_class);
2935 flags &= ~SCF_DO_STCLASS;
2938 Newx(newframe,1,scan_frame);
2941 end = regnext(scan);
2946 SAVEFREEPV(newframe);
2947 newframe->next = regnext(scan);
2948 newframe->last = last;
2949 newframe->stop = stopparen;
2950 newframe->prev = frame;
2960 else if (OP(scan) == EXACT) {
2961 I32 l = STR_LEN(scan);
2964 const U8 * const s = (U8*)STRING(scan);
2965 l = utf8_length(s, s + l);
2966 uc = utf8_to_uvchr(s, NULL);
2968 uc = *((U8*)STRING(scan));
2971 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2972 /* The code below prefers earlier match for fixed
2973 offset, later match for variable offset. */
2974 if (data->last_end == -1) { /* Update the start info. */
2975 data->last_start_min = data->pos_min;
2976 data->last_start_max = is_inf
2977 ? I32_MAX : data->pos_min + data->pos_delta;
2979 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2981 SvUTF8_on(data->last_found);
2983 SV * const sv = data->last_found;
2984 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2985 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2986 if (mg && mg->mg_len >= 0)
2987 mg->mg_len += utf8_length((U8*)STRING(scan),
2988 (U8*)STRING(scan)+STR_LEN(scan));
2990 data->last_end = data->pos_min + l;
2991 data->pos_min += l; /* As in the first entry. */
2992 data->flags &= ~SF_BEFORE_EOL;
2994 if (flags & SCF_DO_STCLASS_AND) {
2995 /* Check whether it is compatible with what we know already! */
2999 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3000 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3001 && (!(data->start_class->flags & ANYOF_FOLD)
3002 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3005 ANYOF_CLASS_ZERO(data->start_class);
3006 ANYOF_BITMAP_ZERO(data->start_class);
3008 ANYOF_BITMAP_SET(data->start_class, uc);
3009 data->start_class->flags &= ~ANYOF_EOS;
3011 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3013 else if (flags & SCF_DO_STCLASS_OR) {
3014 /* false positive possible if the class is case-folded */
3016 ANYOF_BITMAP_SET(data->start_class, uc);
3018 data->start_class->flags |= ANYOF_UNICODE_ALL;
3019 data->start_class->flags &= ~ANYOF_EOS;
3020 cl_and(data->start_class, and_withp);
3022 flags &= ~SCF_DO_STCLASS;
3024 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3025 I32 l = STR_LEN(scan);
3026 UV uc = *((U8*)STRING(scan));
3028 /* Search for fixed substrings supports EXACT only. */
3029 if (flags & SCF_DO_SUBSTR) {
3031 SCAN_COMMIT(pRExC_state, data, minlenp);
3034 const U8 * const s = (U8 *)STRING(scan);
3035 l = utf8_length(s, s + l);
3036 uc = utf8_to_uvchr(s, NULL);
3039 if (flags & SCF_DO_SUBSTR)
3041 if (flags & SCF_DO_STCLASS_AND) {
3042 /* Check whether it is compatible with what we know already! */
3046 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3047 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3048 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3050 ANYOF_CLASS_ZERO(data->start_class);
3051 ANYOF_BITMAP_ZERO(data->start_class);
3053 ANYOF_BITMAP_SET(data->start_class, uc);
3054 data->start_class->flags &= ~ANYOF_EOS;
3055 data->start_class->flags |= ANYOF_FOLD;
3056 if (OP(scan) == EXACTFL)
3057 data->start_class->flags |= ANYOF_LOCALE;
3060 else if (flags & SCF_DO_STCLASS_OR) {
3061 if (data->start_class->flags & ANYOF_FOLD) {
3062 /* false positive possible if the class is case-folded.
3063 Assume that the locale settings are the same... */
3065 ANYOF_BITMAP_SET(data->start_class, uc);
3066 data->start_class->flags &= ~ANYOF_EOS;
3068 cl_and(data->start_class, and_withp);
3070 flags &= ~SCF_DO_STCLASS;
3072 else if (strchr((const char*)PL_varies,OP(scan))) {
3073 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3074 I32 f = flags, pos_before = 0;
3075 regnode * const oscan = scan;
3076 struct regnode_charclass_class this_class;
3077 struct regnode_charclass_class *oclass = NULL;
3078 I32 next_is_eval = 0;
3080 switch (PL_regkind[OP(scan)]) {
3081 case WHILEM: /* End of (?:...)* . */
3082 scan = NEXTOPER(scan);
3085 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3086 next = NEXTOPER(scan);
3087 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3089 maxcount = REG_INFTY;
3090 next = regnext(scan);
3091 scan = NEXTOPER(scan);
3095 if (flags & SCF_DO_SUBSTR)
3100 if (flags & SCF_DO_STCLASS) {
3102 maxcount = REG_INFTY;
3103 next = regnext(scan);
3104 scan = NEXTOPER(scan);
3107 is_inf = is_inf_internal = 1;
3108 scan = regnext(scan);
3109 if (flags & SCF_DO_SUBSTR) {
3110 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3111 data->longest = &(data->longest_float);
3113 goto optimize_curly_tail;
3115 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3116 && (scan->flags == stopparen))
3121 mincount = ARG1(scan);
3122 maxcount = ARG2(scan);
3124 next = regnext(scan);
3125 if (OP(scan) == CURLYX) {
3126 I32 lp = (data ? *(data->last_closep) : 0);
3127 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3129 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3130 next_is_eval = (OP(scan) == EVAL);
3132 if (flags & SCF_DO_SUBSTR) {
3133 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3134 pos_before = data->pos_min;
3138 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3140 data->flags |= SF_IS_INF;
3142 if (flags & SCF_DO_STCLASS) {
3143 cl_init(pRExC_state, &this_class);
3144 oclass = data->start_class;
3145 data->start_class = &this_class;
3146 f |= SCF_DO_STCLASS_AND;
3147 f &= ~SCF_DO_STCLASS_OR;
3149 /* These are the cases when once a subexpression
3150 fails at a particular position, it cannot succeed
3151 even after backtracking at the enclosing scope.
3153 XXXX what if minimal match and we are at the
3154 initial run of {n,m}? */
3155 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3156 f &= ~SCF_WHILEM_VISITED_POS;
3158 /* This will finish on WHILEM, setting scan, or on NULL: */
3159 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3160 last, data, stopparen, recursed, NULL,
3162 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3164 if (flags & SCF_DO_STCLASS)
3165 data->start_class = oclass;
3166 if (mincount == 0 || minnext == 0) {
3167 if (flags & SCF_DO_STCLASS_OR) {
3168 cl_or(pRExC_state, data->start_class, &this_class);
3170 else if (flags & SCF_DO_STCLASS_AND) {
3171 /* Switch to OR mode: cache the old value of
3172 * data->start_class */
3174 StructCopy(data->start_class, and_withp,
3175 struct regnode_charclass_class);
3176 flags &= ~SCF_DO_STCLASS_AND;
3177 StructCopy(&this_class, data->start_class,
3178 struct regnode_charclass_class);
3179 flags |= SCF_DO_STCLASS_OR;
3180 data->start_class->flags |= ANYOF_EOS;
3182 } else { /* Non-zero len */
3183 if (flags & SCF_DO_STCLASS_OR) {
3184 cl_or(pRExC_state, data->start_class, &this_class);
3185 cl_and(data->start_class, and_withp);
3187 else if (flags & SCF_DO_STCLASS_AND)
3188 cl_and(data->start_class, &this_class);
3189 flags &= ~SCF_DO_STCLASS;
3191 if (!scan) /* It was not CURLYX, but CURLY. */
3193 if ( /* ? quantifier ok, except for (?{ ... }) */
3194 (next_is_eval || !(mincount == 0 && maxcount == 1))
3195 && (minnext == 0) && (deltanext == 0)
3196 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3197 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3199 ckWARNreg(RExC_parse,
3200 "Quantifier unexpected on zero-length expression");
3203 min += minnext * mincount;
3204 is_inf_internal |= ((maxcount == REG_INFTY
3205 && (minnext + deltanext) > 0)
3206 || deltanext == I32_MAX);
3207 is_inf |= is_inf_internal;
3208 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3210 /* Try powerful optimization CURLYX => CURLYN. */
3211 if ( OP(oscan) == CURLYX && data
3212 && data->flags & SF_IN_PAR
3213 && !(data->flags & SF_HAS_EVAL)
3214 && !deltanext && minnext == 1 ) {
3215 /* Try to optimize to CURLYN. */
3216 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3217 regnode * const nxt1 = nxt;
3224 if (!strchr((const char*)PL_simple,OP(nxt))
3225 && !(PL_regkind[OP(nxt)] == EXACT
3226 && STR_LEN(nxt) == 1))
3232 if (OP(nxt) != CLOSE)
3234 if (RExC_open_parens) {
3235 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3236 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3238 /* Now we know that nxt2 is the only contents: */
3239 oscan->flags = (U8)ARG(nxt);
3241 OP(nxt1) = NOTHING; /* was OPEN. */
3244 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3245 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3246 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3247 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3248 OP(nxt + 1) = OPTIMIZED; /* was count. */
3249 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3254 /* Try optimization CURLYX => CURLYM. */
3255 if ( OP(oscan) == CURLYX && data
3256 && !(data->flags & SF_HAS_PAR)
3257 && !(data->flags & SF_HAS_EVAL)
3258 && !deltanext /* atom is fixed width */
3259 && minnext != 0 /* CURLYM can't handle zero width */
3261 /* XXXX How to optimize if data == 0? */
3262 /* Optimize to a simpler form. */
3263 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3267 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3268 && (OP(nxt2) != WHILEM))
3270 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3271 /* Need to optimize away parenths. */
3272 if (data->flags & SF_IN_PAR) {
3273 /* Set the parenth number. */
3274 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3276 if (OP(nxt) != CLOSE)
3277 FAIL("Panic opt close");
3278 oscan->flags = (U8)ARG(nxt);
3279 if (RExC_open_parens) {
3280 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3281 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3283 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3284 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3287 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3288 OP(nxt + 1) = OPTIMIZED; /* was count. */
3289 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3290 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3293 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3294 regnode *nnxt = regnext(nxt1);
3297 if (reg_off_by_arg[OP(nxt1)])
3298 ARG_SET(nxt1, nxt2 - nxt1);
3299 else if (nxt2 - nxt1 < U16_MAX)
3300 NEXT_OFF(nxt1) = nxt2 - nxt1;
3302 OP(nxt) = NOTHING; /* Cannot beautify */
3307 /* Optimize again: */
3308 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3309 NULL, stopparen, recursed, NULL, 0,depth+1);
3314 else if ((OP(oscan) == CURLYX)
3315 && (flags & SCF_WHILEM_VISITED_POS)
3316 /* See the comment on a similar expression above.
3317 However, this time it not a subexpression
3318 we care about, but the expression itself. */
3319 && (maxcount == REG_INFTY)
3320 && data && ++data->whilem_c < 16) {
3321 /* This stays as CURLYX, we can put the count/of pair. */
3322 /* Find WHILEM (as in regexec.c) */
3323 regnode *nxt = oscan + NEXT_OFF(oscan);
3325 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3327 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3328 | (RExC_whilem_seen << 4)); /* On WHILEM */
3330 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3332 if (flags & SCF_DO_SUBSTR) {
3333 SV *last_str = NULL;
3334 int counted = mincount != 0;
3336 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3337 #if defined(SPARC64_GCC_WORKAROUND)
3340 const char *s = NULL;
3343 if (pos_before >= data->last_start_min)
3346 b = data->last_start_min;
3349 s = SvPV_const(data->last_found, l);
3350 old = b - data->last_start_min;
3353 I32 b = pos_before >= data->last_start_min
3354 ? pos_before : data->last_start_min;
3356 const char * const s = SvPV_const(data->last_found, l);
3357 I32 old = b - data->last_start_min;
3361 old = utf8_hop((U8*)s, old) - (U8*)s;
3364 /* Get the added string: */
3365 last_str = newSVpvn_utf8(s + old, l, UTF);
3366 if (deltanext == 0 && pos_before == b) {
3367 /* What was added is a constant string */
3369 SvGROW(last_str, (mincount * l) + 1);
3370 repeatcpy(SvPVX(last_str) + l,
3371 SvPVX_const(last_str), l, mincount - 1);
3372 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3373 /* Add additional parts. */
3374 SvCUR_set(data->last_found,
3375 SvCUR(data->last_found) - l);
3376 sv_catsv(data->last_found, last_str);
3378 SV * sv = data->last_found;
3380 SvUTF8(sv) && SvMAGICAL(sv) ?
3381 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3382 if (mg && mg->mg_len >= 0)
3383 mg->mg_len += CHR_SVLEN(last_str) - l;
3385 data->last_end += l * (mincount - 1);
3388 /* start offset must point into the last copy */
3389 data->last_start_min += minnext * (mincount - 1);
3390 data->last_start_max += is_inf ? I32_MAX
3391 : (maxcount - 1) * (minnext + data->pos_delta);
3394 /* It is counted once already... */
3395 data->pos_min += minnext * (mincount - counted);
3396 data->pos_delta += - counted * deltanext +
3397 (minnext + deltanext) * maxcount - minnext * mincount;
3398 if (mincount != maxcount) {
3399 /* Cannot extend fixed substrings found inside
3401 SCAN_COMMIT(pRExC_state,data,minlenp);
3402 if (mincount && last_str) {
3403 SV * const sv = data->last_found;
3404 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3405 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3409 sv_setsv(sv, last_str);
3410 data->last_end = data->pos_min;
3411 data->last_start_min =
3412 data->pos_min - CHR_SVLEN(last_str);
3413 data->last_start_max = is_inf
3415 : data->pos_min + data->pos_delta
3416 - CHR_SVLEN(last_str);
3418 data->longest = &(data->longest_float);
3420 SvREFCNT_dec(last_str);
3422 if (data && (fl & SF_HAS_EVAL))
3423 data->flags |= SF_HAS_EVAL;
3424 optimize_curly_tail:
3425 if (OP(oscan) != CURLYX) {
3426 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3428 NEXT_OFF(oscan) += NEXT_OFF(next);
3431 default: /* REF and CLUMP only? */
3432 if (flags & SCF_DO_SUBSTR) {
3433 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3434 data->longest = &(data->longest_float);
3436 is_inf = is_inf_internal = 1;
3437 if (flags & SCF_DO_STCLASS_OR)
3438 cl_anything(pRExC_state, data->start_class);
3439 flags &= ~SCF_DO_STCLASS;
3443 else if (OP(scan) == LNBREAK) {
3444 if (flags & SCF_DO_STCLASS) {
3446 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3447 if (flags & SCF_DO_STCLASS_AND) {
3448 for (value = 0; value < 256; value++)
3449 if (!is_VERTWS_cp(value))
3450 ANYOF_BITMAP_CLEAR(data->start_class, value);
3453 for (value = 0; value < 256; value++)
3454 if (is_VERTWS_cp(value))
3455 ANYOF_BITMAP_SET(data->start_class, value);
3457 if (flags & SCF_DO_STCLASS_OR)
3458 cl_and(data->start_class, and_withp);
3459 flags &= ~SCF_DO_STCLASS;
3463 if (flags & SCF_DO_SUBSTR) {
3464 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3466 data->pos_delta += 1;
3467 data->longest = &(data->longest_float);
3471 else if (OP(scan) == FOLDCHAR) {
3472 int d = ARG(scan)==0xDF ? 1 : 2;
3473 flags &= ~SCF_DO_STCLASS;
3476 if (flags & SCF_DO_SUBSTR) {
3477 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3479 data->pos_delta += d;
3480 data->longest = &(data->longest_float);
3483 else if (strchr((const char*)PL_simple,OP(scan))) {
3486 if (flags & SCF_DO_SUBSTR) {
3487 SCAN_COMMIT(pRExC_state,data,minlenp);
3491 if (flags & SCF_DO_STCLASS) {
3492 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3494 /* Some of the logic below assumes that switching
3495 locale on will only add false positives. */
3496 switch (PL_regkind[OP(scan)]) {
3500 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3501 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3502 cl_anything(pRExC_state, data->start_class);
3505 if (OP(scan) == SANY)
3507 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3508 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3509 || (data->start_class->flags & ANYOF_CLASS));
3510 cl_anything(pRExC_state, data->start_class);
3512 if (flags & SCF_DO_STCLASS_AND || !value)
3513 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3516 if (flags & SCF_DO_STCLASS_AND)
3517 cl_and(data->start_class,
3518 (struct regnode_charclass_class*)scan);
3520 cl_or(pRExC_state, data->start_class,
3521 (struct regnode_charclass_class*)scan);
3524 if (flags & SCF_DO_STCLASS_AND) {
3525 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3526 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3527 for (value = 0; value < 256; value++)
3528 if (!isALNUM(value))
3529 ANYOF_BITMAP_CLEAR(data->start_class, value);
3533 if (data->start_class->flags & ANYOF_LOCALE)
3534 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3536 for (value = 0; value < 256; value++)
3538 ANYOF_BITMAP_SET(data->start_class, value);
3543 if (flags & SCF_DO_STCLASS_AND) {
3544 if (data->start_class->flags & ANYOF_LOCALE)
3545 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3548 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3549 data->start_class->flags |= ANYOF_LOCALE;
3553 if (flags & SCF_DO_STCLASS_AND) {
3554 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3555 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3556 for (value = 0; value < 256; value++)
3558 ANYOF_BITMAP_CLEAR(data->start_class, value);
3562 if (data->start_class->flags & ANYOF_LOCALE)
3563 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3565 for (value = 0; value < 256; value++)
3566 if (!isALNUM(value))
3567 ANYOF_BITMAP_SET(data->start_class, value);
3572 if (flags & SCF_DO_STCLASS_AND) {
3573 if (data->start_class->flags & ANYOF_LOCALE)
3574 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3577 data->start_class->flags |= ANYOF_LOCALE;
3578 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3582 if (flags & SCF_DO_STCLASS_AND) {
3583 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3584 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3585 for (value = 0; value < 256; value++)
3586 if (!isSPACE(value))
3587 ANYOF_BITMAP_CLEAR(data->start_class, value);
3591 if (data->start_class->flags & ANYOF_LOCALE)
3592 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3594 for (value = 0; value < 256; value++)
3596 ANYOF_BITMAP_SET(data->start_class, value);
3601 if (flags & SCF_DO_STCLASS_AND) {
3602 if (data->start_class->flags & ANYOF_LOCALE)
3603 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3606 data->start_class->flags |= ANYOF_LOCALE;
3607 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3611 if (flags & SCF_DO_STCLASS_AND) {
3612 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3613 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3614 for (value = 0; value < 256; value++)
3616 ANYOF_BITMAP_CLEAR(data->start_class, value);
3620 if (data->start_class->flags & ANYOF_LOCALE)
3621 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3623 for (value = 0; value < 256; value++)
3624 if (!isSPACE(value))
3625 ANYOF_BITMAP_SET(data->start_class, value);
3630 if (flags & SCF_DO_STCLASS_AND) {
3631 if (data->start_class->flags & ANYOF_LOCALE) {
3632 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3633 for (value = 0; value < 256; value++)
3634 if (!isSPACE(value))
3635 ANYOF_BITMAP_CLEAR(data->start_class, value);
3639 data->start_class->flags |= ANYOF_LOCALE;
3640 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3644 if (flags & SCF_DO_STCLASS_AND) {
3645 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3646 for (value = 0; value < 256; value++)
3647 if (!isDIGIT(value))
3648 ANYOF_BITMAP_CLEAR(data->start_class, value);
3651 if (data->start_class->flags & ANYOF_LOCALE)
3652 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3654 for (value = 0; value < 256; value++)
3656 ANYOF_BITMAP_SET(data->start_class, value);
3661 if (flags & SCF_DO_STCLASS_AND) {
3662 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3663 for (value = 0; value < 256; value++)
3665 ANYOF_BITMAP_CLEAR(data->start_class, value);
3668 if (data->start_class->flags & ANYOF_LOCALE)
3669 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3671 for (value = 0; value < 256; value++)
3672 if (!isDIGIT(value))
3673 ANYOF_BITMAP_SET(data->start_class, value);
3677 CASE_SYNST_FNC(VERTWS);
3678 CASE_SYNST_FNC(HORIZWS);
3681 if (flags & SCF_DO_STCLASS_OR)
3682 cl_and(data->start_class, and_withp);
3683 flags &= ~SCF_DO_STCLASS;
3686 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3687 data->flags |= (OP(scan) == MEOL
3691 else if ( PL_regkind[OP(scan)] == BRANCHJ
3692 /* Lookbehind, or need to calculate parens/evals/stclass: */
3693 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3694 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3695 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3696 || OP(scan) == UNLESSM )
3698 /* Negative Lookahead/lookbehind
3699 In this case we can't do fixed string optimisation.
3702 I32 deltanext, minnext, fake = 0;
3704 struct regnode_charclass_class intrnl;
3707 data_fake.flags = 0;
3709 data_fake.whilem_c = data->whilem_c;
3710 data_fake.last_closep = data->last_closep;
3713 data_fake.last_closep = &fake;
3714 data_fake.pos_delta = delta;
3715 if ( flags & SCF_DO_STCLASS && !scan->flags
3716 && OP(scan) == IFMATCH ) { /* Lookahead */
3717 cl_init(pRExC_state, &intrnl);
3718 data_fake.start_class = &intrnl;
3719 f |= SCF_DO_STCLASS_AND;
3721 if (flags & SCF_WHILEM_VISITED_POS)
3722 f |= SCF_WHILEM_VISITED_POS;
3723 next = regnext(scan);
3724 nscan = NEXTOPER(NEXTOPER(scan));
3725 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3726 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3729 FAIL("Variable length lookbehind not implemented");
3731 else if (minnext > (I32)U8_MAX) {
3732 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3734 scan->flags = (U8)minnext;
3737 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3739 if (data_fake.flags & SF_HAS_EVAL)
3740 data->flags |= SF_HAS_EVAL;
3741 data->whilem_c = data_fake.whilem_c;
3743 if (f & SCF_DO_STCLASS_AND) {
3744 if (flags & SCF_DO_STCLASS_OR) {
3745 /* OR before, AND after: ideally we would recurse with
3746 * data_fake to get the AND applied by study of the
3747 * remainder of the pattern, and then derecurse;
3748 * *** HACK *** for now just treat as "no information".
3749 * See [perl #56690].
3751 cl_init(pRExC_state, data->start_class);
3753 /* AND before and after: combine and continue */
3754 const int was = (data->start_class->flags & ANYOF_EOS);
3756 cl_and(data->start_class, &intrnl);
3758 data->start_class->flags |= ANYOF_EOS;
3762 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3764 /* Positive Lookahead/lookbehind
3765 In this case we can do fixed string optimisation,
3766 but we must be careful about it. Note in the case of
3767 lookbehind the positions will be offset by the minimum
3768 length of the pattern, something we won't know about
3769 until after the recurse.
3771 I32 deltanext, fake = 0;
3773 struct regnode_charclass_class intrnl;
3775 /* We use SAVEFREEPV so that when the full compile
3776 is finished perl will clean up the allocated
3777 minlens when its all done. This was we don't
3778 have to worry about freeing them when we know
3779 they wont be used, which would be a pain.
3782 Newx( minnextp, 1, I32 );
3783 SAVEFREEPV(minnextp);
3786 StructCopy(data, &data_fake, scan_data_t);
3787 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3790 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3791 data_fake.last_found=newSVsv(data->last_found);
3795 data_fake.last_closep = &fake;
3796 data_fake.flags = 0;
3797 data_fake.pos_delta = delta;
3799 data_fake.flags |= SF_IS_INF;
3800 if ( flags & SCF_DO_STCLASS && !scan->flags
3801 && OP(scan) == IFMATCH ) { /* Lookahead */
3802 cl_init(pRExC_state, &intrnl);
3803 data_fake.start_class = &intrnl;
3804 f |= SCF_DO_STCLASS_AND;
3806 if (flags & SCF_WHILEM_VISITED_POS)
3807 f |= SCF_WHILEM_VISITED_POS;
3808 next = regnext(scan);
3809 nscan = NEXTOPER(NEXTOPER(scan));
3811 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3812 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3815 FAIL("Variable length lookbehind not implemented");
3817 else if (*minnextp > (I32)U8_MAX) {
3818 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3820 scan->flags = (U8)*minnextp;
3825 if (f & SCF_DO_STCLASS_AND) {
3826 const int was = (data->start_class->flags & ANYOF_EOS);
3828 cl_and(data->start_class, &intrnl);
3830 data->start_class->flags |= ANYOF_EOS;
3833 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3835 if (data_fake.flags & SF_HAS_EVAL)
3836 data->flags |= SF_HAS_EVAL;
3837 data->whilem_c = data_fake.whilem_c;
3838 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3839 if (RExC_rx->minlen<*minnextp)
3840 RExC_rx->minlen=*minnextp;
3841 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3842 SvREFCNT_dec(data_fake.last_found);
3844 if ( data_fake.minlen_fixed != minlenp )
3846 data->offset_fixed= data_fake.offset_fixed;
3847 data->minlen_fixed= data_fake.minlen_fixed;
3848 data->lookbehind_fixed+= scan->flags;
3850 if ( data_fake.minlen_float != minlenp )
3852 data->minlen_float= data_fake.minlen_float;
3853 data->offset_float_min=data_fake.offset_float_min;
3854 data->offset_float_max=data_fake.offset_float_max;
3855 data->lookbehind_float+= scan->flags;
3864 else if (OP(scan) == OPEN) {
3865 if (stopparen != (I32)ARG(scan))
3868 else if (OP(scan) == CLOSE) {
3869 if (stopparen == (I32)ARG(scan)) {
3872 if ((I32)ARG(scan) == is_par) {
3873 next = regnext(scan);
3875 if ( next && (OP(next) != WHILEM) && next < last)
3876 is_par = 0; /* Disable optimization */
3879 *(data->last_closep) = ARG(scan);
3881 else if (OP(scan) == EVAL) {
3883 data->flags |= SF_HAS_EVAL;
3885 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3886 if (flags & SCF_DO_SUBSTR) {
3887 SCAN_COMMIT(pRExC_state,data,minlenp);
3888 flags &= ~SCF_DO_SUBSTR;
3890 if (data && OP(scan)==ACCEPT) {
3891 data->flags |= SCF_SEEN_ACCEPT;
3896 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3898 if (flags & SCF_DO_SUBSTR) {
3899 SCAN_COMMIT(pRExC_state,data,minlenp);
3900 data->longest = &(data->longest_float);
3902 is_inf = is_inf_internal = 1;
3903 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3904 cl_anything(pRExC_state, data->start_class);
3905 flags &= ~SCF_DO_STCLASS;
3907 else if (OP(scan) == GPOS) {
3908 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3909 !(delta || is_inf || (data && data->pos_delta)))
3911 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3912 RExC_rx->extflags |= RXf_ANCH_GPOS;
3913 if (RExC_rx->gofs < (U32)min)
3914 RExC_rx->gofs = min;
3916 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3920 #ifdef TRIE_STUDY_OPT
3921 #ifdef FULL_TRIE_STUDY
3922 else if (PL_regkind[OP(scan)] == TRIE) {
3923 /* NOTE - There is similar code to this block above for handling
3924 BRANCH nodes on the initial study. If you change stuff here
3926 regnode *trie_node= scan;
3927 regnode *tail= regnext(scan);
3928 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3929 I32 max1 = 0, min1 = I32_MAX;
3930 struct regnode_charclass_class accum;
3932 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3933 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3934 if (flags & SCF_DO_STCLASS)
3935 cl_init_zero(pRExC_state, &accum);
3941 const regnode *nextbranch= NULL;
3944 for ( word=1 ; word <= trie->wordcount ; word++)
3946 I32 deltanext=0, minnext=0, f = 0, fake;
3947 struct regnode_charclass_class this_class;
3949 data_fake.flags = 0;
3951 data_fake.whilem_c = data->whilem_c;
3952 data_fake.last_closep = data->last_closep;
3955 data_fake.last_closep = &fake;
3956 data_fake.pos_delta = delta;
3957 if (flags & SCF_DO_STCLASS) {
3958 cl_init(pRExC_state, &this_class);
3959 data_fake.start_class = &this_class;
3960 f = SCF_DO_STCLASS_AND;
3962 if (flags & SCF_WHILEM_VISITED_POS)
3963 f |= SCF_WHILEM_VISITED_POS;
3965 if (trie->jump[word]) {
3967 nextbranch = trie_node + trie->jump[0];
3968 scan= trie_node + trie->jump[word];
3969 /* We go from the jump point to the branch that follows
3970 it. Note this means we need the vestigal unused branches
3971 even though they arent otherwise used.
3973 minnext = study_chunk(pRExC_state, &scan, minlenp,
3974 &deltanext, (regnode *)nextbranch, &data_fake,
3975 stopparen, recursed, NULL, f,depth+1);
3977 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3978 nextbranch= regnext((regnode*)nextbranch);
3980 if (min1 > (I32)(minnext + trie->minlen))
3981 min1 = minnext + trie->minlen;
3982 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3983 max1 = minnext + deltanext + trie->maxlen;
3984 if (deltanext == I32_MAX)
3985 is_inf = is_inf_internal = 1;
3987 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3989 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3990 if ( stopmin > min + min1)
3991 stopmin = min + min1;
3992 flags &= ~SCF_DO_SUBSTR;
3994 data->flags |= SCF_SEEN_ACCEPT;
3997 if (data_fake.flags & SF_HAS_EVAL)
3998 data->flags |= SF_HAS_EVAL;
3999 data->whilem_c = data_fake.whilem_c;
4001 if (flags & SCF_DO_STCLASS)
4002 cl_or(pRExC_state, &accum, &this_class);
4005 if (flags & SCF_DO_SUBSTR) {
4006 data->pos_min += min1;
4007 data->pos_delta += max1 - min1;
4008 if (max1 != min1 || is_inf)
4009 data->longest = &(data->longest_float);
4012 delta += max1 - min1;
4013 if (flags & SCF_DO_STCLASS_OR) {
4014 cl_or(pRExC_state, data->start_class, &accum);
4016 cl_and(data->start_class, and_withp);
4017 flags &= ~SCF_DO_STCLASS;
4020 else if (flags & SCF_DO_STCLASS_AND) {
4022 cl_and(data->start_class, &accum);
4023 flags &= ~SCF_DO_STCLASS;
4026 /* Switch to OR mode: cache the old value of
4027 * data->start_class */
4029 StructCopy(data->start_class, and_withp,
4030 struct regnode_charclass_class);
4031 flags &= ~SCF_DO_STCLASS_AND;
4032 StructCopy(&accum, data->start_class,
4033 struct regnode_charclass_class);
4034 flags |= SCF_DO_STCLASS_OR;
4035 data->start_class->flags |= ANYOF_EOS;
4042 else if (PL_regkind[OP(scan)] == TRIE) {
4043 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4046 min += trie->minlen;
4047 delta += (trie->maxlen - trie->minlen);
4048 flags &= ~SCF_DO_STCLASS; /* xxx */
4049 if (flags & SCF_DO_SUBSTR) {
4050 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4051 data->pos_min += trie->minlen;
4052 data->pos_delta += (trie->maxlen - trie->minlen);
4053 if (trie->maxlen != trie->minlen)
4054 data->longest = &(data->longest_float);
4056 if (trie->jump) /* no more substrings -- for now /grr*/
4057 flags &= ~SCF_DO_SUBSTR;
4059 #endif /* old or new */
4060 #endif /* TRIE_STUDY_OPT */
4062 /* Else: zero-length, ignore. */
4063 scan = regnext(scan);
4068 stopparen = frame->stop;
4069 frame = frame->prev;
4070 goto fake_study_recurse;
4075 DEBUG_STUDYDATA("pre-fin:",data,depth);
4078 *deltap = is_inf_internal ? I32_MAX : delta;
4079 if (flags & SCF_DO_SUBSTR && is_inf)
4080 data->pos_delta = I32_MAX - data->pos_min;
4081 if (is_par > (I32)U8_MAX)
4083 if (is_par && pars==1 && data) {
4084 data->flags |= SF_IN_PAR;
4085 data->flags &= ~SF_HAS_PAR;
4087 else if (pars && data) {
4088 data->flags |= SF_HAS_PAR;
4089 data->flags &= ~SF_IN_PAR;
4091 if (flags & SCF_DO_STCLASS_OR)
4092 cl_and(data->start_class, and_withp);
4093 if (flags & SCF_TRIE_RESTUDY)
4094 data->flags |= SCF_TRIE_RESTUDY;
4096 DEBUG_STUDYDATA("post-fin:",data,depth);
4098 return min < stopmin ? min : stopmin;
4102 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4104 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4106 PERL_ARGS_ASSERT_ADD_DATA;
4108 Renewc(RExC_rxi->data,
4109 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4110 char, struct reg_data);
4112 Renew(RExC_rxi->data->what, count + n, U8);
4114 Newx(RExC_rxi->data->what, n, U8);
4115 RExC_rxi->data->count = count + n;
4116 Copy(s, RExC_rxi->data->what + count, n, U8);
4120 /*XXX: todo make this not included in a non debugging perl */
4121 #ifndef PERL_IN_XSUB_RE
4123 Perl_reginitcolors(pTHX)
4126 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4128 char *t = savepv(s);
4132 t = strchr(t, '\t');
4138 PL_colors[i] = t = (char *)"";
4143 PL_colors[i++] = (char *)"";
4150 #ifdef TRIE_STUDY_OPT
4151 #define CHECK_RESTUDY_GOTO \
4153 (data.flags & SCF_TRIE_RESTUDY) \
4157 #define CHECK_RESTUDY_GOTO
4161 - pregcomp - compile a regular expression into internal code
4163 * We can't allocate space until we know how big the compiled form will be,
4164 * but we can't compile it (and thus know how big it is) until we've got a
4165 * place to put the code. So we cheat: we compile it twice, once with code
4166 * generation turned off and size counting turned on, and once "for real".
4167 * This also means that we don't allocate space until we are sure that the
4168 * thing really will compile successfully, and we never have to move the
4169 * code and thus invalidate pointers into it. (Note that it has to be in
4170 * one piece because free() must be able to free it all.) [NB: not true in perl]
4172 * Beware that the optimization-preparation code in here knows about some
4173 * of the structure of the compiled regexp. [I'll say.]
4178 #ifndef PERL_IN_XSUB_RE
4179 #define RE_ENGINE_PTR &reh_regexp_engine
4181 extern const struct regexp_engine my_reg_engine;
4182 #define RE_ENGINE_PTR &my_reg_engine
4185 #ifndef PERL_IN_XSUB_RE
4187 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4190 HV * const table = GvHV(PL_hintgv);
4192 PERL_ARGS_ASSERT_PREGCOMP;
4194 /* Dispatch a request to compile a regexp to correct
4197 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4198 GET_RE_DEBUG_FLAGS_DECL;
4199 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4200 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4202 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4205 return CALLREGCOMP_ENG(eng, pattern, flags);
4208 return Perl_re_compile(aTHX_ pattern, flags);
4213 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4218 register regexp_internal *ri;
4220 char *exp = SvPV(pattern, plen);
4221 char* xend = exp + plen;
4228 RExC_state_t RExC_state;
4229 RExC_state_t * const pRExC_state = &RExC_state;
4230 #ifdef TRIE_STUDY_OPT
4232 RExC_state_t copyRExC_state;
4234 GET_RE_DEBUG_FLAGS_DECL;
4236 PERL_ARGS_ASSERT_RE_COMPILE;
4238 DEBUG_r(if (!PL_colorset) reginitcolors());
4240 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4243 SV *dsv= sv_newmortal();
4244 RE_PV_QUOTED_DECL(s, RExC_utf8,
4245 dsv, exp, plen, 60);
4246 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4247 PL_colors[4],PL_colors[5],s);
4252 RExC_flags = pm_flags;
4256 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4257 RExC_seen_evals = 0;
4260 /* First pass: determine size, legality. */
4268 RExC_emit = &PL_regdummy;
4269 RExC_whilem_seen = 0;
4270 RExC_open_parens = NULL;
4271 RExC_close_parens = NULL;
4273 RExC_paren_names = NULL;
4275 RExC_paren_name_list = NULL;
4277 RExC_recurse = NULL;
4278 RExC_recurse_count = 0;
4280 #if 0 /* REGC() is (currently) a NOP at the first pass.
4281 * Clever compilers notice this and complain. --jhi */
4282 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4284 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4285 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4286 RExC_precomp = NULL;
4289 if (RExC_utf8 && !RExC_orig_utf8) {
4290 /* It's possible to write a regexp in ascii that represents Unicode
4291 codepoints outside of the byte range, such as via \x{100}. If we
4292 detect such a sequence we have to convert the entire pattern to utf8
4293 and then recompile, as our sizing calculation will have been based
4294 on 1 byte == 1 character, but we will need to use utf8 to encode
4295 at least some part of the pattern, and therefore must convert the whole
4297 XXX: somehow figure out how to make this less expensive...
4300 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4301 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4302 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4304 RExC_orig_utf8 = RExC_utf8;
4306 goto redo_first_pass;
4309 PerlIO_printf(Perl_debug_log,
4310 "Required size %"IVdf" nodes\n"
4311 "Starting second pass (creation)\n",
4314 RExC_lastparse=NULL;
4316 /* Small enough for pointer-storage convention?
4317 If extralen==0, this means that we will not need long jumps. */
4318 if (RExC_size >= 0x10000L && RExC_extralen)
4319 RExC_size += RExC_extralen;
4322 if (RExC_whilem_seen > 15)
4323 RExC_whilem_seen = 15;
4325 /* Allocate space and zero-initialize. Note, the two step process
4326 of zeroing when in debug mode, thus anything assigned has to
4327 happen after that */
4328 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4329 r = (struct regexp*)SvANY(rx);
4330 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4331 char, regexp_internal);
4332 if ( r == NULL || ri == NULL )
4333 FAIL("Regexp out of space");
4335 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4336 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4338 /* bulk initialize base fields with 0. */
4339 Zero(ri, sizeof(regexp_internal), char);
4342 /* non-zero initialization begins here */
4344 r->engine= RE_ENGINE_PTR;
4345 r->extflags = pm_flags;
4347 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4348 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4349 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4350 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4351 >> RXf_PMf_STD_PMMOD_SHIFT);
4352 const char *fptr = STD_PAT_MODS; /*"msix"*/
4354 const STRLEN wraplen = plen + has_minus + has_p + has_runon
4355 + (sizeof(STD_PAT_MODS) - 1)
4356 + (sizeof("(?:)") - 1);
4358 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4359 SvCUR_set(rx, wraplen);
4361 SvFLAGS(rx) |= SvUTF8(pattern);
4364 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4366 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4367 char *colon = r + 1;
4370 while((ch = *fptr++)) {
4384 Copy(RExC_precomp, p, plen, char);
4385 assert ((RX_WRAPPED(rx) - p) < 16);
4386 r->pre_prefix = p - RX_WRAPPED(rx);
4395 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4397 if (RExC_seen & REG_SEEN_RECURSE) {
4398 Newxz(RExC_open_parens, RExC_npar,regnode *);
4399 SAVEFREEPV(RExC_open_parens);
4400 Newxz(RExC_close_parens,RExC_npar,regnode *);
4401 SAVEFREEPV(RExC_close_parens);
4404 /* Useful during FAIL. */
4405 #ifdef RE_TRACK_PATTERN_OFFSETS
4406 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4407 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4408 "%s %"UVuf" bytes for offset annotations.\n",
4409 ri->u.offsets ? "Got" : "Couldn't get",
4410 (UV)((2*RExC_size+1) * sizeof(U32))));
4412 SetProgLen(ri,RExC_size);
4416 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4418 /* Second pass: emit code. */
4419 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4424 RExC_emit_start = ri->program;
4425 RExC_emit = ri->program;
4426 RExC_emit_bound = ri->program + RExC_size + 1;
4428 /* Store the count of eval-groups for security checks: */
4429 RExC_rx->seen_evals = RExC_seen_evals;
4430 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4431 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4435 /* XXXX To minimize changes to RE engine we always allocate
4436 3-units-long substrs field. */
4437 Newx(r->substrs, 1, struct reg_substr_data);
4438 if (RExC_recurse_count) {
4439 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4440 SAVEFREEPV(RExC_recurse);
4444 r->minlen = minlen = sawplus = sawopen = 0;
4445 Zero(r->substrs, 1, struct reg_substr_data);
4447 #ifdef TRIE_STUDY_OPT
4449 StructCopy(&zero_scan_data, &data, scan_data_t);
4450 copyRExC_state = RExC_state;
4453 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4455 RExC_state = copyRExC_state;
4456 if (seen & REG_TOP_LEVEL_BRANCHES)
4457 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4459 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4460 if (data.last_found) {
4461 SvREFCNT_dec(data.longest_fixed);
4462 SvREFCNT_dec(data.longest_float);
4463 SvREFCNT_dec(data.last_found);
4465 StructCopy(&zero_scan_data, &data, scan_data_t);
4468 StructCopy(&zero_scan_data, &data, scan_data_t);
4471 /* Dig out information for optimizations. */
4472 r->extflags = RExC_flags; /* was pm_op */
4473 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4476 SvUTF8_on(rx); /* Unicode in it? */
4477 ri->regstclass = NULL;
4478 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4479 r->intflags |= PREGf_NAUGHTY;
4480 scan = ri->program + 1; /* First BRANCH. */
4482 /* testing for BRANCH here tells us whether there is "must appear"
4483 data in the pattern. If there is then we can use it for optimisations */
4484 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4486 STRLEN longest_float_length, longest_fixed_length;
4487 struct regnode_charclass_class ch_class; /* pointed to by data */
4489 I32 last_close = 0; /* pointed to by data */
4490 regnode *first= scan;
4491 regnode *first_next= regnext(first);
4494 * Skip introductions and multiplicators >= 1
4495 * so that we can extract the 'meat' of the pattern that must
4496 * match in the large if() sequence following.
4497 * NOTE that EXACT is NOT covered here, as it is normally
4498 * picked up by the optimiser separately.
4500 * This is unfortunate as the optimiser isnt handling lookahead
4501 * properly currently.
4504 while ((OP(first) == OPEN && (sawopen = 1)) ||
4505 /* An OR of *one* alternative - should not happen now. */
4506 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4507 /* for now we can't handle lookbehind IFMATCH*/
4508 (OP(first) == IFMATCH && !first->flags) ||
4509 (OP(first) == PLUS) ||
4510 (OP(first) == MINMOD) ||
4511 /* An {n,m} with n>0 */
4512 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4513 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4516 * the only op that could be a regnode is PLUS, all the rest
4517 * will be regnode_1 or regnode_2.
4520 if (OP(first) == PLUS)
4523 first += regarglen[OP(first)];
4525 first = NEXTOPER(first);
4526 first_next= regnext(first);
4529 /* Starting-point info. */
4531 DEBUG_PEEP("first:",first,0);
4532 /* Ignore EXACT as we deal with it later. */
4533 if (PL_regkind[OP(first)] == EXACT) {
4534 if (OP(first) == EXACT)
4535 NOOP; /* Empty, get anchored substr later. */
4536 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4537 ri->regstclass = first;
4540 else if (PL_regkind[OP(first)] == TRIE &&
4541 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4544 /* this can happen only on restudy */
4545 if ( OP(first) == TRIE ) {
4546 struct regnode_1 *trieop = (struct regnode_1 *)
4547 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4548 StructCopy(first,trieop,struct regnode_1);
4549 trie_op=(regnode *)trieop;
4551 struct regnode_charclass *trieop = (struct regnode_charclass *)
4552 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4553 StructCopy(first,trieop,struct regnode_charclass);
4554 trie_op=(regnode *)trieop;
4557 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4558 ri->regstclass = trie_op;
4561 else if (strchr((const char*)PL_simple,OP(first)))
4562 ri->regstclass = first;
4563 else if (PL_regkind[OP(first)] == BOUND ||
4564 PL_regkind[OP(first)] == NBOUND)
4565 ri->regstclass = first;
4566 else if (PL_regkind[OP(first)] == BOL) {
4567 r->extflags |= (OP(first) == MBOL
4569 : (OP(first) == SBOL
4572 first = NEXTOPER(first);
4575 else if (OP(first) == GPOS) {
4576 r->extflags |= RXf_ANCH_GPOS;
4577 first = NEXTOPER(first);
4580 else if ((!sawopen || !RExC_sawback) &&
4581 (OP(first) == STAR &&
4582 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4583 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4585 /* turn .* into ^.* with an implied $*=1 */
4587 (OP(NEXTOPER(first)) == REG_ANY)
4590 r->extflags |= type;
4591 r->intflags |= PREGf_IMPLICIT;
4592 first = NEXTOPER(first);
4595 if (sawplus && (!sawopen || !RExC_sawback)
4596 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4597 /* x+ must match at the 1st pos of run of x's */
4598 r->intflags |= PREGf_SKIP;
4600 /* Scan is after the zeroth branch, first is atomic matcher. */
4601 #ifdef TRIE_STUDY_OPT
4604 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4605 (IV)(first - scan + 1))
4609 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4610 (IV)(first - scan + 1))
4616 * If there's something expensive in the r.e., find the
4617 * longest literal string that must appear and make it the
4618 * regmust. Resolve ties in favor of later strings, since
4619 * the regstart check works with the beginning of the r.e.
4620 * and avoiding duplication strengthens checking. Not a
4621 * strong reason, but sufficient in the absence of others.
4622 * [Now we resolve ties in favor of the earlier string if
4623 * it happens that c_offset_min has been invalidated, since the
4624 * earlier string may buy us something the later one won't.]
4627 data.longest_fixed = newSVpvs("");
4628 data.longest_float = newSVpvs("");
4629 data.last_found = newSVpvs("");
4630 data.longest = &(data.longest_fixed);
4632 if (!ri->regstclass) {
4633 cl_init(pRExC_state, &ch_class);
4634 data.start_class = &ch_class;
4635 stclass_flag = SCF_DO_STCLASS_AND;
4636 } else /* XXXX Check for BOUND? */
4638 data.last_closep = &last_close;
4640 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4641 &data, -1, NULL, NULL,
4642 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4648 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4649 && data.last_start_min == 0 && data.last_end > 0
4650 && !RExC_seen_zerolen
4651 && !(RExC_seen & REG_SEEN_VERBARG)
4652 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4653 r->extflags |= RXf_CHECK_ALL;
4654 scan_commit(pRExC_state, &data,&minlen,0);
4655 SvREFCNT_dec(data.last_found);
4657 /* Note that code very similar to this but for anchored string
4658 follows immediately below, changes may need to be made to both.
4661 longest_float_length = CHR_SVLEN(data.longest_float);
4662 if (longest_float_length
4663 || (data.flags & SF_FL_BEFORE_EOL
4664 && (!(data.flags & SF_FL_BEFORE_MEOL)
4665 || (RExC_flags & RXf_PMf_MULTILINE))))
4669 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4670 && data.offset_fixed == data.offset_float_min
4671 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4672 goto remove_float; /* As in (a)+. */
4674 /* copy the information about the longest float from the reg_scan_data
4675 over to the program. */
4676 if (SvUTF8(data.longest_float)) {
4677 r->float_utf8 = data.longest_float;
4678 r->float_substr = NULL;
4680 r->float_substr = data.longest_float;
4681 r->float_utf8 = NULL;
4683 /* float_end_shift is how many chars that must be matched that
4684 follow this item. We calculate it ahead of time as once the
4685 lookbehind offset is added in we lose the ability to correctly
4687 ml = data.minlen_float ? *(data.minlen_float)
4688 : (I32)longest_float_length;
4689 r->float_end_shift = ml - data.offset_float_min
4690 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4691 + data.lookbehind_float;
4692 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4693 r->float_max_offset = data.offset_float_max;
4694 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4695 r->float_max_offset -= data.lookbehind_float;
4697 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4698 && (!(data.flags & SF_FL_BEFORE_MEOL)
4699 || (RExC_flags & RXf_PMf_MULTILINE)));
4700 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4704 r->float_substr = r->float_utf8 = NULL;
4705 SvREFCNT_dec(data.longest_float);
4706 longest_float_length = 0;
4709 /* Note that code very similar to this but for floating string
4710 is immediately above, changes may need to be made to both.
4713 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4714 if (longest_fixed_length
4715 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4716 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4717 || (RExC_flags & RXf_PMf_MULTILINE))))
4721 /* copy the information about the longest fixed
4722 from the reg_scan_data over to the program. */
4723 if (SvUTF8(data.longest_fixed)) {
4724 r->anchored_utf8 = data.longest_fixed;
4725 r->anchored_substr = NULL;
4727 r->anchored_substr = data.longest_fixed;
4728 r->anchored_utf8 = NULL;
4730 /* fixed_end_shift is how many chars that must be matched that
4731 follow this item. We calculate it ahead of time as once the
4732 lookbehind offset is added in we lose the ability to correctly
4734 ml = data.minlen_fixed ? *(data.minlen_fixed)
4735 : (I32)longest_fixed_length;
4736 r->anchored_end_shift = ml - data.offset_fixed
4737 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4738 + data.lookbehind_fixed;
4739 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4741 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4742 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4743 || (RExC_flags & RXf_PMf_MULTILINE)));
4744 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4747 r->anchored_substr = r->anchored_utf8 = NULL;
4748 SvREFCNT_dec(data.longest_fixed);
4749 longest_fixed_length = 0;
4752 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4753 ri->regstclass = NULL;
4754 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4756 && !(data.start_class->flags & ANYOF_EOS)
4757 && !cl_is_anything(data.start_class))
4759 const U32 n = add_data(pRExC_state, 1, "f");
4761 Newx(RExC_rxi->data->data[n], 1,
4762 struct regnode_charclass_class);
4763 StructCopy(data.start_class,
4764 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4765 struct regnode_charclass_class);
4766 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4767 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4768 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4769 regprop(r, sv, (regnode*)data.start_class);
4770 PerlIO_printf(Perl_debug_log,
4771 "synthetic stclass \"%s\".\n",
4772 SvPVX_const(sv));});
4775 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4776 if (longest_fixed_length > longest_float_length) {
4777 r->check_end_shift = r->anchored_end_shift;
4778 r->check_substr = r->anchored_substr;
4779 r->check_utf8 = r->anchored_utf8;
4780 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4781 if (r->extflags & RXf_ANCH_SINGLE)
4782 r->extflags |= RXf_NOSCAN;
4785 r->check_end_shift = r->float_end_shift;
4786 r->check_substr = r->float_substr;
4787 r->check_utf8 = r->float_utf8;
4788 r->check_offset_min = r->float_min_offset;
4789 r->check_offset_max = r->float_max_offset;
4791 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4792 This should be changed ASAP! */
4793 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4794 r->extflags |= RXf_USE_INTUIT;
4795 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4796 r->extflags |= RXf_INTUIT_TAIL;
4798 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4799 if ( (STRLEN)minlen < longest_float_length )
4800 minlen= longest_float_length;
4801 if ( (STRLEN)minlen < longest_fixed_length )
4802 minlen= longest_fixed_length;
4806 /* Several toplevels. Best we can is to set minlen. */
4808 struct regnode_charclass_class ch_class;
4811 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4813 scan = ri->program + 1;
4814 cl_init(pRExC_state, &ch_class);
4815 data.start_class = &ch_class;
4816 data.last_closep = &last_close;
4819 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4820 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4824 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4825 = r->float_substr = r->float_utf8 = NULL;
4826 if (!(data.start_class->flags & ANYOF_EOS)
4827 && !cl_is_anything(data.start_class))
4829 const U32 n = add_data(pRExC_state, 1, "f");
4831 Newx(RExC_rxi->data->data[n], 1,
4832 struct regnode_charclass_class);
4833 StructCopy(data.start_class,
4834 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4835 struct regnode_charclass_class);
4836 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4837 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4838 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4839 regprop(r, sv, (regnode*)data.start_class);
4840 PerlIO_printf(Perl_debug_log,
4841 "synthetic stclass \"%s\".\n",
4842 SvPVX_const(sv));});
4846 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4847 the "real" pattern. */
4849 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4850 (IV)minlen, (IV)r->minlen);
4852 r->minlenret = minlen;
4853 if (r->minlen < minlen)
4856 if (RExC_seen & REG_SEEN_GPOS)
4857 r->extflags |= RXf_GPOS_SEEN;
4858 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4859 r->extflags |= RXf_LOOKBEHIND_SEEN;
4860 if (RExC_seen & REG_SEEN_EVAL)
4861 r->extflags |= RXf_EVAL_SEEN;
4862 if (RExC_seen & REG_SEEN_CANY)
4863 r->extflags |= RXf_CANY_SEEN;
4864 if (RExC_seen & REG_SEEN_VERBARG)
4865 r->intflags |= PREGf_VERBARG_SEEN;
4866 if (RExC_seen & REG_SEEN_CUTGROUP)
4867 r->intflags |= PREGf_CUTGROUP_SEEN;
4868 if (RExC_paren_names)
4869 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4871 RXp_PAREN_NAMES(r) = NULL;
4873 #ifdef STUPID_PATTERN_CHECKS
4874 if (RX_PRELEN(rx) == 0)
4875 r->extflags |= RXf_NULL;
4876 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4877 /* XXX: this should happen BEFORE we compile */
4878 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4879 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4880 r->extflags |= RXf_WHITE;
4881 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4882 r->extflags |= RXf_START_ONLY;
4884 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4885 /* XXX: this should happen BEFORE we compile */
4886 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4888 regnode *first = ri->program + 1;
4890 U8 nop = OP(NEXTOPER(first));
4892 if (PL_regkind[fop] == NOTHING && nop == END)
4893 r->extflags |= RXf_NULL;
4894 else if (PL_regkind[fop] == BOL && nop == END)
4895 r->extflags |= RXf_START_ONLY;
4896 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4897 r->extflags |= RXf_WHITE;
4901 if (RExC_paren_names) {
4902 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4903 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4906 ri->name_list_idx = 0;
4908 if (RExC_recurse_count) {
4909 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4910 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4911 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4914 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4915 /* assume we don't need to swap parens around before we match */
4918 PerlIO_printf(Perl_debug_log,"Final program:\n");
4921 #ifdef RE_TRACK_PATTERN_OFFSETS
4922 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4923 const U32 len = ri->u.offsets[0];
4925 GET_RE_DEBUG_FLAGS_DECL;
4926 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4927 for (i = 1; i <= len; i++) {
4928 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4929 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4930 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4932 PerlIO_printf(Perl_debug_log, "\n");
4938 #undef RE_ENGINE_PTR
4942 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4945 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4947 PERL_UNUSED_ARG(value);
4949 if (flags & RXapif_FETCH) {
4950 return reg_named_buff_fetch(rx, key, flags);
4951 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4952 Perl_croak(aTHX_ "%s", PL_no_modify);
4954 } else if (flags & RXapif_EXISTS) {
4955 return reg_named_buff_exists(rx, key, flags)
4958 } else if (flags & RXapif_REGNAMES) {
4959 return reg_named_buff_all(rx, flags);
4960 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4961 return reg_named_buff_scalar(rx, flags);
4963 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4969 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4972 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4973 PERL_UNUSED_ARG(lastkey);
4975 if (flags & RXapif_FIRSTKEY)
4976 return reg_named_buff_firstkey(rx, flags);
4977 else if (flags & RXapif_NEXTKEY)
4978 return reg_named_buff_nextkey(rx, flags);
4980 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4986 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4989 AV *retarray = NULL;
4991 struct regexp *const rx = (struct regexp *)SvANY(r);
4993 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4995 if (flags & RXapif_ALL)
4998 if (rx && RXp_PAREN_NAMES(rx)) {
4999 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5002 SV* sv_dat=HeVAL(he_str);
5003 I32 *nums=(I32*)SvPVX(sv_dat);
5004 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5005 if ((I32)(rx->nparens) >= nums[i]
5006 && rx->offs[nums[i]].start != -1
5007 && rx->offs[nums[i]].end != -1)
5010 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5014 ret = newSVsv(&PL_sv_undef);
5017 av_push(retarray, ret);
5020 return newRV_noinc(MUTABLE_SV(retarray));
5027 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5030 struct regexp *const rx = (struct regexp *)SvANY(r);
5032 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5034 if (rx && RXp_PAREN_NAMES(rx)) {
5035 if (flags & RXapif_ALL) {
5036 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5038 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5052 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5054 struct regexp *const rx = (struct regexp *)SvANY(r);
5056 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5058 if ( rx && RXp_PAREN_NAMES(rx) ) {
5059 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5061 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5068 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5070 struct regexp *const rx = (struct regexp *)SvANY(r);
5071 GET_RE_DEBUG_FLAGS_DECL;
5073 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5075 if (rx && RXp_PAREN_NAMES(rx)) {
5076 HV *hv = RXp_PAREN_NAMES(rx);
5078 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5081 SV* sv_dat = HeVAL(temphe);
5082 I32 *nums = (I32*)SvPVX(sv_dat);
5083 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5084 if ((I32)(rx->lastparen) >= nums[i] &&
5085 rx->offs[nums[i]].start != -1 &&
5086 rx->offs[nums[i]].end != -1)
5092 if (parno || flags & RXapif_ALL) {
5093 return newSVhek(HeKEY_hek(temphe));
5101 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5106 struct regexp *const rx = (struct regexp *)SvANY(r);
5108 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5110 if (rx && RXp_PAREN_NAMES(rx)) {
5111 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5112 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5113 } else if (flags & RXapif_ONE) {
5114 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5115 av = MUTABLE_AV(SvRV(ret));
5116 length = av_len(av);
5118 return newSViv(length + 1);
5120 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5124 return &PL_sv_undef;
5128 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5130 struct regexp *const rx = (struct regexp *)SvANY(r);
5133 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5135 if (rx && RXp_PAREN_NAMES(rx)) {
5136 HV *hv= RXp_PAREN_NAMES(rx);
5138 (void)hv_iterinit(hv);
5139 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5142 SV* sv_dat = HeVAL(temphe);
5143 I32 *nums = (I32*)SvPVX(sv_dat);
5144 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5145 if ((I32)(rx->lastparen) >= nums[i] &&
5146 rx->offs[nums[i]].start != -1 &&
5147 rx->offs[nums[i]].end != -1)
5153 if (parno || flags & RXapif_ALL) {
5154 av_push(av, newSVhek(HeKEY_hek(temphe)));
5159 return newRV_noinc(MUTABLE_SV(av));
5163 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5166 struct regexp *const rx = (struct regexp *)SvANY(r);
5171 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5174 sv_setsv(sv,&PL_sv_undef);
5178 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5180 i = rx->offs[0].start;
5184 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5186 s = rx->subbeg + rx->offs[0].end;
5187 i = rx->sublen - rx->offs[0].end;
5190 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5191 (s1 = rx->offs[paren].start) != -1 &&
5192 (t1 = rx->offs[paren].end) != -1)
5196 s = rx->subbeg + s1;
5198 sv_setsv(sv,&PL_sv_undef);
5201 assert(rx->sublen >= (s - rx->subbeg) + i );
5203 const int oldtainted = PL_tainted;
5205 sv_setpvn(sv, s, i);
5206 PL_tainted = oldtainted;
5207 if ( (rx->extflags & RXf_CANY_SEEN)
5208 ? (RXp_MATCH_UTF8(rx)
5209 && (!i || is_utf8_string((U8*)s, i)))
5210 : (RXp_MATCH_UTF8(rx)) )
5217 if (RXp_MATCH_TAINTED(rx)) {
5218 if (SvTYPE(sv) >= SVt_PVMG) {
5219 MAGIC* const mg = SvMAGIC(sv);
5222 SvMAGIC_set(sv, mg->mg_moremagic);
5224 if ((mgt = SvMAGIC(sv))) {
5225 mg->mg_moremagic = mgt;
5226 SvMAGIC_set(sv, mg);
5236 sv_setsv(sv,&PL_sv_undef);
5242 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5243 SV const * const value)
5245 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5247 PERL_UNUSED_ARG(rx);
5248 PERL_UNUSED_ARG(paren);
5249 PERL_UNUSED_ARG(value);
5252 Perl_croak(aTHX_ "%s", PL_no_modify);
5256 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5259 struct regexp *const rx = (struct regexp *)SvANY(r);
5263 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5265 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5267 /* $` / ${^PREMATCH} */
5268 case RX_BUFF_IDX_PREMATCH:
5269 if (rx->offs[0].start != -1) {
5270 i = rx->offs[0].start;
5278 /* $' / ${^POSTMATCH} */
5279 case RX_BUFF_IDX_POSTMATCH:
5280 if (rx->offs[0].end != -1) {
5281 i = rx->sublen - rx->offs[0].end;
5283 s1 = rx->offs[0].end;
5289 /* $& / ${^MATCH}, $1, $2, ... */
5291 if (paren <= (I32)rx->nparens &&
5292 (s1 = rx->offs[paren].start) != -1 &&
5293 (t1 = rx->offs[paren].end) != -1)
5298 if (ckWARN(WARN_UNINITIALIZED))
5299 report_uninit((const SV *)sv);
5304 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5305 const char * const s = rx->subbeg + s1;
5310 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5317 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5319 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5320 PERL_UNUSED_ARG(rx);
5324 return newSVpvs("Regexp");
5327 /* Scans the name of a named buffer from the pattern.
5328 * If flags is REG_RSN_RETURN_NULL returns null.
5329 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5330 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5331 * to the parsed name as looked up in the RExC_paren_names hash.
5332 * If there is an error throws a vFAIL().. type exception.
5335 #define REG_RSN_RETURN_NULL 0
5336 #define REG_RSN_RETURN_NAME 1
5337 #define REG_RSN_RETURN_DATA 2
5340 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5342 char *name_start = RExC_parse;
5344 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5346 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5347 /* skip IDFIRST by using do...while */
5350 RExC_parse += UTF8SKIP(RExC_parse);
5351 } while (isALNUM_utf8((U8*)RExC_parse));
5355 } while (isALNUM(*RExC_parse));
5360 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5361 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5362 if ( flags == REG_RSN_RETURN_NAME)
5364 else if (flags==REG_RSN_RETURN_DATA) {
5367 if ( ! sv_name ) /* should not happen*/
5368 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5369 if (RExC_paren_names)
5370 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5372 sv_dat = HeVAL(he_str);
5374 vFAIL("Reference to nonexistent named group");
5378 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5385 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5386 int rem=(int)(RExC_end - RExC_parse); \
5395 if (RExC_lastparse!=RExC_parse) \
5396 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5399 iscut ? "..." : "<" \
5402 PerlIO_printf(Perl_debug_log,"%16s",""); \
5405 num = RExC_size + 1; \
5407 num=REG_NODE_NUM(RExC_emit); \
5408 if (RExC_lastnum!=num) \
5409 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5411 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5412 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5413 (int)((depth*2)), "", \
5417 RExC_lastparse=RExC_parse; \
5422 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5423 DEBUG_PARSE_MSG((funcname)); \
5424 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5426 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5427 DEBUG_PARSE_MSG((funcname)); \
5428 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5431 - reg - regular expression, i.e. main body or parenthesized thing
5433 * Caller must absorb opening parenthesis.
5435 * Combining parenthesis handling with the base level of regular expression
5436 * is a trifle forced, but the need to tie the tails of the branches to what
5437 * follows makes it hard to avoid.
5439 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5441 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5443 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5447 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5448 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5451 register regnode *ret; /* Will be the head of the group. */
5452 register regnode *br;
5453 register regnode *lastbr;
5454 register regnode *ender = NULL;
5455 register I32 parno = 0;
5457 U32 oregflags = RExC_flags;
5458 bool have_branch = 0;
5460 I32 freeze_paren = 0;
5461 I32 after_freeze = 0;
5463 /* for (?g), (?gc), and (?o) warnings; warning
5464 about (?c) will warn about (?g) -- japhy */
5466 #define WASTED_O 0x01
5467 #define WASTED_G 0x02
5468 #define WASTED_C 0x04
5469 #define WASTED_GC (0x02|0x04)
5470 I32 wastedflags = 0x00;
5472 char * parse_start = RExC_parse; /* MJD */
5473 char * const oregcomp_parse = RExC_parse;
5475 GET_RE_DEBUG_FLAGS_DECL;
5477 PERL_ARGS_ASSERT_REG;
5478 DEBUG_PARSE("reg ");
5480 *flagp = 0; /* Tentatively. */
5483 /* Make an OPEN node, if parenthesized. */
5485 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5486 char *start_verb = RExC_parse;
5487 STRLEN verb_len = 0;
5488 char *start_arg = NULL;
5489 unsigned char op = 0;
5491 int internal_argval = 0; /* internal_argval is only useful if !argok */
5492 while ( *RExC_parse && *RExC_parse != ')' ) {
5493 if ( *RExC_parse == ':' ) {
5494 start_arg = RExC_parse + 1;
5500 verb_len = RExC_parse - start_verb;
5503 while ( *RExC_parse && *RExC_parse != ')' )
5505 if ( *RExC_parse != ')' )
5506 vFAIL("Unterminated verb pattern argument");
5507 if ( RExC_parse == start_arg )
5510 if ( *RExC_parse != ')' )
5511 vFAIL("Unterminated verb pattern");
5514 switch ( *start_verb ) {
5515 case 'A': /* (*ACCEPT) */
5516 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5518 internal_argval = RExC_nestroot;
5521 case 'C': /* (*COMMIT) */
5522 if ( memEQs(start_verb,verb_len,"COMMIT") )
5525 case 'F': /* (*FAIL) */
5526 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5531 case ':': /* (*:NAME) */
5532 case 'M': /* (*MARK:NAME) */
5533 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5538 case 'P': /* (*PRUNE) */
5539 if ( memEQs(start_verb,verb_len,"PRUNE") )
5542 case 'S': /* (*SKIP) */
5543 if ( memEQs(start_verb,verb_len,"SKIP") )
5546 case 'T': /* (*THEN) */
5547 /* [19:06] <TimToady> :: is then */
5548 if ( memEQs(start_verb,verb_len,"THEN") ) {
5550 RExC_seen |= REG_SEEN_CUTGROUP;
5556 vFAIL3("Unknown verb pattern '%.*s'",
5557 verb_len, start_verb);
5560 if ( start_arg && internal_argval ) {
5561 vFAIL3("Verb pattern '%.*s' may not have an argument",
5562 verb_len, start_verb);
5563 } else if ( argok < 0 && !start_arg ) {
5564 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5565 verb_len, start_verb);
5567 ret = reganode(pRExC_state, op, internal_argval);
5568 if ( ! internal_argval && ! SIZE_ONLY ) {
5570 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5571 ARG(ret) = add_data( pRExC_state, 1, "S" );
5572 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5579 if (!internal_argval)
5580 RExC_seen |= REG_SEEN_VERBARG;
5581 } else if ( start_arg ) {
5582 vFAIL3("Verb pattern '%.*s' may not have an argument",
5583 verb_len, start_verb);
5585 ret = reg_node(pRExC_state, op);
5587 nextchar(pRExC_state);
5590 if (*RExC_parse == '?') { /* (?...) */
5591 bool is_logical = 0;
5592 const char * const seqstart = RExC_parse;
5595 paren = *RExC_parse++;
5596 ret = NULL; /* For look-ahead/behind. */
5599 case 'P': /* (?P...) variants for those used to PCRE/Python */
5600 paren = *RExC_parse++;
5601 if ( paren == '<') /* (?P<...>) named capture */
5603 else if (paren == '>') { /* (?P>name) named recursion */
5604 goto named_recursion;
5606 else if (paren == '=') { /* (?P=...) named backref */
5607 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5608 you change this make sure you change that */
5609 char* name_start = RExC_parse;
5611 SV *sv_dat = reg_scan_name(pRExC_state,
5612 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5613 if (RExC_parse == name_start || *RExC_parse != ')')
5614 vFAIL2("Sequence %.3s... not terminated",parse_start);
5617 num = add_data( pRExC_state, 1, "S" );
5618 RExC_rxi->data->data[num]=(void*)sv_dat;
5619 SvREFCNT_inc_simple_void(sv_dat);
5622 ret = reganode(pRExC_state,
5623 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5627 Set_Node_Offset(ret, parse_start+1);
5628 Set_Node_Cur_Length(ret); /* MJD */
5630 nextchar(pRExC_state);
5634 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5636 case '<': /* (?<...) */
5637 if (*RExC_parse == '!')
5639 else if (*RExC_parse != '=')
5645 case '\'': /* (?'...') */
5646 name_start= RExC_parse;
5647 svname = reg_scan_name(pRExC_state,
5648 SIZE_ONLY ? /* reverse test from the others */
5649 REG_RSN_RETURN_NAME :
5650 REG_RSN_RETURN_NULL);
5651 if (RExC_parse == name_start) {
5653 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5656 if (*RExC_parse != paren)
5657 vFAIL2("Sequence (?%c... not terminated",
5658 paren=='>' ? '<' : paren);
5662 if (!svname) /* shouldnt happen */
5664 "panic: reg_scan_name returned NULL");
5665 if (!RExC_paren_names) {
5666 RExC_paren_names= newHV();
5667 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5669 RExC_paren_name_list= newAV();
5670 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5673 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5675 sv_dat = HeVAL(he_str);
5677 /* croak baby croak */
5679 "panic: paren_name hash element allocation failed");
5680 } else if ( SvPOK(sv_dat) ) {
5681 /* (?|...) can mean we have dupes so scan to check
5682 its already been stored. Maybe a flag indicating
5683 we are inside such a construct would be useful,
5684 but the arrays are likely to be quite small, so
5685 for now we punt -- dmq */
5686 IV count = SvIV(sv_dat);
5687 I32 *pv = (I32*)SvPVX(sv_dat);
5689 for ( i = 0 ; i < count ; i++ ) {
5690 if ( pv[i] == RExC_npar ) {
5696 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5697 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5698 pv[count] = RExC_npar;
5699 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5702 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5703 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5705 SvIV_set(sv_dat, 1);
5708 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5709 SvREFCNT_dec(svname);
5712 /*sv_dump(sv_dat);*/
5714 nextchar(pRExC_state);
5716 goto capturing_parens;
5718 RExC_seen |= REG_SEEN_LOOKBEHIND;
5720 case '=': /* (?=...) */
5721 RExC_seen_zerolen++;
5723 case '!': /* (?!...) */
5724 RExC_seen_zerolen++;
5725 if (*RExC_parse == ')') {
5726 ret=reg_node(pRExC_state, OPFAIL);
5727 nextchar(pRExC_state);
5731 case '|': /* (?|...) */
5732 /* branch reset, behave like a (?:...) except that
5733 buffers in alternations share the same numbers */
5735 after_freeze = freeze_paren = RExC_npar;
5737 case ':': /* (?:...) */
5738 case '>': /* (?>...) */
5740 case '$': /* (?$...) */
5741 case '@': /* (?@...) */
5742 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5744 case '#': /* (?#...) */
5745 while (*RExC_parse && *RExC_parse != ')')
5747 if (*RExC_parse != ')')
5748 FAIL("Sequence (?#... not terminated");
5749 nextchar(pRExC_state);
5752 case '0' : /* (?0) */
5753 case 'R' : /* (?R) */
5754 if (*RExC_parse != ')')
5755 FAIL("Sequence (?R) not terminated");
5756 ret = reg_node(pRExC_state, GOSTART);
5757 *flagp |= POSTPONED;
5758 nextchar(pRExC_state);
5761 { /* named and numeric backreferences */
5763 case '&': /* (?&NAME) */
5764 parse_start = RExC_parse - 1;
5767 SV *sv_dat = reg_scan_name(pRExC_state,
5768 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5769 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5771 goto gen_recurse_regop;
5774 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5776 vFAIL("Illegal pattern");
5778 goto parse_recursion;
5780 case '-': /* (?-1) */
5781 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5782 RExC_parse--; /* rewind to let it be handled later */
5786 case '1': case '2': case '3': case '4': /* (?1) */
5787 case '5': case '6': case '7': case '8': case '9':
5790 num = atoi(RExC_parse);
5791 parse_start = RExC_parse - 1; /* MJD */
5792 if (*RExC_parse == '-')
5794 while (isDIGIT(*RExC_parse))
5796 if (*RExC_parse!=')')
5797 vFAIL("Expecting close bracket");
5800 if ( paren == '-' ) {
5802 Diagram of capture buffer numbering.
5803 Top line is the normal capture buffer numbers
5804 Botton line is the negative indexing as from
5808 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5812 num = RExC_npar + num;
5815 vFAIL("Reference to nonexistent group");
5817 } else if ( paren == '+' ) {
5818 num = RExC_npar + num - 1;
5821 ret = reganode(pRExC_state, GOSUB, num);
5823 if (num > (I32)RExC_rx->nparens) {
5825 vFAIL("Reference to nonexistent group");
5827 ARG2L_SET( ret, RExC_recurse_count++);
5829 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5830 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5834 RExC_seen |= REG_SEEN_RECURSE;
5835 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5836 Set_Node_Offset(ret, parse_start); /* MJD */
5838 *flagp |= POSTPONED;
5839 nextchar(pRExC_state);
5841 } /* named and numeric backreferences */
5844 case '?': /* (??...) */
5846 if (*RExC_parse != '{') {
5848 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5851 *flagp |= POSTPONED;
5852 paren = *RExC_parse++;
5854 case '{': /* (?{...}) */
5859 char *s = RExC_parse;
5861 RExC_seen_zerolen++;
5862 RExC_seen |= REG_SEEN_EVAL;
5863 while (count && (c = *RExC_parse)) {
5874 if (*RExC_parse != ')') {
5876 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5880 OP_4tree *sop, *rop;
5881 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5884 Perl_save_re_context(aTHX);
5885 rop = sv_compile_2op(sv, &sop, "re", &pad);
5886 sop->op_private |= OPpREFCOUNTED;
5887 /* re_dup will OpREFCNT_inc */
5888 OpREFCNT_set(sop, 1);
5891 n = add_data(pRExC_state, 3, "nop");
5892 RExC_rxi->data->data[n] = (void*)rop;
5893 RExC_rxi->data->data[n+1] = (void*)sop;
5894 RExC_rxi->data->data[n+2] = (void*)pad;
5897 else { /* First pass */
5898 if (PL_reginterp_cnt < ++RExC_seen_evals
5900 /* No compiled RE interpolated, has runtime
5901 components ===> unsafe. */
5902 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5903 if (PL_tainting && PL_tainted)
5904 FAIL("Eval-group in insecure regular expression");
5905 #if PERL_VERSION > 8
5906 if (IN_PERL_COMPILETIME)
5911 nextchar(pRExC_state);
5913 ret = reg_node(pRExC_state, LOGICAL);
5916 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5917 /* deal with the length of this later - MJD */
5920 ret = reganode(pRExC_state, EVAL, n);
5921 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5922 Set_Node_Offset(ret, parse_start);
5925 case '(': /* (?(?{...})...) and (?(?=...)...) */
5928 if (RExC_parse[0] == '?') { /* (?(?...)) */
5929 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5930 || RExC_parse[1] == '<'
5931 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5934 ret = reg_node(pRExC_state, LOGICAL);
5937 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5941 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5942 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5944 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5945 char *name_start= RExC_parse++;
5947 SV *sv_dat=reg_scan_name(pRExC_state,
5948 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5949 if (RExC_parse == name_start || *RExC_parse != ch)
5950 vFAIL2("Sequence (?(%c... not terminated",
5951 (ch == '>' ? '<' : ch));
5954 num = add_data( pRExC_state, 1, "S" );
5955 RExC_rxi->data->data[num]=(void*)sv_dat;
5956 SvREFCNT_inc_simple_void(sv_dat);
5958 ret = reganode(pRExC_state,NGROUPP,num);
5959 goto insert_if_check_paren;
5961 else if (RExC_parse[0] == 'D' &&
5962 RExC_parse[1] == 'E' &&
5963 RExC_parse[2] == 'F' &&
5964 RExC_parse[3] == 'I' &&
5965 RExC_parse[4] == 'N' &&
5966 RExC_parse[5] == 'E')
5968 ret = reganode(pRExC_state,DEFINEP,0);
5971 goto insert_if_check_paren;
5973 else if (RExC_parse[0] == 'R') {
5976 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5977 parno = atoi(RExC_parse++);
5978 while (isDIGIT(*RExC_parse))
5980 } else if (RExC_parse[0] == '&') {
5983 sv_dat = reg_scan_name(pRExC_state,
5984 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5985 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5987 ret = reganode(pRExC_state,INSUBP,parno);
5988 goto insert_if_check_paren;
5990 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5993 parno = atoi(RExC_parse++);
5995 while (isDIGIT(*RExC_parse))
5997 ret = reganode(pRExC_state, GROUPP, parno);
5999 insert_if_check_paren:
6000 if ((c = *nextchar(pRExC_state)) != ')')
6001 vFAIL("Switch condition not recognized");
6003 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6004 br = regbranch(pRExC_state, &flags, 1,depth+1);
6006 br = reganode(pRExC_state, LONGJMP, 0);
6008 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6009 c = *nextchar(pRExC_state);
6014 vFAIL("(?(DEFINE)....) does not allow branches");
6015 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6016 regbranch(pRExC_state, &flags, 1,depth+1);
6017 REGTAIL(pRExC_state, ret, lastbr);
6020 c = *nextchar(pRExC_state);
6025 vFAIL("Switch (?(condition)... contains too many branches");
6026 ender = reg_node(pRExC_state, TAIL);
6027 REGTAIL(pRExC_state, br, ender);
6029 REGTAIL(pRExC_state, lastbr, ender);
6030 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6033 REGTAIL(pRExC_state, ret, ender);
6034 RExC_size++; /* XXX WHY do we need this?!!
6035 For large programs it seems to be required
6036 but I can't figure out why. -- dmq*/
6040 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6044 RExC_parse--; /* for vFAIL to print correctly */
6045 vFAIL("Sequence (? incomplete");
6049 parse_flags: /* (?i) */
6051 U32 posflags = 0, negflags = 0;
6052 U32 *flagsp = &posflags;
6054 while (*RExC_parse) {
6055 /* && strchr("iogcmsx", *RExC_parse) */
6056 /* (?g), (?gc) and (?o) are useless here
6057 and must be globally applied -- japhy */
6058 switch (*RExC_parse) {
6059 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6060 case ONCE_PAT_MOD: /* 'o' */
6061 case GLOBAL_PAT_MOD: /* 'g' */
6062 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6063 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6064 if (! (wastedflags & wflagbit) ) {
6065 wastedflags |= wflagbit;
6068 "Useless (%s%c) - %suse /%c modifier",
6069 flagsp == &negflags ? "?-" : "?",
6071 flagsp == &negflags ? "don't " : "",
6078 case CONTINUE_PAT_MOD: /* 'c' */
6079 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6080 if (! (wastedflags & WASTED_C) ) {
6081 wastedflags |= WASTED_GC;
6084 "Useless (%sc) - %suse /gc modifier",
6085 flagsp == &negflags ? "?-" : "?",
6086 flagsp == &negflags ? "don't " : ""
6091 case KEEPCOPY_PAT_MOD: /* 'p' */
6092 if (flagsp == &negflags) {
6094 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6096 *flagsp |= RXf_PMf_KEEPCOPY;
6100 if (flagsp == &negflags) {
6102 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6106 wastedflags = 0; /* reset so (?g-c) warns twice */
6112 RExC_flags |= posflags;
6113 RExC_flags &= ~negflags;
6115 oregflags |= posflags;
6116 oregflags &= ~negflags;
6118 nextchar(pRExC_state);
6129 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6134 }} /* one for the default block, one for the switch */
6141 ret = reganode(pRExC_state, OPEN, parno);
6144 RExC_nestroot = parno;
6145 if (RExC_seen & REG_SEEN_RECURSE
6146 && !RExC_open_parens[parno-1])
6148 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6149 "Setting open paren #%"IVdf" to %d\n",
6150 (IV)parno, REG_NODE_NUM(ret)));
6151 RExC_open_parens[parno-1]= ret;
6154 Set_Node_Length(ret, 1); /* MJD */
6155 Set_Node_Offset(ret, RExC_parse); /* MJD */
6163 /* Pick up the branches, linking them together. */
6164 parse_start = RExC_parse; /* MJD */
6165 br = regbranch(pRExC_state, &flags, 1,depth+1);
6168 if (RExC_npar > after_freeze)
6169 after_freeze = RExC_npar;
6170 RExC_npar = freeze_paren;
6173 /* branch_len = (paren != 0); */
6177 if (*RExC_parse == '|') {
6178 if (!SIZE_ONLY && RExC_extralen) {
6179 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6182 reginsert(pRExC_state, BRANCH, br, depth+1);
6183 Set_Node_Length(br, paren != 0);
6184 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6188 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6190 else if (paren == ':') {
6191 *flagp |= flags&SIMPLE;
6193 if (is_open) { /* Starts with OPEN. */
6194 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6196 else if (paren != '?') /* Not Conditional */
6198 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6200 while (*RExC_parse == '|') {
6201 if (!SIZE_ONLY && RExC_extralen) {
6202 ender = reganode(pRExC_state, LONGJMP,0);
6203 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6206 RExC_extralen += 2; /* Account for LONGJMP. */
6207 nextchar(pRExC_state);
6209 if (RExC_npar > after_freeze)
6210 after_freeze = RExC_npar;
6211 RExC_npar = freeze_paren;
6213 br = regbranch(pRExC_state, &flags, 0, depth+1);
6217 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6219 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6222 if (have_branch || paren != ':') {
6223 /* Make a closing node, and hook it on the end. */
6226 ender = reg_node(pRExC_state, TAIL);
6229 ender = reganode(pRExC_state, CLOSE, parno);
6230 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6231 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6232 "Setting close paren #%"IVdf" to %d\n",
6233 (IV)parno, REG_NODE_NUM(ender)));
6234 RExC_close_parens[parno-1]= ender;
6235 if (RExC_nestroot == parno)
6238 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6239 Set_Node_Length(ender,1); /* MJD */
6245 *flagp &= ~HASWIDTH;
6248 ender = reg_node(pRExC_state, SUCCEED);
6251 ender = reg_node(pRExC_state, END);
6253 assert(!RExC_opend); /* there can only be one! */
6258 REGTAIL(pRExC_state, lastbr, ender);
6260 if (have_branch && !SIZE_ONLY) {
6262 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6264 /* Hook the tails of the branches to the closing node. */
6265 for (br = ret; br; br = regnext(br)) {
6266 const U8 op = PL_regkind[OP(br)];
6268 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6270 else if (op == BRANCHJ) {
6271 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6279 static const char parens[] = "=!<,>";
6281 if (paren && (p = strchr(parens, paren))) {
6282 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6283 int flag = (p - parens) > 1;
6286 node = SUSPEND, flag = 0;
6287 reginsert(pRExC_state, node,ret, depth+1);
6288 Set_Node_Cur_Length(ret);
6289 Set_Node_Offset(ret, parse_start + 1);
6291 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6295 /* Check for proper termination. */
6297 RExC_flags = oregflags;
6298 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6299 RExC_parse = oregcomp_parse;
6300 vFAIL("Unmatched (");
6303 else if (!paren && RExC_parse < RExC_end) {
6304 if (*RExC_parse == ')') {
6306 vFAIL("Unmatched )");
6309 FAIL("Junk on end of regexp"); /* "Can't happen". */
6313 RExC_npar = after_freeze;
6318 - regbranch - one alternative of an | operator
6320 * Implements the concatenation operator.
6323 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6326 register regnode *ret;
6327 register regnode *chain = NULL;
6328 register regnode *latest;
6329 I32 flags = 0, c = 0;
6330 GET_RE_DEBUG_FLAGS_DECL;
6332 PERL_ARGS_ASSERT_REGBRANCH;
6334 DEBUG_PARSE("brnc");
6339 if (!SIZE_ONLY && RExC_extralen)
6340 ret = reganode(pRExC_state, BRANCHJ,0);
6342 ret = reg_node(pRExC_state, BRANCH);
6343 Set_Node_Length(ret, 1);
6347 if (!first && SIZE_ONLY)
6348 RExC_extralen += 1; /* BRANCHJ */
6350 *flagp = WORST; /* Tentatively. */
6353 nextchar(pRExC_state);
6354 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6356 latest = regpiece(pRExC_state, &flags,depth+1);
6357 if (latest == NULL) {
6358 if (flags & TRYAGAIN)
6362 else if (ret == NULL)
6364 *flagp |= flags&(HASWIDTH|POSTPONED);
6365 if (chain == NULL) /* First piece. */
6366 *flagp |= flags&SPSTART;
6369 REGTAIL(pRExC_state, chain, latest);
6374 if (chain == NULL) { /* Loop ran zero times. */
6375 chain = reg_node(pRExC_state, NOTHING);
6380 *flagp |= flags&SIMPLE;
6387 - regpiece - something followed by possible [*+?]
6389 * Note that the branching code sequences used for ? and the general cases
6390 * of * and + are somewhat optimized: they use the same NOTHING node as
6391 * both the endmarker for their branch list and the body of the last branch.
6392 * It might seem that this node could be dispensed with entirely, but the
6393 * endmarker role is not redundant.
6396 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6399 register regnode *ret;
6401 register char *next;
6403 const char * const origparse = RExC_parse;
6405 I32 max = REG_INFTY;
6407 const char *maxpos = NULL;
6408 GET_RE_DEBUG_FLAGS_DECL;
6410 PERL_ARGS_ASSERT_REGPIECE;
6412 DEBUG_PARSE("piec");
6414 ret = regatom(pRExC_state, &flags,depth+1);
6416 if (flags & TRYAGAIN)
6423 if (op == '{' && regcurly(RExC_parse)) {
6425 parse_start = RExC_parse; /* MJD */
6426 next = RExC_parse + 1;
6427 while (isDIGIT(*next) || *next == ',') {
6436 if (*next == '}') { /* got one */
6440 min = atoi(RExC_parse);
6444 maxpos = RExC_parse;
6446 if (!max && *maxpos != '0')
6447 max = REG_INFTY; /* meaning "infinity" */
6448 else if (max >= REG_INFTY)
6449 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6451 nextchar(pRExC_state);
6454 if ((flags&SIMPLE)) {
6455 RExC_naughty += 2 + RExC_naughty / 2;
6456 reginsert(pRExC_state, CURLY, ret, depth+1);
6457 Set_Node_Offset(ret, parse_start+1); /* MJD */
6458 Set_Node_Cur_Length(ret);
6461 regnode * const w = reg_node(pRExC_state, WHILEM);
6464 REGTAIL(pRExC_state, ret, w);
6465 if (!SIZE_ONLY && RExC_extralen) {
6466 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6467 reginsert(pRExC_state, NOTHING,ret, depth+1);
6468 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6470 reginsert(pRExC_state, CURLYX,ret, depth+1);
6472 Set_Node_Offset(ret, parse_start+1);
6473 Set_Node_Length(ret,
6474 op == '{' ? (RExC_parse - parse_start) : 1);
6476 if (!SIZE_ONLY && RExC_extralen)
6477 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6478 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6480 RExC_whilem_seen++, RExC_extralen += 3;
6481 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6490 vFAIL("Can't do {n,m} with n > m");
6492 ARG1_SET(ret, (U16)min);
6493 ARG2_SET(ret, (U16)max);
6505 #if 0 /* Now runtime fix should be reliable. */
6507 /* if this is reinstated, don't forget to put this back into perldiag:
6509 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6511 (F) The part of the regexp subject to either the * or + quantifier
6512 could match an empty string. The {#} shows in the regular
6513 expression about where the problem was discovered.
6517 if (!(flags&HASWIDTH) && op != '?')
6518 vFAIL("Regexp *+ operand could be empty");
6521 parse_start = RExC_parse;
6522 nextchar(pRExC_state);
6524 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6526 if (op == '*' && (flags&SIMPLE)) {
6527 reginsert(pRExC_state, STAR, ret, depth+1);
6531 else if (op == '*') {
6535 else if (op == '+' && (flags&SIMPLE)) {
6536 reginsert(pRExC_state, PLUS, ret, depth+1);
6540 else if (op == '+') {
6544 else if (op == '?') {
6549 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6550 ckWARN3reg(RExC_parse,
6551 "%.*s matches null string many times",
6552 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6556 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6557 nextchar(pRExC_state);
6558 reginsert(pRExC_state, MINMOD, ret, depth+1);
6559 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6561 #ifndef REG_ALLOW_MINMOD_SUSPEND
6564 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6566 nextchar(pRExC_state);
6567 ender = reg_node(pRExC_state, SUCCEED);
6568 REGTAIL(pRExC_state, ret, ender);
6569 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6571 ender = reg_node(pRExC_state, TAIL);
6572 REGTAIL(pRExC_state, ret, ender);
6576 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6578 vFAIL("Nested quantifiers");
6585 /* reg_namedseq(pRExC_state,UVp)
6587 This is expected to be called by a parser routine that has
6588 recognized '\N' and needs to handle the rest. RExC_parse is
6589 expected to point at the first char following the N at the time
6592 The \N may be inside (indicated by valuep not being NULL) or outside a
6595 \N may begin either a named sequence, or if outside a character class, mean
6596 to match a non-newline. For non single-quoted regexes, the tokenizer has
6597 attempted to decide which, and in the case of a named sequence converted it
6598 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6599 where c1... are the characters in the sequence. For single-quoted regexes,
6600 the tokenizer passes the \N sequence through unchanged; this code will not
6601 attempt to determine this nor expand those. The net effect is that if the
6602 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6603 signals that this \N occurrence means to match a non-newline.
6605 Only the \N{U+...} form should occur in a character class, for the same
6606 reason that '.' inside a character class means to just match a period: it
6607 just doesn't make sense.
6609 If valuep is non-null then it is assumed that we are parsing inside
6610 of a charclass definition and the first codepoint in the resolved
6611 string is returned via *valuep and the routine will return NULL.
6612 In this mode if a multichar string is returned from the charnames
6613 handler, a warning will be issued, and only the first char in the
6614 sequence will be examined. If the string returned is zero length
6615 then the value of *valuep is undefined and NON-NULL will
6616 be returned to indicate failure. (This will NOT be a valid pointer
6619 If valuep is null then it is assumed that we are parsing normal text and a
6620 new EXACT node is inserted into the program containing the resolved string,
6621 and a pointer to the new node is returned. But if the string is zero length
6622 a NOTHING node is emitted instead.
6624 On success RExC_parse is set to the char following the endbrace.
6625 Parsing failures will generate a fatal error via vFAIL(...)
6628 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6630 char * endbrace; /* '}' following the name */
6631 regnode *ret = NULL;
6633 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6637 GET_RE_DEBUG_FLAGS_DECL;
6639 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6643 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6644 * modifier. The other meaning does not */
6645 p = (RExC_flags & RXf_PMf_EXTENDED)
6646 ? regwhite( pRExC_state, RExC_parse )
6649 /* Disambiguate between \N meaning a named character versus \N meaning
6650 * [^\n]. The former is assumed when it can't be the latter. */
6651 if (*p != '{' || regcurly(p)) {
6654 /* no bare \N in a charclass */
6655 vFAIL("\\N in a character class must be a named character: \\N{...}");
6657 nextchar(pRExC_state);
6658 ret = reg_node(pRExC_state, REG_ANY);
6659 *flagp |= HASWIDTH|SIMPLE;
6662 Set_Node_Length(ret, 1); /* MJD */
6666 /* Here, we have decided it should be a named sequence */
6668 /* The test above made sure that the next real character is a '{', but
6669 * under the /x modifier, it could be separated by space (or a comment and
6670 * \n) and this is not allowed (for consistency with \x{...} and the
6671 * tokenizer handling of \N{NAME}). */
6672 if (*RExC_parse != '{') {
6673 vFAIL("Missing braces on \\N{}");
6676 RExC_parse++; /* Skip past the '{' */
6678 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6679 || ! (endbrace == RExC_parse /* nothing between the {} */
6680 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6681 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6683 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6684 vFAIL("\\N{NAME} must be resolved by the lexer");
6687 if (endbrace == RExC_parse) { /* empty: \N{} */
6689 RExC_parse = endbrace + 1;
6690 return reg_node(pRExC_state,NOTHING);
6694 ckWARNreg(RExC_parse,
6695 "Ignoring zero length \\N{} in character class"
6697 RExC_parse = endbrace + 1;
6700 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6703 RExC_utf8 = 1; /* named sequences imply Unicode semantics */
6704 RExC_parse += 2; /* Skip past the 'U+' */
6706 if (valuep) { /* In a bracketed char class */
6707 /* We only pay attention to the first char of
6708 multichar strings being returned. I kinda wonder
6709 if this makes sense as it does change the behaviour
6710 from earlier versions, OTOH that behaviour was broken
6711 as well. XXX Solution is to recharacterize as
6712 [rest-of-class]|multi1|multi2... */
6714 STRLEN length_of_hex;
6715 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6716 | PERL_SCAN_DISALLOW_PREFIX
6717 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6719 char * endchar = strchr(RExC_parse, '.');
6721 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6723 else endchar = endbrace;
6725 length_of_hex = (STRLEN)(endchar - RExC_parse);
6726 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6728 /* The tokenizer should have guaranteed validity, but it's possible to
6729 * bypass it by using single quoting, so check */
6730 if (length_of_hex == 0
6731 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6733 RExC_parse += length_of_hex; /* Includes all the valid */
6734 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6735 ? UTF8SKIP(RExC_parse)
6737 /* Guard against malformed utf8 */
6738 if (RExC_parse >= endchar) RExC_parse = endchar;
6739 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6742 RExC_parse = endbrace + 1;
6743 if (endchar == endbrace) return NULL;
6745 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
6747 else { /* Not a char class */
6748 char *s; /* String to put in generated EXACT node */
6749 STRLEN len = 0; /* Its current length */
6750 char *endchar; /* Points to '.' or '}' ending cur char in the input
6753 ret = reg_node(pRExC_state,
6754 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6757 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6758 * the input which is of the form now 'c1.c2.c3...}' until find the
6759 * ending brace or exeed length 255. The characters that exceed this
6760 * limit are dropped. The limit could be relaxed should it become
6761 * desirable by reparsing this as (?:\N{NAME}), so could generate
6762 * multiple EXACT nodes, as is done for just regular input. But this
6763 * is primarily a named character, and not intended to be a huge long
6764 * string, so 255 bytes should be good enough */
6766 STRLEN length_of_hex;
6767 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6768 | PERL_SCAN_DISALLOW_PREFIX
6769 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6770 UV cp; /* Ord of current character */
6772 /* Code points are separated by dots. If none, there is only one
6773 * code point, and is terminated by the brace */
6774 endchar = strchr(RExC_parse, '.');
6775 if (! endchar) endchar = endbrace;
6777 /* The values are Unicode even on EBCDIC machines */
6778 length_of_hex = (STRLEN)(endchar - RExC_parse);
6779 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6780 if ( length_of_hex == 0
6781 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6783 RExC_parse += length_of_hex; /* Includes all the valid */
6784 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6785 ? UTF8SKIP(RExC_parse)
6787 /* Guard against malformed utf8 */
6788 if (RExC_parse >= endchar) RExC_parse = endchar;
6789 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6792 if (! FOLD) { /* Not folding, just append to the string */
6795 /* Quit before adding this character if would exceed limit */
6796 if (len + UNISKIP(cp) > U8_MAX) break;
6798 unilen = reguni(pRExC_state, cp, s);
6803 } else { /* Folding, output the folded equivalent */
6804 STRLEN foldlen,numlen;
6805 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6806 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6808 /* Quit before exceeding size limit */
6809 if (len + foldlen > U8_MAX) break;
6811 for (foldbuf = tmpbuf;
6815 cp = utf8_to_uvchr(foldbuf, &numlen);
6817 const STRLEN unilen = reguni(pRExC_state, cp, s);
6820 /* In EBCDIC the numlen and unilen can differ. */
6822 if (numlen >= foldlen)
6826 break; /* "Can't happen." */
6830 /* Point to the beginning of the next character in the sequence. */
6831 RExC_parse = endchar + 1;
6833 /* Quit if no more characters */
6834 if (RExC_parse >= endbrace) break;
6839 if (RExC_parse < endbrace) {
6840 ckWARNreg(RExC_parse - 1,
6841 "Using just the first characters returned by \\N{}");
6844 RExC_size += STR_SZ(len);
6847 RExC_emit += STR_SZ(len);
6850 RExC_parse = endbrace + 1;
6852 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6853 with malformed in t/re/pat_advanced.t */
6855 Set_Node_Cur_Length(ret); /* MJD */
6856 nextchar(pRExC_state);
6866 * It returns the code point in utf8 for the value in *encp.
6867 * value: a code value in the source encoding
6868 * encp: a pointer to an Encode object
6870 * If the result from Encode is not a single character,
6871 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6874 S_reg_recode(pTHX_ const char value, SV **encp)
6877 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6878 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6879 const STRLEN newlen = SvCUR(sv);
6880 UV uv = UNICODE_REPLACEMENT;
6882 PERL_ARGS_ASSERT_REG_RECODE;
6886 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6889 if (!newlen || numlen != newlen) {
6890 uv = UNICODE_REPLACEMENT;
6898 - regatom - the lowest level
6900 Try to identify anything special at the start of the pattern. If there
6901 is, then handle it as required. This may involve generating a single regop,
6902 such as for an assertion; or it may involve recursing, such as to
6903 handle a () structure.
6905 If the string doesn't start with something special then we gobble up
6906 as much literal text as we can.
6908 Once we have been able to handle whatever type of thing started the
6909 sequence, we return.
6911 Note: we have to be careful with escapes, as they can be both literal
6912 and special, and in the case of \10 and friends can either, depending
6913 on context. Specifically there are two seperate switches for handling
6914 escape sequences, with the one for handling literal escapes requiring
6915 a dummy entry for all of the special escapes that are actually handled
6920 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6923 register regnode *ret = NULL;
6925 char *parse_start = RExC_parse;
6926 GET_RE_DEBUG_FLAGS_DECL;
6927 DEBUG_PARSE("atom");
6928 *flagp = WORST; /* Tentatively. */
6930 PERL_ARGS_ASSERT_REGATOM;
6933 switch ((U8)*RExC_parse) {
6935 RExC_seen_zerolen++;
6936 nextchar(pRExC_state);
6937 if (RExC_flags & RXf_PMf_MULTILINE)
6938 ret = reg_node(pRExC_state, MBOL);
6939 else if (RExC_flags & RXf_PMf_SINGLELINE)
6940 ret = reg_node(pRExC_state, SBOL);
6942 ret = reg_node(pRExC_state, BOL);
6943 Set_Node_Length(ret, 1); /* MJD */
6946 nextchar(pRExC_state);
6948 RExC_seen_zerolen++;
6949 if (RExC_flags & RXf_PMf_MULTILINE)
6950 ret = reg_node(pRExC_state, MEOL);
6951 else if (RExC_flags & RXf_PMf_SINGLELINE)
6952 ret = reg_node(pRExC_state, SEOL);
6954 ret = reg_node(pRExC_state, EOL);
6955 Set_Node_Length(ret, 1); /* MJD */
6958 nextchar(pRExC_state);
6959 if (RExC_flags & RXf_PMf_SINGLELINE)
6960 ret = reg_node(pRExC_state, SANY);
6962 ret = reg_node(pRExC_state, REG_ANY);
6963 *flagp |= HASWIDTH|SIMPLE;
6965 Set_Node_Length(ret, 1); /* MJD */
6969 char * const oregcomp_parse = ++RExC_parse;
6970 ret = regclass(pRExC_state,depth+1);
6971 if (*RExC_parse != ']') {
6972 RExC_parse = oregcomp_parse;
6973 vFAIL("Unmatched [");
6975 nextchar(pRExC_state);
6976 *flagp |= HASWIDTH|SIMPLE;
6977 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6981 nextchar(pRExC_state);
6982 ret = reg(pRExC_state, 1, &flags,depth+1);
6984 if (flags & TRYAGAIN) {
6985 if (RExC_parse == RExC_end) {
6986 /* Make parent create an empty node if needed. */
6994 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6998 if (flags & TRYAGAIN) {
7002 vFAIL("Internal urp");
7003 /* Supposed to be caught earlier. */
7006 if (!regcurly(RExC_parse)) {
7015 vFAIL("Quantifier follows nothing");
7023 len=0; /* silence a spurious compiler warning */
7024 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7025 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7026 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7027 ret = reganode(pRExC_state, FOLDCHAR, cp);
7028 Set_Node_Length(ret, 1); /* MJD */
7029 nextchar(pRExC_state); /* kill whitespace under /x */
7037 This switch handles escape sequences that resolve to some kind
7038 of special regop and not to literal text. Escape sequnces that
7039 resolve to literal text are handled below in the switch marked
7042 Every entry in this switch *must* have a corresponding entry
7043 in the literal escape switch. However, the opposite is not
7044 required, as the default for this switch is to jump to the
7045 literal text handling code.
7047 switch ((U8)*++RExC_parse) {
7052 /* Special Escapes */
7054 RExC_seen_zerolen++;
7055 ret = reg_node(pRExC_state, SBOL);
7057 goto finish_meta_pat;
7059 ret = reg_node(pRExC_state, GPOS);
7060 RExC_seen |= REG_SEEN_GPOS;
7062 goto finish_meta_pat;
7064 RExC_seen_zerolen++;
7065 ret = reg_node(pRExC_state, KEEPS);
7067 /* XXX:dmq : disabling in-place substitution seems to
7068 * be necessary here to avoid cases of memory corruption, as
7069 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7071 RExC_seen |= REG_SEEN_LOOKBEHIND;
7072 goto finish_meta_pat;
7074 ret = reg_node(pRExC_state, SEOL);
7076 RExC_seen_zerolen++; /* Do not optimize RE away */
7077 goto finish_meta_pat;
7079 ret = reg_node(pRExC_state, EOS);
7081 RExC_seen_zerolen++; /* Do not optimize RE away */
7082 goto finish_meta_pat;
7084 ret = reg_node(pRExC_state, CANY);
7085 RExC_seen |= REG_SEEN_CANY;
7086 *flagp |= HASWIDTH|SIMPLE;
7087 goto finish_meta_pat;
7089 ret = reg_node(pRExC_state, CLUMP);
7091 goto finish_meta_pat;
7093 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7094 *flagp |= HASWIDTH|SIMPLE;
7095 goto finish_meta_pat;
7097 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7098 *flagp |= HASWIDTH|SIMPLE;
7099 goto finish_meta_pat;
7101 RExC_seen_zerolen++;
7102 RExC_seen |= REG_SEEN_LOOKBEHIND;
7103 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7105 goto finish_meta_pat;
7107 RExC_seen_zerolen++;
7108 RExC_seen |= REG_SEEN_LOOKBEHIND;
7109 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7111 goto finish_meta_pat;
7113 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7114 *flagp |= HASWIDTH|SIMPLE;
7115 goto finish_meta_pat;
7117 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7118 *flagp |= HASWIDTH|SIMPLE;
7119 goto finish_meta_pat;
7121 ret = reg_node(pRExC_state, DIGIT);
7122 *flagp |= HASWIDTH|SIMPLE;
7123 goto finish_meta_pat;
7125 ret = reg_node(pRExC_state, NDIGIT);
7126 *flagp |= HASWIDTH|SIMPLE;
7127 goto finish_meta_pat;
7129 ret = reg_node(pRExC_state, LNBREAK);
7130 *flagp |= HASWIDTH|SIMPLE;
7131 goto finish_meta_pat;
7133 ret = reg_node(pRExC_state, HORIZWS);
7134 *flagp |= HASWIDTH|SIMPLE;
7135 goto finish_meta_pat;
7137 ret = reg_node(pRExC_state, NHORIZWS);
7138 *flagp |= HASWIDTH|SIMPLE;
7139 goto finish_meta_pat;
7141 ret = reg_node(pRExC_state, VERTWS);
7142 *flagp |= HASWIDTH|SIMPLE;
7143 goto finish_meta_pat;
7145 ret = reg_node(pRExC_state, NVERTWS);
7146 *flagp |= HASWIDTH|SIMPLE;
7148 nextchar(pRExC_state);
7149 Set_Node_Length(ret, 2); /* MJD */
7154 char* const oldregxend = RExC_end;
7156 char* parse_start = RExC_parse - 2;
7159 if (RExC_parse[1] == '{') {
7160 /* a lovely hack--pretend we saw [\pX] instead */
7161 RExC_end = strchr(RExC_parse, '}');
7163 const U8 c = (U8)*RExC_parse;
7165 RExC_end = oldregxend;
7166 vFAIL2("Missing right brace on \\%c{}", c);
7171 RExC_end = RExC_parse + 2;
7172 if (RExC_end > oldregxend)
7173 RExC_end = oldregxend;
7177 ret = regclass(pRExC_state,depth+1);
7179 RExC_end = oldregxend;
7182 Set_Node_Offset(ret, parse_start + 2);
7183 Set_Node_Cur_Length(ret);
7184 nextchar(pRExC_state);
7185 *flagp |= HASWIDTH|SIMPLE;
7189 /* Handle \N and \N{NAME} here and not below because it can be
7190 multicharacter. join_exact() will join them up later on.
7191 Also this makes sure that things like /\N{BLAH}+/ and
7192 \N{BLAH} being multi char Just Happen. dmq*/
7194 ret= reg_namedseq(pRExC_state, NULL, flagp);
7196 case 'k': /* Handle \k<NAME> and \k'NAME' */
7199 char ch= RExC_parse[1];
7200 if (ch != '<' && ch != '\'' && ch != '{') {
7202 vFAIL2("Sequence %.2s... not terminated",parse_start);
7204 /* this pretty much dupes the code for (?P=...) in reg(), if
7205 you change this make sure you change that */
7206 char* name_start = (RExC_parse += 2);
7208 SV *sv_dat = reg_scan_name(pRExC_state,
7209 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7210 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7211 if (RExC_parse == name_start || *RExC_parse != ch)
7212 vFAIL2("Sequence %.3s... not terminated",parse_start);
7215 num = add_data( pRExC_state, 1, "S" );
7216 RExC_rxi->data->data[num]=(void*)sv_dat;
7217 SvREFCNT_inc_simple_void(sv_dat);
7221 ret = reganode(pRExC_state,
7222 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7226 /* override incorrect value set in reganode MJD */
7227 Set_Node_Offset(ret, parse_start+1);
7228 Set_Node_Cur_Length(ret); /* MJD */
7229 nextchar(pRExC_state);
7235 case '1': case '2': case '3': case '4':
7236 case '5': case '6': case '7': case '8': case '9':
7239 bool isg = *RExC_parse == 'g';
7244 if (*RExC_parse == '{') {
7248 if (*RExC_parse == '-') {
7252 if (hasbrace && !isDIGIT(*RExC_parse)) {
7253 if (isrel) RExC_parse--;
7255 goto parse_named_seq;
7257 num = atoi(RExC_parse);
7258 if (isg && num == 0)
7259 vFAIL("Reference to invalid group 0");
7261 num = RExC_npar - num;
7263 vFAIL("Reference to nonexistent or unclosed group");
7265 if (!isg && num > 9 && num >= RExC_npar)
7268 char * const parse_start = RExC_parse - 1; /* MJD */
7269 while (isDIGIT(*RExC_parse))
7271 if (parse_start == RExC_parse - 1)
7272 vFAIL("Unterminated \\g... pattern");
7274 if (*RExC_parse != '}')
7275 vFAIL("Unterminated \\g{...} pattern");
7279 if (num > (I32)RExC_rx->nparens)
7280 vFAIL("Reference to nonexistent group");
7283 ret = reganode(pRExC_state,
7284 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7288 /* override incorrect value set in reganode MJD */
7289 Set_Node_Offset(ret, parse_start+1);
7290 Set_Node_Cur_Length(ret); /* MJD */
7292 nextchar(pRExC_state);
7297 if (RExC_parse >= RExC_end)
7298 FAIL("Trailing \\");
7301 /* Do not generate "unrecognized" warnings here, we fall
7302 back into the quick-grab loop below */
7309 if (RExC_flags & RXf_PMf_EXTENDED) {
7310 if ( reg_skipcomment( pRExC_state ) )
7317 register STRLEN len;
7322 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7324 parse_start = RExC_parse - 1;
7330 ret = reg_node(pRExC_state,
7331 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7333 for (len = 0, p = RExC_parse - 1;
7334 len < 127 && p < RExC_end;
7337 char * const oldp = p;
7339 if (RExC_flags & RXf_PMf_EXTENDED)
7340 p = regwhite( pRExC_state, p );
7345 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7346 goto normal_default;
7356 /* Literal Escapes Switch
7358 This switch is meant to handle escape sequences that
7359 resolve to a literal character.
7361 Every escape sequence that represents something
7362 else, like an assertion or a char class, is handled
7363 in the switch marked 'Special Escapes' above in this
7364 routine, but also has an entry here as anything that
7365 isn't explicitly mentioned here will be treated as
7366 an unescaped equivalent literal.
7370 /* These are all the special escapes. */
7374 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7375 goto normal_default;
7376 case 'A': /* Start assertion */
7377 case 'b': case 'B': /* Word-boundary assertion*/
7378 case 'C': /* Single char !DANGEROUS! */
7379 case 'd': case 'D': /* digit class */
7380 case 'g': case 'G': /* generic-backref, pos assertion */
7381 case 'h': case 'H': /* HORIZWS */
7382 case 'k': case 'K': /* named backref, keep marker */
7383 case 'N': /* named char sequence */
7384 case 'p': case 'P': /* Unicode property */
7385 case 'R': /* LNBREAK */
7386 case 's': case 'S': /* space class */
7387 case 'v': case 'V': /* VERTWS */
7388 case 'w': case 'W': /* word class */
7389 case 'X': /* eXtended Unicode "combining character sequence" */
7390 case 'z': case 'Z': /* End of line/string assertion */
7394 /* Anything after here is an escape that resolves to a
7395 literal. (Except digits, which may or may not)
7414 ender = ASCII_TO_NATIVE('\033');
7418 ender = ASCII_TO_NATIVE('\007');
7423 char* const e = strchr(p, '}');
7427 vFAIL("Missing right brace on \\x{}");
7430 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7431 | PERL_SCAN_DISALLOW_PREFIX;
7432 STRLEN numlen = e - p - 1;
7433 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7440 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7442 ender = grok_hex(p, &numlen, &flags, NULL);
7445 if (PL_encoding && ender < 0x100)
7446 goto recode_encoding;
7450 ender = UCHARAT(p++);
7451 ender = toCTRL(ender);
7453 case '0': case '1': case '2': case '3':case '4':
7454 case '5': case '6': case '7': case '8':case '9':
7456 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7459 ender = grok_oct(p, &numlen, &flags, NULL);
7461 /* An octal above 0xff is interpreted differently
7462 * depending on if the re is in utf8 or not. If it
7463 * is in utf8, the value will be itself, otherwise
7464 * it is interpreted as modulo 0x100. It has been
7465 * decided to discourage the use of octal above the
7466 * single-byte range. For now, warn only when
7467 * it ends up modulo */
7468 if (SIZE_ONLY && ender >= 0x100
7469 && ! UTF && ! PL_encoding) {
7470 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7478 if (PL_encoding && ender < 0x100)
7479 goto recode_encoding;
7483 SV* enc = PL_encoding;
7484 ender = reg_recode((const char)(U8)ender, &enc);
7485 if (!enc && SIZE_ONLY)
7486 ckWARNreg(p, "Invalid escape in the specified encoding");
7492 FAIL("Trailing \\");
7495 if (!SIZE_ONLY&& isALPHA(*p))
7496 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7497 goto normal_default;
7502 if (UTF8_IS_START(*p) && UTF) {
7504 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7505 &numlen, UTF8_ALLOW_DEFAULT);
7512 if ( RExC_flags & RXf_PMf_EXTENDED)
7513 p = regwhite( pRExC_state, p );
7515 /* Prime the casefolded buffer. */
7516 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7518 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7523 /* Emit all the Unicode characters. */
7525 for (foldbuf = tmpbuf;
7527 foldlen -= numlen) {
7528 ender = utf8_to_uvchr(foldbuf, &numlen);
7530 const STRLEN unilen = reguni(pRExC_state, ender, s);
7533 /* In EBCDIC the numlen
7534 * and unilen can differ. */
7536 if (numlen >= foldlen)
7540 break; /* "Can't happen." */
7544 const STRLEN unilen = reguni(pRExC_state, ender, s);
7553 REGC((char)ender, s++);
7559 /* Emit all the Unicode characters. */
7561 for (foldbuf = tmpbuf;
7563 foldlen -= numlen) {
7564 ender = utf8_to_uvchr(foldbuf, &numlen);
7566 const STRLEN unilen = reguni(pRExC_state, ender, s);
7569 /* In EBCDIC the numlen
7570 * and unilen can differ. */
7572 if (numlen >= foldlen)
7580 const STRLEN unilen = reguni(pRExC_state, ender, s);
7589 REGC((char)ender, s++);
7593 Set_Node_Cur_Length(ret); /* MJD */
7594 nextchar(pRExC_state);
7596 /* len is STRLEN which is unsigned, need to copy to signed */
7599 vFAIL("Internal disaster");
7603 if (len == 1 && UNI_IS_INVARIANT(ender))
7607 RExC_size += STR_SZ(len);
7610 RExC_emit += STR_SZ(len);
7620 S_regwhite( RExC_state_t *pRExC_state, char *p )
7622 const char *e = RExC_end;
7624 PERL_ARGS_ASSERT_REGWHITE;
7629 else if (*p == '#') {
7638 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7646 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7647 Character classes ([:foo:]) can also be negated ([:^foo:]).
7648 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7649 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7650 but trigger failures because they are currently unimplemented. */
7652 #define POSIXCC_DONE(c) ((c) == ':')
7653 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7654 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7657 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7660 I32 namedclass = OOB_NAMEDCLASS;
7662 PERL_ARGS_ASSERT_REGPPOSIXCC;
7664 if (value == '[' && RExC_parse + 1 < RExC_end &&
7665 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7666 POSIXCC(UCHARAT(RExC_parse))) {
7667 const char c = UCHARAT(RExC_parse);
7668 char* const s = RExC_parse++;
7670 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7672 if (RExC_parse == RExC_end)
7673 /* Grandfather lone [:, [=, [. */
7676 const char* const t = RExC_parse++; /* skip over the c */
7679 if (UCHARAT(RExC_parse) == ']') {
7680 const char *posixcc = s + 1;
7681 RExC_parse++; /* skip over the ending ] */
7684 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7685 const I32 skip = t - posixcc;
7687 /* Initially switch on the length of the name. */
7690 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7691 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7694 /* Names all of length 5. */
7695 /* alnum alpha ascii blank cntrl digit graph lower
7696 print punct space upper */
7697 /* Offset 4 gives the best switch position. */
7698 switch (posixcc[4]) {
7700 if (memEQ(posixcc, "alph", 4)) /* alpha */
7701 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7704 if (memEQ(posixcc, "spac", 4)) /* space */
7705 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7708 if (memEQ(posixcc, "grap", 4)) /* graph */
7709 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7712 if (memEQ(posixcc, "asci", 4)) /* ascii */
7713 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7716 if (memEQ(posixcc, "blan", 4)) /* blank */
7717 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7720 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7721 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7724 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7725 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7728 if (memEQ(posixcc, "lowe", 4)) /* lower */
7729 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7730 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7731 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7734 if (memEQ(posixcc, "digi", 4)) /* digit */
7735 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7736 else if (memEQ(posixcc, "prin", 4)) /* print */
7737 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7738 else if (memEQ(posixcc, "punc", 4)) /* punct */
7739 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7744 if (memEQ(posixcc, "xdigit", 6))
7745 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7749 if (namedclass == OOB_NAMEDCLASS)
7750 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7752 assert (posixcc[skip] == ':');
7753 assert (posixcc[skip+1] == ']');
7754 } else if (!SIZE_ONLY) {
7755 /* [[=foo=]] and [[.foo.]] are still future. */
7757 /* adjust RExC_parse so the warning shows after
7759 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7761 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7764 /* Maternal grandfather:
7765 * "[:" ending in ":" but not in ":]" */
7775 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7779 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7781 if (POSIXCC(UCHARAT(RExC_parse))) {
7782 const char *s = RExC_parse;
7783 const char c = *s++;
7787 if (*s && c == *s && s[1] == ']') {
7789 "POSIX syntax [%c %c] belongs inside character classes",
7792 /* [[=foo=]] and [[.foo.]] are still future. */
7793 if (POSIXCC_NOTYET(c)) {
7794 /* adjust RExC_parse so the error shows after
7796 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7798 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7805 #define _C_C_T_(NAME,TEST,WORD) \
7808 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7810 for (value = 0; value < 256; value++) \
7812 ANYOF_BITMAP_SET(ret, value); \
7817 case ANYOF_N##NAME: \
7819 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7821 for (value = 0; value < 256; value++) \
7823 ANYOF_BITMAP_SET(ret, value); \
7829 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7831 for (value = 0; value < 256; value++) \
7833 ANYOF_BITMAP_SET(ret, value); \
7837 case ANYOF_N##NAME: \
7838 for (value = 0; value < 256; value++) \
7840 ANYOF_BITMAP_SET(ret, value); \
7846 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7847 so that it is possible to override the option here without having to
7848 rebuild the entire core. as we are required to do if we change regcomp.h
7849 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7851 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7852 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7855 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7856 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7858 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7862 parse a class specification and produce either an ANYOF node that
7863 matches the pattern or if the pattern matches a single char only and
7864 that char is < 256 and we are case insensitive then we produce an
7869 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7872 register UV nextvalue;
7873 register IV prevvalue = OOB_UNICODE;
7874 register IV range = 0;
7875 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7876 register regnode *ret;
7879 char *rangebegin = NULL;
7880 bool need_class = 0;
7883 bool optimize_invert = TRUE;
7884 AV* unicode_alternate = NULL;
7886 UV literal_endpoint = 0;
7888 UV stored = 0; /* number of chars stored in the class */
7890 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7891 case we need to change the emitted regop to an EXACT. */
7892 const char * orig_parse = RExC_parse;
7893 GET_RE_DEBUG_FLAGS_DECL;
7895 PERL_ARGS_ASSERT_REGCLASS;
7897 PERL_UNUSED_ARG(depth);
7900 DEBUG_PARSE("clas");
7902 /* Assume we are going to generate an ANYOF node. */
7903 ret = reganode(pRExC_state, ANYOF, 0);
7906 ANYOF_FLAGS(ret) = 0;
7908 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7912 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7916 RExC_size += ANYOF_SKIP;
7917 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7920 RExC_emit += ANYOF_SKIP;
7922 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7924 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7925 ANYOF_BITMAP_ZERO(ret);
7926 listsv = newSVpvs("# comment\n");
7929 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7931 if (!SIZE_ONLY && POSIXCC(nextvalue))
7932 checkposixcc(pRExC_state);
7934 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7935 if (UCHARAT(RExC_parse) == ']')
7939 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7943 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7946 rangebegin = RExC_parse;
7948 value = utf8n_to_uvchr((U8*)RExC_parse,
7949 RExC_end - RExC_parse,
7950 &numlen, UTF8_ALLOW_DEFAULT);
7951 RExC_parse += numlen;
7954 value = UCHARAT(RExC_parse++);
7956 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7957 if (value == '[' && POSIXCC(nextvalue))
7958 namedclass = regpposixcc(pRExC_state, value);
7959 else if (value == '\\') {
7961 value = utf8n_to_uvchr((U8*)RExC_parse,
7962 RExC_end - RExC_parse,
7963 &numlen, UTF8_ALLOW_DEFAULT);
7964 RExC_parse += numlen;
7967 value = UCHARAT(RExC_parse++);
7968 /* Some compilers cannot handle switching on 64-bit integer
7969 * values, therefore value cannot be an UV. Yes, this will
7970 * be a problem later if we want switch on Unicode.
7971 * A similar issue a little bit later when switching on
7972 * namedclass. --jhi */
7973 switch ((I32)value) {
7974 case 'w': namedclass = ANYOF_ALNUM; break;
7975 case 'W': namedclass = ANYOF_NALNUM; break;
7976 case 's': namedclass = ANYOF_SPACE; break;
7977 case 'S': namedclass = ANYOF_NSPACE; break;
7978 case 'd': namedclass = ANYOF_DIGIT; break;
7979 case 'D': namedclass = ANYOF_NDIGIT; break;
7980 case 'v': namedclass = ANYOF_VERTWS; break;
7981 case 'V': namedclass = ANYOF_NVERTWS; break;
7982 case 'h': namedclass = ANYOF_HORIZWS; break;
7983 case 'H': namedclass = ANYOF_NHORIZWS; break;
7984 case 'N': /* Handle \N{NAME} in class */
7986 /* We only pay attention to the first char of
7987 multichar strings being returned. I kinda wonder
7988 if this makes sense as it does change the behaviour
7989 from earlier versions, OTOH that behaviour was broken
7991 UV v; /* value is register so we cant & it /grrr */
7992 if (reg_namedseq(pRExC_state, &v, NULL)) {
8002 if (RExC_parse >= RExC_end)
8003 vFAIL2("Empty \\%c{}", (U8)value);
8004 if (*RExC_parse == '{') {
8005 const U8 c = (U8)value;
8006 e = strchr(RExC_parse++, '}');
8008 vFAIL2("Missing right brace on \\%c{}", c);
8009 while (isSPACE(UCHARAT(RExC_parse)))
8011 if (e == RExC_parse)
8012 vFAIL2("Empty \\%c{}", c);
8014 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8022 if (UCHARAT(RExC_parse) == '^') {
8025 value = value == 'p' ? 'P' : 'p'; /* toggle */
8026 while (isSPACE(UCHARAT(RExC_parse))) {
8031 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8032 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8035 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8036 namedclass = ANYOF_MAX; /* no official name, but it's named */
8039 case 'n': value = '\n'; break;
8040 case 'r': value = '\r'; break;
8041 case 't': value = '\t'; break;
8042 case 'f': value = '\f'; break;
8043 case 'b': value = '\b'; break;
8044 case 'e': value = ASCII_TO_NATIVE('\033');break;
8045 case 'a': value = ASCII_TO_NATIVE('\007');break;
8047 if (*RExC_parse == '{') {
8048 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8049 | PERL_SCAN_DISALLOW_PREFIX;
8050 char * const e = strchr(RExC_parse++, '}');
8052 vFAIL("Missing right brace on \\x{}");
8054 numlen = e - RExC_parse;
8055 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8059 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8061 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8062 RExC_parse += numlen;
8064 if (PL_encoding && value < 0x100)
8065 goto recode_encoding;
8068 value = UCHARAT(RExC_parse++);
8069 value = toCTRL(value);
8071 case '0': case '1': case '2': case '3': case '4':
8072 case '5': case '6': case '7': case '8': case '9':
8076 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8077 RExC_parse += numlen;
8078 if (PL_encoding && value < 0x100)
8079 goto recode_encoding;
8084 SV* enc = PL_encoding;
8085 value = reg_recode((const char)(U8)value, &enc);
8086 if (!enc && SIZE_ONLY)
8087 ckWARNreg(RExC_parse,
8088 "Invalid escape in the specified encoding");
8092 if (!SIZE_ONLY && isALPHA(value))
8093 ckWARN2reg(RExC_parse,
8094 "Unrecognized escape \\%c in character class passed through",
8098 } /* end of \blah */
8104 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8106 if (!SIZE_ONLY && !need_class)
8107 ANYOF_CLASS_ZERO(ret);
8111 /* a bad range like a-\d, a-[:digit:] ? */
8115 RExC_parse >= rangebegin ?
8116 RExC_parse - rangebegin : 0;
8117 ckWARN4reg(RExC_parse,
8118 "False [] range \"%*.*s\"",
8121 if (prevvalue < 256) {
8122 ANYOF_BITMAP_SET(ret, prevvalue);
8123 ANYOF_BITMAP_SET(ret, '-');
8126 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8127 Perl_sv_catpvf(aTHX_ listsv,
8128 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8132 range = 0; /* this was not a true range */
8138 const char *what = NULL;
8141 if (namedclass > OOB_NAMEDCLASS)
8142 optimize_invert = FALSE;
8143 /* Possible truncation here but in some 64-bit environments
8144 * the compiler gets heartburn about switch on 64-bit values.
8145 * A similar issue a little earlier when switching on value.
8147 switch ((I32)namedclass) {
8149 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8150 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8151 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8152 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8153 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8154 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8155 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8156 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8157 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8158 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8159 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8160 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8161 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8163 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8164 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8166 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8167 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8168 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8171 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8174 for (value = 0; value < 128; value++)
8175 ANYOF_BITMAP_SET(ret, value);
8177 for (value = 0; value < 256; value++) {
8179 ANYOF_BITMAP_SET(ret, value);
8188 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8191 for (value = 128; value < 256; value++)
8192 ANYOF_BITMAP_SET(ret, value);
8194 for (value = 0; value < 256; value++) {
8195 if (!isASCII(value))
8196 ANYOF_BITMAP_SET(ret, value);
8205 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8207 /* consecutive digits assumed */
8208 for (value = '0'; value <= '9'; value++)
8209 ANYOF_BITMAP_SET(ret, value);
8212 what = POSIX_CC_UNI_NAME("Digit");
8216 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8218 /* consecutive digits assumed */
8219 for (value = 0; value < '0'; value++)
8220 ANYOF_BITMAP_SET(ret, value);
8221 for (value = '9' + 1; value < 256; value++)
8222 ANYOF_BITMAP_SET(ret, value);
8225 what = POSIX_CC_UNI_NAME("Digit");
8228 /* this is to handle \p and \P */
8231 vFAIL("Invalid [::] class");
8235 /* Strings such as "+utf8::isWord\n" */
8236 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8239 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8242 } /* end of namedclass \blah */
8245 if (prevvalue > (IV)value) /* b-a */ {
8246 const int w = RExC_parse - rangebegin;
8247 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8248 range = 0; /* not a valid range */
8252 prevvalue = value; /* save the beginning of the range */
8253 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8254 RExC_parse[1] != ']') {
8257 /* a bad range like \w-, [:word:]- ? */
8258 if (namedclass > OOB_NAMEDCLASS) {
8259 if (ckWARN(WARN_REGEXP)) {
8261 RExC_parse >= rangebegin ?
8262 RExC_parse - rangebegin : 0;
8264 "False [] range \"%*.*s\"",
8268 ANYOF_BITMAP_SET(ret, '-');
8270 range = 1; /* yeah, it's a range! */
8271 continue; /* but do it the next time */
8275 /* now is the next time */
8276 /*stored += (value - prevvalue + 1);*/
8278 if (prevvalue < 256) {
8279 const IV ceilvalue = value < 256 ? value : 255;
8282 /* In EBCDIC [\x89-\x91] should include
8283 * the \x8e but [i-j] should not. */
8284 if (literal_endpoint == 2 &&
8285 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8286 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8288 if (isLOWER(prevvalue)) {
8289 for (i = prevvalue; i <= ceilvalue; i++)
8290 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8292 ANYOF_BITMAP_SET(ret, i);
8295 for (i = prevvalue; i <= ceilvalue; i++)
8296 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8298 ANYOF_BITMAP_SET(ret, i);
8304 for (i = prevvalue; i <= ceilvalue; i++) {
8305 if (!ANYOF_BITMAP_TEST(ret,i)) {
8307 ANYOF_BITMAP_SET(ret, i);
8311 if (value > 255 || UTF) {
8312 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8313 const UV natvalue = NATIVE_TO_UNI(value);
8314 stored+=2; /* can't optimize this class */
8315 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8316 if (prevnatvalue < natvalue) { /* what about > ? */
8317 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8318 prevnatvalue, natvalue);
8320 else if (prevnatvalue == natvalue) {
8321 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8323 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8325 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8327 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8328 if (RExC_precomp[0] == ':' &&
8329 RExC_precomp[1] == '[' &&
8330 (f == 0xDF || f == 0x92)) {
8331 f = NATIVE_TO_UNI(f);
8334 /* If folding and foldable and a single
8335 * character, insert also the folded version
8336 * to the charclass. */
8338 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8339 if ((RExC_precomp[0] == ':' &&
8340 RExC_precomp[1] == '[' &&
8342 (value == 0xFB05 || value == 0xFB06))) ?
8343 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8344 foldlen == (STRLEN)UNISKIP(f) )
8346 if (foldlen == (STRLEN)UNISKIP(f))
8348 Perl_sv_catpvf(aTHX_ listsv,
8351 /* Any multicharacter foldings
8352 * require the following transform:
8353 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8354 * where E folds into "pq" and F folds
8355 * into "rst", all other characters
8356 * fold to single characters. We save
8357 * away these multicharacter foldings,
8358 * to be later saved as part of the
8359 * additional "s" data. */
8362 if (!unicode_alternate)
8363 unicode_alternate = newAV();
8364 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8366 av_push(unicode_alternate, sv);
8370 /* If folding and the value is one of the Greek
8371 * sigmas insert a few more sigmas to make the
8372 * folding rules of the sigmas to work right.
8373 * Note that not all the possible combinations
8374 * are handled here: some of them are handled
8375 * by the standard folding rules, and some of
8376 * them (literal or EXACTF cases) are handled
8377 * during runtime in regexec.c:S_find_byclass(). */
8378 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8379 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8380 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8381 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8382 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8384 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8385 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8386 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8391 literal_endpoint = 0;
8395 range = 0; /* this range (if it was one) is done now */
8399 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8401 RExC_size += ANYOF_CLASS_ADD_SKIP;
8403 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8409 /****** !SIZE_ONLY AFTER HERE *********/
8411 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8412 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8414 /* optimize single char class to an EXACT node
8415 but *only* when its not a UTF/high char */
8416 const char * cur_parse= RExC_parse;
8417 RExC_emit = (regnode *)orig_emit;
8418 RExC_parse = (char *)orig_parse;
8419 ret = reg_node(pRExC_state,
8420 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8421 RExC_parse = (char *)cur_parse;
8422 *STRING(ret)= (char)value;
8424 RExC_emit += STR_SZ(1);
8425 SvREFCNT_dec(listsv);
8428 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8429 if ( /* If the only flag is folding (plus possibly inversion). */
8430 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8432 for (value = 0; value < 256; ++value) {
8433 if (ANYOF_BITMAP_TEST(ret, value)) {
8434 UV fold = PL_fold[value];
8437 ANYOF_BITMAP_SET(ret, fold);
8440 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8443 /* optimize inverted simple patterns (e.g. [^a-z]) */
8444 if (optimize_invert &&
8445 /* If the only flag is inversion. */
8446 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8447 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8448 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8449 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8452 AV * const av = newAV();
8454 /* The 0th element stores the character class description
8455 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8456 * to initialize the appropriate swash (which gets stored in
8457 * the 1st element), and also useful for dumping the regnode.
8458 * The 2nd element stores the multicharacter foldings,
8459 * used later (regexec.c:S_reginclass()). */
8460 av_store(av, 0, listsv);
8461 av_store(av, 1, NULL);
8462 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8463 rv = newRV_noinc(MUTABLE_SV(av));
8464 n = add_data(pRExC_state, 1, "s");
8465 RExC_rxi->data->data[n] = (void*)rv;
8473 /* reg_skipcomment()
8475 Absorbs an /x style # comments from the input stream.
8476 Returns true if there is more text remaining in the stream.
8477 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8478 terminates the pattern without including a newline.
8480 Note its the callers responsibility to ensure that we are
8486 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8490 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8492 while (RExC_parse < RExC_end)
8493 if (*RExC_parse++ == '\n') {
8498 /* we ran off the end of the pattern without ending
8499 the comment, so we have to add an \n when wrapping */
8500 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8508 Advance that parse position, and optionall absorbs
8509 "whitespace" from the inputstream.
8511 Without /x "whitespace" means (?#...) style comments only,
8512 with /x this means (?#...) and # comments and whitespace proper.
8514 Returns the RExC_parse point from BEFORE the scan occurs.
8516 This is the /x friendly way of saying RExC_parse++.
8520 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8522 char* const retval = RExC_parse++;
8524 PERL_ARGS_ASSERT_NEXTCHAR;
8527 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8528 RExC_parse[2] == '#') {
8529 while (*RExC_parse != ')') {
8530 if (RExC_parse == RExC_end)
8531 FAIL("Sequence (?#... not terminated");
8537 if (RExC_flags & RXf_PMf_EXTENDED) {
8538 if (isSPACE(*RExC_parse)) {
8542 else if (*RExC_parse == '#') {
8543 if ( reg_skipcomment( pRExC_state ) )
8552 - reg_node - emit a node
8554 STATIC regnode * /* Location. */
8555 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8558 register regnode *ptr;
8559 regnode * const ret = RExC_emit;
8560 GET_RE_DEBUG_FLAGS_DECL;
8562 PERL_ARGS_ASSERT_REG_NODE;
8565 SIZE_ALIGN(RExC_size);
8569 if (RExC_emit >= RExC_emit_bound)
8570 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8572 NODE_ALIGN_FILL(ret);
8574 FILL_ADVANCE_NODE(ptr, op);
8575 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8576 #ifdef RE_TRACK_PATTERN_OFFSETS
8577 if (RExC_offsets) { /* MJD */
8578 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8579 "reg_node", __LINE__,
8581 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8582 ? "Overwriting end of array!\n" : "OK",
8583 (UV)(RExC_emit - RExC_emit_start),
8584 (UV)(RExC_parse - RExC_start),
8585 (UV)RExC_offsets[0]));
8586 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8594 - reganode - emit a node with an argument
8596 STATIC regnode * /* Location. */
8597 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8600 register regnode *ptr;
8601 regnode * const ret = RExC_emit;
8602 GET_RE_DEBUG_FLAGS_DECL;
8604 PERL_ARGS_ASSERT_REGANODE;
8607 SIZE_ALIGN(RExC_size);
8612 assert(2==regarglen[op]+1);
8614 Anything larger than this has to allocate the extra amount.
8615 If we changed this to be:
8617 RExC_size += (1 + regarglen[op]);
8619 then it wouldn't matter. Its not clear what side effect
8620 might come from that so its not done so far.
8625 if (RExC_emit >= RExC_emit_bound)
8626 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8628 NODE_ALIGN_FILL(ret);
8630 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8631 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8632 #ifdef RE_TRACK_PATTERN_OFFSETS
8633 if (RExC_offsets) { /* MJD */
8634 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8638 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8639 "Overwriting end of array!\n" : "OK",
8640 (UV)(RExC_emit - RExC_emit_start),
8641 (UV)(RExC_parse - RExC_start),
8642 (UV)RExC_offsets[0]));
8643 Set_Cur_Node_Offset;
8651 - reguni - emit (if appropriate) a Unicode character
8654 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8658 PERL_ARGS_ASSERT_REGUNI;
8660 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8664 - reginsert - insert an operator in front of already-emitted operand
8666 * Means relocating the operand.
8669 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8672 register regnode *src;
8673 register regnode *dst;
8674 register regnode *place;
8675 const int offset = regarglen[(U8)op];
8676 const int size = NODE_STEP_REGNODE + offset;
8677 GET_RE_DEBUG_FLAGS_DECL;
8679 PERL_ARGS_ASSERT_REGINSERT;
8680 PERL_UNUSED_ARG(depth);
8681 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8682 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8691 if (RExC_open_parens) {
8693 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8694 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8695 if ( RExC_open_parens[paren] >= opnd ) {
8696 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8697 RExC_open_parens[paren] += size;
8699 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8701 if ( RExC_close_parens[paren] >= opnd ) {
8702 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8703 RExC_close_parens[paren] += size;
8705 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8710 while (src > opnd) {
8711 StructCopy(--src, --dst, regnode);
8712 #ifdef RE_TRACK_PATTERN_OFFSETS
8713 if (RExC_offsets) { /* MJD 20010112 */
8714 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8718 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8719 ? "Overwriting end of array!\n" : "OK",
8720 (UV)(src - RExC_emit_start),
8721 (UV)(dst - RExC_emit_start),
8722 (UV)RExC_offsets[0]));
8723 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8724 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8730 place = opnd; /* Op node, where operand used to be. */
8731 #ifdef RE_TRACK_PATTERN_OFFSETS
8732 if (RExC_offsets) { /* MJD */
8733 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8737 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8738 ? "Overwriting end of array!\n" : "OK",
8739 (UV)(place - RExC_emit_start),
8740 (UV)(RExC_parse - RExC_start),
8741 (UV)RExC_offsets[0]));
8742 Set_Node_Offset(place, RExC_parse);
8743 Set_Node_Length(place, 1);
8746 src = NEXTOPER(place);
8747 FILL_ADVANCE_NODE(place, op);
8748 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8749 Zero(src, offset, regnode);
8753 - regtail - set the next-pointer at the end of a node chain of p to val.
8754 - SEE ALSO: regtail_study
8756 /* TODO: All three parms should be const */
8758 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8761 register regnode *scan;
8762 GET_RE_DEBUG_FLAGS_DECL;
8764 PERL_ARGS_ASSERT_REGTAIL;
8766 PERL_UNUSED_ARG(depth);
8772 /* Find last node. */
8775 regnode * const temp = regnext(scan);
8777 SV * const mysv=sv_newmortal();
8778 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8779 regprop(RExC_rx, mysv, scan);
8780 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8781 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8782 (temp == NULL ? "->" : ""),
8783 (temp == NULL ? PL_reg_name[OP(val)] : "")
8791 if (reg_off_by_arg[OP(scan)]) {
8792 ARG_SET(scan, val - scan);
8795 NEXT_OFF(scan) = val - scan;
8801 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8802 - Look for optimizable sequences at the same time.
8803 - currently only looks for EXACT chains.
8805 This is expermental code. The idea is to use this routine to perform
8806 in place optimizations on branches and groups as they are constructed,
8807 with the long term intention of removing optimization from study_chunk so
8808 that it is purely analytical.
8810 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8811 to control which is which.
8814 /* TODO: All four parms should be const */
8817 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8820 register regnode *scan;
8822 #ifdef EXPERIMENTAL_INPLACESCAN
8825 GET_RE_DEBUG_FLAGS_DECL;
8827 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8833 /* Find last node. */
8837 regnode * const temp = regnext(scan);
8838 #ifdef EXPERIMENTAL_INPLACESCAN
8839 if (PL_regkind[OP(scan)] == EXACT)
8840 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8848 if( exact == PSEUDO )
8850 else if ( exact != OP(scan) )
8859 SV * const mysv=sv_newmortal();
8860 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8861 regprop(RExC_rx, mysv, scan);
8862 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8863 SvPV_nolen_const(mysv),
8865 PL_reg_name[exact]);
8872 SV * const mysv_val=sv_newmortal();
8873 DEBUG_PARSE_MSG("");
8874 regprop(RExC_rx, mysv_val, val);
8875 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8876 SvPV_nolen_const(mysv_val),
8877 (IV)REG_NODE_NUM(val),
8881 if (reg_off_by_arg[OP(scan)]) {
8882 ARG_SET(scan, val - scan);
8885 NEXT_OFF(scan) = val - scan;
8893 - regcurly - a little FSA that accepts {\d+,?\d*}
8895 #ifndef PERL_IN_XSUB_RE
8897 Perl_regcurly(register const char *s)
8899 PERL_ARGS_ASSERT_REGCURLY;
8918 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8922 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8927 for (bit=0; bit<32; bit++) {
8928 if (flags & (1<<bit)) {
8930 PerlIO_printf(Perl_debug_log, "%s",lead);
8931 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8936 PerlIO_printf(Perl_debug_log, "\n");
8938 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8944 Perl_regdump(pTHX_ const regexp *r)
8948 SV * const sv = sv_newmortal();
8949 SV *dsv= sv_newmortal();
8951 GET_RE_DEBUG_FLAGS_DECL;
8953 PERL_ARGS_ASSERT_REGDUMP;
8955 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8957 /* Header fields of interest. */
8958 if (r->anchored_substr) {
8959 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8960 RE_SV_DUMPLEN(r->anchored_substr), 30);
8961 PerlIO_printf(Perl_debug_log,
8962 "anchored %s%s at %"IVdf" ",
8963 s, RE_SV_TAIL(r->anchored_substr),
8964 (IV)r->anchored_offset);
8965 } else if (r->anchored_utf8) {
8966 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8967 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8968 PerlIO_printf(Perl_debug_log,
8969 "anchored utf8 %s%s at %"IVdf" ",
8970 s, RE_SV_TAIL(r->anchored_utf8),
8971 (IV)r->anchored_offset);
8973 if (r->float_substr) {
8974 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8975 RE_SV_DUMPLEN(r->float_substr), 30);
8976 PerlIO_printf(Perl_debug_log,
8977 "floating %s%s at %"IVdf"..%"UVuf" ",
8978 s, RE_SV_TAIL(r->float_substr),
8979 (IV)r->float_min_offset, (UV)r->float_max_offset);
8980 } else if (r->float_utf8) {
8981 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8982 RE_SV_DUMPLEN(r->float_utf8), 30);
8983 PerlIO_printf(Perl_debug_log,
8984 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8985 s, RE_SV_TAIL(r->float_utf8),
8986 (IV)r->float_min_offset, (UV)r->float_max_offset);
8988 if (r->check_substr || r->check_utf8)
8989 PerlIO_printf(Perl_debug_log,
8991 (r->check_substr == r->float_substr
8992 && r->check_utf8 == r->float_utf8
8993 ? "(checking floating" : "(checking anchored"));
8994 if (r->extflags & RXf_NOSCAN)
8995 PerlIO_printf(Perl_debug_log, " noscan");
8996 if (r->extflags & RXf_CHECK_ALL)
8997 PerlIO_printf(Perl_debug_log, " isall");
8998 if (r->check_substr || r->check_utf8)
8999 PerlIO_printf(Perl_debug_log, ") ");
9001 if (ri->regstclass) {
9002 regprop(r, sv, ri->regstclass);
9003 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9005 if (r->extflags & RXf_ANCH) {
9006 PerlIO_printf(Perl_debug_log, "anchored");
9007 if (r->extflags & RXf_ANCH_BOL)
9008 PerlIO_printf(Perl_debug_log, "(BOL)");
9009 if (r->extflags & RXf_ANCH_MBOL)
9010 PerlIO_printf(Perl_debug_log, "(MBOL)");
9011 if (r->extflags & RXf_ANCH_SBOL)
9012 PerlIO_printf(Perl_debug_log, "(SBOL)");
9013 if (r->extflags & RXf_ANCH_GPOS)
9014 PerlIO_printf(Perl_debug_log, "(GPOS)");
9015 PerlIO_putc(Perl_debug_log, ' ');
9017 if (r->extflags & RXf_GPOS_SEEN)
9018 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9019 if (r->intflags & PREGf_SKIP)
9020 PerlIO_printf(Perl_debug_log, "plus ");
9021 if (r->intflags & PREGf_IMPLICIT)
9022 PerlIO_printf(Perl_debug_log, "implicit ");
9023 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9024 if (r->extflags & RXf_EVAL_SEEN)
9025 PerlIO_printf(Perl_debug_log, "with eval ");
9026 PerlIO_printf(Perl_debug_log, "\n");
9027 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9029 PERL_ARGS_ASSERT_REGDUMP;
9030 PERL_UNUSED_CONTEXT;
9032 #endif /* DEBUGGING */
9036 - regprop - printable representation of opcode
9038 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9041 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9042 if (flags & ANYOF_INVERT) \
9043 /*make sure the invert info is in each */ \
9044 sv_catpvs(sv, "^"); \
9050 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9055 RXi_GET_DECL(prog,progi);
9056 GET_RE_DEBUG_FLAGS_DECL;
9058 PERL_ARGS_ASSERT_REGPROP;
9062 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9063 /* It would be nice to FAIL() here, but this may be called from
9064 regexec.c, and it would be hard to supply pRExC_state. */
9065 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9066 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9068 k = PL_regkind[OP(o)];
9072 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9073 * is a crude hack but it may be the best for now since
9074 * we have no flag "this EXACTish node was UTF-8"
9076 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9077 PERL_PV_ESCAPE_UNI_DETECT |
9078 PERL_PV_PRETTY_ELLIPSES |
9079 PERL_PV_PRETTY_LTGT |
9080 PERL_PV_PRETTY_NOCLEAR
9082 } else if (k == TRIE) {
9083 /* print the details of the trie in dumpuntil instead, as
9084 * progi->data isn't available here */
9085 const char op = OP(o);
9086 const U32 n = ARG(o);
9087 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9088 (reg_ac_data *)progi->data->data[n] :
9090 const reg_trie_data * const trie
9091 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9093 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9094 DEBUG_TRIE_COMPILE_r(
9095 Perl_sv_catpvf(aTHX_ sv,
9096 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9097 (UV)trie->startstate,
9098 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9099 (UV)trie->wordcount,
9102 (UV)TRIE_CHARCOUNT(trie),
9103 (UV)trie->uniquecharcount
9106 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9108 int rangestart = -1;
9109 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9111 for (i = 0; i <= 256; i++) {
9112 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9113 if (rangestart == -1)
9115 } else if (rangestart != -1) {
9116 if (i <= rangestart + 3)
9117 for (; rangestart < i; rangestart++)
9118 put_byte(sv, rangestart);
9120 put_byte(sv, rangestart);
9122 put_byte(sv, i - 1);
9130 } else if (k == CURLY) {
9131 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9132 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9133 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9135 else if (k == WHILEM && o->flags) /* Ordinal/of */
9136 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9137 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9138 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9139 if ( RXp_PAREN_NAMES(prog) ) {
9140 if ( k != REF || OP(o) < NREF) {
9141 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9142 SV **name= av_fetch(list, ARG(o), 0 );
9144 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9147 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9148 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9149 I32 *nums=(I32*)SvPVX(sv_dat);
9150 SV **name= av_fetch(list, nums[0], 0 );
9153 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9154 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9155 (n ? "," : ""), (IV)nums[n]);
9157 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9161 } else if (k == GOSUB)
9162 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9163 else if (k == VERB) {
9165 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9166 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9167 } else if (k == LOGICAL)
9168 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9169 else if (k == FOLDCHAR)
9170 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9171 else if (k == ANYOF) {
9172 int i, rangestart = -1;
9173 const U8 flags = ANYOF_FLAGS(o);
9176 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9177 static const char * const anyofs[] = {
9210 if (flags & ANYOF_LOCALE)
9211 sv_catpvs(sv, "{loc}");
9212 if (flags & ANYOF_FOLD)
9213 sv_catpvs(sv, "{i}");
9214 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9215 if (flags & ANYOF_INVERT)
9218 /* output what the standard cp 0-255 bitmap matches */
9219 for (i = 0; i <= 256; i++) {
9220 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9221 if (rangestart == -1)
9223 } else if (rangestart != -1) {
9224 if (i <= rangestart + 3)
9225 for (; rangestart < i; rangestart++)
9226 put_byte(sv, rangestart);
9228 put_byte(sv, rangestart);
9230 put_byte(sv, i - 1);
9237 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9238 /* output any special charclass tests (used mostly under use locale) */
9239 if (o->flags & ANYOF_CLASS)
9240 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9241 if (ANYOF_CLASS_TEST(o,i)) {
9242 sv_catpv(sv, anyofs[i]);
9246 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9248 /* output information about the unicode matching */
9249 if (flags & ANYOF_UNICODE)
9250 sv_catpvs(sv, "{unicode}");
9251 else if (flags & ANYOF_UNICODE_ALL)
9252 sv_catpvs(sv, "{unicode_all}");
9256 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9260 U8 s[UTF8_MAXBYTES_CASE+1];
9262 for (i = 0; i <= 256; i++) { /* just the first 256 */
9263 uvchr_to_utf8(s, i);
9265 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9266 if (rangestart == -1)
9268 } else if (rangestart != -1) {
9269 if (i <= rangestart + 3)
9270 for (; rangestart < i; rangestart++) {
9271 const U8 * const e = uvchr_to_utf8(s,rangestart);
9273 for(p = s; p < e; p++)
9277 const U8 *e = uvchr_to_utf8(s,rangestart);
9279 for (p = s; p < e; p++)
9282 e = uvchr_to_utf8(s, i-1);
9283 for (p = s; p < e; p++)
9290 sv_catpvs(sv, "..."); /* et cetera */
9294 char *s = savesvpv(lv);
9295 char * const origs = s;
9297 while (*s && *s != '\n')
9301 const char * const t = ++s;
9319 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9321 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9322 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9324 PERL_UNUSED_CONTEXT;
9325 PERL_UNUSED_ARG(sv);
9327 PERL_UNUSED_ARG(prog);
9328 #endif /* DEBUGGING */
9332 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9333 { /* Assume that RE_INTUIT is set */
9335 struct regexp *const prog = (struct regexp *)SvANY(r);
9336 GET_RE_DEBUG_FLAGS_DECL;
9338 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9339 PERL_UNUSED_CONTEXT;
9343 const char * const s = SvPV_nolen_const(prog->check_substr
9344 ? prog->check_substr : prog->check_utf8);
9346 if (!PL_colorset) reginitcolors();
9347 PerlIO_printf(Perl_debug_log,
9348 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9350 prog->check_substr ? "" : "utf8 ",
9351 PL_colors[5],PL_colors[0],
9354 (strlen(s) > 60 ? "..." : ""));
9357 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9363 handles refcounting and freeing the perl core regexp structure. When
9364 it is necessary to actually free the structure the first thing it
9365 does is call the 'free' method of the regexp_engine associated to to
9366 the regexp, allowing the handling of the void *pprivate; member
9367 first. (This routine is not overridable by extensions, which is why
9368 the extensions free is called first.)
9370 See regdupe and regdupe_internal if you change anything here.
9372 #ifndef PERL_IN_XSUB_RE
9374 Perl_pregfree(pTHX_ REGEXP *r)
9380 Perl_pregfree2(pTHX_ REGEXP *rx)
9383 struct regexp *const r = (struct regexp *)SvANY(rx);
9384 GET_RE_DEBUG_FLAGS_DECL;
9386 PERL_ARGS_ASSERT_PREGFREE2;
9389 ReREFCNT_dec(r->mother_re);
9391 CALLREGFREE_PVT(rx); /* free the private data */
9392 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9395 SvREFCNT_dec(r->anchored_substr);
9396 SvREFCNT_dec(r->anchored_utf8);
9397 SvREFCNT_dec(r->float_substr);
9398 SvREFCNT_dec(r->float_utf8);
9399 Safefree(r->substrs);
9401 RX_MATCH_COPY_FREE(rx);
9402 #ifdef PERL_OLD_COPY_ON_WRITE
9403 SvREFCNT_dec(r->saved_copy);
9410 This is a hacky workaround to the structural issue of match results
9411 being stored in the regexp structure which is in turn stored in
9412 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9413 could be PL_curpm in multiple contexts, and could require multiple
9414 result sets being associated with the pattern simultaneously, such
9415 as when doing a recursive match with (??{$qr})
9417 The solution is to make a lightweight copy of the regexp structure
9418 when a qr// is returned from the code executed by (??{$qr}) this
9419 lightweight copy doesnt actually own any of its data except for
9420 the starp/end and the actual regexp structure itself.
9426 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9429 struct regexp *const r = (struct regexp *)SvANY(rx);
9430 register const I32 npar = r->nparens+1;
9432 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9435 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9436 ret = (struct regexp *)SvANY(ret_x);
9438 (void)ReREFCNT_inc(rx);
9439 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9440 by pointing directly at the buffer, but flagging that the allocated
9441 space in the copy is zero. As we've just done a struct copy, it's now
9442 a case of zero-ing that, rather than copying the current length. */
9443 SvPV_set(ret_x, RX_WRAPPED(rx));
9444 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9445 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9446 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9447 SvLEN_set(ret_x, 0);
9448 SvSTASH_set(ret_x, NULL);
9449 SvMAGIC_set(ret_x, NULL);
9450 Newx(ret->offs, npar, regexp_paren_pair);
9451 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9453 Newx(ret->substrs, 1, struct reg_substr_data);
9454 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9456 SvREFCNT_inc_void(ret->anchored_substr);
9457 SvREFCNT_inc_void(ret->anchored_utf8);
9458 SvREFCNT_inc_void(ret->float_substr);
9459 SvREFCNT_inc_void(ret->float_utf8);
9461 /* check_substr and check_utf8, if non-NULL, point to either their
9462 anchored or float namesakes, and don't hold a second reference. */
9464 RX_MATCH_COPIED_off(ret_x);
9465 #ifdef PERL_OLD_COPY_ON_WRITE
9466 ret->saved_copy = NULL;
9468 ret->mother_re = rx;
9474 /* regfree_internal()
9476 Free the private data in a regexp. This is overloadable by
9477 extensions. Perl takes care of the regexp structure in pregfree(),
9478 this covers the *pprivate pointer which technically perldoesnt
9479 know about, however of course we have to handle the
9480 regexp_internal structure when no extension is in use.
9482 Note this is called before freeing anything in the regexp
9487 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9490 struct regexp *const r = (struct regexp *)SvANY(rx);
9492 GET_RE_DEBUG_FLAGS_DECL;
9494 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9500 SV *dsv= sv_newmortal();
9501 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9502 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9503 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9504 PL_colors[4],PL_colors[5],s);
9507 #ifdef RE_TRACK_PATTERN_OFFSETS
9509 Safefree(ri->u.offsets); /* 20010421 MJD */
9512 int n = ri->data->count;
9513 PAD* new_comppad = NULL;
9518 /* If you add a ->what type here, update the comment in regcomp.h */
9519 switch (ri->data->what[n]) {
9523 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9526 Safefree(ri->data->data[n]);
9529 new_comppad = MUTABLE_AV(ri->data->data[n]);
9532 if (new_comppad == NULL)
9533 Perl_croak(aTHX_ "panic: pregfree comppad");
9534 PAD_SAVE_LOCAL(old_comppad,
9535 /* Watch out for global destruction's random ordering. */
9536 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9539 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9542 op_free((OP_4tree*)ri->data->data[n]);
9544 PAD_RESTORE_LOCAL(old_comppad);
9545 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9551 { /* Aho Corasick add-on structure for a trie node.
9552 Used in stclass optimization only */
9554 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9556 refcount = --aho->refcount;
9559 PerlMemShared_free(aho->states);
9560 PerlMemShared_free(aho->fail);
9561 /* do this last!!!! */
9562 PerlMemShared_free(ri->data->data[n]);
9563 PerlMemShared_free(ri->regstclass);
9569 /* trie structure. */
9571 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9573 refcount = --trie->refcount;
9576 PerlMemShared_free(trie->charmap);
9577 PerlMemShared_free(trie->states);
9578 PerlMemShared_free(trie->trans);
9580 PerlMemShared_free(trie->bitmap);
9582 PerlMemShared_free(trie->wordlen);
9584 PerlMemShared_free(trie->jump);
9586 PerlMemShared_free(trie->nextword);
9587 /* do this last!!!! */
9588 PerlMemShared_free(ri->data->data[n]);
9593 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9596 Safefree(ri->data->what);
9603 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9604 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9605 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9606 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9609 re_dup - duplicate a regexp.
9611 This routine is expected to clone a given regexp structure. It is only
9612 compiled under USE_ITHREADS.
9614 After all of the core data stored in struct regexp is duplicated
9615 the regexp_engine.dupe method is used to copy any private data
9616 stored in the *pprivate pointer. This allows extensions to handle
9617 any duplication it needs to do.
9619 See pregfree() and regfree_internal() if you change anything here.
9621 #if defined(USE_ITHREADS)
9622 #ifndef PERL_IN_XSUB_RE
9624 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9628 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9629 struct regexp *ret = (struct regexp *)SvANY(dstr);
9631 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9633 npar = r->nparens+1;
9634 Newx(ret->offs, npar, regexp_paren_pair);
9635 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9637 /* no need to copy these */
9638 Newx(ret->swap, npar, regexp_paren_pair);
9642 /* Do it this way to avoid reading from *r after the StructCopy().
9643 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9644 cache, it doesn't matter. */
9645 const bool anchored = r->check_substr
9646 ? r->check_substr == r->anchored_substr
9647 : r->check_utf8 == r->anchored_utf8;
9648 Newx(ret->substrs, 1, struct reg_substr_data);
9649 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9651 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9652 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9653 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9654 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9656 /* check_substr and check_utf8, if non-NULL, point to either their
9657 anchored or float namesakes, and don't hold a second reference. */
9659 if (ret->check_substr) {
9661 assert(r->check_utf8 == r->anchored_utf8);
9662 ret->check_substr = ret->anchored_substr;
9663 ret->check_utf8 = ret->anchored_utf8;
9665 assert(r->check_substr == r->float_substr);
9666 assert(r->check_utf8 == r->float_utf8);
9667 ret->check_substr = ret->float_substr;
9668 ret->check_utf8 = ret->float_utf8;
9670 } else if (ret->check_utf8) {
9672 ret->check_utf8 = ret->anchored_utf8;
9674 ret->check_utf8 = ret->float_utf8;
9679 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9682 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9684 if (RX_MATCH_COPIED(dstr))
9685 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9688 #ifdef PERL_OLD_COPY_ON_WRITE
9689 ret->saved_copy = NULL;
9692 if (ret->mother_re) {
9693 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9694 /* Our storage points directly to our mother regexp, but that's
9695 1: a buffer in a different thread
9696 2: something we no longer hold a reference on
9697 so we need to copy it locally. */
9698 /* Note we need to sue SvCUR() on our mother_re, because it, in
9699 turn, may well be pointing to its own mother_re. */
9700 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9701 SvCUR(ret->mother_re)+1));
9702 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9704 ret->mother_re = NULL;
9708 #endif /* PERL_IN_XSUB_RE */
9713 This is the internal complement to regdupe() which is used to copy
9714 the structure pointed to by the *pprivate pointer in the regexp.
9715 This is the core version of the extension overridable cloning hook.
9716 The regexp structure being duplicated will be copied by perl prior
9717 to this and will be provided as the regexp *r argument, however
9718 with the /old/ structures pprivate pointer value. Thus this routine
9719 may override any copying normally done by perl.
9721 It returns a pointer to the new regexp_internal structure.
9725 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9728 struct regexp *const r = (struct regexp *)SvANY(rx);
9729 regexp_internal *reti;
9733 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9735 npar = r->nparens+1;
9738 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9739 Copy(ri->program, reti->program, len+1, regnode);
9742 reti->regstclass = NULL;
9746 const int count = ri->data->count;
9749 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9750 char, struct reg_data);
9751 Newx(d->what, count, U8);
9754 for (i = 0; i < count; i++) {
9755 d->what[i] = ri->data->what[i];
9756 switch (d->what[i]) {
9757 /* legal options are one of: sSfpontTu
9758 see also regcomp.h and pregfree() */
9761 case 'p': /* actually an AV, but the dup function is identical. */
9762 case 'u': /* actually an HV, but the dup function is identical. */
9763 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9766 /* This is cheating. */
9767 Newx(d->data[i], 1, struct regnode_charclass_class);
9768 StructCopy(ri->data->data[i], d->data[i],
9769 struct regnode_charclass_class);
9770 reti->regstclass = (regnode*)d->data[i];
9773 /* Compiled op trees are readonly and in shared memory,
9774 and can thus be shared without duplication. */
9776 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9780 /* Trie stclasses are readonly and can thus be shared
9781 * without duplication. We free the stclass in pregfree
9782 * when the corresponding reg_ac_data struct is freed.
9784 reti->regstclass= ri->regstclass;
9788 ((reg_trie_data*)ri->data->data[i])->refcount++;
9792 d->data[i] = ri->data->data[i];
9795 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9804 reti->name_list_idx = ri->name_list_idx;
9806 #ifdef RE_TRACK_PATTERN_OFFSETS
9807 if (ri->u.offsets) {
9808 Newx(reti->u.offsets, 2*len+1, U32);
9809 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9812 SetProgLen(reti,len);
9818 #endif /* USE_ITHREADS */
9820 #ifndef PERL_IN_XSUB_RE
9823 - regnext - dig the "next" pointer out of a node
9826 Perl_regnext(pTHX_ register regnode *p)
9829 register I32 offset;
9834 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9843 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9846 STRLEN l1 = strlen(pat1);
9847 STRLEN l2 = strlen(pat2);
9850 const char *message;
9852 PERL_ARGS_ASSERT_RE_CROAK2;
9858 Copy(pat1, buf, l1 , char);
9859 Copy(pat2, buf + l1, l2 , char);
9860 buf[l1 + l2] = '\n';
9861 buf[l1 + l2 + 1] = '\0';
9863 /* ANSI variant takes additional second argument */
9864 va_start(args, pat2);
9868 msv = vmess(buf, &args);
9870 message = SvPV_const(msv,l1);
9873 Copy(message, buf, l1 , char);
9874 buf[l1-1] = '\0'; /* Overwrite \n */
9875 Perl_croak(aTHX_ "%s", buf);
9878 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9880 #ifndef PERL_IN_XSUB_RE
9882 Perl_save_re_context(pTHX)
9886 struct re_save_state *state;
9888 SAVEVPTR(PL_curcop);
9889 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9891 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9892 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9893 SSPUSHINT(SAVEt_RE_STATE);
9895 Copy(&PL_reg_state, state, 1, struct re_save_state);
9897 PL_reg_start_tmp = 0;
9898 PL_reg_start_tmpl = 0;
9899 PL_reg_oldsaved = NULL;
9900 PL_reg_oldsavedlen = 0;
9902 PL_reg_leftiter = 0;
9903 PL_reg_poscache = NULL;
9904 PL_reg_poscache_size = 0;
9905 #ifdef PERL_OLD_COPY_ON_WRITE
9909 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9911 const REGEXP * const rx = PM_GETRE(PL_curpm);
9914 for (i = 1; i <= RX_NPARENS(rx); i++) {
9915 char digits[TYPE_CHARS(long)];
9916 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9917 GV *const *const gvp
9918 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9921 GV * const gv = *gvp;
9922 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9932 clear_re(pTHX_ void *r)
9935 ReREFCNT_dec((REGEXP *)r);
9941 S_put_byte(pTHX_ SV *sv, int c)
9943 PERL_ARGS_ASSERT_PUT_BYTE;
9945 /* Our definition of isPRINT() ignores locales, so only bytes that are
9946 not part of UTF-8 are considered printable. I assume that the same
9947 holds for UTF-EBCDIC.
9948 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9949 which Wikipedia says:
9951 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9952 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9953 identical, to the ASCII delete (DEL) or rubout control character.
9954 ) So the old condition can be simplified to !isPRINT(c) */
9956 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9958 const char string = c;
9959 if (c == '-' || c == ']' || c == '\\' || c == '^')
9960 sv_catpvs(sv, "\\");
9961 sv_catpvn(sv, &string, 1);
9966 #define CLEAR_OPTSTART \
9967 if (optstart) STMT_START { \
9968 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9972 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9974 STATIC const regnode *
9975 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9976 const regnode *last, const regnode *plast,
9977 SV* sv, I32 indent, U32 depth)
9980 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9981 register const regnode *next;
9982 const regnode *optstart= NULL;
9985 GET_RE_DEBUG_FLAGS_DECL;
9987 PERL_ARGS_ASSERT_DUMPUNTIL;
9989 #ifdef DEBUG_DUMPUNTIL
9990 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9991 last ? last-start : 0,plast ? plast-start : 0);
9994 if (plast && plast < last)
9997 while (PL_regkind[op] != END && (!last || node < last)) {
9998 /* While that wasn't END last time... */
10001 if (op == CLOSE || op == WHILEM)
10003 next = regnext((regnode *)node);
10006 if (OP(node) == OPTIMIZED) {
10007 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10014 regprop(r, sv, node);
10015 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10016 (int)(2*indent + 1), "", SvPVX_const(sv));
10018 if (OP(node) != OPTIMIZED) {
10019 if (next == NULL) /* Next ptr. */
10020 PerlIO_printf(Perl_debug_log, " (0)");
10021 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10022 PerlIO_printf(Perl_debug_log, " (FAIL)");
10024 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10025 (void)PerlIO_putc(Perl_debug_log, '\n');
10029 if (PL_regkind[(U8)op] == BRANCHJ) {
10032 register const regnode *nnode = (OP(next) == LONGJMP
10033 ? regnext((regnode *)next)
10035 if (last && nnode > last)
10037 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10040 else if (PL_regkind[(U8)op] == BRANCH) {
10042 DUMPUNTIL(NEXTOPER(node), next);
10044 else if ( PL_regkind[(U8)op] == TRIE ) {
10045 const regnode *this_trie = node;
10046 const char op = OP(node);
10047 const U32 n = ARG(node);
10048 const reg_ac_data * const ac = op>=AHOCORASICK ?
10049 (reg_ac_data *)ri->data->data[n] :
10051 const reg_trie_data * const trie =
10052 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10054 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10056 const regnode *nextbranch= NULL;
10059 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10060 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10062 PerlIO_printf(Perl_debug_log, "%*s%s ",
10063 (int)(2*(indent+3)), "",
10064 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10065 PL_colors[0], PL_colors[1],
10066 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10067 PERL_PV_PRETTY_ELLIPSES |
10068 PERL_PV_PRETTY_LTGT
10073 U16 dist= trie->jump[word_idx+1];
10074 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10075 (UV)((dist ? this_trie + dist : next) - start));
10078 nextbranch= this_trie + trie->jump[0];
10079 DUMPUNTIL(this_trie + dist, nextbranch);
10081 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10082 nextbranch= regnext((regnode *)nextbranch);
10084 PerlIO_printf(Perl_debug_log, "\n");
10087 if (last && next > last)
10092 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10093 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10094 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10096 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10098 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10100 else if ( op == PLUS || op == STAR) {
10101 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10103 else if (op == ANYOF) {
10104 /* arglen 1 + class block */
10105 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10106 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10107 node = NEXTOPER(node);
10109 else if (PL_regkind[(U8)op] == EXACT) {
10110 /* Literal string, where present. */
10111 node += NODE_SZ_STR(node) - 1;
10112 node = NEXTOPER(node);
10115 node = NEXTOPER(node);
10116 node += regarglen[(U8)op];
10118 if (op == CURLYX || op == OPEN)
10122 #ifdef DEBUG_DUMPUNTIL
10123 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10128 #endif /* DEBUGGING */
10132 * c-indentation-style: bsd
10133 * c-basic-offset: 4
10134 * indent-tabs-mode: t
10137 * ex: set ts=8 sts=4 sw=4 noet: