5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
93 # if defined(BUGGY_MSC6)
94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 # pragma optimize("a",off)
96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 # pragma optimize("w",on )
98 # endif /* BUGGY_MSC6 */
102 #define STATIC static
105 typedef struct RExC_state_t {
106 U32 flags; /* are we folding, multilining? */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
111 char *start; /* Start of input for compile */
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
115 regnode *emit_start; /* Start of emitted-code area */
116 regnode *emit_bound; /* First regnode outside of the allocated space */
117 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
121 I32 size; /* Code size. */
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
124 I32 nestroot; /* root parens we are in - used by accept */
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
135 HV *paren_names; /* Paren names */
137 regnode **recurse; /* Recurse regops */
138 I32 recurse_count; /* Number of recurse regops */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
144 const char *lastparse;
146 AV *paren_name_list; /* idx -> name */
147 #define RExC_lastparse (pRExC_state->lastparse)
148 #define RExC_lastnum (pRExC_state->lastnum)
149 #define RExC_paren_name_list (pRExC_state->paren_name_list)
153 #define RExC_flags (pRExC_state->flags)
154 #define RExC_precomp (pRExC_state->precomp)
155 #define RExC_rx_sv (pRExC_state->rx_sv)
156 #define RExC_rx (pRExC_state->rx)
157 #define RExC_rxi (pRExC_state->rxi)
158 #define RExC_start (pRExC_state->start)
159 #define RExC_end (pRExC_state->end)
160 #define RExC_parse (pRExC_state->parse)
161 #define RExC_whilem_seen (pRExC_state->whilem_seen)
162 #ifdef RE_TRACK_PATTERN_OFFSETS
163 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
165 #define RExC_emit (pRExC_state->emit)
166 #define RExC_emit_start (pRExC_state->emit_start)
167 #define RExC_emit_bound (pRExC_state->emit_bound)
168 #define RExC_naughty (pRExC_state->naughty)
169 #define RExC_sawback (pRExC_state->sawback)
170 #define RExC_seen (pRExC_state->seen)
171 #define RExC_size (pRExC_state->size)
172 #define RExC_npar (pRExC_state->npar)
173 #define RExC_nestroot (pRExC_state->nestroot)
174 #define RExC_extralen (pRExC_state->extralen)
175 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
176 #define RExC_seen_evals (pRExC_state->seen_evals)
177 #define RExC_utf8 (pRExC_state->utf8)
178 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
179 #define RExC_open_parens (pRExC_state->open_parens)
180 #define RExC_close_parens (pRExC_state->close_parens)
181 #define RExC_opend (pRExC_state->opend)
182 #define RExC_paren_names (pRExC_state->paren_names)
183 #define RExC_recurse (pRExC_state->recurse)
184 #define RExC_recurse_count (pRExC_state->recurse_count)
187 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
188 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 ((*s) == '{' && regcurly(s)))
192 #undef SPSTART /* dratted cpp namespace... */
195 * Flags to be passed up and down.
197 #define WORST 0 /* Worst case. */
198 #define HASWIDTH 0x01 /* Known to match non-null strings. */
199 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
200 #define SPSTART 0x04 /* Starts with * or +. */
201 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
202 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
204 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
206 /* whether trie related optimizations are enabled */
207 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208 #define TRIE_STUDY_OPT
209 #define FULL_TRIE_STUDY
215 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216 #define PBITVAL(paren) (1 << ((paren) & 7))
217 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
222 /* About scan_data_t.
224 During optimisation we recurse through the regexp program performing
225 various inplace (keyhole style) optimisations. In addition study_chunk
226 and scan_commit populate this data structure with information about
227 what strings MUST appear in the pattern. We look for the longest
228 string that must appear for at a fixed location, and we look for the
229 longest string that may appear at a floating location. So for instance
234 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
235 strings (because they follow a .* construct). study_chunk will identify
236 both FOO and BAR as being the longest fixed and floating strings respectively.
238 The strings can be composites, for instance
242 will result in a composite fixed substring 'foo'.
244 For each string some basic information is maintained:
246 - offset or min_offset
247 This is the position the string must appear at, or not before.
248 It also implicitly (when combined with minlenp) tells us how many
249 character must match before the string we are searching.
250 Likewise when combined with minlenp and the length of the string
251 tells us how many characters must appear after the string we have
255 Only used for floating strings. This is the rightmost point that
256 the string can appear at. Ifset to I32 max it indicates that the
257 string can occur infinitely far to the right.
260 A pointer to the minimum length of the pattern that the string
261 was found inside. This is important as in the case of positive
262 lookahead or positive lookbehind we can have multiple patterns
267 The minimum length of the pattern overall is 3, the minimum length
268 of the lookahead part is 3, but the minimum length of the part that
269 will actually match is 1. So 'FOO's minimum length is 3, but the
270 minimum length for the F is 1. This is important as the minimum length
271 is used to determine offsets in front of and behind the string being
272 looked for. Since strings can be composites this is the length of the
273 pattern at the time it was commited with a scan_commit. Note that
274 the length is calculated by study_chunk, so that the minimum lengths
275 are not known until the full pattern has been compiled, thus the
276 pointer to the value.
280 In the case of lookbehind the string being searched for can be
281 offset past the start point of the final matching string.
282 If this value was just blithely removed from the min_offset it would
283 invalidate some of the calculations for how many chars must match
284 before or after (as they are derived from min_offset and minlen and
285 the length of the string being searched for).
286 When the final pattern is compiled and the data is moved from the
287 scan_data_t structure into the regexp structure the information
288 about lookbehind is factored in, with the information that would
289 have been lost precalculated in the end_shift field for the
292 The fields pos_min and pos_delta are used to store the minimum offset
293 and the delta to the maximum offset at the current point in the pattern.
297 typedef struct scan_data_t {
298 /*I32 len_min; unused */
299 /*I32 len_delta; unused */
303 I32 last_end; /* min value, <0 unless valid. */
306 SV **longest; /* Either &l_fixed, or &l_float. */
307 SV *longest_fixed; /* longest fixed string found in pattern */
308 I32 offset_fixed; /* offset where it starts */
309 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
310 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
311 SV *longest_float; /* longest floating string found in pattern */
312 I32 offset_float_min; /* earliest point in string it can appear */
313 I32 offset_float_max; /* latest point in string it can appear */
314 I32 *minlen_float; /* pointer to the minlen relevent to the string */
315 I32 lookbehind_float; /* is the position of the string modified by LB */
319 struct regnode_charclass_class *start_class;
323 * Forward declarations for pregcomp()'s friends.
326 static const scan_data_t zero_scan_data =
327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
329 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
330 #define SF_BEFORE_SEOL 0x0001
331 #define SF_BEFORE_MEOL 0x0002
332 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
336 # define SF_FIX_SHIFT_EOL (0+2)
337 # define SF_FL_SHIFT_EOL (0+4)
339 # define SF_FIX_SHIFT_EOL (+2)
340 # define SF_FL_SHIFT_EOL (+4)
343 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
346 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
348 #define SF_IS_INF 0x0040
349 #define SF_HAS_PAR 0x0080
350 #define SF_IN_PAR 0x0100
351 #define SF_HAS_EVAL 0x0200
352 #define SCF_DO_SUBSTR 0x0400
353 #define SCF_DO_STCLASS_AND 0x0800
354 #define SCF_DO_STCLASS_OR 0x1000
355 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
356 #define SCF_WHILEM_VISITED_POS 0x2000
358 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
359 #define SCF_SEEN_ACCEPT 0x8000
361 #define UTF (RExC_utf8 != 0)
362 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
365 #define OOB_UNICODE 12345678
366 #define OOB_NAMEDCLASS -1
368 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
372 /* length of regex to show in messages that don't mark a position within */
373 #define RegexLengthToShowInErrorMessages 127
376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
378 * op/pragma/warn/regcomp.
380 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
381 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
383 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
387 * arg. Show regex, up to a maximum length. If it's too long, chop and add
390 #define _FAIL(code) STMT_START { \
391 const char *ellipses = ""; \
392 IV len = RExC_end - RExC_precomp; \
395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
396 if (len > RegexLengthToShowInErrorMessages) { \
397 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
398 len = RegexLengthToShowInErrorMessages - 10; \
404 #define FAIL(msg) _FAIL( \
405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
406 msg, (int)len, RExC_precomp, ellipses))
408 #define FAIL2(msg,arg) _FAIL( \
409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
410 arg, (int)len, RExC_precomp, ellipses))
413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
415 #define Simple_vFAIL(m) STMT_START { \
416 const IV offset = RExC_parse - RExC_precomp; \
417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
424 #define vFAIL(m) STMT_START { \
426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
431 * Like Simple_vFAIL(), but accepts two arguments.
433 #define Simple_vFAIL2(m,a1) STMT_START { \
434 const IV offset = RExC_parse - RExC_precomp; \
435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
436 (int)offset, RExC_precomp, RExC_precomp + offset); \
440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
442 #define vFAIL2(m,a1) STMT_START { \
444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
445 Simple_vFAIL2(m, a1); \
450 * Like Simple_vFAIL(), but accepts three arguments.
452 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
453 const IV offset = RExC_parse - RExC_precomp; \
454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
455 (int)offset, RExC_precomp, RExC_precomp + offset); \
459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
461 #define vFAIL3(m,a1,a2) STMT_START { \
463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
464 Simple_vFAIL3(m, a1, a2); \
468 * Like Simple_vFAIL(), but accepts four arguments.
470 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
471 const IV offset = RExC_parse - RExC_precomp; \
472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
473 (int)offset, RExC_precomp, RExC_precomp + offset); \
476 #define ckWARNreg(loc,m) STMT_START { \
477 const IV offset = loc - RExC_precomp; \
478 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 (int)offset, RExC_precomp, RExC_precomp + offset); \
482 #define ckWARNregdep(loc,m) STMT_START { \
483 const IV offset = loc - RExC_precomp; \
484 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
486 (int)offset, RExC_precomp, RExC_precomp + offset); \
489 #define ckWARN2reg(loc, m, a1) STMT_START { \
490 const IV offset = loc - RExC_precomp; \
491 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
492 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
495 #define vWARN3(loc, m, a1, a2) STMT_START { \
496 const IV offset = loc - RExC_precomp; \
497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
498 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
501 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
502 const IV offset = loc - RExC_precomp; \
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
507 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
508 const IV offset = loc - RExC_precomp; \
509 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
510 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
513 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
514 const IV offset = loc - RExC_precomp; \
515 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
516 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
519 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
520 const IV offset = loc - RExC_precomp; \
521 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
522 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
526 /* Allow for side effects in s */
527 #define REGC(c,s) STMT_START { \
528 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
531 /* Macros for recording node offsets. 20001227 mjd@plover.com
532 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
533 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
534 * Element 0 holds the number n.
535 * Position is 1 indexed.
537 #ifndef RE_TRACK_PATTERN_OFFSETS
538 #define Set_Node_Offset_To_R(node,byte)
539 #define Set_Node_Offset(node,byte)
540 #define Set_Cur_Node_Offset
541 #define Set_Node_Length_To_R(node,len)
542 #define Set_Node_Length(node,len)
543 #define Set_Node_Cur_Length(node)
544 #define Node_Offset(n)
545 #define Node_Length(n)
546 #define Set_Node_Offset_Length(node,offset,len)
547 #define ProgLen(ri) ri->u.proglen
548 #define SetProgLen(ri,x) ri->u.proglen = x
550 #define ProgLen(ri) ri->u.offsets[0]
551 #define SetProgLen(ri,x) ri->u.offsets[0] = x
552 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
554 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
555 __LINE__, (int)(node), (int)(byte))); \
557 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
559 RExC_offsets[2*(node)-1] = (byte); \
564 #define Set_Node_Offset(node,byte) \
565 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
566 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
568 #define Set_Node_Length_To_R(node,len) STMT_START { \
570 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
571 __LINE__, (int)(node), (int)(len))); \
573 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
575 RExC_offsets[2*(node)] = (len); \
580 #define Set_Node_Length(node,len) \
581 Set_Node_Length_To_R((node)-RExC_emit_start, len)
582 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
583 #define Set_Node_Cur_Length(node) \
584 Set_Node_Length(node, RExC_parse - parse_start)
586 /* Get offsets and lengths */
587 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
588 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
590 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
591 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
592 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
596 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
597 #define EXPERIMENTAL_INPLACESCAN
598 #endif /*RE_TRACK_PATTERN_OFFSETS*/
600 #define DEBUG_STUDYDATA(str,data,depth) \
601 DEBUG_OPTIMISE_MORE_r(if(data){ \
602 PerlIO_printf(Perl_debug_log, \
603 "%*s" str "Pos:%"IVdf"/%"IVdf \
604 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
605 (int)(depth)*2, "", \
606 (IV)((data)->pos_min), \
607 (IV)((data)->pos_delta), \
608 (UV)((data)->flags), \
609 (IV)((data)->whilem_c), \
610 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
611 is_inf ? "INF " : "" \
613 if ((data)->last_found) \
614 PerlIO_printf(Perl_debug_log, \
615 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
616 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
617 SvPVX_const((data)->last_found), \
618 (IV)((data)->last_end), \
619 (IV)((data)->last_start_min), \
620 (IV)((data)->last_start_max), \
621 ((data)->longest && \
622 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
623 SvPVX_const((data)->longest_fixed), \
624 (IV)((data)->offset_fixed), \
625 ((data)->longest && \
626 (data)->longest==&((data)->longest_float)) ? "*" : "", \
627 SvPVX_const((data)->longest_float), \
628 (IV)((data)->offset_float_min), \
629 (IV)((data)->offset_float_max) \
631 PerlIO_printf(Perl_debug_log,"\n"); \
634 static void clear_re(pTHX_ void *r);
636 /* Mark that we cannot extend a found fixed substring at this point.
637 Update the longest found anchored substring and the longest found
638 floating substrings if needed. */
641 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
643 const STRLEN l = CHR_SVLEN(data->last_found);
644 const STRLEN old_l = CHR_SVLEN(*data->longest);
645 GET_RE_DEBUG_FLAGS_DECL;
647 PERL_ARGS_ASSERT_SCAN_COMMIT;
649 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
650 SvSetMagicSV(*data->longest, data->last_found);
651 if (*data->longest == data->longest_fixed) {
652 data->offset_fixed = l ? data->last_start_min : data->pos_min;
653 if (data->flags & SF_BEFORE_EOL)
655 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
657 data->flags &= ~SF_FIX_BEFORE_EOL;
658 data->minlen_fixed=minlenp;
659 data->lookbehind_fixed=0;
661 else { /* *data->longest == data->longest_float */
662 data->offset_float_min = l ? data->last_start_min : data->pos_min;
663 data->offset_float_max = (l
664 ? data->last_start_max
665 : data->pos_min + data->pos_delta);
666 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
667 data->offset_float_max = I32_MAX;
668 if (data->flags & SF_BEFORE_EOL)
670 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
672 data->flags &= ~SF_FL_BEFORE_EOL;
673 data->minlen_float=minlenp;
674 data->lookbehind_float=0;
677 SvCUR_set(data->last_found, 0);
679 SV * const sv = data->last_found;
680 if (SvUTF8(sv) && SvMAGICAL(sv)) {
681 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
687 data->flags &= ~SF_BEFORE_EOL;
688 DEBUG_STUDYDATA("commit: ",data,0);
691 /* Can match anything (initialization) */
693 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
695 PERL_ARGS_ASSERT_CL_ANYTHING;
697 ANYOF_CLASS_ZERO(cl);
698 ANYOF_BITMAP_SETALL(cl);
699 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
701 cl->flags |= ANYOF_LOCALE;
704 /* Can match anything (initialization) */
706 S_cl_is_anything(const struct regnode_charclass_class *cl)
710 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
712 for (value = 0; value <= ANYOF_MAX; value += 2)
713 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
715 if (!(cl->flags & ANYOF_UNICODE_ALL))
717 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
722 /* Can match anything (initialization) */
724 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
726 PERL_ARGS_ASSERT_CL_INIT;
728 Zero(cl, 1, struct regnode_charclass_class);
730 cl_anything(pRExC_state, cl);
734 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
736 PERL_ARGS_ASSERT_CL_INIT_ZERO;
738 Zero(cl, 1, struct regnode_charclass_class);
740 cl_anything(pRExC_state, cl);
742 cl->flags |= ANYOF_LOCALE;
745 /* 'And' a given class with another one. Can create false positives */
746 /* We assume that cl is not inverted */
748 S_cl_and(struct regnode_charclass_class *cl,
749 const struct regnode_charclass_class *and_with)
751 PERL_ARGS_ASSERT_CL_AND;
753 assert(and_with->type == ANYOF);
754 if (!(and_with->flags & ANYOF_CLASS)
755 && !(cl->flags & ANYOF_CLASS)
756 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
757 && !(and_with->flags & ANYOF_FOLD)
758 && !(cl->flags & ANYOF_FOLD)) {
761 if (and_with->flags & ANYOF_INVERT)
762 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
763 cl->bitmap[i] &= ~and_with->bitmap[i];
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] &= and_with->bitmap[i];
767 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
768 if (!(and_with->flags & ANYOF_EOS))
769 cl->flags &= ~ANYOF_EOS;
771 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
772 !(and_with->flags & ANYOF_INVERT)) {
773 cl->flags &= ~ANYOF_UNICODE_ALL;
774 cl->flags |= ANYOF_UNICODE;
775 ARG_SET(cl, ARG(and_with));
777 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
778 !(and_with->flags & ANYOF_INVERT))
779 cl->flags &= ~ANYOF_UNICODE_ALL;
780 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
781 !(and_with->flags & ANYOF_INVERT))
782 cl->flags &= ~ANYOF_UNICODE;
785 /* 'OR' a given class with another one. Can create false positives */
786 /* We assume that cl is not inverted */
788 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
790 PERL_ARGS_ASSERT_CL_OR;
792 if (or_with->flags & ANYOF_INVERT) {
794 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
795 * <= (B1 | !B2) | (CL1 | !CL2)
796 * which is wasteful if CL2 is small, but we ignore CL2:
797 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
798 * XXXX Can we handle case-fold? Unclear:
799 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
800 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
802 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
803 && !(or_with->flags & ANYOF_FOLD)
804 && !(cl->flags & ANYOF_FOLD) ) {
807 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
808 cl->bitmap[i] |= ~or_with->bitmap[i];
809 } /* XXXX: logic is complicated otherwise */
811 cl_anything(pRExC_state, cl);
814 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
815 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
816 && (!(or_with->flags & ANYOF_FOLD)
817 || (cl->flags & ANYOF_FOLD)) ) {
820 /* OR char bitmap and class bitmap separately */
821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 cl->bitmap[i] |= or_with->bitmap[i];
823 if (or_with->flags & ANYOF_CLASS) {
824 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
825 cl->classflags[i] |= or_with->classflags[i];
826 cl->flags |= ANYOF_CLASS;
829 else { /* XXXX: logic is complicated, leave it along for a moment. */
830 cl_anything(pRExC_state, cl);
833 if (or_with->flags & ANYOF_EOS)
834 cl->flags |= ANYOF_EOS;
836 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
837 ARG(cl) != ARG(or_with)) {
838 cl->flags |= ANYOF_UNICODE_ALL;
839 cl->flags &= ~ANYOF_UNICODE;
841 if (or_with->flags & ANYOF_UNICODE_ALL) {
842 cl->flags |= ANYOF_UNICODE_ALL;
843 cl->flags &= ~ANYOF_UNICODE;
847 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
848 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
849 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
850 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
855 dump_trie(trie,widecharmap,revcharmap)
856 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
857 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
859 These routines dump out a trie in a somewhat readable format.
860 The _interim_ variants are used for debugging the interim
861 tables that are used to generate the final compressed
862 representation which is what dump_trie expects.
864 Part of the reason for their existance is to provide a form
865 of documentation as to how the different representations function.
870 Dumps the final compressed table form of the trie to Perl_debug_log.
871 Used for debugging make_trie().
875 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
876 AV *revcharmap, U32 depth)
879 SV *sv=sv_newmortal();
880 int colwidth= widecharmap ? 6 : 4;
881 GET_RE_DEBUG_FLAGS_DECL;
883 PERL_ARGS_ASSERT_DUMP_TRIE;
885 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
886 (int)depth * 2 + 2,"",
887 "Match","Base","Ofs" );
889 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
890 SV ** const tmp = av_fetch( revcharmap, state, 0);
892 PerlIO_printf( Perl_debug_log, "%*s",
894 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
895 PL_colors[0], PL_colors[1],
896 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
897 PERL_PV_ESCAPE_FIRSTCHAR
902 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
903 (int)depth * 2 + 2,"");
905 for( state = 0 ; state < trie->uniquecharcount ; state++ )
906 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
907 PerlIO_printf( Perl_debug_log, "\n");
909 for( state = 1 ; state < trie->statecount ; state++ ) {
910 const U32 base = trie->states[ state ].trans.base;
912 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
914 if ( trie->states[ state ].wordnum ) {
915 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
917 PerlIO_printf( Perl_debug_log, "%6s", "" );
920 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
925 while( ( base + ofs < trie->uniquecharcount ) ||
926 ( base + ofs - trie->uniquecharcount < trie->lasttrans
927 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
930 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
932 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
933 if ( ( base + ofs >= trie->uniquecharcount ) &&
934 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
935 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
937 PerlIO_printf( Perl_debug_log, "%*"UVXf,
939 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
941 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
945 PerlIO_printf( Perl_debug_log, "]");
948 PerlIO_printf( Perl_debug_log, "\n" );
952 Dumps a fully constructed but uncompressed trie in list form.
953 List tries normally only are used for construction when the number of
954 possible chars (trie->uniquecharcount) is very high.
955 Used for debugging make_trie().
958 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
959 HV *widecharmap, AV *revcharmap, U32 next_alloc,
963 SV *sv=sv_newmortal();
964 int colwidth= widecharmap ? 6 : 4;
965 GET_RE_DEBUG_FLAGS_DECL;
967 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
969 /* print out the table precompression. */
970 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
971 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
972 "------:-----+-----------------\n" );
974 for( state=1 ; state < next_alloc ; state ++ ) {
977 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
978 (int)depth * 2 + 2,"", (UV)state );
979 if ( ! trie->states[ state ].wordnum ) {
980 PerlIO_printf( Perl_debug_log, "%5s| ","");
982 PerlIO_printf( Perl_debug_log, "W%4x| ",
983 trie->states[ state ].wordnum
986 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
987 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
989 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
991 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
992 PL_colors[0], PL_colors[1],
993 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
994 PERL_PV_ESCAPE_FIRSTCHAR
996 TRIE_LIST_ITEM(state,charid).forid,
997 (UV)TRIE_LIST_ITEM(state,charid).newstate
1000 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1001 (int)((depth * 2) + 14), "");
1004 PerlIO_printf( Perl_debug_log, "\n");
1009 Dumps a fully constructed but uncompressed trie in table form.
1010 This is the normal DFA style state transition table, with a few
1011 twists to facilitate compression later.
1012 Used for debugging make_trie().
1015 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1016 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1021 SV *sv=sv_newmortal();
1022 int colwidth= widecharmap ? 6 : 4;
1023 GET_RE_DEBUG_FLAGS_DECL;
1025 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1028 print out the table precompression so that we can do a visual check
1029 that they are identical.
1032 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1034 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1035 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1037 PerlIO_printf( Perl_debug_log, "%*s",
1039 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1040 PL_colors[0], PL_colors[1],
1041 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1042 PERL_PV_ESCAPE_FIRSTCHAR
1048 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1050 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1051 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1054 PerlIO_printf( Perl_debug_log, "\n" );
1056 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1058 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1059 (int)depth * 2 + 2,"",
1060 (UV)TRIE_NODENUM( state ) );
1062 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1063 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1065 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1067 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1069 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1070 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1073 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1080 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1081 startbranch: the first branch in the whole branch sequence
1082 first : start branch of sequence of branch-exact nodes.
1083 May be the same as startbranch
1084 last : Thing following the last branch.
1085 May be the same as tail.
1086 tail : item following the branch sequence
1087 count : words in the sequence
1088 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1089 depth : indent depth
1091 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1093 A trie is an N'ary tree where the branches are determined by digital
1094 decomposition of the key. IE, at the root node you look up the 1st character and
1095 follow that branch repeat until you find the end of the branches. Nodes can be
1096 marked as "accepting" meaning they represent a complete word. Eg:
1100 would convert into the following structure. Numbers represent states, letters
1101 following numbers represent valid transitions on the letter from that state, if
1102 the number is in square brackets it represents an accepting state, otherwise it
1103 will be in parenthesis.
1105 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1109 (1) +-i->(6)-+-s->[7]
1111 +-s->(3)-+-h->(4)-+-e->[5]
1113 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1115 This shows that when matching against the string 'hers' we will begin at state 1
1116 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1117 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1118 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1119 single traverse. We store a mapping from accepting to state to which word was
1120 matched, and then when we have multiple possibilities we try to complete the
1121 rest of the regex in the order in which they occured in the alternation.
1123 The only prior NFA like behaviour that would be changed by the TRIE support is
1124 the silent ignoring of duplicate alternations which are of the form:
1126 / (DUPE|DUPE) X? (?{ ... }) Y /x
1128 Thus EVAL blocks follwing a trie may be called a different number of times with
1129 and without the optimisation. With the optimisations dupes will be silently
1130 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1131 the following demonstrates:
1133 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1135 which prints out 'word' three times, but
1137 'words'=~/(word|word|word)(?{ print $1 })S/
1139 which doesnt print it out at all. This is due to other optimisations kicking in.
1141 Example of what happens on a structural level:
1143 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1145 1: CURLYM[1] {1,32767}(18)
1156 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1157 and should turn into:
1159 1: CURLYM[1] {1,32767}(18)
1161 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1169 Cases where tail != last would be like /(?foo|bar)baz/:
1179 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1180 and would end up looking like:
1183 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1190 d = uvuni_to_utf8_flags(d, uv, 0);
1192 is the recommended Unicode-aware way of saying
1197 #define TRIE_STORE_REVCHAR \
1200 SV *zlopp = newSV(2); \
1201 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1202 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1203 SvCUR_set(zlopp, kapow - flrbbbbb); \
1206 av_push(revcharmap, zlopp); \
1208 char ooooff = (char)uvc; \
1209 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1213 #define TRIE_READ_CHAR STMT_START { \
1217 if ( foldlen > 0 ) { \
1218 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1223 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1224 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1225 foldlen -= UNISKIP( uvc ); \
1226 scan = foldbuf + UNISKIP( uvc ); \
1229 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1239 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1240 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1241 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1242 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1244 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1245 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1246 TRIE_LIST_CUR( state )++; \
1249 #define TRIE_LIST_NEW(state) STMT_START { \
1250 Newxz( trie->states[ state ].trans.list, \
1251 4, reg_trie_trans_le ); \
1252 TRIE_LIST_CUR( state ) = 1; \
1253 TRIE_LIST_LEN( state ) = 4; \
1256 #define TRIE_HANDLE_WORD(state) STMT_START { \
1257 U16 dupe= trie->states[ state ].wordnum; \
1258 regnode * const noper_next = regnext( noper ); \
1260 if (trie->wordlen) \
1261 trie->wordlen[ curword ] = wordlen; \
1263 /* store the word for dumping */ \
1265 if (OP(noper) != NOTHING) \
1266 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1268 tmp = newSVpvn_utf8( "", 0, UTF ); \
1269 av_push( trie_words, tmp ); \
1274 if ( noper_next < tail ) { \
1276 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1277 trie->jump[curword] = (U16)(noper_next - convert); \
1279 jumper = noper_next; \
1281 nextbranch= regnext(cur); \
1285 /* So it's a dupe. This means we need to maintain a */\
1286 /* linked-list from the first to the next. */\
1287 /* we only allocate the nextword buffer when there */\
1288 /* a dupe, so first time we have to do the allocation */\
1289 if (!trie->nextword) \
1290 trie->nextword = (U16 *) \
1291 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1292 while ( trie->nextword[dupe] ) \
1293 dupe= trie->nextword[dupe]; \
1294 trie->nextword[dupe]= curword; \
1296 /* we haven't inserted this word yet. */ \
1297 trie->states[ state ].wordnum = curword; \
1302 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1303 ( ( base + charid >= ucharcount \
1304 && base + charid < ubound \
1305 && state == trie->trans[ base - ucharcount + charid ].check \
1306 && trie->trans[ base - ucharcount + charid ].next ) \
1307 ? trie->trans[ base - ucharcount + charid ].next \
1308 : ( state==1 ? special : 0 ) \
1312 #define MADE_JUMP_TRIE 2
1313 #define MADE_EXACT_TRIE 4
1316 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1319 /* first pass, loop through and scan words */
1320 reg_trie_data *trie;
1321 HV *widecharmap = NULL;
1322 AV *revcharmap = newAV();
1324 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1329 regnode *jumper = NULL;
1330 regnode *nextbranch = NULL;
1331 regnode *convert = NULL;
1332 /* we just use folder as a flag in utf8 */
1333 const U8 * const folder = ( flags == EXACTF
1335 : ( flags == EXACTFL
1342 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1343 AV *trie_words = NULL;
1344 /* along with revcharmap, this only used during construction but both are
1345 * useful during debugging so we store them in the struct when debugging.
1348 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1349 STRLEN trie_charcount=0;
1351 SV *re_trie_maxbuff;
1352 GET_RE_DEBUG_FLAGS_DECL;
1354 PERL_ARGS_ASSERT_MAKE_TRIE;
1356 PERL_UNUSED_ARG(depth);
1359 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1361 trie->startstate = 1;
1362 trie->wordcount = word_count;
1363 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1364 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1365 if (!(UTF && folder))
1366 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1368 trie_words = newAV();
1371 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1372 if (!SvIOK(re_trie_maxbuff)) {
1373 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1376 PerlIO_printf( Perl_debug_log,
1377 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1378 (int)depth * 2 + 2, "",
1379 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1380 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1384 /* Find the node we are going to overwrite */
1385 if ( first == startbranch && OP( last ) != BRANCH ) {
1386 /* whole branch chain */
1389 /* branch sub-chain */
1390 convert = NEXTOPER( first );
1393 /* -- First loop and Setup --
1395 We first traverse the branches and scan each word to determine if it
1396 contains widechars, and how many unique chars there are, this is
1397 important as we have to build a table with at least as many columns as we
1400 We use an array of integers to represent the character codes 0..255
1401 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1402 native representation of the character value as the key and IV's for the
1405 *TODO* If we keep track of how many times each character is used we can
1406 remap the columns so that the table compression later on is more
1407 efficient in terms of memory by ensuring most common value is in the
1408 middle and the least common are on the outside. IMO this would be better
1409 than a most to least common mapping as theres a decent chance the most
1410 common letter will share a node with the least common, meaning the node
1411 will not be compressable. With a middle is most common approach the worst
1412 case is when we have the least common nodes twice.
1416 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1417 regnode * const noper = NEXTOPER( cur );
1418 const U8 *uc = (U8*)STRING( noper );
1419 const U8 * const e = uc + STR_LEN( noper );
1421 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1422 const U8 *scan = (U8*)NULL;
1423 U32 wordlen = 0; /* required init */
1425 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1427 if (OP(noper) == NOTHING) {
1431 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1432 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1433 regardless of encoding */
1435 for ( ; uc < e ; uc += len ) {
1436 TRIE_CHARCOUNT(trie)++;
1440 if ( !trie->charmap[ uvc ] ) {
1441 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1443 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1447 /* store the codepoint in the bitmap, and if its ascii
1448 also store its folded equivelent. */
1449 TRIE_BITMAP_SET(trie,uvc);
1451 /* store the folded codepoint */
1452 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1455 /* store first byte of utf8 representation of
1456 codepoints in the 127 < uvc < 256 range */
1457 if (127 < uvc && uvc < 192) {
1458 TRIE_BITMAP_SET(trie,194);
1459 } else if (191 < uvc ) {
1460 TRIE_BITMAP_SET(trie,195);
1461 /* && uvc < 256 -- we know uvc is < 256 already */
1464 set_bit = 0; /* We've done our bit :-) */
1469 widecharmap = newHV();
1471 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1474 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1476 if ( !SvTRUE( *svpp ) ) {
1477 sv_setiv( *svpp, ++trie->uniquecharcount );
1482 if( cur == first ) {
1485 } else if (chars < trie->minlen) {
1487 } else if (chars > trie->maxlen) {
1491 } /* end first pass */
1492 DEBUG_TRIE_COMPILE_r(
1493 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1494 (int)depth * 2 + 2,"",
1495 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1496 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1497 (int)trie->minlen, (int)trie->maxlen )
1499 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1502 We now know what we are dealing with in terms of unique chars and
1503 string sizes so we can calculate how much memory a naive
1504 representation using a flat table will take. If it's over a reasonable
1505 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1506 conservative but potentially much slower representation using an array
1509 At the end we convert both representations into the same compressed
1510 form that will be used in regexec.c for matching with. The latter
1511 is a form that cannot be used to construct with but has memory
1512 properties similar to the list form and access properties similar
1513 to the table form making it both suitable for fast searches and
1514 small enough that its feasable to store for the duration of a program.
1516 See the comment in the code where the compressed table is produced
1517 inplace from the flat tabe representation for an explanation of how
1518 the compression works.
1523 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1525 Second Pass -- Array Of Lists Representation
1527 Each state will be represented by a list of charid:state records
1528 (reg_trie_trans_le) the first such element holds the CUR and LEN
1529 points of the allocated array. (See defines above).
1531 We build the initial structure using the lists, and then convert
1532 it into the compressed table form which allows faster lookups
1533 (but cant be modified once converted).
1536 STRLEN transcount = 1;
1538 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1539 "%*sCompiling trie using list compiler\n",
1540 (int)depth * 2 + 2, ""));
1542 trie->states = (reg_trie_state *)
1543 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1544 sizeof(reg_trie_state) );
1548 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1550 regnode * const noper = NEXTOPER( cur );
1551 U8 *uc = (U8*)STRING( noper );
1552 const U8 * const e = uc + STR_LEN( noper );
1553 U32 state = 1; /* required init */
1554 U16 charid = 0; /* sanity init */
1555 U8 *scan = (U8*)NULL; /* sanity init */
1556 STRLEN foldlen = 0; /* required init */
1557 U32 wordlen = 0; /* required init */
1558 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1560 if (OP(noper) != NOTHING) {
1561 for ( ; uc < e ; uc += len ) {
1566 charid = trie->charmap[ uvc ];
1568 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1572 charid=(U16)SvIV( *svpp );
1575 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1582 if ( !trie->states[ state ].trans.list ) {
1583 TRIE_LIST_NEW( state );
1585 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1586 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1587 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1592 newstate = next_alloc++;
1593 TRIE_LIST_PUSH( state, charid, newstate );
1598 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1602 TRIE_HANDLE_WORD(state);
1604 } /* end second pass */
1606 /* next alloc is the NEXT state to be allocated */
1607 trie->statecount = next_alloc;
1608 trie->states = (reg_trie_state *)
1609 PerlMemShared_realloc( trie->states,
1611 * sizeof(reg_trie_state) );
1613 /* and now dump it out before we compress it */
1614 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1615 revcharmap, next_alloc,
1619 trie->trans = (reg_trie_trans *)
1620 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1627 for( state=1 ; state < next_alloc ; state ++ ) {
1631 DEBUG_TRIE_COMPILE_MORE_r(
1632 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1636 if (trie->states[state].trans.list) {
1637 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1641 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1642 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1643 if ( forid < minid ) {
1645 } else if ( forid > maxid ) {
1649 if ( transcount < tp + maxid - minid + 1) {
1651 trie->trans = (reg_trie_trans *)
1652 PerlMemShared_realloc( trie->trans,
1654 * sizeof(reg_trie_trans) );
1655 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1657 base = trie->uniquecharcount + tp - minid;
1658 if ( maxid == minid ) {
1660 for ( ; zp < tp ; zp++ ) {
1661 if ( ! trie->trans[ zp ].next ) {
1662 base = trie->uniquecharcount + zp - minid;
1663 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1664 trie->trans[ zp ].check = state;
1670 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1671 trie->trans[ tp ].check = state;
1676 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1677 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1678 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1679 trie->trans[ tid ].check = state;
1681 tp += ( maxid - minid + 1 );
1683 Safefree(trie->states[ state ].trans.list);
1686 DEBUG_TRIE_COMPILE_MORE_r(
1687 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1690 trie->states[ state ].trans.base=base;
1692 trie->lasttrans = tp + 1;
1696 Second Pass -- Flat Table Representation.
1698 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1699 We know that we will need Charcount+1 trans at most to store the data
1700 (one row per char at worst case) So we preallocate both structures
1701 assuming worst case.
1703 We then construct the trie using only the .next slots of the entry
1706 We use the .check field of the first entry of the node temporarily to
1707 make compression both faster and easier by keeping track of how many non
1708 zero fields are in the node.
1710 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1713 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1714 number representing the first entry of the node, and state as a
1715 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1716 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1717 are 2 entrys per node. eg:
1725 The table is internally in the right hand, idx form. However as we also
1726 have to deal with the states array which is indexed by nodenum we have to
1727 use TRIE_NODENUM() to convert.
1730 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1731 "%*sCompiling trie using table compiler\n",
1732 (int)depth * 2 + 2, ""));
1734 trie->trans = (reg_trie_trans *)
1735 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1736 * trie->uniquecharcount + 1,
1737 sizeof(reg_trie_trans) );
1738 trie->states = (reg_trie_state *)
1739 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1740 sizeof(reg_trie_state) );
1741 next_alloc = trie->uniquecharcount + 1;
1744 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1746 regnode * const noper = NEXTOPER( cur );
1747 const U8 *uc = (U8*)STRING( noper );
1748 const U8 * const e = uc + STR_LEN( noper );
1750 U32 state = 1; /* required init */
1752 U16 charid = 0; /* sanity init */
1753 U32 accept_state = 0; /* sanity init */
1754 U8 *scan = (U8*)NULL; /* sanity init */
1756 STRLEN foldlen = 0; /* required init */
1757 U32 wordlen = 0; /* required init */
1758 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1760 if ( OP(noper) != NOTHING ) {
1761 for ( ; uc < e ; uc += len ) {
1766 charid = trie->charmap[ uvc ];
1768 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1769 charid = svpp ? (U16)SvIV(*svpp) : 0;
1773 if ( !trie->trans[ state + charid ].next ) {
1774 trie->trans[ state + charid ].next = next_alloc;
1775 trie->trans[ state ].check++;
1776 next_alloc += trie->uniquecharcount;
1778 state = trie->trans[ state + charid ].next;
1780 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1782 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1785 accept_state = TRIE_NODENUM( state );
1786 TRIE_HANDLE_WORD(accept_state);
1788 } /* end second pass */
1790 /* and now dump it out before we compress it */
1791 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1793 next_alloc, depth+1));
1797 * Inplace compress the table.*
1799 For sparse data sets the table constructed by the trie algorithm will
1800 be mostly 0/FAIL transitions or to put it another way mostly empty.
1801 (Note that leaf nodes will not contain any transitions.)
1803 This algorithm compresses the tables by eliminating most such
1804 transitions, at the cost of a modest bit of extra work during lookup:
1806 - Each states[] entry contains a .base field which indicates the
1807 index in the state[] array wheres its transition data is stored.
1809 - If .base is 0 there are no valid transitions from that node.
1811 - If .base is nonzero then charid is added to it to find an entry in
1814 -If trans[states[state].base+charid].check!=state then the
1815 transition is taken to be a 0/Fail transition. Thus if there are fail
1816 transitions at the front of the node then the .base offset will point
1817 somewhere inside the previous nodes data (or maybe even into a node
1818 even earlier), but the .check field determines if the transition is
1822 The following process inplace converts the table to the compressed
1823 table: We first do not compress the root node 1,and mark its all its
1824 .check pointers as 1 and set its .base pointer as 1 as well. This
1825 allows to do a DFA construction from the compressed table later, and
1826 ensures that any .base pointers we calculate later are greater than
1829 - We set 'pos' to indicate the first entry of the second node.
1831 - We then iterate over the columns of the node, finding the first and
1832 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1833 and set the .check pointers accordingly, and advance pos
1834 appropriately and repreat for the next node. Note that when we copy
1835 the next pointers we have to convert them from the original
1836 NODEIDX form to NODENUM form as the former is not valid post
1839 - If a node has no transitions used we mark its base as 0 and do not
1840 advance the pos pointer.
1842 - If a node only has one transition we use a second pointer into the
1843 structure to fill in allocated fail transitions from other states.
1844 This pointer is independent of the main pointer and scans forward
1845 looking for null transitions that are allocated to a state. When it
1846 finds one it writes the single transition into the "hole". If the
1847 pointer doesnt find one the single transition is appended as normal.
1849 - Once compressed we can Renew/realloc the structures to release the
1852 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1853 specifically Fig 3.47 and the associated pseudocode.
1857 const U32 laststate = TRIE_NODENUM( next_alloc );
1860 trie->statecount = laststate;
1862 for ( state = 1 ; state < laststate ; state++ ) {
1864 const U32 stateidx = TRIE_NODEIDX( state );
1865 const U32 o_used = trie->trans[ stateidx ].check;
1866 U32 used = trie->trans[ stateidx ].check;
1867 trie->trans[ stateidx ].check = 0;
1869 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1870 if ( flag || trie->trans[ stateidx + charid ].next ) {
1871 if ( trie->trans[ stateidx + charid ].next ) {
1873 for ( ; zp < pos ; zp++ ) {
1874 if ( ! trie->trans[ zp ].next ) {
1878 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1879 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880 trie->trans[ zp ].check = state;
1881 if ( ++zp > pos ) pos = zp;
1888 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1890 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1891 trie->trans[ pos ].check = state;
1896 trie->lasttrans = pos + 1;
1897 trie->states = (reg_trie_state *)
1898 PerlMemShared_realloc( trie->states, laststate
1899 * sizeof(reg_trie_state) );
1900 DEBUG_TRIE_COMPILE_MORE_r(
1901 PerlIO_printf( Perl_debug_log,
1902 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1903 (int)depth * 2 + 2,"",
1904 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1907 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1910 } /* end table compress */
1912 DEBUG_TRIE_COMPILE_MORE_r(
1913 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1914 (int)depth * 2 + 2, "",
1915 (UV)trie->statecount,
1916 (UV)trie->lasttrans)
1918 /* resize the trans array to remove unused space */
1919 trie->trans = (reg_trie_trans *)
1920 PerlMemShared_realloc( trie->trans, trie->lasttrans
1921 * sizeof(reg_trie_trans) );
1923 /* and now dump out the compressed format */
1924 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1926 { /* Modify the program and insert the new TRIE node*/
1927 U8 nodetype =(U8)(flags & 0xFF);
1931 regnode *optimize = NULL;
1932 #ifdef RE_TRACK_PATTERN_OFFSETS
1935 U32 mjd_nodelen = 0;
1936 #endif /* RE_TRACK_PATTERN_OFFSETS */
1937 #endif /* DEBUGGING */
1939 This means we convert either the first branch or the first Exact,
1940 depending on whether the thing following (in 'last') is a branch
1941 or not and whther first is the startbranch (ie is it a sub part of
1942 the alternation or is it the whole thing.)
1943 Assuming its a sub part we conver the EXACT otherwise we convert
1944 the whole branch sequence, including the first.
1946 /* Find the node we are going to overwrite */
1947 if ( first != startbranch || OP( last ) == BRANCH ) {
1948 /* branch sub-chain */
1949 NEXT_OFF( first ) = (U16)(last - first);
1950 #ifdef RE_TRACK_PATTERN_OFFSETS
1952 mjd_offset= Node_Offset((convert));
1953 mjd_nodelen= Node_Length((convert));
1956 /* whole branch chain */
1958 #ifdef RE_TRACK_PATTERN_OFFSETS
1961 const regnode *nop = NEXTOPER( convert );
1962 mjd_offset= Node_Offset((nop));
1963 mjd_nodelen= Node_Length((nop));
1967 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1968 (int)depth * 2 + 2, "",
1969 (UV)mjd_offset, (UV)mjd_nodelen)
1972 /* But first we check to see if there is a common prefix we can
1973 split out as an EXACT and put in front of the TRIE node. */
1974 trie->startstate= 1;
1975 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1977 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1981 const U32 base = trie->states[ state ].trans.base;
1983 if ( trie->states[state].wordnum )
1986 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1987 if ( ( base + ofs >= trie->uniquecharcount ) &&
1988 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1989 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1991 if ( ++count > 1 ) {
1992 SV **tmp = av_fetch( revcharmap, ofs, 0);
1993 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1994 if ( state == 1 ) break;
1996 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1998 PerlIO_printf(Perl_debug_log,
1999 "%*sNew Start State=%"UVuf" Class: [",
2000 (int)depth * 2 + 2, "",
2003 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2004 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2006 TRIE_BITMAP_SET(trie,*ch);
2008 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2010 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2014 TRIE_BITMAP_SET(trie,*ch);
2016 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2017 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2023 SV **tmp = av_fetch( revcharmap, idx, 0);
2025 char *ch = SvPV( *tmp, len );
2027 SV *sv=sv_newmortal();
2028 PerlIO_printf( Perl_debug_log,
2029 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2030 (int)depth * 2 + 2, "",
2032 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2033 PL_colors[0], PL_colors[1],
2034 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2035 PERL_PV_ESCAPE_FIRSTCHAR
2040 OP( convert ) = nodetype;
2041 str=STRING(convert);
2044 STR_LEN(convert) += len;
2050 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2056 regnode *n = convert+NODE_SZ_STR(convert);
2057 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2058 trie->startstate = state;
2059 trie->minlen -= (state - 1);
2060 trie->maxlen -= (state - 1);
2062 /* At least the UNICOS C compiler choked on this
2063 * being argument to DEBUG_r(), so let's just have
2066 #ifdef PERL_EXT_RE_BUILD
2072 regnode *fix = convert;
2073 U32 word = trie->wordcount;
2075 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2076 while( ++fix < n ) {
2077 Set_Node_Offset_Length(fix, 0, 0);
2080 SV ** const tmp = av_fetch( trie_words, word, 0 );
2082 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2083 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2093 NEXT_OFF(convert) = (U16)(tail - convert);
2094 DEBUG_r(optimize= n);
2100 if ( trie->maxlen ) {
2101 NEXT_OFF( convert ) = (U16)(tail - convert);
2102 ARG_SET( convert, data_slot );
2103 /* Store the offset to the first unabsorbed branch in
2104 jump[0], which is otherwise unused by the jump logic.
2105 We use this when dumping a trie and during optimisation. */
2107 trie->jump[0] = (U16)(nextbranch - convert);
2110 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2111 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2113 OP( convert ) = TRIEC;
2114 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2115 PerlMemShared_free(trie->bitmap);
2118 OP( convert ) = TRIE;
2120 /* store the type in the flags */
2121 convert->flags = nodetype;
2125 + regarglen[ OP( convert ) ];
2127 /* XXX We really should free up the resource in trie now,
2128 as we won't use them - (which resources?) dmq */
2130 /* needed for dumping*/
2131 DEBUG_r(if (optimize) {
2132 regnode *opt = convert;
2134 while ( ++opt < optimize) {
2135 Set_Node_Offset_Length(opt,0,0);
2138 Try to clean up some of the debris left after the
2141 while( optimize < jumper ) {
2142 mjd_nodelen += Node_Length((optimize));
2143 OP( optimize ) = OPTIMIZED;
2144 Set_Node_Offset_Length(optimize,0,0);
2147 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2149 } /* end node insert */
2150 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2151 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2153 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2154 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2156 SvREFCNT_dec(revcharmap);
2160 : trie->startstate>1
2166 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2168 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2170 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2171 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2174 We find the fail state for each state in the trie, this state is the longest proper
2175 suffix of the current states 'word' that is also a proper prefix of another word in our
2176 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2177 the DFA not to have to restart after its tried and failed a word at a given point, it
2178 simply continues as though it had been matching the other word in the first place.
2180 'abcdgu'=~/abcdefg|cdgu/
2181 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2182 fail, which would bring use to the state representing 'd' in the second word where we would
2183 try 'g' and succeed, prodceding to match 'cdgu'.
2185 /* add a fail transition */
2186 const U32 trie_offset = ARG(source);
2187 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2189 const U32 ucharcount = trie->uniquecharcount;
2190 const U32 numstates = trie->statecount;
2191 const U32 ubound = trie->lasttrans + ucharcount;
2195 U32 base = trie->states[ 1 ].trans.base;
2198 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2199 GET_RE_DEBUG_FLAGS_DECL;
2201 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2203 PERL_UNUSED_ARG(depth);
2207 ARG_SET( stclass, data_slot );
2208 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2209 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2210 aho->trie=trie_offset;
2211 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2212 Copy( trie->states, aho->states, numstates, reg_trie_state );
2213 Newxz( q, numstates, U32);
2214 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2217 /* initialize fail[0..1] to be 1 so that we always have
2218 a valid final fail state */
2219 fail[ 0 ] = fail[ 1 ] = 1;
2221 for ( charid = 0; charid < ucharcount ; charid++ ) {
2222 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2224 q[ q_write ] = newstate;
2225 /* set to point at the root */
2226 fail[ q[ q_write++ ] ]=1;
2229 while ( q_read < q_write) {
2230 const U32 cur = q[ q_read++ % numstates ];
2231 base = trie->states[ cur ].trans.base;
2233 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2234 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2236 U32 fail_state = cur;
2239 fail_state = fail[ fail_state ];
2240 fail_base = aho->states[ fail_state ].trans.base;
2241 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2243 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2244 fail[ ch_state ] = fail_state;
2245 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2247 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2249 q[ q_write++ % numstates] = ch_state;
2253 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2254 when we fail in state 1, this allows us to use the
2255 charclass scan to find a valid start char. This is based on the principle
2256 that theres a good chance the string being searched contains lots of stuff
2257 that cant be a start char.
2259 fail[ 0 ] = fail[ 1 ] = 0;
2260 DEBUG_TRIE_COMPILE_r({
2261 PerlIO_printf(Perl_debug_log,
2262 "%*sStclass Failtable (%"UVuf" states): 0",
2263 (int)(depth * 2), "", (UV)numstates
2265 for( q_read=1; q_read<numstates; q_read++ ) {
2266 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2268 PerlIO_printf(Perl_debug_log, "\n");
2271 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2276 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2277 * These need to be revisited when a newer toolchain becomes available.
2279 #if defined(__sparc64__) && defined(__GNUC__)
2280 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2281 # undef SPARC64_GCC_WORKAROUND
2282 # define SPARC64_GCC_WORKAROUND 1
2286 #define DEBUG_PEEP(str,scan,depth) \
2287 DEBUG_OPTIMISE_r({if (scan){ \
2288 SV * const mysv=sv_newmortal(); \
2289 regnode *Next = regnext(scan); \
2290 regprop(RExC_rx, mysv, scan); \
2291 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2292 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2293 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2300 #define JOIN_EXACT(scan,min,flags) \
2301 if (PL_regkind[OP(scan)] == EXACT) \
2302 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2305 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2306 /* Merge several consecutive EXACTish nodes into one. */
2307 regnode *n = regnext(scan);
2309 regnode *next = scan + NODE_SZ_STR(scan);
2313 regnode *stop = scan;
2314 GET_RE_DEBUG_FLAGS_DECL;
2316 PERL_UNUSED_ARG(depth);
2319 PERL_ARGS_ASSERT_JOIN_EXACT;
2320 #ifndef EXPERIMENTAL_INPLACESCAN
2321 PERL_UNUSED_ARG(flags);
2322 PERL_UNUSED_ARG(val);
2324 DEBUG_PEEP("join",scan,depth);
2326 /* Skip NOTHING, merge EXACT*. */
2328 ( PL_regkind[OP(n)] == NOTHING ||
2329 (stringok && (OP(n) == OP(scan))))
2331 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2333 if (OP(n) == TAIL || n > next)
2335 if (PL_regkind[OP(n)] == NOTHING) {
2336 DEBUG_PEEP("skip:",n,depth);
2337 NEXT_OFF(scan) += NEXT_OFF(n);
2338 next = n + NODE_STEP_REGNODE;
2345 else if (stringok) {
2346 const unsigned int oldl = STR_LEN(scan);
2347 regnode * const nnext = regnext(n);
2349 DEBUG_PEEP("merg",n,depth);
2352 if (oldl + STR_LEN(n) > U8_MAX)
2354 NEXT_OFF(scan) += NEXT_OFF(n);
2355 STR_LEN(scan) += STR_LEN(n);
2356 next = n + NODE_SZ_STR(n);
2357 /* Now we can overwrite *n : */
2358 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2366 #ifdef EXPERIMENTAL_INPLACESCAN
2367 if (flags && !NEXT_OFF(n)) {
2368 DEBUG_PEEP("atch", val, depth);
2369 if (reg_off_by_arg[OP(n)]) {
2370 ARG_SET(n, val - n);
2373 NEXT_OFF(n) = val - n;
2380 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2382 Two problematic code points in Unicode casefolding of EXACT nodes:
2384 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2385 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2391 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2392 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2394 This means that in case-insensitive matching (or "loose matching",
2395 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2396 length of the above casefolded versions) can match a target string
2397 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2398 This would rather mess up the minimum length computation.
2400 What we'll do is to look for the tail four bytes, and then peek
2401 at the preceding two bytes to see whether we need to decrease
2402 the minimum length by four (six minus two).
2404 Thanks to the design of UTF-8, there cannot be false matches:
2405 A sequence of valid UTF-8 bytes cannot be a subsequence of
2406 another valid sequence of UTF-8 bytes.
2409 char * const s0 = STRING(scan), *s, *t;
2410 char * const s1 = s0 + STR_LEN(scan) - 1;
2411 char * const s2 = s1 - 4;
2412 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2413 const char t0[] = "\xaf\x49\xaf\x42";
2415 const char t0[] = "\xcc\x88\xcc\x81";
2417 const char * const t1 = t0 + 3;
2420 s < s2 && (t = ninstr(s, s1, t0, t1));
2423 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2424 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2426 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2427 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2435 n = scan + NODE_SZ_STR(scan);
2437 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2444 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2448 /* REx optimizer. Converts nodes into quickier variants "in place".
2449 Finds fixed substrings. */
2451 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2452 to the position after last scanned or to NULL. */
2454 #define INIT_AND_WITHP \
2455 assert(!and_withp); \
2456 Newx(and_withp,1,struct regnode_charclass_class); \
2457 SAVEFREEPV(and_withp)
2459 /* this is a chain of data about sub patterns we are processing that
2460 need to be handled seperately/specially in study_chunk. Its so
2461 we can simulate recursion without losing state. */
2463 typedef struct scan_frame {
2464 regnode *last; /* last node to process in this frame */
2465 regnode *next; /* next node to process when last is reached */
2466 struct scan_frame *prev; /*previous frame*/
2467 I32 stop; /* what stopparen do we use */
2471 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2473 #define CASE_SYNST_FNC(nAmE) \
2475 if (flags & SCF_DO_STCLASS_AND) { \
2476 for (value = 0; value < 256; value++) \
2477 if (!is_ ## nAmE ## _cp(value)) \
2478 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2481 for (value = 0; value < 256; value++) \
2482 if (is_ ## nAmE ## _cp(value)) \
2483 ANYOF_BITMAP_SET(data->start_class, value); \
2487 if (flags & SCF_DO_STCLASS_AND) { \
2488 for (value = 0; value < 256; value++) \
2489 if (is_ ## nAmE ## _cp(value)) \
2490 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2493 for (value = 0; value < 256; value++) \
2494 if (!is_ ## nAmE ## _cp(value)) \
2495 ANYOF_BITMAP_SET(data->start_class, value); \
2502 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2503 I32 *minlenp, I32 *deltap,
2508 struct regnode_charclass_class *and_withp,
2509 U32 flags, U32 depth)
2510 /* scanp: Start here (read-write). */
2511 /* deltap: Write maxlen-minlen here. */
2512 /* last: Stop before this one. */
2513 /* data: string data about the pattern */
2514 /* stopparen: treat close N as END */
2515 /* recursed: which subroutines have we recursed into */
2516 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2519 I32 min = 0, pars = 0, code;
2520 regnode *scan = *scanp, *next;
2522 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2523 int is_inf_internal = 0; /* The studied chunk is infinite */
2524 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2525 scan_data_t data_fake;
2526 SV *re_trie_maxbuff = NULL;
2527 regnode *first_non_open = scan;
2528 I32 stopmin = I32_MAX;
2529 scan_frame *frame = NULL;
2530 GET_RE_DEBUG_FLAGS_DECL;
2532 PERL_ARGS_ASSERT_STUDY_CHUNK;
2535 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2539 while (first_non_open && OP(first_non_open) == OPEN)
2540 first_non_open=regnext(first_non_open);
2545 while ( scan && OP(scan) != END && scan < last ){
2546 /* Peephole optimizer: */
2547 DEBUG_STUDYDATA("Peep:", data,depth);
2548 DEBUG_PEEP("Peep",scan,depth);
2549 JOIN_EXACT(scan,&min,0);
2551 /* Follow the next-chain of the current node and optimize
2552 away all the NOTHINGs from it. */
2553 if (OP(scan) != CURLYX) {
2554 const int max = (reg_off_by_arg[OP(scan)]
2556 /* I32 may be smaller than U16 on CRAYs! */
2557 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2558 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2562 /* Skip NOTHING and LONGJMP. */
2563 while ((n = regnext(n))
2564 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2565 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2566 && off + noff < max)
2568 if (reg_off_by_arg[OP(scan)])
2571 NEXT_OFF(scan) = off;
2576 /* The principal pseudo-switch. Cannot be a switch, since we
2577 look into several different things. */
2578 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2579 || OP(scan) == IFTHEN) {
2580 next = regnext(scan);
2582 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2584 if (OP(next) == code || code == IFTHEN) {
2585 /* NOTE - There is similar code to this block below for handling
2586 TRIE nodes on a re-study. If you change stuff here check there
2588 I32 max1 = 0, min1 = I32_MAX, num = 0;
2589 struct regnode_charclass_class accum;
2590 regnode * const startbranch=scan;
2592 if (flags & SCF_DO_SUBSTR)
2593 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2594 if (flags & SCF_DO_STCLASS)
2595 cl_init_zero(pRExC_state, &accum);
2597 while (OP(scan) == code) {
2598 I32 deltanext, minnext, f = 0, fake;
2599 struct regnode_charclass_class this_class;
2602 data_fake.flags = 0;
2604 data_fake.whilem_c = data->whilem_c;
2605 data_fake.last_closep = data->last_closep;
2608 data_fake.last_closep = &fake;
2610 data_fake.pos_delta = delta;
2611 next = regnext(scan);
2612 scan = NEXTOPER(scan);
2614 scan = NEXTOPER(scan);
2615 if (flags & SCF_DO_STCLASS) {
2616 cl_init(pRExC_state, &this_class);
2617 data_fake.start_class = &this_class;
2618 f = SCF_DO_STCLASS_AND;
2620 if (flags & SCF_WHILEM_VISITED_POS)
2621 f |= SCF_WHILEM_VISITED_POS;
2623 /* we suppose the run is continuous, last=next...*/
2624 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2626 stopparen, recursed, NULL, f,depth+1);
2629 if (max1 < minnext + deltanext)
2630 max1 = minnext + deltanext;
2631 if (deltanext == I32_MAX)
2632 is_inf = is_inf_internal = 1;
2634 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2636 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2637 if ( stopmin > minnext)
2638 stopmin = min + min1;
2639 flags &= ~SCF_DO_SUBSTR;
2641 data->flags |= SCF_SEEN_ACCEPT;
2644 if (data_fake.flags & SF_HAS_EVAL)
2645 data->flags |= SF_HAS_EVAL;
2646 data->whilem_c = data_fake.whilem_c;
2648 if (flags & SCF_DO_STCLASS)
2649 cl_or(pRExC_state, &accum, &this_class);
2651 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2653 if (flags & SCF_DO_SUBSTR) {
2654 data->pos_min += min1;
2655 data->pos_delta += max1 - min1;
2656 if (max1 != min1 || is_inf)
2657 data->longest = &(data->longest_float);
2660 delta += max1 - min1;
2661 if (flags & SCF_DO_STCLASS_OR) {
2662 cl_or(pRExC_state, data->start_class, &accum);
2664 cl_and(data->start_class, and_withp);
2665 flags &= ~SCF_DO_STCLASS;
2668 else if (flags & SCF_DO_STCLASS_AND) {
2670 cl_and(data->start_class, &accum);
2671 flags &= ~SCF_DO_STCLASS;
2674 /* Switch to OR mode: cache the old value of
2675 * data->start_class */
2677 StructCopy(data->start_class, and_withp,
2678 struct regnode_charclass_class);
2679 flags &= ~SCF_DO_STCLASS_AND;
2680 StructCopy(&accum, data->start_class,
2681 struct regnode_charclass_class);
2682 flags |= SCF_DO_STCLASS_OR;
2683 data->start_class->flags |= ANYOF_EOS;
2687 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2690 Assuming this was/is a branch we are dealing with: 'scan' now
2691 points at the item that follows the branch sequence, whatever
2692 it is. We now start at the beginning of the sequence and look
2699 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2701 If we can find such a subseqence we need to turn the first
2702 element into a trie and then add the subsequent branch exact
2703 strings to the trie.
2707 1. patterns where the whole set of branch can be converted.
2709 2. patterns where only a subset can be converted.
2711 In case 1 we can replace the whole set with a single regop
2712 for the trie. In case 2 we need to keep the start and end
2715 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2716 becomes BRANCH TRIE; BRANCH X;
2718 There is an additional case, that being where there is a
2719 common prefix, which gets split out into an EXACT like node
2720 preceding the TRIE node.
2722 If x(1..n)==tail then we can do a simple trie, if not we make
2723 a "jump" trie, such that when we match the appropriate word
2724 we "jump" to the appopriate tail node. Essentailly we turn
2725 a nested if into a case structure of sorts.
2730 if (!re_trie_maxbuff) {
2731 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2732 if (!SvIOK(re_trie_maxbuff))
2733 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2735 if ( SvIV(re_trie_maxbuff)>=0 ) {
2737 regnode *first = (regnode *)NULL;
2738 regnode *last = (regnode *)NULL;
2739 regnode *tail = scan;
2744 SV * const mysv = sv_newmortal(); /* for dumping */
2746 /* var tail is used because there may be a TAIL
2747 regop in the way. Ie, the exacts will point to the
2748 thing following the TAIL, but the last branch will
2749 point at the TAIL. So we advance tail. If we
2750 have nested (?:) we may have to move through several
2754 while ( OP( tail ) == TAIL ) {
2755 /* this is the TAIL generated by (?:) */
2756 tail = regnext( tail );
2761 regprop(RExC_rx, mysv, tail );
2762 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2763 (int)depth * 2 + 2, "",
2764 "Looking for TRIE'able sequences. Tail node is: ",
2765 SvPV_nolen_const( mysv )
2771 step through the branches, cur represents each
2772 branch, noper is the first thing to be matched
2773 as part of that branch and noper_next is the
2774 regnext() of that node. if noper is an EXACT
2775 and noper_next is the same as scan (our current
2776 position in the regex) then the EXACT branch is
2777 a possible optimization target. Once we have
2778 two or more consequetive such branches we can
2779 create a trie of the EXACT's contents and stich
2780 it in place. If the sequence represents all of
2781 the branches we eliminate the whole thing and
2782 replace it with a single TRIE. If it is a
2783 subsequence then we need to stitch it in. This
2784 means the first branch has to remain, and needs
2785 to be repointed at the item on the branch chain
2786 following the last branch optimized. This could
2787 be either a BRANCH, in which case the
2788 subsequence is internal, or it could be the
2789 item following the branch sequence in which
2790 case the subsequence is at the end.
2794 /* dont use tail as the end marker for this traverse */
2795 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2796 regnode * const noper = NEXTOPER( cur );
2797 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2798 regnode * const noper_next = regnext( noper );
2802 regprop(RExC_rx, mysv, cur);
2803 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2804 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2806 regprop(RExC_rx, mysv, noper);
2807 PerlIO_printf( Perl_debug_log, " -> %s",
2808 SvPV_nolen_const(mysv));
2811 regprop(RExC_rx, mysv, noper_next );
2812 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2813 SvPV_nolen_const(mysv));
2815 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2816 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2818 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2819 : PL_regkind[ OP( noper ) ] == EXACT )
2820 || OP(noper) == NOTHING )
2822 && noper_next == tail
2827 if ( !first || optype == NOTHING ) {
2828 if (!first) first = cur;
2829 optype = OP( noper );
2835 Currently we do not believe that the trie logic can
2836 handle case insensitive matching properly when the
2837 pattern is not unicode (thus forcing unicode semantics).
2839 If/when this is fixed the following define can be swapped
2840 in below to fully enable trie logic.
2842 #define TRIE_TYPE_IS_SAFE 1
2845 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2847 if ( last && TRIE_TYPE_IS_SAFE ) {
2848 make_trie( pRExC_state,
2849 startbranch, first, cur, tail, count,
2852 if ( PL_regkind[ OP( noper ) ] == EXACT
2854 && noper_next == tail
2859 optype = OP( noper );
2869 regprop(RExC_rx, mysv, cur);
2870 PerlIO_printf( Perl_debug_log,
2871 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2872 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2876 if ( last && TRIE_TYPE_IS_SAFE ) {
2877 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2878 #ifdef TRIE_STUDY_OPT
2879 if ( ((made == MADE_EXACT_TRIE &&
2880 startbranch == first)
2881 || ( first_non_open == first )) &&
2883 flags |= SCF_TRIE_RESTUDY;
2884 if ( startbranch == first
2887 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2897 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2898 scan = NEXTOPER(NEXTOPER(scan));
2899 } else /* single branch is optimized. */
2900 scan = NEXTOPER(scan);
2902 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2903 scan_frame *newframe = NULL;
2908 if (OP(scan) != SUSPEND) {
2909 /* set the pointer */
2910 if (OP(scan) == GOSUB) {
2912 RExC_recurse[ARG2L(scan)] = scan;
2913 start = RExC_open_parens[paren-1];
2914 end = RExC_close_parens[paren-1];
2917 start = RExC_rxi->program + 1;
2921 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2922 SAVEFREEPV(recursed);
2924 if (!PAREN_TEST(recursed,paren+1)) {
2925 PAREN_SET(recursed,paren+1);
2926 Newx(newframe,1,scan_frame);
2928 if (flags & SCF_DO_SUBSTR) {
2929 SCAN_COMMIT(pRExC_state,data,minlenp);
2930 data->longest = &(data->longest_float);
2932 is_inf = is_inf_internal = 1;
2933 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2934 cl_anything(pRExC_state, data->start_class);
2935 flags &= ~SCF_DO_STCLASS;
2938 Newx(newframe,1,scan_frame);
2941 end = regnext(scan);
2946 SAVEFREEPV(newframe);
2947 newframe->next = regnext(scan);
2948 newframe->last = last;
2949 newframe->stop = stopparen;
2950 newframe->prev = frame;
2960 else if (OP(scan) == EXACT) {
2961 I32 l = STR_LEN(scan);
2964 const U8 * const s = (U8*)STRING(scan);
2965 l = utf8_length(s, s + l);
2966 uc = utf8_to_uvchr(s, NULL);
2968 uc = *((U8*)STRING(scan));
2971 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2972 /* The code below prefers earlier match for fixed
2973 offset, later match for variable offset. */
2974 if (data->last_end == -1) { /* Update the start info. */
2975 data->last_start_min = data->pos_min;
2976 data->last_start_max = is_inf
2977 ? I32_MAX : data->pos_min + data->pos_delta;
2979 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2981 SvUTF8_on(data->last_found);
2983 SV * const sv = data->last_found;
2984 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2985 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2986 if (mg && mg->mg_len >= 0)
2987 mg->mg_len += utf8_length((U8*)STRING(scan),
2988 (U8*)STRING(scan)+STR_LEN(scan));
2990 data->last_end = data->pos_min + l;
2991 data->pos_min += l; /* As in the first entry. */
2992 data->flags &= ~SF_BEFORE_EOL;
2994 if (flags & SCF_DO_STCLASS_AND) {
2995 /* Check whether it is compatible with what we know already! */
2999 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3000 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3001 && (!(data->start_class->flags & ANYOF_FOLD)
3002 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3005 ANYOF_CLASS_ZERO(data->start_class);
3006 ANYOF_BITMAP_ZERO(data->start_class);
3008 ANYOF_BITMAP_SET(data->start_class, uc);
3009 data->start_class->flags &= ~ANYOF_EOS;
3011 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3013 else if (flags & SCF_DO_STCLASS_OR) {
3014 /* false positive possible if the class is case-folded */
3016 ANYOF_BITMAP_SET(data->start_class, uc);
3018 data->start_class->flags |= ANYOF_UNICODE_ALL;
3019 data->start_class->flags &= ~ANYOF_EOS;
3020 cl_and(data->start_class, and_withp);
3022 flags &= ~SCF_DO_STCLASS;
3024 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3025 I32 l = STR_LEN(scan);
3026 UV uc = *((U8*)STRING(scan));
3028 /* Search for fixed substrings supports EXACT only. */
3029 if (flags & SCF_DO_SUBSTR) {
3031 SCAN_COMMIT(pRExC_state, data, minlenp);
3034 const U8 * const s = (U8 *)STRING(scan);
3035 l = utf8_length(s, s + l);
3036 uc = utf8_to_uvchr(s, NULL);
3039 if (flags & SCF_DO_SUBSTR)
3041 if (flags & SCF_DO_STCLASS_AND) {
3042 /* Check whether it is compatible with what we know already! */
3046 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3047 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3048 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3050 ANYOF_CLASS_ZERO(data->start_class);
3051 ANYOF_BITMAP_ZERO(data->start_class);
3053 ANYOF_BITMAP_SET(data->start_class, uc);
3054 data->start_class->flags &= ~ANYOF_EOS;
3055 data->start_class->flags |= ANYOF_FOLD;
3056 if (OP(scan) == EXACTFL)
3057 data->start_class->flags |= ANYOF_LOCALE;
3060 else if (flags & SCF_DO_STCLASS_OR) {
3061 if (data->start_class->flags & ANYOF_FOLD) {
3062 /* false positive possible if the class is case-folded.
3063 Assume that the locale settings are the same... */
3065 ANYOF_BITMAP_SET(data->start_class, uc);
3066 data->start_class->flags &= ~ANYOF_EOS;
3068 cl_and(data->start_class, and_withp);
3070 flags &= ~SCF_DO_STCLASS;
3072 else if (strchr((const char*)PL_varies,OP(scan))) {
3073 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3074 I32 f = flags, pos_before = 0;
3075 regnode * const oscan = scan;
3076 struct regnode_charclass_class this_class;
3077 struct regnode_charclass_class *oclass = NULL;
3078 I32 next_is_eval = 0;
3080 switch (PL_regkind[OP(scan)]) {
3081 case WHILEM: /* End of (?:...)* . */
3082 scan = NEXTOPER(scan);
3085 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3086 next = NEXTOPER(scan);
3087 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3089 maxcount = REG_INFTY;
3090 next = regnext(scan);
3091 scan = NEXTOPER(scan);
3095 if (flags & SCF_DO_SUBSTR)
3100 if (flags & SCF_DO_STCLASS) {
3102 maxcount = REG_INFTY;
3103 next = regnext(scan);
3104 scan = NEXTOPER(scan);
3107 is_inf = is_inf_internal = 1;
3108 scan = regnext(scan);
3109 if (flags & SCF_DO_SUBSTR) {
3110 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3111 data->longest = &(data->longest_float);
3113 goto optimize_curly_tail;
3115 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3116 && (scan->flags == stopparen))
3121 mincount = ARG1(scan);
3122 maxcount = ARG2(scan);
3124 next = regnext(scan);
3125 if (OP(scan) == CURLYX) {
3126 I32 lp = (data ? *(data->last_closep) : 0);
3127 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3129 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3130 next_is_eval = (OP(scan) == EVAL);
3132 if (flags & SCF_DO_SUBSTR) {
3133 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3134 pos_before = data->pos_min;
3138 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3140 data->flags |= SF_IS_INF;
3142 if (flags & SCF_DO_STCLASS) {
3143 cl_init(pRExC_state, &this_class);
3144 oclass = data->start_class;
3145 data->start_class = &this_class;
3146 f |= SCF_DO_STCLASS_AND;
3147 f &= ~SCF_DO_STCLASS_OR;
3149 /* These are the cases when once a subexpression
3150 fails at a particular position, it cannot succeed
3151 even after backtracking at the enclosing scope.
3153 XXXX what if minimal match and we are at the
3154 initial run of {n,m}? */
3155 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3156 f &= ~SCF_WHILEM_VISITED_POS;
3158 /* This will finish on WHILEM, setting scan, or on NULL: */
3159 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3160 last, data, stopparen, recursed, NULL,
3162 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3164 if (flags & SCF_DO_STCLASS)
3165 data->start_class = oclass;
3166 if (mincount == 0 || minnext == 0) {
3167 if (flags & SCF_DO_STCLASS_OR) {
3168 cl_or(pRExC_state, data->start_class, &this_class);
3170 else if (flags & SCF_DO_STCLASS_AND) {
3171 /* Switch to OR mode: cache the old value of
3172 * data->start_class */
3174 StructCopy(data->start_class, and_withp,
3175 struct regnode_charclass_class);
3176 flags &= ~SCF_DO_STCLASS_AND;
3177 StructCopy(&this_class, data->start_class,
3178 struct regnode_charclass_class);
3179 flags |= SCF_DO_STCLASS_OR;
3180 data->start_class->flags |= ANYOF_EOS;
3182 } else { /* Non-zero len */
3183 if (flags & SCF_DO_STCLASS_OR) {
3184 cl_or(pRExC_state, data->start_class, &this_class);
3185 cl_and(data->start_class, and_withp);
3187 else if (flags & SCF_DO_STCLASS_AND)
3188 cl_and(data->start_class, &this_class);
3189 flags &= ~SCF_DO_STCLASS;
3191 if (!scan) /* It was not CURLYX, but CURLY. */
3193 if ( /* ? quantifier ok, except for (?{ ... }) */
3194 (next_is_eval || !(mincount == 0 && maxcount == 1))
3195 && (minnext == 0) && (deltanext == 0)
3196 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3197 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3199 ckWARNreg(RExC_parse,
3200 "Quantifier unexpected on zero-length expression");
3203 min += minnext * mincount;
3204 is_inf_internal |= ((maxcount == REG_INFTY
3205 && (minnext + deltanext) > 0)
3206 || deltanext == I32_MAX);
3207 is_inf |= is_inf_internal;
3208 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3210 /* Try powerful optimization CURLYX => CURLYN. */
3211 if ( OP(oscan) == CURLYX && data
3212 && data->flags & SF_IN_PAR
3213 && !(data->flags & SF_HAS_EVAL)
3214 && !deltanext && minnext == 1 ) {
3215 /* Try to optimize to CURLYN. */
3216 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3217 regnode * const nxt1 = nxt;
3224 if (!strchr((const char*)PL_simple,OP(nxt))
3225 && !(PL_regkind[OP(nxt)] == EXACT
3226 && STR_LEN(nxt) == 1))
3232 if (OP(nxt) != CLOSE)
3234 if (RExC_open_parens) {
3235 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3236 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3238 /* Now we know that nxt2 is the only contents: */
3239 oscan->flags = (U8)ARG(nxt);
3241 OP(nxt1) = NOTHING; /* was OPEN. */
3244 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3245 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3246 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3247 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3248 OP(nxt + 1) = OPTIMIZED; /* was count. */
3249 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3254 /* Try optimization CURLYX => CURLYM. */
3255 if ( OP(oscan) == CURLYX && data
3256 && !(data->flags & SF_HAS_PAR)
3257 && !(data->flags & SF_HAS_EVAL)
3258 && !deltanext /* atom is fixed width */
3259 && minnext != 0 /* CURLYM can't handle zero width */
3261 /* XXXX How to optimize if data == 0? */
3262 /* Optimize to a simpler form. */
3263 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3267 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3268 && (OP(nxt2) != WHILEM))
3270 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3271 /* Need to optimize away parenths. */
3272 if (data->flags & SF_IN_PAR) {
3273 /* Set the parenth number. */
3274 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3276 if (OP(nxt) != CLOSE)
3277 FAIL("Panic opt close");
3278 oscan->flags = (U8)ARG(nxt);
3279 if (RExC_open_parens) {
3280 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3281 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3283 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3284 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3287 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3288 OP(nxt + 1) = OPTIMIZED; /* was count. */
3289 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3290 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3293 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3294 regnode *nnxt = regnext(nxt1);
3297 if (reg_off_by_arg[OP(nxt1)])
3298 ARG_SET(nxt1, nxt2 - nxt1);
3299 else if (nxt2 - nxt1 < U16_MAX)
3300 NEXT_OFF(nxt1) = nxt2 - nxt1;
3302 OP(nxt) = NOTHING; /* Cannot beautify */
3307 /* Optimize again: */
3308 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3309 NULL, stopparen, recursed, NULL, 0,depth+1);
3314 else if ((OP(oscan) == CURLYX)
3315 && (flags & SCF_WHILEM_VISITED_POS)
3316 /* See the comment on a similar expression above.
3317 However, this time it not a subexpression
3318 we care about, but the expression itself. */
3319 && (maxcount == REG_INFTY)
3320 && data && ++data->whilem_c < 16) {
3321 /* This stays as CURLYX, we can put the count/of pair. */
3322 /* Find WHILEM (as in regexec.c) */
3323 regnode *nxt = oscan + NEXT_OFF(oscan);
3325 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3327 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3328 | (RExC_whilem_seen << 4)); /* On WHILEM */
3330 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3332 if (flags & SCF_DO_SUBSTR) {
3333 SV *last_str = NULL;
3334 int counted = mincount != 0;
3336 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3337 #if defined(SPARC64_GCC_WORKAROUND)
3340 const char *s = NULL;
3343 if (pos_before >= data->last_start_min)
3346 b = data->last_start_min;
3349 s = SvPV_const(data->last_found, l);
3350 old = b - data->last_start_min;
3353 I32 b = pos_before >= data->last_start_min
3354 ? pos_before : data->last_start_min;
3356 const char * const s = SvPV_const(data->last_found, l);
3357 I32 old = b - data->last_start_min;
3361 old = utf8_hop((U8*)s, old) - (U8*)s;
3364 /* Get the added string: */
3365 last_str = newSVpvn_utf8(s + old, l, UTF);
3366 if (deltanext == 0 && pos_before == b) {
3367 /* What was added is a constant string */
3369 SvGROW(last_str, (mincount * l) + 1);
3370 repeatcpy(SvPVX(last_str) + l,
3371 SvPVX_const(last_str), l, mincount - 1);
3372 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3373 /* Add additional parts. */
3374 SvCUR_set(data->last_found,
3375 SvCUR(data->last_found) - l);
3376 sv_catsv(data->last_found, last_str);
3378 SV * sv = data->last_found;
3380 SvUTF8(sv) && SvMAGICAL(sv) ?
3381 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3382 if (mg && mg->mg_len >= 0)
3383 mg->mg_len += CHR_SVLEN(last_str) - l;
3385 data->last_end += l * (mincount - 1);
3388 /* start offset must point into the last copy */
3389 data->last_start_min += minnext * (mincount - 1);
3390 data->last_start_max += is_inf ? I32_MAX
3391 : (maxcount - 1) * (minnext + data->pos_delta);
3394 /* It is counted once already... */
3395 data->pos_min += minnext * (mincount - counted);
3396 data->pos_delta += - counted * deltanext +
3397 (minnext + deltanext) * maxcount - minnext * mincount;
3398 if (mincount != maxcount) {
3399 /* Cannot extend fixed substrings found inside
3401 SCAN_COMMIT(pRExC_state,data,minlenp);
3402 if (mincount && last_str) {
3403 SV * const sv = data->last_found;
3404 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3405 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3409 sv_setsv(sv, last_str);
3410 data->last_end = data->pos_min;
3411 data->last_start_min =
3412 data->pos_min - CHR_SVLEN(last_str);
3413 data->last_start_max = is_inf
3415 : data->pos_min + data->pos_delta
3416 - CHR_SVLEN(last_str);
3418 data->longest = &(data->longest_float);
3420 SvREFCNT_dec(last_str);
3422 if (data && (fl & SF_HAS_EVAL))
3423 data->flags |= SF_HAS_EVAL;
3424 optimize_curly_tail:
3425 if (OP(oscan) != CURLYX) {
3426 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3428 NEXT_OFF(oscan) += NEXT_OFF(next);
3431 default: /* REF and CLUMP only? */
3432 if (flags & SCF_DO_SUBSTR) {
3433 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3434 data->longest = &(data->longest_float);
3436 is_inf = is_inf_internal = 1;
3437 if (flags & SCF_DO_STCLASS_OR)
3438 cl_anything(pRExC_state, data->start_class);
3439 flags &= ~SCF_DO_STCLASS;
3443 else if (OP(scan) == LNBREAK) {
3444 if (flags & SCF_DO_STCLASS) {
3446 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3447 if (flags & SCF_DO_STCLASS_AND) {
3448 for (value = 0; value < 256; value++)
3449 if (!is_VERTWS_cp(value))
3450 ANYOF_BITMAP_CLEAR(data->start_class, value);
3453 for (value = 0; value < 256; value++)
3454 if (is_VERTWS_cp(value))
3455 ANYOF_BITMAP_SET(data->start_class, value);
3457 if (flags & SCF_DO_STCLASS_OR)
3458 cl_and(data->start_class, and_withp);
3459 flags &= ~SCF_DO_STCLASS;
3463 if (flags & SCF_DO_SUBSTR) {
3464 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3466 data->pos_delta += 1;
3467 data->longest = &(data->longest_float);
3471 else if (OP(scan) == FOLDCHAR) {
3472 int d = ARG(scan)==0xDF ? 1 : 2;
3473 flags &= ~SCF_DO_STCLASS;
3476 if (flags & SCF_DO_SUBSTR) {
3477 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3479 data->pos_delta += d;
3480 data->longest = &(data->longest_float);
3483 else if (strchr((const char*)PL_simple,OP(scan))) {
3486 if (flags & SCF_DO_SUBSTR) {
3487 SCAN_COMMIT(pRExC_state,data,minlenp);
3491 if (flags & SCF_DO_STCLASS) {
3492 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3494 /* Some of the logic below assumes that switching
3495 locale on will only add false positives. */
3496 switch (PL_regkind[OP(scan)]) {
3500 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3501 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3502 cl_anything(pRExC_state, data->start_class);
3505 if (OP(scan) == SANY)
3507 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3508 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3509 || (data->start_class->flags & ANYOF_CLASS));
3510 cl_anything(pRExC_state, data->start_class);
3512 if (flags & SCF_DO_STCLASS_AND || !value)
3513 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3516 if (flags & SCF_DO_STCLASS_AND)
3517 cl_and(data->start_class,
3518 (struct regnode_charclass_class*)scan);
3520 cl_or(pRExC_state, data->start_class,
3521 (struct regnode_charclass_class*)scan);
3524 if (flags & SCF_DO_STCLASS_AND) {
3525 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3526 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3527 for (value = 0; value < 256; value++)
3528 if (!isALNUM(value))
3529 ANYOF_BITMAP_CLEAR(data->start_class, value);
3533 if (data->start_class->flags & ANYOF_LOCALE)
3534 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3536 for (value = 0; value < 256; value++)
3538 ANYOF_BITMAP_SET(data->start_class, value);
3543 if (flags & SCF_DO_STCLASS_AND) {
3544 if (data->start_class->flags & ANYOF_LOCALE)
3545 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3548 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3549 data->start_class->flags |= ANYOF_LOCALE;
3553 if (flags & SCF_DO_STCLASS_AND) {
3554 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3555 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3556 for (value = 0; value < 256; value++)
3558 ANYOF_BITMAP_CLEAR(data->start_class, value);
3562 if (data->start_class->flags & ANYOF_LOCALE)
3563 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3565 for (value = 0; value < 256; value++)
3566 if (!isALNUM(value))
3567 ANYOF_BITMAP_SET(data->start_class, value);
3572 if (flags & SCF_DO_STCLASS_AND) {
3573 if (data->start_class->flags & ANYOF_LOCALE)
3574 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3577 data->start_class->flags |= ANYOF_LOCALE;
3578 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3582 if (flags & SCF_DO_STCLASS_AND) {
3583 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3584 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3585 for (value = 0; value < 256; value++)
3586 if (!isSPACE(value))
3587 ANYOF_BITMAP_CLEAR(data->start_class, value);
3591 if (data->start_class->flags & ANYOF_LOCALE)
3592 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3594 for (value = 0; value < 256; value++)
3596 ANYOF_BITMAP_SET(data->start_class, value);
3601 if (flags & SCF_DO_STCLASS_AND) {
3602 if (data->start_class->flags & ANYOF_LOCALE)
3603 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3606 data->start_class->flags |= ANYOF_LOCALE;
3607 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3611 if (flags & SCF_DO_STCLASS_AND) {
3612 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3613 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3614 for (value = 0; value < 256; value++)
3616 ANYOF_BITMAP_CLEAR(data->start_class, value);
3620 if (data->start_class->flags & ANYOF_LOCALE)
3621 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3623 for (value = 0; value < 256; value++)
3624 if (!isSPACE(value))
3625 ANYOF_BITMAP_SET(data->start_class, value);
3630 if (flags & SCF_DO_STCLASS_AND) {
3631 if (data->start_class->flags & ANYOF_LOCALE) {
3632 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3633 for (value = 0; value < 256; value++)
3634 if (!isSPACE(value))
3635 ANYOF_BITMAP_CLEAR(data->start_class, value);
3639 data->start_class->flags |= ANYOF_LOCALE;
3640 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3644 if (flags & SCF_DO_STCLASS_AND) {
3645 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3646 for (value = 0; value < 256; value++)
3647 if (!isDIGIT(value))
3648 ANYOF_BITMAP_CLEAR(data->start_class, value);
3651 if (data->start_class->flags & ANYOF_LOCALE)
3652 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3654 for (value = 0; value < 256; value++)
3656 ANYOF_BITMAP_SET(data->start_class, value);
3661 if (flags & SCF_DO_STCLASS_AND) {
3662 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3663 for (value = 0; value < 256; value++)
3665 ANYOF_BITMAP_CLEAR(data->start_class, value);
3668 if (data->start_class->flags & ANYOF_LOCALE)
3669 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3671 for (value = 0; value < 256; value++)
3672 if (!isDIGIT(value))
3673 ANYOF_BITMAP_SET(data->start_class, value);
3677 CASE_SYNST_FNC(VERTWS);
3678 CASE_SYNST_FNC(HORIZWS);
3681 if (flags & SCF_DO_STCLASS_OR)
3682 cl_and(data->start_class, and_withp);
3683 flags &= ~SCF_DO_STCLASS;
3686 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3687 data->flags |= (OP(scan) == MEOL
3691 else if ( PL_regkind[OP(scan)] == BRANCHJ
3692 /* Lookbehind, or need to calculate parens/evals/stclass: */
3693 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3694 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3695 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3696 || OP(scan) == UNLESSM )
3698 /* Negative Lookahead/lookbehind
3699 In this case we can't do fixed string optimisation.
3702 I32 deltanext, minnext, fake = 0;
3704 struct regnode_charclass_class intrnl;
3707 data_fake.flags = 0;
3709 data_fake.whilem_c = data->whilem_c;
3710 data_fake.last_closep = data->last_closep;
3713 data_fake.last_closep = &fake;
3714 data_fake.pos_delta = delta;
3715 if ( flags & SCF_DO_STCLASS && !scan->flags
3716 && OP(scan) == IFMATCH ) { /* Lookahead */
3717 cl_init(pRExC_state, &intrnl);
3718 data_fake.start_class = &intrnl;
3719 f |= SCF_DO_STCLASS_AND;
3721 if (flags & SCF_WHILEM_VISITED_POS)
3722 f |= SCF_WHILEM_VISITED_POS;
3723 next = regnext(scan);
3724 nscan = NEXTOPER(NEXTOPER(scan));
3725 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3726 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3729 FAIL("Variable length lookbehind not implemented");
3731 else if (minnext > (I32)U8_MAX) {
3732 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3734 scan->flags = (U8)minnext;
3737 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3739 if (data_fake.flags & SF_HAS_EVAL)
3740 data->flags |= SF_HAS_EVAL;
3741 data->whilem_c = data_fake.whilem_c;
3743 if (f & SCF_DO_STCLASS_AND) {
3744 if (flags & SCF_DO_STCLASS_OR) {
3745 /* OR before, AND after: ideally we would recurse with
3746 * data_fake to get the AND applied by study of the
3747 * remainder of the pattern, and then derecurse;
3748 * *** HACK *** for now just treat as "no information".
3749 * See [perl #56690].
3751 cl_init(pRExC_state, data->start_class);
3753 /* AND before and after: combine and continue */
3754 const int was = (data->start_class->flags & ANYOF_EOS);
3756 cl_and(data->start_class, &intrnl);
3758 data->start_class->flags |= ANYOF_EOS;
3762 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3764 /* Positive Lookahead/lookbehind
3765 In this case we can do fixed string optimisation,
3766 but we must be careful about it. Note in the case of
3767 lookbehind the positions will be offset by the minimum
3768 length of the pattern, something we won't know about
3769 until after the recurse.
3771 I32 deltanext, fake = 0;
3773 struct regnode_charclass_class intrnl;
3775 /* We use SAVEFREEPV so that when the full compile
3776 is finished perl will clean up the allocated
3777 minlens when its all done. This was we don't
3778 have to worry about freeing them when we know
3779 they wont be used, which would be a pain.
3782 Newx( minnextp, 1, I32 );
3783 SAVEFREEPV(minnextp);
3786 StructCopy(data, &data_fake, scan_data_t);
3787 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3790 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3791 data_fake.last_found=newSVsv(data->last_found);
3795 data_fake.last_closep = &fake;
3796 data_fake.flags = 0;
3797 data_fake.pos_delta = delta;
3799 data_fake.flags |= SF_IS_INF;
3800 if ( flags & SCF_DO_STCLASS && !scan->flags
3801 && OP(scan) == IFMATCH ) { /* Lookahead */
3802 cl_init(pRExC_state, &intrnl);
3803 data_fake.start_class = &intrnl;
3804 f |= SCF_DO_STCLASS_AND;
3806 if (flags & SCF_WHILEM_VISITED_POS)
3807 f |= SCF_WHILEM_VISITED_POS;
3808 next = regnext(scan);
3809 nscan = NEXTOPER(NEXTOPER(scan));
3811 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3812 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3815 FAIL("Variable length lookbehind not implemented");
3817 else if (*minnextp > (I32)U8_MAX) {
3818 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3820 scan->flags = (U8)*minnextp;
3825 if (f & SCF_DO_STCLASS_AND) {
3826 const int was = (data->start_class->flags & ANYOF_EOS);
3828 cl_and(data->start_class, &intrnl);
3830 data->start_class->flags |= ANYOF_EOS;
3833 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3835 if (data_fake.flags & SF_HAS_EVAL)
3836 data->flags |= SF_HAS_EVAL;
3837 data->whilem_c = data_fake.whilem_c;
3838 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3839 if (RExC_rx->minlen<*minnextp)
3840 RExC_rx->minlen=*minnextp;
3841 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3842 SvREFCNT_dec(data_fake.last_found);
3844 if ( data_fake.minlen_fixed != minlenp )
3846 data->offset_fixed= data_fake.offset_fixed;
3847 data->minlen_fixed= data_fake.minlen_fixed;
3848 data->lookbehind_fixed+= scan->flags;
3850 if ( data_fake.minlen_float != minlenp )
3852 data->minlen_float= data_fake.minlen_float;
3853 data->offset_float_min=data_fake.offset_float_min;
3854 data->offset_float_max=data_fake.offset_float_max;
3855 data->lookbehind_float+= scan->flags;
3864 else if (OP(scan) == OPEN) {
3865 if (stopparen != (I32)ARG(scan))
3868 else if (OP(scan) == CLOSE) {
3869 if (stopparen == (I32)ARG(scan)) {
3872 if ((I32)ARG(scan) == is_par) {
3873 next = regnext(scan);
3875 if ( next && (OP(next) != WHILEM) && next < last)
3876 is_par = 0; /* Disable optimization */
3879 *(data->last_closep) = ARG(scan);
3881 else if (OP(scan) == EVAL) {
3883 data->flags |= SF_HAS_EVAL;
3885 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3886 if (flags & SCF_DO_SUBSTR) {
3887 SCAN_COMMIT(pRExC_state,data,minlenp);
3888 flags &= ~SCF_DO_SUBSTR;
3890 if (data && OP(scan)==ACCEPT) {
3891 data->flags |= SCF_SEEN_ACCEPT;
3896 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3898 if (flags & SCF_DO_SUBSTR) {
3899 SCAN_COMMIT(pRExC_state,data,minlenp);
3900 data->longest = &(data->longest_float);
3902 is_inf = is_inf_internal = 1;
3903 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3904 cl_anything(pRExC_state, data->start_class);
3905 flags &= ~SCF_DO_STCLASS;
3907 else if (OP(scan) == GPOS) {
3908 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3909 !(delta || is_inf || (data && data->pos_delta)))
3911 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3912 RExC_rx->extflags |= RXf_ANCH_GPOS;
3913 if (RExC_rx->gofs < (U32)min)
3914 RExC_rx->gofs = min;
3916 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3920 #ifdef TRIE_STUDY_OPT
3921 #ifdef FULL_TRIE_STUDY
3922 else if (PL_regkind[OP(scan)] == TRIE) {
3923 /* NOTE - There is similar code to this block above for handling
3924 BRANCH nodes on the initial study. If you change stuff here
3926 regnode *trie_node= scan;
3927 regnode *tail= regnext(scan);
3928 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3929 I32 max1 = 0, min1 = I32_MAX;
3930 struct regnode_charclass_class accum;
3932 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3933 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3934 if (flags & SCF_DO_STCLASS)
3935 cl_init_zero(pRExC_state, &accum);
3941 const regnode *nextbranch= NULL;
3944 for ( word=1 ; word <= trie->wordcount ; word++)
3946 I32 deltanext=0, minnext=0, f = 0, fake;
3947 struct regnode_charclass_class this_class;
3949 data_fake.flags = 0;
3951 data_fake.whilem_c = data->whilem_c;
3952 data_fake.last_closep = data->last_closep;
3955 data_fake.last_closep = &fake;
3956 data_fake.pos_delta = delta;
3957 if (flags & SCF_DO_STCLASS) {
3958 cl_init(pRExC_state, &this_class);
3959 data_fake.start_class = &this_class;
3960 f = SCF_DO_STCLASS_AND;
3962 if (flags & SCF_WHILEM_VISITED_POS)
3963 f |= SCF_WHILEM_VISITED_POS;
3965 if (trie->jump[word]) {
3967 nextbranch = trie_node + trie->jump[0];
3968 scan= trie_node + trie->jump[word];
3969 /* We go from the jump point to the branch that follows
3970 it. Note this means we need the vestigal unused branches
3971 even though they arent otherwise used.
3973 minnext = study_chunk(pRExC_state, &scan, minlenp,
3974 &deltanext, (regnode *)nextbranch, &data_fake,
3975 stopparen, recursed, NULL, f,depth+1);
3977 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3978 nextbranch= regnext((regnode*)nextbranch);
3980 if (min1 > (I32)(minnext + trie->minlen))
3981 min1 = minnext + trie->minlen;
3982 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3983 max1 = minnext + deltanext + trie->maxlen;
3984 if (deltanext == I32_MAX)
3985 is_inf = is_inf_internal = 1;
3987 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3989 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3990 if ( stopmin > min + min1)
3991 stopmin = min + min1;
3992 flags &= ~SCF_DO_SUBSTR;
3994 data->flags |= SCF_SEEN_ACCEPT;
3997 if (data_fake.flags & SF_HAS_EVAL)
3998 data->flags |= SF_HAS_EVAL;
3999 data->whilem_c = data_fake.whilem_c;
4001 if (flags & SCF_DO_STCLASS)
4002 cl_or(pRExC_state, &accum, &this_class);
4005 if (flags & SCF_DO_SUBSTR) {
4006 data->pos_min += min1;
4007 data->pos_delta += max1 - min1;
4008 if (max1 != min1 || is_inf)
4009 data->longest = &(data->longest_float);
4012 delta += max1 - min1;
4013 if (flags & SCF_DO_STCLASS_OR) {
4014 cl_or(pRExC_state, data->start_class, &accum);
4016 cl_and(data->start_class, and_withp);
4017 flags &= ~SCF_DO_STCLASS;
4020 else if (flags & SCF_DO_STCLASS_AND) {
4022 cl_and(data->start_class, &accum);
4023 flags &= ~SCF_DO_STCLASS;
4026 /* Switch to OR mode: cache the old value of
4027 * data->start_class */
4029 StructCopy(data->start_class, and_withp,
4030 struct regnode_charclass_class);
4031 flags &= ~SCF_DO_STCLASS_AND;
4032 StructCopy(&accum, data->start_class,
4033 struct regnode_charclass_class);
4034 flags |= SCF_DO_STCLASS_OR;
4035 data->start_class->flags |= ANYOF_EOS;
4042 else if (PL_regkind[OP(scan)] == TRIE) {
4043 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4046 min += trie->minlen;
4047 delta += (trie->maxlen - trie->minlen);
4048 flags &= ~SCF_DO_STCLASS; /* xxx */
4049 if (flags & SCF_DO_SUBSTR) {
4050 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4051 data->pos_min += trie->minlen;
4052 data->pos_delta += (trie->maxlen - trie->minlen);
4053 if (trie->maxlen != trie->minlen)
4054 data->longest = &(data->longest_float);
4056 if (trie->jump) /* no more substrings -- for now /grr*/
4057 flags &= ~SCF_DO_SUBSTR;
4059 #endif /* old or new */
4060 #endif /* TRIE_STUDY_OPT */
4062 /* Else: zero-length, ignore. */
4063 scan = regnext(scan);
4068 stopparen = frame->stop;
4069 frame = frame->prev;
4070 goto fake_study_recurse;
4075 DEBUG_STUDYDATA("pre-fin:",data,depth);
4078 *deltap = is_inf_internal ? I32_MAX : delta;
4079 if (flags & SCF_DO_SUBSTR && is_inf)
4080 data->pos_delta = I32_MAX - data->pos_min;
4081 if (is_par > (I32)U8_MAX)
4083 if (is_par && pars==1 && data) {
4084 data->flags |= SF_IN_PAR;
4085 data->flags &= ~SF_HAS_PAR;
4087 else if (pars && data) {
4088 data->flags |= SF_HAS_PAR;
4089 data->flags &= ~SF_IN_PAR;
4091 if (flags & SCF_DO_STCLASS_OR)
4092 cl_and(data->start_class, and_withp);
4093 if (flags & SCF_TRIE_RESTUDY)
4094 data->flags |= SCF_TRIE_RESTUDY;
4096 DEBUG_STUDYDATA("post-fin:",data,depth);
4098 return min < stopmin ? min : stopmin;
4102 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4104 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4106 PERL_ARGS_ASSERT_ADD_DATA;
4108 Renewc(RExC_rxi->data,
4109 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4110 char, struct reg_data);
4112 Renew(RExC_rxi->data->what, count + n, U8);
4114 Newx(RExC_rxi->data->what, n, U8);
4115 RExC_rxi->data->count = count + n;
4116 Copy(s, RExC_rxi->data->what + count, n, U8);
4120 /*XXX: todo make this not included in a non debugging perl */
4121 #ifndef PERL_IN_XSUB_RE
4123 Perl_reginitcolors(pTHX)
4126 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4128 char *t = savepv(s);
4132 t = strchr(t, '\t');
4138 PL_colors[i] = t = (char *)"";
4143 PL_colors[i++] = (char *)"";
4150 #ifdef TRIE_STUDY_OPT
4151 #define CHECK_RESTUDY_GOTO \
4153 (data.flags & SCF_TRIE_RESTUDY) \
4157 #define CHECK_RESTUDY_GOTO
4161 - pregcomp - compile a regular expression into internal code
4163 * We can't allocate space until we know how big the compiled form will be,
4164 * but we can't compile it (and thus know how big it is) until we've got a
4165 * place to put the code. So we cheat: we compile it twice, once with code
4166 * generation turned off and size counting turned on, and once "for real".
4167 * This also means that we don't allocate space until we are sure that the
4168 * thing really will compile successfully, and we never have to move the
4169 * code and thus invalidate pointers into it. (Note that it has to be in
4170 * one piece because free() must be able to free it all.) [NB: not true in perl]
4172 * Beware that the optimization-preparation code in here knows about some
4173 * of the structure of the compiled regexp. [I'll say.]
4178 #ifndef PERL_IN_XSUB_RE
4179 #define RE_ENGINE_PTR &reh_regexp_engine
4181 extern const struct regexp_engine my_reg_engine;
4182 #define RE_ENGINE_PTR &my_reg_engine
4185 #ifndef PERL_IN_XSUB_RE
4187 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4190 HV * const table = GvHV(PL_hintgv);
4192 PERL_ARGS_ASSERT_PREGCOMP;
4194 /* Dispatch a request to compile a regexp to correct
4197 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4198 GET_RE_DEBUG_FLAGS_DECL;
4199 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4200 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4202 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4205 return CALLREGCOMP_ENG(eng, pattern, flags);
4208 return Perl_re_compile(aTHX_ pattern, flags);
4213 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4218 register regexp_internal *ri;
4220 char *exp = SvPV(pattern, plen);
4221 char* xend = exp + plen;
4228 RExC_state_t RExC_state;
4229 RExC_state_t * const pRExC_state = &RExC_state;
4230 #ifdef TRIE_STUDY_OPT
4232 RExC_state_t copyRExC_state;
4234 GET_RE_DEBUG_FLAGS_DECL;
4236 PERL_ARGS_ASSERT_RE_COMPILE;
4238 DEBUG_r(if (!PL_colorset) reginitcolors());
4240 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4243 SV *dsv= sv_newmortal();
4244 RE_PV_QUOTED_DECL(s, RExC_utf8,
4245 dsv, exp, plen, 60);
4246 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4247 PL_colors[4],PL_colors[5],s);
4252 RExC_flags = pm_flags;
4256 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4257 RExC_seen_evals = 0;
4260 /* First pass: determine size, legality. */
4268 RExC_emit = &PL_regdummy;
4269 RExC_whilem_seen = 0;
4270 RExC_open_parens = NULL;
4271 RExC_close_parens = NULL;
4273 RExC_paren_names = NULL;
4275 RExC_paren_name_list = NULL;
4277 RExC_recurse = NULL;
4278 RExC_recurse_count = 0;
4280 #if 0 /* REGC() is (currently) a NOP at the first pass.
4281 * Clever compilers notice this and complain. --jhi */
4282 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4284 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4285 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4286 RExC_precomp = NULL;
4289 if (RExC_utf8 && !RExC_orig_utf8) {
4290 /* It's possible to write a regexp in ascii that represents Unicode
4291 codepoints outside of the byte range, such as via \x{100}. If we
4292 detect such a sequence we have to convert the entire pattern to utf8
4293 and then recompile, as our sizing calculation will have been based
4294 on 1 byte == 1 character, but we will need to use utf8 to encode
4295 at least some part of the pattern, and therefore must convert the whole
4297 XXX: somehow figure out how to make this less expensive...
4300 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4301 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4302 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4304 RExC_orig_utf8 = RExC_utf8;
4306 goto redo_first_pass;
4309 PerlIO_printf(Perl_debug_log,
4310 "Required size %"IVdf" nodes\n"
4311 "Starting second pass (creation)\n",
4314 RExC_lastparse=NULL;
4316 /* Small enough for pointer-storage convention?
4317 If extralen==0, this means that we will not need long jumps. */
4318 if (RExC_size >= 0x10000L && RExC_extralen)
4319 RExC_size += RExC_extralen;
4322 if (RExC_whilem_seen > 15)
4323 RExC_whilem_seen = 15;
4325 /* Allocate space and zero-initialize. Note, the two step process
4326 of zeroing when in debug mode, thus anything assigned has to
4327 happen after that */
4328 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4329 r = (struct regexp*)SvANY(rx);
4330 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4331 char, regexp_internal);
4332 if ( r == NULL || ri == NULL )
4333 FAIL("Regexp out of space");
4335 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4336 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4338 /* bulk initialize base fields with 0. */
4339 Zero(ri, sizeof(regexp_internal), char);
4342 /* non-zero initialization begins here */
4344 r->engine= RE_ENGINE_PTR;
4345 r->extflags = pm_flags;
4347 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4348 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4349 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4350 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4351 >> RXf_PMf_STD_PMMOD_SHIFT);
4352 const char *fptr = STD_PAT_MODS; /*"msix"*/
4354 const STRLEN wraplen = plen + has_minus + has_p + has_runon
4355 + (sizeof(STD_PAT_MODS) - 1)
4356 + (sizeof("(?:)") - 1);
4358 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4359 SvCUR_set(rx, wraplen);
4361 SvFLAGS(rx) |= SvUTF8(pattern);
4364 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4366 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4367 char *colon = r + 1;
4370 while((ch = *fptr++)) {
4384 Copy(RExC_precomp, p, plen, char);
4385 assert ((RX_WRAPPED(rx) - p) < 16);
4386 r->pre_prefix = p - RX_WRAPPED(rx);
4395 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4397 if (RExC_seen & REG_SEEN_RECURSE) {
4398 Newxz(RExC_open_parens, RExC_npar,regnode *);
4399 SAVEFREEPV(RExC_open_parens);
4400 Newxz(RExC_close_parens,RExC_npar,regnode *);
4401 SAVEFREEPV(RExC_close_parens);
4404 /* Useful during FAIL. */
4405 #ifdef RE_TRACK_PATTERN_OFFSETS
4406 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4407 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4408 "%s %"UVuf" bytes for offset annotations.\n",
4409 ri->u.offsets ? "Got" : "Couldn't get",
4410 (UV)((2*RExC_size+1) * sizeof(U32))));
4412 SetProgLen(ri,RExC_size);
4416 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4418 /* Second pass: emit code. */
4419 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4424 RExC_emit_start = ri->program;
4425 RExC_emit = ri->program;
4426 RExC_emit_bound = ri->program + RExC_size + 1;
4428 /* Store the count of eval-groups for security checks: */
4429 RExC_rx->seen_evals = RExC_seen_evals;
4430 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4431 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4435 /* XXXX To minimize changes to RE engine we always allocate
4436 3-units-long substrs field. */
4437 Newx(r->substrs, 1, struct reg_substr_data);
4438 if (RExC_recurse_count) {
4439 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4440 SAVEFREEPV(RExC_recurse);
4444 r->minlen = minlen = sawplus = sawopen = 0;
4445 Zero(r->substrs, 1, struct reg_substr_data);
4447 #ifdef TRIE_STUDY_OPT
4449 StructCopy(&zero_scan_data, &data, scan_data_t);
4450 copyRExC_state = RExC_state;
4453 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4455 RExC_state = copyRExC_state;
4456 if (seen & REG_TOP_LEVEL_BRANCHES)
4457 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4459 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4460 if (data.last_found) {
4461 SvREFCNT_dec(data.longest_fixed);
4462 SvREFCNT_dec(data.longest_float);
4463 SvREFCNT_dec(data.last_found);
4465 StructCopy(&zero_scan_data, &data, scan_data_t);
4468 StructCopy(&zero_scan_data, &data, scan_data_t);
4471 /* Dig out information for optimizations. */
4472 r->extflags = RExC_flags; /* was pm_op */
4473 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4476 SvUTF8_on(rx); /* Unicode in it? */
4477 ri->regstclass = NULL;
4478 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4479 r->intflags |= PREGf_NAUGHTY;
4480 scan = ri->program + 1; /* First BRANCH. */
4482 /* testing for BRANCH here tells us whether there is "must appear"
4483 data in the pattern. If there is then we can use it for optimisations */
4484 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4486 STRLEN longest_float_length, longest_fixed_length;
4487 struct regnode_charclass_class ch_class; /* pointed to by data */
4489 I32 last_close = 0; /* pointed to by data */
4490 regnode *first= scan;
4491 regnode *first_next= regnext(first);
4494 * Skip introductions and multiplicators >= 1
4495 * so that we can extract the 'meat' of the pattern that must
4496 * match in the large if() sequence following.
4497 * NOTE that EXACT is NOT covered here, as it is normally
4498 * picked up by the optimiser separately.
4500 * This is unfortunate as the optimiser isnt handling lookahead
4501 * properly currently.
4504 while ((OP(first) == OPEN && (sawopen = 1)) ||
4505 /* An OR of *one* alternative - should not happen now. */
4506 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4507 /* for now we can't handle lookbehind IFMATCH*/
4508 (OP(first) == IFMATCH && !first->flags) ||
4509 (OP(first) == PLUS) ||
4510 (OP(first) == MINMOD) ||
4511 /* An {n,m} with n>0 */
4512 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4513 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4516 * the only op that could be a regnode is PLUS, all the rest
4517 * will be regnode_1 or regnode_2.
4520 if (OP(first) == PLUS)
4523 first += regarglen[OP(first)];
4525 first = NEXTOPER(first);
4526 first_next= regnext(first);
4529 /* Starting-point info. */
4531 DEBUG_PEEP("first:",first,0);
4532 /* Ignore EXACT as we deal with it later. */
4533 if (PL_regkind[OP(first)] == EXACT) {
4534 if (OP(first) == EXACT)
4535 NOOP; /* Empty, get anchored substr later. */
4536 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4537 ri->regstclass = first;
4540 else if (PL_regkind[OP(first)] == TRIE &&
4541 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4544 /* this can happen only on restudy */
4545 if ( OP(first) == TRIE ) {
4546 struct regnode_1 *trieop = (struct regnode_1 *)
4547 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4548 StructCopy(first,trieop,struct regnode_1);
4549 trie_op=(regnode *)trieop;
4551 struct regnode_charclass *trieop = (struct regnode_charclass *)
4552 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4553 StructCopy(first,trieop,struct regnode_charclass);
4554 trie_op=(regnode *)trieop;
4557 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4558 ri->regstclass = trie_op;
4561 else if (strchr((const char*)PL_simple,OP(first)))
4562 ri->regstclass = first;
4563 else if (PL_regkind[OP(first)] == BOUND ||
4564 PL_regkind[OP(first)] == NBOUND)
4565 ri->regstclass = first;
4566 else if (PL_regkind[OP(first)] == BOL) {
4567 r->extflags |= (OP(first) == MBOL
4569 : (OP(first) == SBOL
4572 first = NEXTOPER(first);
4575 else if (OP(first) == GPOS) {
4576 r->extflags |= RXf_ANCH_GPOS;
4577 first = NEXTOPER(first);
4580 else if ((!sawopen || !RExC_sawback) &&
4581 (OP(first) == STAR &&
4582 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4583 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4585 /* turn .* into ^.* with an implied $*=1 */
4587 (OP(NEXTOPER(first)) == REG_ANY)
4590 r->extflags |= type;
4591 r->intflags |= PREGf_IMPLICIT;
4592 first = NEXTOPER(first);
4595 if (sawplus && (!sawopen || !RExC_sawback)
4596 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4597 /* x+ must match at the 1st pos of run of x's */
4598 r->intflags |= PREGf_SKIP;
4600 /* Scan is after the zeroth branch, first is atomic matcher. */
4601 #ifdef TRIE_STUDY_OPT
4604 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4605 (IV)(first - scan + 1))
4609 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4610 (IV)(first - scan + 1))
4616 * If there's something expensive in the r.e., find the
4617 * longest literal string that must appear and make it the
4618 * regmust. Resolve ties in favor of later strings, since
4619 * the regstart check works with the beginning of the r.e.
4620 * and avoiding duplication strengthens checking. Not a
4621 * strong reason, but sufficient in the absence of others.
4622 * [Now we resolve ties in favor of the earlier string if
4623 * it happens that c_offset_min has been invalidated, since the
4624 * earlier string may buy us something the later one won't.]
4627 data.longest_fixed = newSVpvs("");
4628 data.longest_float = newSVpvs("");
4629 data.last_found = newSVpvs("");
4630 data.longest = &(data.longest_fixed);
4632 if (!ri->regstclass) {
4633 cl_init(pRExC_state, &ch_class);
4634 data.start_class = &ch_class;
4635 stclass_flag = SCF_DO_STCLASS_AND;
4636 } else /* XXXX Check for BOUND? */
4638 data.last_closep = &last_close;
4640 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4641 &data, -1, NULL, NULL,
4642 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4648 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4649 && data.last_start_min == 0 && data.last_end > 0
4650 && !RExC_seen_zerolen
4651 && !(RExC_seen & REG_SEEN_VERBARG)
4652 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4653 r->extflags |= RXf_CHECK_ALL;
4654 scan_commit(pRExC_state, &data,&minlen,0);
4655 SvREFCNT_dec(data.last_found);
4657 /* Note that code very similar to this but for anchored string
4658 follows immediately below, changes may need to be made to both.
4661 longest_float_length = CHR_SVLEN(data.longest_float);
4662 if (longest_float_length
4663 || (data.flags & SF_FL_BEFORE_EOL
4664 && (!(data.flags & SF_FL_BEFORE_MEOL)
4665 || (RExC_flags & RXf_PMf_MULTILINE))))
4669 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4670 && data.offset_fixed == data.offset_float_min
4671 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4672 goto remove_float; /* As in (a)+. */
4674 /* copy the information about the longest float from the reg_scan_data
4675 over to the program. */
4676 if (SvUTF8(data.longest_float)) {
4677 r->float_utf8 = data.longest_float;
4678 r->float_substr = NULL;
4680 r->float_substr = data.longest_float;
4681 r->float_utf8 = NULL;
4683 /* float_end_shift is how many chars that must be matched that
4684 follow this item. We calculate it ahead of time as once the
4685 lookbehind offset is added in we lose the ability to correctly
4687 ml = data.minlen_float ? *(data.minlen_float)
4688 : (I32)longest_float_length;
4689 r->float_end_shift = ml - data.offset_float_min
4690 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4691 + data.lookbehind_float;
4692 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4693 r->float_max_offset = data.offset_float_max;
4694 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4695 r->float_max_offset -= data.lookbehind_float;
4697 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4698 && (!(data.flags & SF_FL_BEFORE_MEOL)
4699 || (RExC_flags & RXf_PMf_MULTILINE)));
4700 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4704 r->float_substr = r->float_utf8 = NULL;
4705 SvREFCNT_dec(data.longest_float);
4706 longest_float_length = 0;
4709 /* Note that code very similar to this but for floating string
4710 is immediately above, changes may need to be made to both.
4713 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4714 if (longest_fixed_length
4715 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4716 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4717 || (RExC_flags & RXf_PMf_MULTILINE))))
4721 /* copy the information about the longest fixed
4722 from the reg_scan_data over to the program. */
4723 if (SvUTF8(data.longest_fixed)) {
4724 r->anchored_utf8 = data.longest_fixed;
4725 r->anchored_substr = NULL;
4727 r->anchored_substr = data.longest_fixed;
4728 r->anchored_utf8 = NULL;
4730 /* fixed_end_shift is how many chars that must be matched that
4731 follow this item. We calculate it ahead of time as once the
4732 lookbehind offset is added in we lose the ability to correctly
4734 ml = data.minlen_fixed ? *(data.minlen_fixed)
4735 : (I32)longest_fixed_length;
4736 r->anchored_end_shift = ml - data.offset_fixed
4737 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4738 + data.lookbehind_fixed;
4739 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4741 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4742 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4743 || (RExC_flags & RXf_PMf_MULTILINE)));
4744 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4747 r->anchored_substr = r->anchored_utf8 = NULL;
4748 SvREFCNT_dec(data.longest_fixed);
4749 longest_fixed_length = 0;
4752 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4753 ri->regstclass = NULL;
4754 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4756 && !(data.start_class->flags & ANYOF_EOS)
4757 && !cl_is_anything(data.start_class))
4759 const U32 n = add_data(pRExC_state, 1, "f");
4761 Newx(RExC_rxi->data->data[n], 1,
4762 struct regnode_charclass_class);
4763 StructCopy(data.start_class,
4764 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4765 struct regnode_charclass_class);
4766 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4767 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4768 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4769 regprop(r, sv, (regnode*)data.start_class);
4770 PerlIO_printf(Perl_debug_log,
4771 "synthetic stclass \"%s\".\n",
4772 SvPVX_const(sv));});
4775 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4776 if (longest_fixed_length > longest_float_length) {
4777 r->check_end_shift = r->anchored_end_shift;
4778 r->check_substr = r->anchored_substr;
4779 r->check_utf8 = r->anchored_utf8;
4780 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4781 if (r->extflags & RXf_ANCH_SINGLE)
4782 r->extflags |= RXf_NOSCAN;
4785 r->check_end_shift = r->float_end_shift;
4786 r->check_substr = r->float_substr;
4787 r->check_utf8 = r->float_utf8;
4788 r->check_offset_min = r->float_min_offset;
4789 r->check_offset_max = r->float_max_offset;
4791 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4792 This should be changed ASAP! */
4793 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4794 r->extflags |= RXf_USE_INTUIT;
4795 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4796 r->extflags |= RXf_INTUIT_TAIL;
4798 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4799 if ( (STRLEN)minlen < longest_float_length )
4800 minlen= longest_float_length;
4801 if ( (STRLEN)minlen < longest_fixed_length )
4802 minlen= longest_fixed_length;
4806 /* Several toplevels. Best we can is to set minlen. */
4808 struct regnode_charclass_class ch_class;
4811 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4813 scan = ri->program + 1;
4814 cl_init(pRExC_state, &ch_class);
4815 data.start_class = &ch_class;
4816 data.last_closep = &last_close;
4819 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4820 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4824 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4825 = r->float_substr = r->float_utf8 = NULL;
4826 if (!(data.start_class->flags & ANYOF_EOS)
4827 && !cl_is_anything(data.start_class))
4829 const U32 n = add_data(pRExC_state, 1, "f");
4831 Newx(RExC_rxi->data->data[n], 1,
4832 struct regnode_charclass_class);
4833 StructCopy(data.start_class,
4834 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4835 struct regnode_charclass_class);
4836 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4837 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4838 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4839 regprop(r, sv, (regnode*)data.start_class);
4840 PerlIO_printf(Perl_debug_log,
4841 "synthetic stclass \"%s\".\n",
4842 SvPVX_const(sv));});
4846 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4847 the "real" pattern. */
4849 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4850 (IV)minlen, (IV)r->minlen);
4852 r->minlenret = minlen;
4853 if (r->minlen < minlen)
4856 if (RExC_seen & REG_SEEN_GPOS)
4857 r->extflags |= RXf_GPOS_SEEN;
4858 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4859 r->extflags |= RXf_LOOKBEHIND_SEEN;
4860 if (RExC_seen & REG_SEEN_EVAL)
4861 r->extflags |= RXf_EVAL_SEEN;
4862 if (RExC_seen & REG_SEEN_CANY)
4863 r->extflags |= RXf_CANY_SEEN;
4864 if (RExC_seen & REG_SEEN_VERBARG)
4865 r->intflags |= PREGf_VERBARG_SEEN;
4866 if (RExC_seen & REG_SEEN_CUTGROUP)
4867 r->intflags |= PREGf_CUTGROUP_SEEN;
4868 if (RExC_paren_names)
4869 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4871 RXp_PAREN_NAMES(r) = NULL;
4873 #ifdef STUPID_PATTERN_CHECKS
4874 if (RX_PRELEN(rx) == 0)
4875 r->extflags |= RXf_NULL;
4876 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4877 /* XXX: this should happen BEFORE we compile */
4878 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4879 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4880 r->extflags |= RXf_WHITE;
4881 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4882 r->extflags |= RXf_START_ONLY;
4884 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4885 /* XXX: this should happen BEFORE we compile */
4886 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4888 regnode *first = ri->program + 1;
4890 U8 nop = OP(NEXTOPER(first));
4892 if (PL_regkind[fop] == NOTHING && nop == END)
4893 r->extflags |= RXf_NULL;
4894 else if (PL_regkind[fop] == BOL && nop == END)
4895 r->extflags |= RXf_START_ONLY;
4896 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4897 r->extflags |= RXf_WHITE;
4901 if (RExC_paren_names) {
4902 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4903 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4906 ri->name_list_idx = 0;
4908 if (RExC_recurse_count) {
4909 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4910 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4911 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4914 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4915 /* assume we don't need to swap parens around before we match */
4918 PerlIO_printf(Perl_debug_log,"Final program:\n");
4921 #ifdef RE_TRACK_PATTERN_OFFSETS
4922 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4923 const U32 len = ri->u.offsets[0];
4925 GET_RE_DEBUG_FLAGS_DECL;
4926 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4927 for (i = 1; i <= len; i++) {
4928 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4929 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4930 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4932 PerlIO_printf(Perl_debug_log, "\n");
4938 #undef RE_ENGINE_PTR
4942 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4945 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4947 PERL_UNUSED_ARG(value);
4949 if (flags & RXapif_FETCH) {
4950 return reg_named_buff_fetch(rx, key, flags);
4951 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4952 Perl_croak(aTHX_ "%s", PL_no_modify);
4954 } else if (flags & RXapif_EXISTS) {
4955 return reg_named_buff_exists(rx, key, flags)
4958 } else if (flags & RXapif_REGNAMES) {
4959 return reg_named_buff_all(rx, flags);
4960 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4961 return reg_named_buff_scalar(rx, flags);
4963 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4969 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4972 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4973 PERL_UNUSED_ARG(lastkey);
4975 if (flags & RXapif_FIRSTKEY)
4976 return reg_named_buff_firstkey(rx, flags);
4977 else if (flags & RXapif_NEXTKEY)
4978 return reg_named_buff_nextkey(rx, flags);
4980 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4986 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4989 AV *retarray = NULL;
4991 struct regexp *const rx = (struct regexp *)SvANY(r);
4993 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4995 if (flags & RXapif_ALL)
4998 if (rx && RXp_PAREN_NAMES(rx)) {
4999 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5002 SV* sv_dat=HeVAL(he_str);
5003 I32 *nums=(I32*)SvPVX(sv_dat);
5004 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5005 if ((I32)(rx->nparens) >= nums[i]
5006 && rx->offs[nums[i]].start != -1
5007 && rx->offs[nums[i]].end != -1)
5010 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5014 ret = newSVsv(&PL_sv_undef);
5017 av_push(retarray, ret);
5020 return newRV_noinc(MUTABLE_SV(retarray));
5027 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5030 struct regexp *const rx = (struct regexp *)SvANY(r);
5032 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5034 if (rx && RXp_PAREN_NAMES(rx)) {
5035 if (flags & RXapif_ALL) {
5036 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5038 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5052 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5054 struct regexp *const rx = (struct regexp *)SvANY(r);
5056 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5058 if ( rx && RXp_PAREN_NAMES(rx) ) {
5059 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5061 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5068 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5070 struct regexp *const rx = (struct regexp *)SvANY(r);
5071 GET_RE_DEBUG_FLAGS_DECL;
5073 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5075 if (rx && RXp_PAREN_NAMES(rx)) {
5076 HV *hv = RXp_PAREN_NAMES(rx);
5078 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5081 SV* sv_dat = HeVAL(temphe);
5082 I32 *nums = (I32*)SvPVX(sv_dat);
5083 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5084 if ((I32)(rx->lastparen) >= nums[i] &&
5085 rx->offs[nums[i]].start != -1 &&
5086 rx->offs[nums[i]].end != -1)
5092 if (parno || flags & RXapif_ALL) {
5093 return newSVhek(HeKEY_hek(temphe));
5101 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5106 struct regexp *const rx = (struct regexp *)SvANY(r);
5108 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5110 if (rx && RXp_PAREN_NAMES(rx)) {
5111 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5112 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5113 } else if (flags & RXapif_ONE) {
5114 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5115 av = MUTABLE_AV(SvRV(ret));
5116 length = av_len(av);
5118 return newSViv(length + 1);
5120 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5124 return &PL_sv_undef;
5128 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5130 struct regexp *const rx = (struct regexp *)SvANY(r);
5133 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5135 if (rx && RXp_PAREN_NAMES(rx)) {
5136 HV *hv= RXp_PAREN_NAMES(rx);
5138 (void)hv_iterinit(hv);
5139 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5142 SV* sv_dat = HeVAL(temphe);
5143 I32 *nums = (I32*)SvPVX(sv_dat);
5144 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5145 if ((I32)(rx->lastparen) >= nums[i] &&
5146 rx->offs[nums[i]].start != -1 &&
5147 rx->offs[nums[i]].end != -1)
5153 if (parno || flags & RXapif_ALL) {
5154 av_push(av, newSVhek(HeKEY_hek(temphe)));
5159 return newRV_noinc(MUTABLE_SV(av));
5163 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5166 struct regexp *const rx = (struct regexp *)SvANY(r);
5171 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5174 sv_setsv(sv,&PL_sv_undef);
5178 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5180 i = rx->offs[0].start;
5184 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5186 s = rx->subbeg + rx->offs[0].end;
5187 i = rx->sublen - rx->offs[0].end;
5190 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5191 (s1 = rx->offs[paren].start) != -1 &&
5192 (t1 = rx->offs[paren].end) != -1)
5196 s = rx->subbeg + s1;
5198 sv_setsv(sv,&PL_sv_undef);
5201 assert(rx->sublen >= (s - rx->subbeg) + i );
5203 const int oldtainted = PL_tainted;
5205 sv_setpvn(sv, s, i);
5206 PL_tainted = oldtainted;
5207 if ( (rx->extflags & RXf_CANY_SEEN)
5208 ? (RXp_MATCH_UTF8(rx)
5209 && (!i || is_utf8_string((U8*)s, i)))
5210 : (RXp_MATCH_UTF8(rx)) )
5217 if (RXp_MATCH_TAINTED(rx)) {
5218 if (SvTYPE(sv) >= SVt_PVMG) {
5219 MAGIC* const mg = SvMAGIC(sv);
5222 SvMAGIC_set(sv, mg->mg_moremagic);
5224 if ((mgt = SvMAGIC(sv))) {
5225 mg->mg_moremagic = mgt;
5226 SvMAGIC_set(sv, mg);
5236 sv_setsv(sv,&PL_sv_undef);
5242 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5243 SV const * const value)
5245 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5247 PERL_UNUSED_ARG(rx);
5248 PERL_UNUSED_ARG(paren);
5249 PERL_UNUSED_ARG(value);
5252 Perl_croak(aTHX_ "%s", PL_no_modify);
5256 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5259 struct regexp *const rx = (struct regexp *)SvANY(r);
5263 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5265 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5267 /* $` / ${^PREMATCH} */
5268 case RX_BUFF_IDX_PREMATCH:
5269 if (rx->offs[0].start != -1) {
5270 i = rx->offs[0].start;
5278 /* $' / ${^POSTMATCH} */
5279 case RX_BUFF_IDX_POSTMATCH:
5280 if (rx->offs[0].end != -1) {
5281 i = rx->sublen - rx->offs[0].end;
5283 s1 = rx->offs[0].end;
5289 /* $& / ${^MATCH}, $1, $2, ... */
5291 if (paren <= (I32)rx->nparens &&
5292 (s1 = rx->offs[paren].start) != -1 &&
5293 (t1 = rx->offs[paren].end) != -1)
5298 if (ckWARN(WARN_UNINITIALIZED))
5299 report_uninit((const SV *)sv);
5304 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5305 const char * const s = rx->subbeg + s1;
5310 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5317 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5319 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5320 PERL_UNUSED_ARG(rx);
5324 return newSVpvs("Regexp");
5327 /* Scans the name of a named buffer from the pattern.
5328 * If flags is REG_RSN_RETURN_NULL returns null.
5329 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5330 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5331 * to the parsed name as looked up in the RExC_paren_names hash.
5332 * If there is an error throws a vFAIL().. type exception.
5335 #define REG_RSN_RETURN_NULL 0
5336 #define REG_RSN_RETURN_NAME 1
5337 #define REG_RSN_RETURN_DATA 2
5340 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5342 char *name_start = RExC_parse;
5344 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5346 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5347 /* skip IDFIRST by using do...while */
5350 RExC_parse += UTF8SKIP(RExC_parse);
5351 } while (isALNUM_utf8((U8*)RExC_parse));
5355 } while (isALNUM(*RExC_parse));
5360 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5361 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5362 if ( flags == REG_RSN_RETURN_NAME)
5364 else if (flags==REG_RSN_RETURN_DATA) {
5367 if ( ! sv_name ) /* should not happen*/
5368 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5369 if (RExC_paren_names)
5370 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5372 sv_dat = HeVAL(he_str);
5374 vFAIL("Reference to nonexistent named group");
5378 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5385 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5386 int rem=(int)(RExC_end - RExC_parse); \
5395 if (RExC_lastparse!=RExC_parse) \
5396 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5399 iscut ? "..." : "<" \
5402 PerlIO_printf(Perl_debug_log,"%16s",""); \
5405 num = RExC_size + 1; \
5407 num=REG_NODE_NUM(RExC_emit); \
5408 if (RExC_lastnum!=num) \
5409 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5411 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5412 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5413 (int)((depth*2)), "", \
5417 RExC_lastparse=RExC_parse; \
5422 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5423 DEBUG_PARSE_MSG((funcname)); \
5424 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5426 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5427 DEBUG_PARSE_MSG((funcname)); \
5428 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5431 - reg - regular expression, i.e. main body or parenthesized thing
5433 * Caller must absorb opening parenthesis.
5435 * Combining parenthesis handling with the base level of regular expression
5436 * is a trifle forced, but the need to tie the tails of the branches to what
5437 * follows makes it hard to avoid.
5439 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5441 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5443 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5447 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5448 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5451 register regnode *ret; /* Will be the head of the group. */
5452 register regnode *br;
5453 register regnode *lastbr;
5454 register regnode *ender = NULL;
5455 register I32 parno = 0;
5457 U32 oregflags = RExC_flags;
5458 bool have_branch = 0;
5460 I32 freeze_paren = 0;
5461 I32 after_freeze = 0;
5463 /* for (?g), (?gc), and (?o) warnings; warning
5464 about (?c) will warn about (?g) -- japhy */
5466 #define WASTED_O 0x01
5467 #define WASTED_G 0x02
5468 #define WASTED_C 0x04
5469 #define WASTED_GC (0x02|0x04)
5470 I32 wastedflags = 0x00;
5472 char * parse_start = RExC_parse; /* MJD */
5473 char * const oregcomp_parse = RExC_parse;
5475 GET_RE_DEBUG_FLAGS_DECL;
5477 PERL_ARGS_ASSERT_REG;
5478 DEBUG_PARSE("reg ");
5480 *flagp = 0; /* Tentatively. */
5483 /* Make an OPEN node, if parenthesized. */
5485 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5486 char *start_verb = RExC_parse;
5487 STRLEN verb_len = 0;
5488 char *start_arg = NULL;
5489 unsigned char op = 0;
5491 int internal_argval = 0; /* internal_argval is only useful if !argok */
5492 while ( *RExC_parse && *RExC_parse != ')' ) {
5493 if ( *RExC_parse == ':' ) {
5494 start_arg = RExC_parse + 1;
5500 verb_len = RExC_parse - start_verb;
5503 while ( *RExC_parse && *RExC_parse != ')' )
5505 if ( *RExC_parse != ')' )
5506 vFAIL("Unterminated verb pattern argument");
5507 if ( RExC_parse == start_arg )
5510 if ( *RExC_parse != ')' )
5511 vFAIL("Unterminated verb pattern");
5514 switch ( *start_verb ) {
5515 case 'A': /* (*ACCEPT) */
5516 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5518 internal_argval = RExC_nestroot;
5521 case 'C': /* (*COMMIT) */
5522 if ( memEQs(start_verb,verb_len,"COMMIT") )
5525 case 'F': /* (*FAIL) */
5526 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5531 case ':': /* (*:NAME) */
5532 case 'M': /* (*MARK:NAME) */
5533 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5538 case 'P': /* (*PRUNE) */
5539 if ( memEQs(start_verb,verb_len,"PRUNE") )
5542 case 'S': /* (*SKIP) */
5543 if ( memEQs(start_verb,verb_len,"SKIP") )
5546 case 'T': /* (*THEN) */
5547 /* [19:06] <TimToady> :: is then */
5548 if ( memEQs(start_verb,verb_len,"THEN") ) {
5550 RExC_seen |= REG_SEEN_CUTGROUP;
5556 vFAIL3("Unknown verb pattern '%.*s'",
5557 verb_len, start_verb);
5560 if ( start_arg && internal_argval ) {
5561 vFAIL3("Verb pattern '%.*s' may not have an argument",
5562 verb_len, start_verb);
5563 } else if ( argok < 0 && !start_arg ) {
5564 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5565 verb_len, start_verb);
5567 ret = reganode(pRExC_state, op, internal_argval);
5568 if ( ! internal_argval && ! SIZE_ONLY ) {
5570 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5571 ARG(ret) = add_data( pRExC_state, 1, "S" );
5572 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5579 if (!internal_argval)
5580 RExC_seen |= REG_SEEN_VERBARG;
5581 } else if ( start_arg ) {
5582 vFAIL3("Verb pattern '%.*s' may not have an argument",
5583 verb_len, start_verb);
5585 ret = reg_node(pRExC_state, op);
5587 nextchar(pRExC_state);
5590 if (*RExC_parse == '?') { /* (?...) */
5591 bool is_logical = 0;
5592 const char * const seqstart = RExC_parse;
5595 paren = *RExC_parse++;
5596 ret = NULL; /* For look-ahead/behind. */
5599 case 'P': /* (?P...) variants for those used to PCRE/Python */
5600 paren = *RExC_parse++;
5601 if ( paren == '<') /* (?P<...>) named capture */
5603 else if (paren == '>') { /* (?P>name) named recursion */
5604 goto named_recursion;
5606 else if (paren == '=') { /* (?P=...) named backref */
5607 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5608 you change this make sure you change that */
5609 char* name_start = RExC_parse;
5611 SV *sv_dat = reg_scan_name(pRExC_state,
5612 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5613 if (RExC_parse == name_start || *RExC_parse != ')')
5614 vFAIL2("Sequence %.3s... not terminated",parse_start);
5617 num = add_data( pRExC_state, 1, "S" );
5618 RExC_rxi->data->data[num]=(void*)sv_dat;
5619 SvREFCNT_inc_simple_void(sv_dat);
5622 ret = reganode(pRExC_state,
5623 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5627 Set_Node_Offset(ret, parse_start+1);
5628 Set_Node_Cur_Length(ret); /* MJD */
5630 nextchar(pRExC_state);
5634 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5636 case '<': /* (?<...) */
5637 if (*RExC_parse == '!')
5639 else if (*RExC_parse != '=')
5645 case '\'': /* (?'...') */
5646 name_start= RExC_parse;
5647 svname = reg_scan_name(pRExC_state,
5648 SIZE_ONLY ? /* reverse test from the others */
5649 REG_RSN_RETURN_NAME :
5650 REG_RSN_RETURN_NULL);
5651 if (RExC_parse == name_start) {
5653 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5656 if (*RExC_parse != paren)
5657 vFAIL2("Sequence (?%c... not terminated",
5658 paren=='>' ? '<' : paren);
5662 if (!svname) /* shouldnt happen */
5664 "panic: reg_scan_name returned NULL");
5665 if (!RExC_paren_names) {
5666 RExC_paren_names= newHV();
5667 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5669 RExC_paren_name_list= newAV();
5670 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5673 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5675 sv_dat = HeVAL(he_str);
5677 /* croak baby croak */
5679 "panic: paren_name hash element allocation failed");
5680 } else if ( SvPOK(sv_dat) ) {
5681 /* (?|...) can mean we have dupes so scan to check
5682 its already been stored. Maybe a flag indicating
5683 we are inside such a construct would be useful,
5684 but the arrays are likely to be quite small, so
5685 for now we punt -- dmq */
5686 IV count = SvIV(sv_dat);
5687 I32 *pv = (I32*)SvPVX(sv_dat);
5689 for ( i = 0 ; i < count ; i++ ) {
5690 if ( pv[i] == RExC_npar ) {
5696 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5697 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5698 pv[count] = RExC_npar;
5699 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5702 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5703 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5705 SvIV_set(sv_dat, 1);
5708 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5709 SvREFCNT_dec(svname);
5712 /*sv_dump(sv_dat);*/
5714 nextchar(pRExC_state);
5716 goto capturing_parens;
5718 RExC_seen |= REG_SEEN_LOOKBEHIND;
5720 case '=': /* (?=...) */
5721 RExC_seen_zerolen++;
5723 case '!': /* (?!...) */
5724 RExC_seen_zerolen++;
5725 if (*RExC_parse == ')') {
5726 ret=reg_node(pRExC_state, OPFAIL);
5727 nextchar(pRExC_state);
5731 case '|': /* (?|...) */
5732 /* branch reset, behave like a (?:...) except that
5733 buffers in alternations share the same numbers */
5735 after_freeze = freeze_paren = RExC_npar;
5737 case ':': /* (?:...) */
5738 case '>': /* (?>...) */
5740 case '$': /* (?$...) */
5741 case '@': /* (?@...) */
5742 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5744 case '#': /* (?#...) */
5745 while (*RExC_parse && *RExC_parse != ')')
5747 if (*RExC_parse != ')')
5748 FAIL("Sequence (?#... not terminated");
5749 nextchar(pRExC_state);
5752 case '0' : /* (?0) */
5753 case 'R' : /* (?R) */
5754 if (*RExC_parse != ')')
5755 FAIL("Sequence (?R) not terminated");
5756 ret = reg_node(pRExC_state, GOSTART);
5757 *flagp |= POSTPONED;
5758 nextchar(pRExC_state);
5761 { /* named and numeric backreferences */
5763 case '&': /* (?&NAME) */
5764 parse_start = RExC_parse - 1;
5767 SV *sv_dat = reg_scan_name(pRExC_state,
5768 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5769 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5771 goto gen_recurse_regop;
5774 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5776 vFAIL("Illegal pattern");
5778 goto parse_recursion;
5780 case '-': /* (?-1) */
5781 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5782 RExC_parse--; /* rewind to let it be handled later */
5786 case '1': case '2': case '3': case '4': /* (?1) */
5787 case '5': case '6': case '7': case '8': case '9':
5790 num = atoi(RExC_parse);
5791 parse_start = RExC_parse - 1; /* MJD */
5792 if (*RExC_parse == '-')
5794 while (isDIGIT(*RExC_parse))
5796 if (*RExC_parse!=')')
5797 vFAIL("Expecting close bracket");
5800 if ( paren == '-' ) {
5802 Diagram of capture buffer numbering.
5803 Top line is the normal capture buffer numbers
5804 Botton line is the negative indexing as from
5808 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5812 num = RExC_npar + num;
5815 vFAIL("Reference to nonexistent group");
5817 } else if ( paren == '+' ) {
5818 num = RExC_npar + num - 1;
5821 ret = reganode(pRExC_state, GOSUB, num);
5823 if (num > (I32)RExC_rx->nparens) {
5825 vFAIL("Reference to nonexistent group");
5827 ARG2L_SET( ret, RExC_recurse_count++);
5829 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5830 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5834 RExC_seen |= REG_SEEN_RECURSE;
5835 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5836 Set_Node_Offset(ret, parse_start); /* MJD */
5838 *flagp |= POSTPONED;
5839 nextchar(pRExC_state);
5841 } /* named and numeric backreferences */
5844 case '?': /* (??...) */
5846 if (*RExC_parse != '{') {
5848 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5851 *flagp |= POSTPONED;
5852 paren = *RExC_parse++;
5854 case '{': /* (?{...}) */
5859 char *s = RExC_parse;
5861 RExC_seen_zerolen++;
5862 RExC_seen |= REG_SEEN_EVAL;
5863 while (count && (c = *RExC_parse)) {
5874 if (*RExC_parse != ')') {
5876 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5880 OP_4tree *sop, *rop;
5881 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5884 Perl_save_re_context(aTHX);
5885 rop = sv_compile_2op(sv, &sop, "re", &pad);
5886 sop->op_private |= OPpREFCOUNTED;
5887 /* re_dup will OpREFCNT_inc */
5888 OpREFCNT_set(sop, 1);
5891 n = add_data(pRExC_state, 3, "nop");
5892 RExC_rxi->data->data[n] = (void*)rop;
5893 RExC_rxi->data->data[n+1] = (void*)sop;
5894 RExC_rxi->data->data[n+2] = (void*)pad;
5897 else { /* First pass */
5898 if (PL_reginterp_cnt < ++RExC_seen_evals
5900 /* No compiled RE interpolated, has runtime
5901 components ===> unsafe. */
5902 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5903 if (PL_tainting && PL_tainted)
5904 FAIL("Eval-group in insecure regular expression");
5905 #if PERL_VERSION > 8
5906 if (IN_PERL_COMPILETIME)
5911 nextchar(pRExC_state);
5913 ret = reg_node(pRExC_state, LOGICAL);
5916 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5917 /* deal with the length of this later - MJD */
5920 ret = reganode(pRExC_state, EVAL, n);
5921 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5922 Set_Node_Offset(ret, parse_start);
5925 case '(': /* (?(?{...})...) and (?(?=...)...) */
5928 if (RExC_parse[0] == '?') { /* (?(?...)) */
5929 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5930 || RExC_parse[1] == '<'
5931 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5934 ret = reg_node(pRExC_state, LOGICAL);
5937 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5941 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5942 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5944 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5945 char *name_start= RExC_parse++;
5947 SV *sv_dat=reg_scan_name(pRExC_state,
5948 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5949 if (RExC_parse == name_start || *RExC_parse != ch)
5950 vFAIL2("Sequence (?(%c... not terminated",
5951 (ch == '>' ? '<' : ch));
5954 num = add_data( pRExC_state, 1, "S" );
5955 RExC_rxi->data->data[num]=(void*)sv_dat;
5956 SvREFCNT_inc_simple_void(sv_dat);
5958 ret = reganode(pRExC_state,NGROUPP,num);
5959 goto insert_if_check_paren;
5961 else if (RExC_parse[0] == 'D' &&
5962 RExC_parse[1] == 'E' &&
5963 RExC_parse[2] == 'F' &&
5964 RExC_parse[3] == 'I' &&
5965 RExC_parse[4] == 'N' &&
5966 RExC_parse[5] == 'E')
5968 ret = reganode(pRExC_state,DEFINEP,0);
5971 goto insert_if_check_paren;
5973 else if (RExC_parse[0] == 'R') {
5976 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5977 parno = atoi(RExC_parse++);
5978 while (isDIGIT(*RExC_parse))
5980 } else if (RExC_parse[0] == '&') {
5983 sv_dat = reg_scan_name(pRExC_state,
5984 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5985 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5987 ret = reganode(pRExC_state,INSUBP,parno);
5988 goto insert_if_check_paren;
5990 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5993 parno = atoi(RExC_parse++);
5995 while (isDIGIT(*RExC_parse))
5997 ret = reganode(pRExC_state, GROUPP, parno);
5999 insert_if_check_paren:
6000 if ((c = *nextchar(pRExC_state)) != ')')
6001 vFAIL("Switch condition not recognized");
6003 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6004 br = regbranch(pRExC_state, &flags, 1,depth+1);
6006 br = reganode(pRExC_state, LONGJMP, 0);
6008 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6009 c = *nextchar(pRExC_state);
6014 vFAIL("(?(DEFINE)....) does not allow branches");
6015 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6016 regbranch(pRExC_state, &flags, 1,depth+1);
6017 REGTAIL(pRExC_state, ret, lastbr);
6020 c = *nextchar(pRExC_state);
6025 vFAIL("Switch (?(condition)... contains too many branches");
6026 ender = reg_node(pRExC_state, TAIL);
6027 REGTAIL(pRExC_state, br, ender);
6029 REGTAIL(pRExC_state, lastbr, ender);
6030 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6033 REGTAIL(pRExC_state, ret, ender);
6034 RExC_size++; /* XXX WHY do we need this?!!
6035 For large programs it seems to be required
6036 but I can't figure out why. -- dmq*/
6040 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6044 RExC_parse--; /* for vFAIL to print correctly */
6045 vFAIL("Sequence (? incomplete");
6049 parse_flags: /* (?i) */
6051 U32 posflags = 0, negflags = 0;
6052 U32 *flagsp = &posflags;
6054 while (*RExC_parse) {
6055 /* && strchr("iogcmsx", *RExC_parse) */
6056 /* (?g), (?gc) and (?o) are useless here
6057 and must be globally applied -- japhy */
6058 switch (*RExC_parse) {
6059 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6060 case ONCE_PAT_MOD: /* 'o' */
6061 case GLOBAL_PAT_MOD: /* 'g' */
6062 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6063 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6064 if (! (wastedflags & wflagbit) ) {
6065 wastedflags |= wflagbit;
6068 "Useless (%s%c) - %suse /%c modifier",
6069 flagsp == &negflags ? "?-" : "?",
6071 flagsp == &negflags ? "don't " : "",
6078 case CONTINUE_PAT_MOD: /* 'c' */
6079 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6080 if (! (wastedflags & WASTED_C) ) {
6081 wastedflags |= WASTED_GC;
6084 "Useless (%sc) - %suse /gc modifier",
6085 flagsp == &negflags ? "?-" : "?",
6086 flagsp == &negflags ? "don't " : ""
6091 case KEEPCOPY_PAT_MOD: /* 'p' */
6092 if (flagsp == &negflags) {
6094 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6096 *flagsp |= RXf_PMf_KEEPCOPY;
6100 if (flagsp == &negflags) {
6102 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6106 wastedflags = 0; /* reset so (?g-c) warns twice */
6112 RExC_flags |= posflags;
6113 RExC_flags &= ~negflags;
6115 oregflags |= posflags;
6116 oregflags &= ~negflags;
6118 nextchar(pRExC_state);
6129 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6134 }} /* one for the default block, one for the switch */
6141 ret = reganode(pRExC_state, OPEN, parno);
6144 RExC_nestroot = parno;
6145 if (RExC_seen & REG_SEEN_RECURSE
6146 && !RExC_open_parens[parno-1])
6148 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6149 "Setting open paren #%"IVdf" to %d\n",
6150 (IV)parno, REG_NODE_NUM(ret)));
6151 RExC_open_parens[parno-1]= ret;
6154 Set_Node_Length(ret, 1); /* MJD */
6155 Set_Node_Offset(ret, RExC_parse); /* MJD */
6163 /* Pick up the branches, linking them together. */
6164 parse_start = RExC_parse; /* MJD */
6165 br = regbranch(pRExC_state, &flags, 1,depth+1);
6168 if (RExC_npar > after_freeze)
6169 after_freeze = RExC_npar;
6170 RExC_npar = freeze_paren;
6173 /* branch_len = (paren != 0); */
6177 if (*RExC_parse == '|') {
6178 if (!SIZE_ONLY && RExC_extralen) {
6179 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6182 reginsert(pRExC_state, BRANCH, br, depth+1);
6183 Set_Node_Length(br, paren != 0);
6184 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6188 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6190 else if (paren == ':') {
6191 *flagp |= flags&SIMPLE;
6193 if (is_open) { /* Starts with OPEN. */
6194 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6196 else if (paren != '?') /* Not Conditional */
6198 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6200 while (*RExC_parse == '|') {
6201 if (!SIZE_ONLY && RExC_extralen) {
6202 ender = reganode(pRExC_state, LONGJMP,0);
6203 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6206 RExC_extralen += 2; /* Account for LONGJMP. */
6207 nextchar(pRExC_state);
6209 if (RExC_npar > after_freeze)
6210 after_freeze = RExC_npar;
6211 RExC_npar = freeze_paren;
6213 br = regbranch(pRExC_state, &flags, 0, depth+1);
6217 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6219 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6222 if (have_branch || paren != ':') {
6223 /* Make a closing node, and hook it on the end. */
6226 ender = reg_node(pRExC_state, TAIL);
6229 ender = reganode(pRExC_state, CLOSE, parno);
6230 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6231 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6232 "Setting close paren #%"IVdf" to %d\n",
6233 (IV)parno, REG_NODE_NUM(ender)));
6234 RExC_close_parens[parno-1]= ender;
6235 if (RExC_nestroot == parno)
6238 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6239 Set_Node_Length(ender,1); /* MJD */
6245 *flagp &= ~HASWIDTH;
6248 ender = reg_node(pRExC_state, SUCCEED);
6251 ender = reg_node(pRExC_state, END);
6253 assert(!RExC_opend); /* there can only be one! */
6258 REGTAIL(pRExC_state, lastbr, ender);
6260 if (have_branch && !SIZE_ONLY) {
6262 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6264 /* Hook the tails of the branches to the closing node. */
6265 for (br = ret; br; br = regnext(br)) {
6266 const U8 op = PL_regkind[OP(br)];
6268 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6270 else if (op == BRANCHJ) {
6271 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6279 static const char parens[] = "=!<,>";
6281 if (paren && (p = strchr(parens, paren))) {
6282 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6283 int flag = (p - parens) > 1;
6286 node = SUSPEND, flag = 0;
6287 reginsert(pRExC_state, node,ret, depth+1);
6288 Set_Node_Cur_Length(ret);
6289 Set_Node_Offset(ret, parse_start + 1);
6291 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6295 /* Check for proper termination. */
6297 RExC_flags = oregflags;
6298 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6299 RExC_parse = oregcomp_parse;
6300 vFAIL("Unmatched (");
6303 else if (!paren && RExC_parse < RExC_end) {
6304 if (*RExC_parse == ')') {
6306 vFAIL("Unmatched )");
6309 FAIL("Junk on end of regexp"); /* "Can't happen". */
6313 RExC_npar = after_freeze;
6318 - regbranch - one alternative of an | operator
6320 * Implements the concatenation operator.
6323 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6326 register regnode *ret;
6327 register regnode *chain = NULL;
6328 register regnode *latest;
6329 I32 flags = 0, c = 0;
6330 GET_RE_DEBUG_FLAGS_DECL;
6332 PERL_ARGS_ASSERT_REGBRANCH;
6334 DEBUG_PARSE("brnc");
6339 if (!SIZE_ONLY && RExC_extralen)
6340 ret = reganode(pRExC_state, BRANCHJ,0);
6342 ret = reg_node(pRExC_state, BRANCH);
6343 Set_Node_Length(ret, 1);
6347 if (!first && SIZE_ONLY)
6348 RExC_extralen += 1; /* BRANCHJ */
6350 *flagp = WORST; /* Tentatively. */
6353 nextchar(pRExC_state);
6354 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6356 latest = regpiece(pRExC_state, &flags,depth+1);
6357 if (latest == NULL) {
6358 if (flags & TRYAGAIN)
6362 else if (ret == NULL)
6364 *flagp |= flags&(HASWIDTH|POSTPONED);
6365 if (chain == NULL) /* First piece. */
6366 *flagp |= flags&SPSTART;
6369 REGTAIL(pRExC_state, chain, latest);
6374 if (chain == NULL) { /* Loop ran zero times. */
6375 chain = reg_node(pRExC_state, NOTHING);
6380 *flagp |= flags&SIMPLE;
6387 - regpiece - something followed by possible [*+?]
6389 * Note that the branching code sequences used for ? and the general cases
6390 * of * and + are somewhat optimized: they use the same NOTHING node as
6391 * both the endmarker for their branch list and the body of the last branch.
6392 * It might seem that this node could be dispensed with entirely, but the
6393 * endmarker role is not redundant.
6396 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6399 register regnode *ret;
6401 register char *next;
6403 const char * const origparse = RExC_parse;
6405 I32 max = REG_INFTY;
6407 const char *maxpos = NULL;
6408 GET_RE_DEBUG_FLAGS_DECL;
6410 PERL_ARGS_ASSERT_REGPIECE;
6412 DEBUG_PARSE("piec");
6414 ret = regatom(pRExC_state, &flags,depth+1);
6416 if (flags & TRYAGAIN)
6423 if (op == '{' && regcurly(RExC_parse)) {
6425 parse_start = RExC_parse; /* MJD */
6426 next = RExC_parse + 1;
6427 while (isDIGIT(*next) || *next == ',') {
6436 if (*next == '}') { /* got one */
6440 min = atoi(RExC_parse);
6444 maxpos = RExC_parse;
6446 if (!max && *maxpos != '0')
6447 max = REG_INFTY; /* meaning "infinity" */
6448 else if (max >= REG_INFTY)
6449 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6451 nextchar(pRExC_state);
6454 if ((flags&SIMPLE)) {
6455 RExC_naughty += 2 + RExC_naughty / 2;
6456 reginsert(pRExC_state, CURLY, ret, depth+1);
6457 Set_Node_Offset(ret, parse_start+1); /* MJD */
6458 Set_Node_Cur_Length(ret);
6461 regnode * const w = reg_node(pRExC_state, WHILEM);
6464 REGTAIL(pRExC_state, ret, w);
6465 if (!SIZE_ONLY && RExC_extralen) {
6466 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6467 reginsert(pRExC_state, NOTHING,ret, depth+1);
6468 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6470 reginsert(pRExC_state, CURLYX,ret, depth+1);
6472 Set_Node_Offset(ret, parse_start+1);
6473 Set_Node_Length(ret,
6474 op == '{' ? (RExC_parse - parse_start) : 1);
6476 if (!SIZE_ONLY && RExC_extralen)
6477 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6478 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6480 RExC_whilem_seen++, RExC_extralen += 3;
6481 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6490 vFAIL("Can't do {n,m} with n > m");
6492 ARG1_SET(ret, (U16)min);
6493 ARG2_SET(ret, (U16)max);
6505 #if 0 /* Now runtime fix should be reliable. */
6507 /* if this is reinstated, don't forget to put this back into perldiag:
6509 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6511 (F) The part of the regexp subject to either the * or + quantifier
6512 could match an empty string. The {#} shows in the regular
6513 expression about where the problem was discovered.
6517 if (!(flags&HASWIDTH) && op != '?')
6518 vFAIL("Regexp *+ operand could be empty");
6521 parse_start = RExC_parse;
6522 nextchar(pRExC_state);
6524 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6526 if (op == '*' && (flags&SIMPLE)) {
6527 reginsert(pRExC_state, STAR, ret, depth+1);
6531 else if (op == '*') {
6535 else if (op == '+' && (flags&SIMPLE)) {
6536 reginsert(pRExC_state, PLUS, ret, depth+1);
6540 else if (op == '+') {
6544 else if (op == '?') {
6549 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6550 ckWARN3reg(RExC_parse,
6551 "%.*s matches null string many times",
6552 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6556 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6557 nextchar(pRExC_state);
6558 reginsert(pRExC_state, MINMOD, ret, depth+1);
6559 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6561 #ifndef REG_ALLOW_MINMOD_SUSPEND
6564 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6566 nextchar(pRExC_state);
6567 ender = reg_node(pRExC_state, SUCCEED);
6568 REGTAIL(pRExC_state, ret, ender);
6569 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6571 ender = reg_node(pRExC_state, TAIL);
6572 REGTAIL(pRExC_state, ret, ender);
6576 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6578 vFAIL("Nested quantifiers");
6585 /* reg_namedseq(pRExC_state,UVp)
6587 This is expected to be called by a parser routine that has
6588 recognized '\N' and needs to handle the rest. RExC_parse is
6589 expected to point at the first char following the N at the time
6592 The \N may be inside (indicated by valuep not being NULL) or outside a
6595 \N may begin either a named sequence, or if outside a character class, mean
6596 to match a non-newline. For non single-quoted regexes, the tokenizer has
6597 attempted to decide which, and in the case of a named sequence converted it
6598 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6599 where c1... are the characters in the sequence. For single-quoted regexes,
6600 the tokenizer passes the \N sequence through unchanged; this code will not
6601 attempt to determine this nor expand those. The net effect is that if the
6602 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6603 signals that this \N occurrence means to match a non-newline.
6605 Only the \N{U+...} form should occur in a character class, for the same
6606 reason that '.' inside a character class means to just match a period: it
6607 just doesn't make sense.
6609 If valuep is non-null then it is assumed that we are parsing inside
6610 of a charclass definition and the first codepoint in the resolved
6611 string is returned via *valuep and the routine will return NULL.
6612 In this mode if a multichar string is returned from the charnames
6613 handler, a warning will be issued, and only the first char in the
6614 sequence will be examined. If the string returned is zero length
6615 then the value of *valuep is undefined and NON-NULL will
6616 be returned to indicate failure. (This will NOT be a valid pointer
6619 If valuep is null then it is assumed that we are parsing normal text and a
6620 new EXACT node is inserted into the program containing the resolved string,
6621 and a pointer to the new node is returned. But if the string is zero length
6622 a NOTHING node is emitted instead.
6624 On success RExC_parse is set to the char following the endbrace.
6625 Parsing failures will generate a fatal error via vFAIL(...)
6628 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6630 char * endbrace; /* '}' following the name */
6631 regnode *ret = NULL;
6633 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6637 GET_RE_DEBUG_FLAGS_DECL;
6639 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6643 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6644 * modifier. The other meaning does not */
6645 p = (RExC_flags & RXf_PMf_EXTENDED)
6646 ? regwhite( pRExC_state, RExC_parse )
6649 /* Disambiguate between \N meaning a named character versus \N meaning
6650 * [^\n]. The former is assumed when it can't be the latter. */
6651 if (*p != '{' || regcurly(p)) {
6654 /* no bare \N in a charclass */
6655 vFAIL("\\N in a character class must be a named character: \\N{...}");
6657 nextchar(pRExC_state);
6658 ret = reg_node(pRExC_state, REG_ANY);
6659 *flagp |= HASWIDTH|SIMPLE;
6662 Set_Node_Length(ret, 1); /* MJD */
6666 /* Here, we have decided it should be a named sequence */
6668 /* The test above made sure that the next real character is a '{', but
6669 * under the /x modifier, it could be separated by space (or a comment and
6670 * \n) and this is not allowed (for consistency with \x{...} and the
6671 * tokenizer handling of \N{NAME}). */
6672 if (*RExC_parse != '{') {
6673 vFAIL("Missing braces on \\N{}");
6676 RExC_parse++; /* Skip past the '{' */
6678 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6679 || ! (endbrace == RExC_parse /* nothing between the {} */
6680 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6681 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6683 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6684 vFAIL("\\N{NAME} must be resolved by the lexer");
6687 if (endbrace == RExC_parse) { /* empty: \N{} */
6689 RExC_parse = endbrace + 1;
6690 return reg_node(pRExC_state,NOTHING);
6694 ckWARNreg(RExC_parse,
6695 "Ignoring zero length \\N{} in character class"
6697 RExC_parse = endbrace + 1;
6700 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6703 RExC_utf8 = 1; /* named sequences imply Unicode semantics */
6704 RExC_parse += 2; /* Skip past the 'U+' */
6706 if (valuep) { /* In a bracketed char class */
6707 /* We only pay attention to the first char of
6708 multichar strings being returned. I kinda wonder
6709 if this makes sense as it does change the behaviour
6710 from earlier versions, OTOH that behaviour was broken
6711 as well. XXX Solution is to recharacterize as
6712 [rest-of-class]|multi1|multi2... */
6714 STRLEN length_of_hex;
6715 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6716 | PERL_SCAN_DISALLOW_PREFIX
6717 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6719 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6720 if (endchar < endbrace) {
6721 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6724 length_of_hex = (STRLEN)(endchar - RExC_parse);
6725 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6727 /* The tokenizer should have guaranteed validity, but it's possible to
6728 * bypass it by using single quoting, so check */
6729 if (length_of_hex == 0
6730 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6732 RExC_parse += length_of_hex; /* Includes all the valid */
6733 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6734 ? UTF8SKIP(RExC_parse)
6736 /* Guard against malformed utf8 */
6737 if (RExC_parse >= endchar) RExC_parse = endchar;
6738 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6741 RExC_parse = endbrace + 1;
6742 if (endchar == endbrace) return NULL;
6744 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
6746 else { /* Not a char class */
6747 char *s; /* String to put in generated EXACT node */
6748 STRLEN len = 0; /* Its current length */
6749 char *endchar; /* Points to '.' or '}' ending cur char in the input
6752 ret = reg_node(pRExC_state,
6753 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6756 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6757 * the input which is of the form now 'c1.c2.c3...}' until find the
6758 * ending brace or exeed length 255. The characters that exceed this
6759 * limit are dropped. The limit could be relaxed should it become
6760 * desirable by reparsing this as (?:\N{NAME}), so could generate
6761 * multiple EXACT nodes, as is done for just regular input. But this
6762 * is primarily a named character, and not intended to be a huge long
6763 * string, so 255 bytes should be good enough */
6765 STRLEN length_of_hex;
6766 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6767 | PERL_SCAN_DISALLOW_PREFIX
6768 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6769 UV cp; /* Ord of current character */
6771 /* Code points are separated by dots. If none, there is only one
6772 * code point, and is terminated by the brace */
6773 endchar = RExC_parse + strcspn(RExC_parse, ".}");
6775 /* The values are Unicode even on EBCDIC machines */
6776 length_of_hex = (STRLEN)(endchar - RExC_parse);
6777 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6778 if ( length_of_hex == 0
6779 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6781 RExC_parse += length_of_hex; /* Includes all the valid */
6782 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6783 ? UTF8SKIP(RExC_parse)
6785 /* Guard against malformed utf8 */
6786 if (RExC_parse >= endchar) RExC_parse = endchar;
6787 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6790 if (! FOLD) { /* Not folding, just append to the string */
6793 /* Quit before adding this character if would exceed limit */
6794 if (len + UNISKIP(cp) > U8_MAX) break;
6796 unilen = reguni(pRExC_state, cp, s);
6801 } else { /* Folding, output the folded equivalent */
6802 STRLEN foldlen,numlen;
6803 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6804 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6806 /* Quit before exceeding size limit */
6807 if (len + foldlen > U8_MAX) break;
6809 for (foldbuf = tmpbuf;
6813 cp = utf8_to_uvchr(foldbuf, &numlen);
6815 const STRLEN unilen = reguni(pRExC_state, cp, s);
6818 /* In EBCDIC the numlen and unilen can differ. */
6820 if (numlen >= foldlen)
6824 break; /* "Can't happen." */
6828 /* Point to the beginning of the next character in the sequence. */
6829 RExC_parse = endchar + 1;
6831 /* Quit if no more characters */
6832 if (RExC_parse >= endbrace) break;
6837 if (RExC_parse < endbrace) {
6838 ckWARNreg(RExC_parse - 1,
6839 "Using just the first characters returned by \\N{}");
6842 RExC_size += STR_SZ(len);
6845 RExC_emit += STR_SZ(len);
6848 RExC_parse = endbrace + 1;
6850 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6851 with malformed in t/re/pat_advanced.t */
6853 Set_Node_Cur_Length(ret); /* MJD */
6854 nextchar(pRExC_state);
6864 * It returns the code point in utf8 for the value in *encp.
6865 * value: a code value in the source encoding
6866 * encp: a pointer to an Encode object
6868 * If the result from Encode is not a single character,
6869 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6872 S_reg_recode(pTHX_ const char value, SV **encp)
6875 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6876 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6877 const STRLEN newlen = SvCUR(sv);
6878 UV uv = UNICODE_REPLACEMENT;
6880 PERL_ARGS_ASSERT_REG_RECODE;
6884 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6887 if (!newlen || numlen != newlen) {
6888 uv = UNICODE_REPLACEMENT;
6896 - regatom - the lowest level
6898 Try to identify anything special at the start of the pattern. If there
6899 is, then handle it as required. This may involve generating a single regop,
6900 such as for an assertion; or it may involve recursing, such as to
6901 handle a () structure.
6903 If the string doesn't start with something special then we gobble up
6904 as much literal text as we can.
6906 Once we have been able to handle whatever type of thing started the
6907 sequence, we return.
6909 Note: we have to be careful with escapes, as they can be both literal
6910 and special, and in the case of \10 and friends can either, depending
6911 on context. Specifically there are two seperate switches for handling
6912 escape sequences, with the one for handling literal escapes requiring
6913 a dummy entry for all of the special escapes that are actually handled
6918 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6921 register regnode *ret = NULL;
6923 char *parse_start = RExC_parse;
6924 GET_RE_DEBUG_FLAGS_DECL;
6925 DEBUG_PARSE("atom");
6926 *flagp = WORST; /* Tentatively. */
6928 PERL_ARGS_ASSERT_REGATOM;
6931 switch ((U8)*RExC_parse) {
6933 RExC_seen_zerolen++;
6934 nextchar(pRExC_state);
6935 if (RExC_flags & RXf_PMf_MULTILINE)
6936 ret = reg_node(pRExC_state, MBOL);
6937 else if (RExC_flags & RXf_PMf_SINGLELINE)
6938 ret = reg_node(pRExC_state, SBOL);
6940 ret = reg_node(pRExC_state, BOL);
6941 Set_Node_Length(ret, 1); /* MJD */
6944 nextchar(pRExC_state);
6946 RExC_seen_zerolen++;
6947 if (RExC_flags & RXf_PMf_MULTILINE)
6948 ret = reg_node(pRExC_state, MEOL);
6949 else if (RExC_flags & RXf_PMf_SINGLELINE)
6950 ret = reg_node(pRExC_state, SEOL);
6952 ret = reg_node(pRExC_state, EOL);
6953 Set_Node_Length(ret, 1); /* MJD */
6956 nextchar(pRExC_state);
6957 if (RExC_flags & RXf_PMf_SINGLELINE)
6958 ret = reg_node(pRExC_state, SANY);
6960 ret = reg_node(pRExC_state, REG_ANY);
6961 *flagp |= HASWIDTH|SIMPLE;
6963 Set_Node_Length(ret, 1); /* MJD */
6967 char * const oregcomp_parse = ++RExC_parse;
6968 ret = regclass(pRExC_state,depth+1);
6969 if (*RExC_parse != ']') {
6970 RExC_parse = oregcomp_parse;
6971 vFAIL("Unmatched [");
6973 nextchar(pRExC_state);
6974 *flagp |= HASWIDTH|SIMPLE;
6975 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6979 nextchar(pRExC_state);
6980 ret = reg(pRExC_state, 1, &flags,depth+1);
6982 if (flags & TRYAGAIN) {
6983 if (RExC_parse == RExC_end) {
6984 /* Make parent create an empty node if needed. */
6992 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6996 if (flags & TRYAGAIN) {
7000 vFAIL("Internal urp");
7001 /* Supposed to be caught earlier. */
7004 if (!regcurly(RExC_parse)) {
7013 vFAIL("Quantifier follows nothing");
7021 len=0; /* silence a spurious compiler warning */
7022 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7023 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7024 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7025 ret = reganode(pRExC_state, FOLDCHAR, cp);
7026 Set_Node_Length(ret, 1); /* MJD */
7027 nextchar(pRExC_state); /* kill whitespace under /x */
7035 This switch handles escape sequences that resolve to some kind
7036 of special regop and not to literal text. Escape sequnces that
7037 resolve to literal text are handled below in the switch marked
7040 Every entry in this switch *must* have a corresponding entry
7041 in the literal escape switch. However, the opposite is not
7042 required, as the default for this switch is to jump to the
7043 literal text handling code.
7045 switch ((U8)*++RExC_parse) {
7050 /* Special Escapes */
7052 RExC_seen_zerolen++;
7053 ret = reg_node(pRExC_state, SBOL);
7055 goto finish_meta_pat;
7057 ret = reg_node(pRExC_state, GPOS);
7058 RExC_seen |= REG_SEEN_GPOS;
7060 goto finish_meta_pat;
7062 RExC_seen_zerolen++;
7063 ret = reg_node(pRExC_state, KEEPS);
7065 /* XXX:dmq : disabling in-place substitution seems to
7066 * be necessary here to avoid cases of memory corruption, as
7067 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7069 RExC_seen |= REG_SEEN_LOOKBEHIND;
7070 goto finish_meta_pat;
7072 ret = reg_node(pRExC_state, SEOL);
7074 RExC_seen_zerolen++; /* Do not optimize RE away */
7075 goto finish_meta_pat;
7077 ret = reg_node(pRExC_state, EOS);
7079 RExC_seen_zerolen++; /* Do not optimize RE away */
7080 goto finish_meta_pat;
7082 ret = reg_node(pRExC_state, CANY);
7083 RExC_seen |= REG_SEEN_CANY;
7084 *flagp |= HASWIDTH|SIMPLE;
7085 goto finish_meta_pat;
7087 ret = reg_node(pRExC_state, CLUMP);
7089 goto finish_meta_pat;
7091 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7092 *flagp |= HASWIDTH|SIMPLE;
7093 goto finish_meta_pat;
7095 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7096 *flagp |= HASWIDTH|SIMPLE;
7097 goto finish_meta_pat;
7099 RExC_seen_zerolen++;
7100 RExC_seen |= REG_SEEN_LOOKBEHIND;
7101 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7103 goto finish_meta_pat;
7105 RExC_seen_zerolen++;
7106 RExC_seen |= REG_SEEN_LOOKBEHIND;
7107 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7109 goto finish_meta_pat;
7111 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7112 *flagp |= HASWIDTH|SIMPLE;
7113 goto finish_meta_pat;
7115 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7116 *flagp |= HASWIDTH|SIMPLE;
7117 goto finish_meta_pat;
7119 ret = reg_node(pRExC_state, DIGIT);
7120 *flagp |= HASWIDTH|SIMPLE;
7121 goto finish_meta_pat;
7123 ret = reg_node(pRExC_state, NDIGIT);
7124 *flagp |= HASWIDTH|SIMPLE;
7125 goto finish_meta_pat;
7127 ret = reg_node(pRExC_state, LNBREAK);
7128 *flagp |= HASWIDTH|SIMPLE;
7129 goto finish_meta_pat;
7131 ret = reg_node(pRExC_state, HORIZWS);
7132 *flagp |= HASWIDTH|SIMPLE;
7133 goto finish_meta_pat;
7135 ret = reg_node(pRExC_state, NHORIZWS);
7136 *flagp |= HASWIDTH|SIMPLE;
7137 goto finish_meta_pat;
7139 ret = reg_node(pRExC_state, VERTWS);
7140 *flagp |= HASWIDTH|SIMPLE;
7141 goto finish_meta_pat;
7143 ret = reg_node(pRExC_state, NVERTWS);
7144 *flagp |= HASWIDTH|SIMPLE;
7146 nextchar(pRExC_state);
7147 Set_Node_Length(ret, 2); /* MJD */
7152 char* const oldregxend = RExC_end;
7154 char* parse_start = RExC_parse - 2;
7157 if (RExC_parse[1] == '{') {
7158 /* a lovely hack--pretend we saw [\pX] instead */
7159 RExC_end = strchr(RExC_parse, '}');
7161 const U8 c = (U8)*RExC_parse;
7163 RExC_end = oldregxend;
7164 vFAIL2("Missing right brace on \\%c{}", c);
7169 RExC_end = RExC_parse + 2;
7170 if (RExC_end > oldregxend)
7171 RExC_end = oldregxend;
7175 ret = regclass(pRExC_state,depth+1);
7177 RExC_end = oldregxend;
7180 Set_Node_Offset(ret, parse_start + 2);
7181 Set_Node_Cur_Length(ret);
7182 nextchar(pRExC_state);
7183 *flagp |= HASWIDTH|SIMPLE;
7187 /* Handle \N and \N{NAME} here and not below because it can be
7188 multicharacter. join_exact() will join them up later on.
7189 Also this makes sure that things like /\N{BLAH}+/ and
7190 \N{BLAH} being multi char Just Happen. dmq*/
7192 ret= reg_namedseq(pRExC_state, NULL, flagp);
7194 case 'k': /* Handle \k<NAME> and \k'NAME' */
7197 char ch= RExC_parse[1];
7198 if (ch != '<' && ch != '\'' && ch != '{') {
7200 vFAIL2("Sequence %.2s... not terminated",parse_start);
7202 /* this pretty much dupes the code for (?P=...) in reg(), if
7203 you change this make sure you change that */
7204 char* name_start = (RExC_parse += 2);
7206 SV *sv_dat = reg_scan_name(pRExC_state,
7207 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7208 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7209 if (RExC_parse == name_start || *RExC_parse != ch)
7210 vFAIL2("Sequence %.3s... not terminated",parse_start);
7213 num = add_data( pRExC_state, 1, "S" );
7214 RExC_rxi->data->data[num]=(void*)sv_dat;
7215 SvREFCNT_inc_simple_void(sv_dat);
7219 ret = reganode(pRExC_state,
7220 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7224 /* override incorrect value set in reganode MJD */
7225 Set_Node_Offset(ret, parse_start+1);
7226 Set_Node_Cur_Length(ret); /* MJD */
7227 nextchar(pRExC_state);
7233 case '1': case '2': case '3': case '4':
7234 case '5': case '6': case '7': case '8': case '9':
7237 bool isg = *RExC_parse == 'g';
7242 if (*RExC_parse == '{') {
7246 if (*RExC_parse == '-') {
7250 if (hasbrace && !isDIGIT(*RExC_parse)) {
7251 if (isrel) RExC_parse--;
7253 goto parse_named_seq;
7255 num = atoi(RExC_parse);
7256 if (isg && num == 0)
7257 vFAIL("Reference to invalid group 0");
7259 num = RExC_npar - num;
7261 vFAIL("Reference to nonexistent or unclosed group");
7263 if (!isg && num > 9 && num >= RExC_npar)
7266 char * const parse_start = RExC_parse - 1; /* MJD */
7267 while (isDIGIT(*RExC_parse))
7269 if (parse_start == RExC_parse - 1)
7270 vFAIL("Unterminated \\g... pattern");
7272 if (*RExC_parse != '}')
7273 vFAIL("Unterminated \\g{...} pattern");
7277 if (num > (I32)RExC_rx->nparens)
7278 vFAIL("Reference to nonexistent group");
7281 ret = reganode(pRExC_state,
7282 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7286 /* override incorrect value set in reganode MJD */
7287 Set_Node_Offset(ret, parse_start+1);
7288 Set_Node_Cur_Length(ret); /* MJD */
7290 nextchar(pRExC_state);
7295 if (RExC_parse >= RExC_end)
7296 FAIL("Trailing \\");
7299 /* Do not generate "unrecognized" warnings here, we fall
7300 back into the quick-grab loop below */
7307 if (RExC_flags & RXf_PMf_EXTENDED) {
7308 if ( reg_skipcomment( pRExC_state ) )
7315 register STRLEN len;
7320 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7322 parse_start = RExC_parse - 1;
7328 ret = reg_node(pRExC_state,
7329 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7331 for (len = 0, p = RExC_parse - 1;
7332 len < 127 && p < RExC_end;
7335 char * const oldp = p;
7337 if (RExC_flags & RXf_PMf_EXTENDED)
7338 p = regwhite( pRExC_state, p );
7343 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7344 goto normal_default;
7354 /* Literal Escapes Switch
7356 This switch is meant to handle escape sequences that
7357 resolve to a literal character.
7359 Every escape sequence that represents something
7360 else, like an assertion or a char class, is handled
7361 in the switch marked 'Special Escapes' above in this
7362 routine, but also has an entry here as anything that
7363 isn't explicitly mentioned here will be treated as
7364 an unescaped equivalent literal.
7368 /* These are all the special escapes. */
7372 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7373 goto normal_default;
7374 case 'A': /* Start assertion */
7375 case 'b': case 'B': /* Word-boundary assertion*/
7376 case 'C': /* Single char !DANGEROUS! */
7377 case 'd': case 'D': /* digit class */
7378 case 'g': case 'G': /* generic-backref, pos assertion */
7379 case 'h': case 'H': /* HORIZWS */
7380 case 'k': case 'K': /* named backref, keep marker */
7381 case 'N': /* named char sequence */
7382 case 'p': case 'P': /* Unicode property */
7383 case 'R': /* LNBREAK */
7384 case 's': case 'S': /* space class */
7385 case 'v': case 'V': /* VERTWS */
7386 case 'w': case 'W': /* word class */
7387 case 'X': /* eXtended Unicode "combining character sequence" */
7388 case 'z': case 'Z': /* End of line/string assertion */
7392 /* Anything after here is an escape that resolves to a
7393 literal. (Except digits, which may or may not)
7412 ender = ASCII_TO_NATIVE('\033');
7416 ender = ASCII_TO_NATIVE('\007');
7421 char* const e = strchr(p, '}');
7425 vFAIL("Missing right brace on \\x{}");
7428 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7429 | PERL_SCAN_DISALLOW_PREFIX;
7430 STRLEN numlen = e - p - 1;
7431 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7438 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7440 ender = grok_hex(p, &numlen, &flags, NULL);
7443 if (PL_encoding && ender < 0x100)
7444 goto recode_encoding;
7448 ender = UCHARAT(p++);
7449 ender = toCTRL(ender);
7451 case '0': case '1': case '2': case '3':case '4':
7452 case '5': case '6': case '7': case '8':case '9':
7454 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7457 ender = grok_oct(p, &numlen, &flags, NULL);
7459 /* An octal above 0xff is interpreted differently
7460 * depending on if the re is in utf8 or not. If it
7461 * is in utf8, the value will be itself, otherwise
7462 * it is interpreted as modulo 0x100. It has been
7463 * decided to discourage the use of octal above the
7464 * single-byte range. For now, warn only when
7465 * it ends up modulo */
7466 if (SIZE_ONLY && ender >= 0x100
7467 && ! UTF && ! PL_encoding) {
7468 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7476 if (PL_encoding && ender < 0x100)
7477 goto recode_encoding;
7481 SV* enc = PL_encoding;
7482 ender = reg_recode((const char)(U8)ender, &enc);
7483 if (!enc && SIZE_ONLY)
7484 ckWARNreg(p, "Invalid escape in the specified encoding");
7490 FAIL("Trailing \\");
7493 if (!SIZE_ONLY&& isALPHA(*p))
7494 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7495 goto normal_default;
7500 if (UTF8_IS_START(*p) && UTF) {
7502 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7503 &numlen, UTF8_ALLOW_DEFAULT);
7510 if ( RExC_flags & RXf_PMf_EXTENDED)
7511 p = regwhite( pRExC_state, p );
7513 /* Prime the casefolded buffer. */
7514 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7516 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7521 /* Emit all the Unicode characters. */
7523 for (foldbuf = tmpbuf;
7525 foldlen -= numlen) {
7526 ender = utf8_to_uvchr(foldbuf, &numlen);
7528 const STRLEN unilen = reguni(pRExC_state, ender, s);
7531 /* In EBCDIC the numlen
7532 * and unilen can differ. */
7534 if (numlen >= foldlen)
7538 break; /* "Can't happen." */
7542 const STRLEN unilen = reguni(pRExC_state, ender, s);
7551 REGC((char)ender, s++);
7557 /* Emit all the Unicode characters. */
7559 for (foldbuf = tmpbuf;
7561 foldlen -= numlen) {
7562 ender = utf8_to_uvchr(foldbuf, &numlen);
7564 const STRLEN unilen = reguni(pRExC_state, ender, s);
7567 /* In EBCDIC the numlen
7568 * and unilen can differ. */
7570 if (numlen >= foldlen)
7578 const STRLEN unilen = reguni(pRExC_state, ender, s);
7587 REGC((char)ender, s++);
7591 Set_Node_Cur_Length(ret); /* MJD */
7592 nextchar(pRExC_state);
7594 /* len is STRLEN which is unsigned, need to copy to signed */
7597 vFAIL("Internal disaster");
7601 if (len == 1 && UNI_IS_INVARIANT(ender))
7605 RExC_size += STR_SZ(len);
7608 RExC_emit += STR_SZ(len);
7618 S_regwhite( RExC_state_t *pRExC_state, char *p )
7620 const char *e = RExC_end;
7622 PERL_ARGS_ASSERT_REGWHITE;
7627 else if (*p == '#') {
7636 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7644 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7645 Character classes ([:foo:]) can also be negated ([:^foo:]).
7646 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7647 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7648 but trigger failures because they are currently unimplemented. */
7650 #define POSIXCC_DONE(c) ((c) == ':')
7651 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7652 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7655 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7658 I32 namedclass = OOB_NAMEDCLASS;
7660 PERL_ARGS_ASSERT_REGPPOSIXCC;
7662 if (value == '[' && RExC_parse + 1 < RExC_end &&
7663 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7664 POSIXCC(UCHARAT(RExC_parse))) {
7665 const char c = UCHARAT(RExC_parse);
7666 char* const s = RExC_parse++;
7668 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7670 if (RExC_parse == RExC_end)
7671 /* Grandfather lone [:, [=, [. */
7674 const char* const t = RExC_parse++; /* skip over the c */
7677 if (UCHARAT(RExC_parse) == ']') {
7678 const char *posixcc = s + 1;
7679 RExC_parse++; /* skip over the ending ] */
7682 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7683 const I32 skip = t - posixcc;
7685 /* Initially switch on the length of the name. */
7688 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7689 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7692 /* Names all of length 5. */
7693 /* alnum alpha ascii blank cntrl digit graph lower
7694 print punct space upper */
7695 /* Offset 4 gives the best switch position. */
7696 switch (posixcc[4]) {
7698 if (memEQ(posixcc, "alph", 4)) /* alpha */
7699 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7702 if (memEQ(posixcc, "spac", 4)) /* space */
7703 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7706 if (memEQ(posixcc, "grap", 4)) /* graph */
7707 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7710 if (memEQ(posixcc, "asci", 4)) /* ascii */
7711 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7714 if (memEQ(posixcc, "blan", 4)) /* blank */
7715 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7718 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7719 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7722 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7723 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7726 if (memEQ(posixcc, "lowe", 4)) /* lower */
7727 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7728 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7729 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7732 if (memEQ(posixcc, "digi", 4)) /* digit */
7733 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7734 else if (memEQ(posixcc, "prin", 4)) /* print */
7735 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7736 else if (memEQ(posixcc, "punc", 4)) /* punct */
7737 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7742 if (memEQ(posixcc, "xdigit", 6))
7743 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7747 if (namedclass == OOB_NAMEDCLASS)
7748 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7750 assert (posixcc[skip] == ':');
7751 assert (posixcc[skip+1] == ']');
7752 } else if (!SIZE_ONLY) {
7753 /* [[=foo=]] and [[.foo.]] are still future. */
7755 /* adjust RExC_parse so the warning shows after
7757 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7759 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7762 /* Maternal grandfather:
7763 * "[:" ending in ":" but not in ":]" */
7773 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7777 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7779 if (POSIXCC(UCHARAT(RExC_parse))) {
7780 const char *s = RExC_parse;
7781 const char c = *s++;
7785 if (*s && c == *s && s[1] == ']') {
7787 "POSIX syntax [%c %c] belongs inside character classes",
7790 /* [[=foo=]] and [[.foo.]] are still future. */
7791 if (POSIXCC_NOTYET(c)) {
7792 /* adjust RExC_parse so the error shows after
7794 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7796 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7803 #define _C_C_T_(NAME,TEST,WORD) \
7806 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7808 for (value = 0; value < 256; value++) \
7810 ANYOF_BITMAP_SET(ret, value); \
7815 case ANYOF_N##NAME: \
7817 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7819 for (value = 0; value < 256; value++) \
7821 ANYOF_BITMAP_SET(ret, value); \
7827 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7829 for (value = 0; value < 256; value++) \
7831 ANYOF_BITMAP_SET(ret, value); \
7835 case ANYOF_N##NAME: \
7836 for (value = 0; value < 256; value++) \
7838 ANYOF_BITMAP_SET(ret, value); \
7844 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7845 so that it is possible to override the option here without having to
7846 rebuild the entire core. as we are required to do if we change regcomp.h
7847 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7849 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7850 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7853 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7854 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7856 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7860 parse a class specification and produce either an ANYOF node that
7861 matches the pattern or if the pattern matches a single char only and
7862 that char is < 256 and we are case insensitive then we produce an
7867 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7870 register UV nextvalue;
7871 register IV prevvalue = OOB_UNICODE;
7872 register IV range = 0;
7873 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7874 register regnode *ret;
7877 char *rangebegin = NULL;
7878 bool need_class = 0;
7881 bool optimize_invert = TRUE;
7882 AV* unicode_alternate = NULL;
7884 UV literal_endpoint = 0;
7886 UV stored = 0; /* number of chars stored in the class */
7888 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7889 case we need to change the emitted regop to an EXACT. */
7890 const char * orig_parse = RExC_parse;
7891 GET_RE_DEBUG_FLAGS_DECL;
7893 PERL_ARGS_ASSERT_REGCLASS;
7895 PERL_UNUSED_ARG(depth);
7898 DEBUG_PARSE("clas");
7900 /* Assume we are going to generate an ANYOF node. */
7901 ret = reganode(pRExC_state, ANYOF, 0);
7904 ANYOF_FLAGS(ret) = 0;
7906 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7910 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7914 RExC_size += ANYOF_SKIP;
7915 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7918 RExC_emit += ANYOF_SKIP;
7920 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7922 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7923 ANYOF_BITMAP_ZERO(ret);
7924 listsv = newSVpvs("# comment\n");
7927 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7929 if (!SIZE_ONLY && POSIXCC(nextvalue))
7930 checkposixcc(pRExC_state);
7932 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7933 if (UCHARAT(RExC_parse) == ']')
7937 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7941 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7944 rangebegin = RExC_parse;
7946 value = utf8n_to_uvchr((U8*)RExC_parse,
7947 RExC_end - RExC_parse,
7948 &numlen, UTF8_ALLOW_DEFAULT);
7949 RExC_parse += numlen;
7952 value = UCHARAT(RExC_parse++);
7954 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7955 if (value == '[' && POSIXCC(nextvalue))
7956 namedclass = regpposixcc(pRExC_state, value);
7957 else if (value == '\\') {
7959 value = utf8n_to_uvchr((U8*)RExC_parse,
7960 RExC_end - RExC_parse,
7961 &numlen, UTF8_ALLOW_DEFAULT);
7962 RExC_parse += numlen;
7965 value = UCHARAT(RExC_parse++);
7966 /* Some compilers cannot handle switching on 64-bit integer
7967 * values, therefore value cannot be an UV. Yes, this will
7968 * be a problem later if we want switch on Unicode.
7969 * A similar issue a little bit later when switching on
7970 * namedclass. --jhi */
7971 switch ((I32)value) {
7972 case 'w': namedclass = ANYOF_ALNUM; break;
7973 case 'W': namedclass = ANYOF_NALNUM; break;
7974 case 's': namedclass = ANYOF_SPACE; break;
7975 case 'S': namedclass = ANYOF_NSPACE; break;
7976 case 'd': namedclass = ANYOF_DIGIT; break;
7977 case 'D': namedclass = ANYOF_NDIGIT; break;
7978 case 'v': namedclass = ANYOF_VERTWS; break;
7979 case 'V': namedclass = ANYOF_NVERTWS; break;
7980 case 'h': namedclass = ANYOF_HORIZWS; break;
7981 case 'H': namedclass = ANYOF_NHORIZWS; break;
7982 case 'N': /* Handle \N{NAME} in class */
7984 /* We only pay attention to the first char of
7985 multichar strings being returned. I kinda wonder
7986 if this makes sense as it does change the behaviour
7987 from earlier versions, OTOH that behaviour was broken
7989 UV v; /* value is register so we cant & it /grrr */
7990 if (reg_namedseq(pRExC_state, &v, NULL)) {
8000 if (RExC_parse >= RExC_end)
8001 vFAIL2("Empty \\%c{}", (U8)value);
8002 if (*RExC_parse == '{') {
8003 const U8 c = (U8)value;
8004 e = strchr(RExC_parse++, '}');
8006 vFAIL2("Missing right brace on \\%c{}", c);
8007 while (isSPACE(UCHARAT(RExC_parse)))
8009 if (e == RExC_parse)
8010 vFAIL2("Empty \\%c{}", c);
8012 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8020 if (UCHARAT(RExC_parse) == '^') {
8023 value = value == 'p' ? 'P' : 'p'; /* toggle */
8024 while (isSPACE(UCHARAT(RExC_parse))) {
8029 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8030 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8033 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8034 namedclass = ANYOF_MAX; /* no official name, but it's named */
8037 case 'n': value = '\n'; break;
8038 case 'r': value = '\r'; break;
8039 case 't': value = '\t'; break;
8040 case 'f': value = '\f'; break;
8041 case 'b': value = '\b'; break;
8042 case 'e': value = ASCII_TO_NATIVE('\033');break;
8043 case 'a': value = ASCII_TO_NATIVE('\007');break;
8045 if (*RExC_parse == '{') {
8046 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8047 | PERL_SCAN_DISALLOW_PREFIX;
8048 char * const e = strchr(RExC_parse++, '}');
8050 vFAIL("Missing right brace on \\x{}");
8052 numlen = e - RExC_parse;
8053 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8057 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8059 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8060 RExC_parse += numlen;
8062 if (PL_encoding && value < 0x100)
8063 goto recode_encoding;
8066 value = UCHARAT(RExC_parse++);
8067 value = toCTRL(value);
8069 case '0': case '1': case '2': case '3': case '4':
8070 case '5': case '6': case '7': case '8': case '9':
8074 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8075 RExC_parse += numlen;
8076 if (PL_encoding && value < 0x100)
8077 goto recode_encoding;
8082 SV* enc = PL_encoding;
8083 value = reg_recode((const char)(U8)value, &enc);
8084 if (!enc && SIZE_ONLY)
8085 ckWARNreg(RExC_parse,
8086 "Invalid escape in the specified encoding");
8090 if (!SIZE_ONLY && isALPHA(value))
8091 ckWARN2reg(RExC_parse,
8092 "Unrecognized escape \\%c in character class passed through",
8096 } /* end of \blah */
8102 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8104 if (!SIZE_ONLY && !need_class)
8105 ANYOF_CLASS_ZERO(ret);
8109 /* a bad range like a-\d, a-[:digit:] ? */
8113 RExC_parse >= rangebegin ?
8114 RExC_parse - rangebegin : 0;
8115 ckWARN4reg(RExC_parse,
8116 "False [] range \"%*.*s\"",
8119 if (prevvalue < 256) {
8120 ANYOF_BITMAP_SET(ret, prevvalue);
8121 ANYOF_BITMAP_SET(ret, '-');
8124 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8125 Perl_sv_catpvf(aTHX_ listsv,
8126 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8130 range = 0; /* this was not a true range */
8136 const char *what = NULL;
8139 if (namedclass > OOB_NAMEDCLASS)
8140 optimize_invert = FALSE;
8141 /* Possible truncation here but in some 64-bit environments
8142 * the compiler gets heartburn about switch on 64-bit values.
8143 * A similar issue a little earlier when switching on value.
8145 switch ((I32)namedclass) {
8147 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8148 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8149 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8150 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8151 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8152 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8153 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8154 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8155 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8156 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8157 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8158 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8159 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8161 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8162 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8164 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8165 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8166 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8169 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8172 for (value = 0; value < 128; value++)
8173 ANYOF_BITMAP_SET(ret, value);
8175 for (value = 0; value < 256; value++) {
8177 ANYOF_BITMAP_SET(ret, value);
8186 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8189 for (value = 128; value < 256; value++)
8190 ANYOF_BITMAP_SET(ret, value);
8192 for (value = 0; value < 256; value++) {
8193 if (!isASCII(value))
8194 ANYOF_BITMAP_SET(ret, value);
8203 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8205 /* consecutive digits assumed */
8206 for (value = '0'; value <= '9'; value++)
8207 ANYOF_BITMAP_SET(ret, value);
8210 what = POSIX_CC_UNI_NAME("Digit");
8214 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8216 /* consecutive digits assumed */
8217 for (value = 0; value < '0'; value++)
8218 ANYOF_BITMAP_SET(ret, value);
8219 for (value = '9' + 1; value < 256; value++)
8220 ANYOF_BITMAP_SET(ret, value);
8223 what = POSIX_CC_UNI_NAME("Digit");
8226 /* this is to handle \p and \P */
8229 vFAIL("Invalid [::] class");
8233 /* Strings such as "+utf8::isWord\n" */
8234 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8237 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8240 } /* end of namedclass \blah */
8243 if (prevvalue > (IV)value) /* b-a */ {
8244 const int w = RExC_parse - rangebegin;
8245 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8246 range = 0; /* not a valid range */
8250 prevvalue = value; /* save the beginning of the range */
8251 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8252 RExC_parse[1] != ']') {
8255 /* a bad range like \w-, [:word:]- ? */
8256 if (namedclass > OOB_NAMEDCLASS) {
8257 if (ckWARN(WARN_REGEXP)) {
8259 RExC_parse >= rangebegin ?
8260 RExC_parse - rangebegin : 0;
8262 "False [] range \"%*.*s\"",
8266 ANYOF_BITMAP_SET(ret, '-');
8268 range = 1; /* yeah, it's a range! */
8269 continue; /* but do it the next time */
8273 /* now is the next time */
8274 /*stored += (value - prevvalue + 1);*/
8276 if (prevvalue < 256) {
8277 const IV ceilvalue = value < 256 ? value : 255;
8280 /* In EBCDIC [\x89-\x91] should include
8281 * the \x8e but [i-j] should not. */
8282 if (literal_endpoint == 2 &&
8283 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8284 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8286 if (isLOWER(prevvalue)) {
8287 for (i = prevvalue; i <= ceilvalue; i++)
8288 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8290 ANYOF_BITMAP_SET(ret, i);
8293 for (i = prevvalue; i <= ceilvalue; i++)
8294 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8296 ANYOF_BITMAP_SET(ret, i);
8302 for (i = prevvalue; i <= ceilvalue; i++) {
8303 if (!ANYOF_BITMAP_TEST(ret,i)) {
8305 ANYOF_BITMAP_SET(ret, i);
8309 if (value > 255 || UTF) {
8310 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8311 const UV natvalue = NATIVE_TO_UNI(value);
8312 stored+=2; /* can't optimize this class */
8313 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8314 if (prevnatvalue < natvalue) { /* what about > ? */
8315 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8316 prevnatvalue, natvalue);
8318 else if (prevnatvalue == natvalue) {
8319 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8321 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8323 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8325 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8326 if (RExC_precomp[0] == ':' &&
8327 RExC_precomp[1] == '[' &&
8328 (f == 0xDF || f == 0x92)) {
8329 f = NATIVE_TO_UNI(f);
8332 /* If folding and foldable and a single
8333 * character, insert also the folded version
8334 * to the charclass. */
8336 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8337 if ((RExC_precomp[0] == ':' &&
8338 RExC_precomp[1] == '[' &&
8340 (value == 0xFB05 || value == 0xFB06))) ?
8341 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8342 foldlen == (STRLEN)UNISKIP(f) )
8344 if (foldlen == (STRLEN)UNISKIP(f))
8346 Perl_sv_catpvf(aTHX_ listsv,
8349 /* Any multicharacter foldings
8350 * require the following transform:
8351 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8352 * where E folds into "pq" and F folds
8353 * into "rst", all other characters
8354 * fold to single characters. We save
8355 * away these multicharacter foldings,
8356 * to be later saved as part of the
8357 * additional "s" data. */
8360 if (!unicode_alternate)
8361 unicode_alternate = newAV();
8362 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8364 av_push(unicode_alternate, sv);
8368 /* If folding and the value is one of the Greek
8369 * sigmas insert a few more sigmas to make the
8370 * folding rules of the sigmas to work right.
8371 * Note that not all the possible combinations
8372 * are handled here: some of them are handled
8373 * by the standard folding rules, and some of
8374 * them (literal or EXACTF cases) are handled
8375 * during runtime in regexec.c:S_find_byclass(). */
8376 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8377 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8378 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8379 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8380 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8382 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8383 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8384 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8389 literal_endpoint = 0;
8393 range = 0; /* this range (if it was one) is done now */
8397 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8399 RExC_size += ANYOF_CLASS_ADD_SKIP;
8401 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8407 /****** !SIZE_ONLY AFTER HERE *********/
8409 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8410 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8412 /* optimize single char class to an EXACT node
8413 but *only* when its not a UTF/high char */
8414 const char * cur_parse= RExC_parse;
8415 RExC_emit = (regnode *)orig_emit;
8416 RExC_parse = (char *)orig_parse;
8417 ret = reg_node(pRExC_state,
8418 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8419 RExC_parse = (char *)cur_parse;
8420 *STRING(ret)= (char)value;
8422 RExC_emit += STR_SZ(1);
8423 SvREFCNT_dec(listsv);
8426 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8427 if ( /* If the only flag is folding (plus possibly inversion). */
8428 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8430 for (value = 0; value < 256; ++value) {
8431 if (ANYOF_BITMAP_TEST(ret, value)) {
8432 UV fold = PL_fold[value];
8435 ANYOF_BITMAP_SET(ret, fold);
8438 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8441 /* optimize inverted simple patterns (e.g. [^a-z]) */
8442 if (optimize_invert &&
8443 /* If the only flag is inversion. */
8444 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8445 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8446 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8447 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8450 AV * const av = newAV();
8452 /* The 0th element stores the character class description
8453 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8454 * to initialize the appropriate swash (which gets stored in
8455 * the 1st element), and also useful for dumping the regnode.
8456 * The 2nd element stores the multicharacter foldings,
8457 * used later (regexec.c:S_reginclass()). */
8458 av_store(av, 0, listsv);
8459 av_store(av, 1, NULL);
8460 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8461 rv = newRV_noinc(MUTABLE_SV(av));
8462 n = add_data(pRExC_state, 1, "s");
8463 RExC_rxi->data->data[n] = (void*)rv;
8471 /* reg_skipcomment()
8473 Absorbs an /x style # comments from the input stream.
8474 Returns true if there is more text remaining in the stream.
8475 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8476 terminates the pattern without including a newline.
8478 Note its the callers responsibility to ensure that we are
8484 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8488 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8490 while (RExC_parse < RExC_end)
8491 if (*RExC_parse++ == '\n') {
8496 /* we ran off the end of the pattern without ending
8497 the comment, so we have to add an \n when wrapping */
8498 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8506 Advance that parse position, and optionall absorbs
8507 "whitespace" from the inputstream.
8509 Without /x "whitespace" means (?#...) style comments only,
8510 with /x this means (?#...) and # comments and whitespace proper.
8512 Returns the RExC_parse point from BEFORE the scan occurs.
8514 This is the /x friendly way of saying RExC_parse++.
8518 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8520 char* const retval = RExC_parse++;
8522 PERL_ARGS_ASSERT_NEXTCHAR;
8525 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8526 RExC_parse[2] == '#') {
8527 while (*RExC_parse != ')') {
8528 if (RExC_parse == RExC_end)
8529 FAIL("Sequence (?#... not terminated");
8535 if (RExC_flags & RXf_PMf_EXTENDED) {
8536 if (isSPACE(*RExC_parse)) {
8540 else if (*RExC_parse == '#') {
8541 if ( reg_skipcomment( pRExC_state ) )
8550 - reg_node - emit a node
8552 STATIC regnode * /* Location. */
8553 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8556 register regnode *ptr;
8557 regnode * const ret = RExC_emit;
8558 GET_RE_DEBUG_FLAGS_DECL;
8560 PERL_ARGS_ASSERT_REG_NODE;
8563 SIZE_ALIGN(RExC_size);
8567 if (RExC_emit >= RExC_emit_bound)
8568 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8570 NODE_ALIGN_FILL(ret);
8572 FILL_ADVANCE_NODE(ptr, op);
8573 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8574 #ifdef RE_TRACK_PATTERN_OFFSETS
8575 if (RExC_offsets) { /* MJD */
8576 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8577 "reg_node", __LINE__,
8579 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8580 ? "Overwriting end of array!\n" : "OK",
8581 (UV)(RExC_emit - RExC_emit_start),
8582 (UV)(RExC_parse - RExC_start),
8583 (UV)RExC_offsets[0]));
8584 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8592 - reganode - emit a node with an argument
8594 STATIC regnode * /* Location. */
8595 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8598 register regnode *ptr;
8599 regnode * const ret = RExC_emit;
8600 GET_RE_DEBUG_FLAGS_DECL;
8602 PERL_ARGS_ASSERT_REGANODE;
8605 SIZE_ALIGN(RExC_size);
8610 assert(2==regarglen[op]+1);
8612 Anything larger than this has to allocate the extra amount.
8613 If we changed this to be:
8615 RExC_size += (1 + regarglen[op]);
8617 then it wouldn't matter. Its not clear what side effect
8618 might come from that so its not done so far.
8623 if (RExC_emit >= RExC_emit_bound)
8624 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8626 NODE_ALIGN_FILL(ret);
8628 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8629 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8630 #ifdef RE_TRACK_PATTERN_OFFSETS
8631 if (RExC_offsets) { /* MJD */
8632 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8636 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8637 "Overwriting end of array!\n" : "OK",
8638 (UV)(RExC_emit - RExC_emit_start),
8639 (UV)(RExC_parse - RExC_start),
8640 (UV)RExC_offsets[0]));
8641 Set_Cur_Node_Offset;
8649 - reguni - emit (if appropriate) a Unicode character
8652 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8656 PERL_ARGS_ASSERT_REGUNI;
8658 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8662 - reginsert - insert an operator in front of already-emitted operand
8664 * Means relocating the operand.
8667 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8670 register regnode *src;
8671 register regnode *dst;
8672 register regnode *place;
8673 const int offset = regarglen[(U8)op];
8674 const int size = NODE_STEP_REGNODE + offset;
8675 GET_RE_DEBUG_FLAGS_DECL;
8677 PERL_ARGS_ASSERT_REGINSERT;
8678 PERL_UNUSED_ARG(depth);
8679 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8680 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8689 if (RExC_open_parens) {
8691 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8692 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8693 if ( RExC_open_parens[paren] >= opnd ) {
8694 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8695 RExC_open_parens[paren] += size;
8697 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8699 if ( RExC_close_parens[paren] >= opnd ) {
8700 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8701 RExC_close_parens[paren] += size;
8703 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8708 while (src > opnd) {
8709 StructCopy(--src, --dst, regnode);
8710 #ifdef RE_TRACK_PATTERN_OFFSETS
8711 if (RExC_offsets) { /* MJD 20010112 */
8712 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8716 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8717 ? "Overwriting end of array!\n" : "OK",
8718 (UV)(src - RExC_emit_start),
8719 (UV)(dst - RExC_emit_start),
8720 (UV)RExC_offsets[0]));
8721 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8722 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8728 place = opnd; /* Op node, where operand used to be. */
8729 #ifdef RE_TRACK_PATTERN_OFFSETS
8730 if (RExC_offsets) { /* MJD */
8731 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8735 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8736 ? "Overwriting end of array!\n" : "OK",
8737 (UV)(place - RExC_emit_start),
8738 (UV)(RExC_parse - RExC_start),
8739 (UV)RExC_offsets[0]));
8740 Set_Node_Offset(place, RExC_parse);
8741 Set_Node_Length(place, 1);
8744 src = NEXTOPER(place);
8745 FILL_ADVANCE_NODE(place, op);
8746 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8747 Zero(src, offset, regnode);
8751 - regtail - set the next-pointer at the end of a node chain of p to val.
8752 - SEE ALSO: regtail_study
8754 /* TODO: All three parms should be const */
8756 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8759 register regnode *scan;
8760 GET_RE_DEBUG_FLAGS_DECL;
8762 PERL_ARGS_ASSERT_REGTAIL;
8764 PERL_UNUSED_ARG(depth);
8770 /* Find last node. */
8773 regnode * const temp = regnext(scan);
8775 SV * const mysv=sv_newmortal();
8776 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8777 regprop(RExC_rx, mysv, scan);
8778 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8779 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8780 (temp == NULL ? "->" : ""),
8781 (temp == NULL ? PL_reg_name[OP(val)] : "")
8789 if (reg_off_by_arg[OP(scan)]) {
8790 ARG_SET(scan, val - scan);
8793 NEXT_OFF(scan) = val - scan;
8799 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8800 - Look for optimizable sequences at the same time.
8801 - currently only looks for EXACT chains.
8803 This is expermental code. The idea is to use this routine to perform
8804 in place optimizations on branches and groups as they are constructed,
8805 with the long term intention of removing optimization from study_chunk so
8806 that it is purely analytical.
8808 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8809 to control which is which.
8812 /* TODO: All four parms should be const */
8815 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8818 register regnode *scan;
8820 #ifdef EXPERIMENTAL_INPLACESCAN
8823 GET_RE_DEBUG_FLAGS_DECL;
8825 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8831 /* Find last node. */
8835 regnode * const temp = regnext(scan);
8836 #ifdef EXPERIMENTAL_INPLACESCAN
8837 if (PL_regkind[OP(scan)] == EXACT)
8838 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8846 if( exact == PSEUDO )
8848 else if ( exact != OP(scan) )
8857 SV * const mysv=sv_newmortal();
8858 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8859 regprop(RExC_rx, mysv, scan);
8860 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8861 SvPV_nolen_const(mysv),
8863 PL_reg_name[exact]);
8870 SV * const mysv_val=sv_newmortal();
8871 DEBUG_PARSE_MSG("");
8872 regprop(RExC_rx, mysv_val, val);
8873 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8874 SvPV_nolen_const(mysv_val),
8875 (IV)REG_NODE_NUM(val),
8879 if (reg_off_by_arg[OP(scan)]) {
8880 ARG_SET(scan, val - scan);
8883 NEXT_OFF(scan) = val - scan;
8891 - regcurly - a little FSA that accepts {\d+,?\d*}
8893 #ifndef PERL_IN_XSUB_RE
8895 Perl_regcurly(register const char *s)
8897 PERL_ARGS_ASSERT_REGCURLY;
8916 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8920 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8925 for (bit=0; bit<32; bit++) {
8926 if (flags & (1<<bit)) {
8928 PerlIO_printf(Perl_debug_log, "%s",lead);
8929 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8934 PerlIO_printf(Perl_debug_log, "\n");
8936 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8942 Perl_regdump(pTHX_ const regexp *r)
8946 SV * const sv = sv_newmortal();
8947 SV *dsv= sv_newmortal();
8949 GET_RE_DEBUG_FLAGS_DECL;
8951 PERL_ARGS_ASSERT_REGDUMP;
8953 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8955 /* Header fields of interest. */
8956 if (r->anchored_substr) {
8957 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8958 RE_SV_DUMPLEN(r->anchored_substr), 30);
8959 PerlIO_printf(Perl_debug_log,
8960 "anchored %s%s at %"IVdf" ",
8961 s, RE_SV_TAIL(r->anchored_substr),
8962 (IV)r->anchored_offset);
8963 } else if (r->anchored_utf8) {
8964 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8965 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8966 PerlIO_printf(Perl_debug_log,
8967 "anchored utf8 %s%s at %"IVdf" ",
8968 s, RE_SV_TAIL(r->anchored_utf8),
8969 (IV)r->anchored_offset);
8971 if (r->float_substr) {
8972 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8973 RE_SV_DUMPLEN(r->float_substr), 30);
8974 PerlIO_printf(Perl_debug_log,
8975 "floating %s%s at %"IVdf"..%"UVuf" ",
8976 s, RE_SV_TAIL(r->float_substr),
8977 (IV)r->float_min_offset, (UV)r->float_max_offset);
8978 } else if (r->float_utf8) {
8979 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8980 RE_SV_DUMPLEN(r->float_utf8), 30);
8981 PerlIO_printf(Perl_debug_log,
8982 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8983 s, RE_SV_TAIL(r->float_utf8),
8984 (IV)r->float_min_offset, (UV)r->float_max_offset);
8986 if (r->check_substr || r->check_utf8)
8987 PerlIO_printf(Perl_debug_log,
8989 (r->check_substr == r->float_substr
8990 && r->check_utf8 == r->float_utf8
8991 ? "(checking floating" : "(checking anchored"));
8992 if (r->extflags & RXf_NOSCAN)
8993 PerlIO_printf(Perl_debug_log, " noscan");
8994 if (r->extflags & RXf_CHECK_ALL)
8995 PerlIO_printf(Perl_debug_log, " isall");
8996 if (r->check_substr || r->check_utf8)
8997 PerlIO_printf(Perl_debug_log, ") ");
8999 if (ri->regstclass) {
9000 regprop(r, sv, ri->regstclass);
9001 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9003 if (r->extflags & RXf_ANCH) {
9004 PerlIO_printf(Perl_debug_log, "anchored");
9005 if (r->extflags & RXf_ANCH_BOL)
9006 PerlIO_printf(Perl_debug_log, "(BOL)");
9007 if (r->extflags & RXf_ANCH_MBOL)
9008 PerlIO_printf(Perl_debug_log, "(MBOL)");
9009 if (r->extflags & RXf_ANCH_SBOL)
9010 PerlIO_printf(Perl_debug_log, "(SBOL)");
9011 if (r->extflags & RXf_ANCH_GPOS)
9012 PerlIO_printf(Perl_debug_log, "(GPOS)");
9013 PerlIO_putc(Perl_debug_log, ' ');
9015 if (r->extflags & RXf_GPOS_SEEN)
9016 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9017 if (r->intflags & PREGf_SKIP)
9018 PerlIO_printf(Perl_debug_log, "plus ");
9019 if (r->intflags & PREGf_IMPLICIT)
9020 PerlIO_printf(Perl_debug_log, "implicit ");
9021 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9022 if (r->extflags & RXf_EVAL_SEEN)
9023 PerlIO_printf(Perl_debug_log, "with eval ");
9024 PerlIO_printf(Perl_debug_log, "\n");
9025 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9027 PERL_ARGS_ASSERT_REGDUMP;
9028 PERL_UNUSED_CONTEXT;
9030 #endif /* DEBUGGING */
9034 - regprop - printable representation of opcode
9036 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9039 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9040 if (flags & ANYOF_INVERT) \
9041 /*make sure the invert info is in each */ \
9042 sv_catpvs(sv, "^"); \
9048 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9053 RXi_GET_DECL(prog,progi);
9054 GET_RE_DEBUG_FLAGS_DECL;
9056 PERL_ARGS_ASSERT_REGPROP;
9060 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9061 /* It would be nice to FAIL() here, but this may be called from
9062 regexec.c, and it would be hard to supply pRExC_state. */
9063 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9064 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9066 k = PL_regkind[OP(o)];
9070 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9071 * is a crude hack but it may be the best for now since
9072 * we have no flag "this EXACTish node was UTF-8"
9074 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9075 PERL_PV_ESCAPE_UNI_DETECT |
9076 PERL_PV_PRETTY_ELLIPSES |
9077 PERL_PV_PRETTY_LTGT |
9078 PERL_PV_PRETTY_NOCLEAR
9080 } else if (k == TRIE) {
9081 /* print the details of the trie in dumpuntil instead, as
9082 * progi->data isn't available here */
9083 const char op = OP(o);
9084 const U32 n = ARG(o);
9085 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9086 (reg_ac_data *)progi->data->data[n] :
9088 const reg_trie_data * const trie
9089 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9091 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9092 DEBUG_TRIE_COMPILE_r(
9093 Perl_sv_catpvf(aTHX_ sv,
9094 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9095 (UV)trie->startstate,
9096 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9097 (UV)trie->wordcount,
9100 (UV)TRIE_CHARCOUNT(trie),
9101 (UV)trie->uniquecharcount
9104 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9106 int rangestart = -1;
9107 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9109 for (i = 0; i <= 256; i++) {
9110 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9111 if (rangestart == -1)
9113 } else if (rangestart != -1) {
9114 if (i <= rangestart + 3)
9115 for (; rangestart < i; rangestart++)
9116 put_byte(sv, rangestart);
9118 put_byte(sv, rangestart);
9120 put_byte(sv, i - 1);
9128 } else if (k == CURLY) {
9129 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9130 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9131 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9133 else if (k == WHILEM && o->flags) /* Ordinal/of */
9134 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9135 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9136 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9137 if ( RXp_PAREN_NAMES(prog) ) {
9138 if ( k != REF || OP(o) < NREF) {
9139 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9140 SV **name= av_fetch(list, ARG(o), 0 );
9142 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9145 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9146 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9147 I32 *nums=(I32*)SvPVX(sv_dat);
9148 SV **name= av_fetch(list, nums[0], 0 );
9151 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9152 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9153 (n ? "," : ""), (IV)nums[n]);
9155 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9159 } else if (k == GOSUB)
9160 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9161 else if (k == VERB) {
9163 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9164 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9165 } else if (k == LOGICAL)
9166 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9167 else if (k == FOLDCHAR)
9168 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9169 else if (k == ANYOF) {
9170 int i, rangestart = -1;
9171 const U8 flags = ANYOF_FLAGS(o);
9174 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9175 static const char * const anyofs[] = {
9208 if (flags & ANYOF_LOCALE)
9209 sv_catpvs(sv, "{loc}");
9210 if (flags & ANYOF_FOLD)
9211 sv_catpvs(sv, "{i}");
9212 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9213 if (flags & ANYOF_INVERT)
9216 /* output what the standard cp 0-255 bitmap matches */
9217 for (i = 0; i <= 256; i++) {
9218 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9219 if (rangestart == -1)
9221 } else if (rangestart != -1) {
9222 if (i <= rangestart + 3)
9223 for (; rangestart < i; rangestart++)
9224 put_byte(sv, rangestart);
9226 put_byte(sv, rangestart);
9228 put_byte(sv, i - 1);
9235 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9236 /* output any special charclass tests (used mostly under use locale) */
9237 if (o->flags & ANYOF_CLASS)
9238 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9239 if (ANYOF_CLASS_TEST(o,i)) {
9240 sv_catpv(sv, anyofs[i]);
9244 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9246 /* output information about the unicode matching */
9247 if (flags & ANYOF_UNICODE)
9248 sv_catpvs(sv, "{unicode}");
9249 else if (flags & ANYOF_UNICODE_ALL)
9250 sv_catpvs(sv, "{unicode_all}");
9254 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9258 U8 s[UTF8_MAXBYTES_CASE+1];
9260 for (i = 0; i <= 256; i++) { /* just the first 256 */
9261 uvchr_to_utf8(s, i);
9263 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9264 if (rangestart == -1)
9266 } else if (rangestart != -1) {
9267 if (i <= rangestart + 3)
9268 for (; rangestart < i; rangestart++) {
9269 const U8 * const e = uvchr_to_utf8(s,rangestart);
9271 for(p = s; p < e; p++)
9275 const U8 *e = uvchr_to_utf8(s,rangestart);
9277 for (p = s; p < e; p++)
9280 e = uvchr_to_utf8(s, i-1);
9281 for (p = s; p < e; p++)
9288 sv_catpvs(sv, "..."); /* et cetera */
9292 char *s = savesvpv(lv);
9293 char * const origs = s;
9295 while (*s && *s != '\n')
9299 const char * const t = ++s;
9317 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9319 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9320 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9322 PERL_UNUSED_CONTEXT;
9323 PERL_UNUSED_ARG(sv);
9325 PERL_UNUSED_ARG(prog);
9326 #endif /* DEBUGGING */
9330 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9331 { /* Assume that RE_INTUIT is set */
9333 struct regexp *const prog = (struct regexp *)SvANY(r);
9334 GET_RE_DEBUG_FLAGS_DECL;
9336 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9337 PERL_UNUSED_CONTEXT;
9341 const char * const s = SvPV_nolen_const(prog->check_substr
9342 ? prog->check_substr : prog->check_utf8);
9344 if (!PL_colorset) reginitcolors();
9345 PerlIO_printf(Perl_debug_log,
9346 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9348 prog->check_substr ? "" : "utf8 ",
9349 PL_colors[5],PL_colors[0],
9352 (strlen(s) > 60 ? "..." : ""));
9355 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9361 handles refcounting and freeing the perl core regexp structure. When
9362 it is necessary to actually free the structure the first thing it
9363 does is call the 'free' method of the regexp_engine associated to to
9364 the regexp, allowing the handling of the void *pprivate; member
9365 first. (This routine is not overridable by extensions, which is why
9366 the extensions free is called first.)
9368 See regdupe and regdupe_internal if you change anything here.
9370 #ifndef PERL_IN_XSUB_RE
9372 Perl_pregfree(pTHX_ REGEXP *r)
9378 Perl_pregfree2(pTHX_ REGEXP *rx)
9381 struct regexp *const r = (struct regexp *)SvANY(rx);
9382 GET_RE_DEBUG_FLAGS_DECL;
9384 PERL_ARGS_ASSERT_PREGFREE2;
9387 ReREFCNT_dec(r->mother_re);
9389 CALLREGFREE_PVT(rx); /* free the private data */
9390 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9393 SvREFCNT_dec(r->anchored_substr);
9394 SvREFCNT_dec(r->anchored_utf8);
9395 SvREFCNT_dec(r->float_substr);
9396 SvREFCNT_dec(r->float_utf8);
9397 Safefree(r->substrs);
9399 RX_MATCH_COPY_FREE(rx);
9400 #ifdef PERL_OLD_COPY_ON_WRITE
9401 SvREFCNT_dec(r->saved_copy);
9408 This is a hacky workaround to the structural issue of match results
9409 being stored in the regexp structure which is in turn stored in
9410 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9411 could be PL_curpm in multiple contexts, and could require multiple
9412 result sets being associated with the pattern simultaneously, such
9413 as when doing a recursive match with (??{$qr})
9415 The solution is to make a lightweight copy of the regexp structure
9416 when a qr// is returned from the code executed by (??{$qr}) this
9417 lightweight copy doesnt actually own any of its data except for
9418 the starp/end and the actual regexp structure itself.
9424 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9427 struct regexp *const r = (struct regexp *)SvANY(rx);
9428 register const I32 npar = r->nparens+1;
9430 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9433 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9434 ret = (struct regexp *)SvANY(ret_x);
9436 (void)ReREFCNT_inc(rx);
9437 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9438 by pointing directly at the buffer, but flagging that the allocated
9439 space in the copy is zero. As we've just done a struct copy, it's now
9440 a case of zero-ing that, rather than copying the current length. */
9441 SvPV_set(ret_x, RX_WRAPPED(rx));
9442 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9443 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9444 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9445 SvLEN_set(ret_x, 0);
9446 SvSTASH_set(ret_x, NULL);
9447 SvMAGIC_set(ret_x, NULL);
9448 Newx(ret->offs, npar, regexp_paren_pair);
9449 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9451 Newx(ret->substrs, 1, struct reg_substr_data);
9452 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9454 SvREFCNT_inc_void(ret->anchored_substr);
9455 SvREFCNT_inc_void(ret->anchored_utf8);
9456 SvREFCNT_inc_void(ret->float_substr);
9457 SvREFCNT_inc_void(ret->float_utf8);
9459 /* check_substr and check_utf8, if non-NULL, point to either their
9460 anchored or float namesakes, and don't hold a second reference. */
9462 RX_MATCH_COPIED_off(ret_x);
9463 #ifdef PERL_OLD_COPY_ON_WRITE
9464 ret->saved_copy = NULL;
9466 ret->mother_re = rx;
9472 /* regfree_internal()
9474 Free the private data in a regexp. This is overloadable by
9475 extensions. Perl takes care of the regexp structure in pregfree(),
9476 this covers the *pprivate pointer which technically perldoesnt
9477 know about, however of course we have to handle the
9478 regexp_internal structure when no extension is in use.
9480 Note this is called before freeing anything in the regexp
9485 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9488 struct regexp *const r = (struct regexp *)SvANY(rx);
9490 GET_RE_DEBUG_FLAGS_DECL;
9492 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9498 SV *dsv= sv_newmortal();
9499 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9500 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9501 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9502 PL_colors[4],PL_colors[5],s);
9505 #ifdef RE_TRACK_PATTERN_OFFSETS
9507 Safefree(ri->u.offsets); /* 20010421 MJD */
9510 int n = ri->data->count;
9511 PAD* new_comppad = NULL;
9516 /* If you add a ->what type here, update the comment in regcomp.h */
9517 switch (ri->data->what[n]) {
9521 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9524 Safefree(ri->data->data[n]);
9527 new_comppad = MUTABLE_AV(ri->data->data[n]);
9530 if (new_comppad == NULL)
9531 Perl_croak(aTHX_ "panic: pregfree comppad");
9532 PAD_SAVE_LOCAL(old_comppad,
9533 /* Watch out for global destruction's random ordering. */
9534 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9537 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9540 op_free((OP_4tree*)ri->data->data[n]);
9542 PAD_RESTORE_LOCAL(old_comppad);
9543 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9549 { /* Aho Corasick add-on structure for a trie node.
9550 Used in stclass optimization only */
9552 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9554 refcount = --aho->refcount;
9557 PerlMemShared_free(aho->states);
9558 PerlMemShared_free(aho->fail);
9559 /* do this last!!!! */
9560 PerlMemShared_free(ri->data->data[n]);
9561 PerlMemShared_free(ri->regstclass);
9567 /* trie structure. */
9569 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9571 refcount = --trie->refcount;
9574 PerlMemShared_free(trie->charmap);
9575 PerlMemShared_free(trie->states);
9576 PerlMemShared_free(trie->trans);
9578 PerlMemShared_free(trie->bitmap);
9580 PerlMemShared_free(trie->wordlen);
9582 PerlMemShared_free(trie->jump);
9584 PerlMemShared_free(trie->nextword);
9585 /* do this last!!!! */
9586 PerlMemShared_free(ri->data->data[n]);
9591 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9594 Safefree(ri->data->what);
9601 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9602 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9603 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9604 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9607 re_dup - duplicate a regexp.
9609 This routine is expected to clone a given regexp structure. It is only
9610 compiled under USE_ITHREADS.
9612 After all of the core data stored in struct regexp is duplicated
9613 the regexp_engine.dupe method is used to copy any private data
9614 stored in the *pprivate pointer. This allows extensions to handle
9615 any duplication it needs to do.
9617 See pregfree() and regfree_internal() if you change anything here.
9619 #if defined(USE_ITHREADS)
9620 #ifndef PERL_IN_XSUB_RE
9622 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9626 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9627 struct regexp *ret = (struct regexp *)SvANY(dstr);
9629 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9631 npar = r->nparens+1;
9632 Newx(ret->offs, npar, regexp_paren_pair);
9633 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9635 /* no need to copy these */
9636 Newx(ret->swap, npar, regexp_paren_pair);
9640 /* Do it this way to avoid reading from *r after the StructCopy().
9641 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9642 cache, it doesn't matter. */
9643 const bool anchored = r->check_substr
9644 ? r->check_substr == r->anchored_substr
9645 : r->check_utf8 == r->anchored_utf8;
9646 Newx(ret->substrs, 1, struct reg_substr_data);
9647 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9649 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9650 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9651 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9652 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9654 /* check_substr and check_utf8, if non-NULL, point to either their
9655 anchored or float namesakes, and don't hold a second reference. */
9657 if (ret->check_substr) {
9659 assert(r->check_utf8 == r->anchored_utf8);
9660 ret->check_substr = ret->anchored_substr;
9661 ret->check_utf8 = ret->anchored_utf8;
9663 assert(r->check_substr == r->float_substr);
9664 assert(r->check_utf8 == r->float_utf8);
9665 ret->check_substr = ret->float_substr;
9666 ret->check_utf8 = ret->float_utf8;
9668 } else if (ret->check_utf8) {
9670 ret->check_utf8 = ret->anchored_utf8;
9672 ret->check_utf8 = ret->float_utf8;
9677 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9680 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9682 if (RX_MATCH_COPIED(dstr))
9683 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9686 #ifdef PERL_OLD_COPY_ON_WRITE
9687 ret->saved_copy = NULL;
9690 if (ret->mother_re) {
9691 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9692 /* Our storage points directly to our mother regexp, but that's
9693 1: a buffer in a different thread
9694 2: something we no longer hold a reference on
9695 so we need to copy it locally. */
9696 /* Note we need to sue SvCUR() on our mother_re, because it, in
9697 turn, may well be pointing to its own mother_re. */
9698 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9699 SvCUR(ret->mother_re)+1));
9700 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9702 ret->mother_re = NULL;
9706 #endif /* PERL_IN_XSUB_RE */
9711 This is the internal complement to regdupe() which is used to copy
9712 the structure pointed to by the *pprivate pointer in the regexp.
9713 This is the core version of the extension overridable cloning hook.
9714 The regexp structure being duplicated will be copied by perl prior
9715 to this and will be provided as the regexp *r argument, however
9716 with the /old/ structures pprivate pointer value. Thus this routine
9717 may override any copying normally done by perl.
9719 It returns a pointer to the new regexp_internal structure.
9723 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9726 struct regexp *const r = (struct regexp *)SvANY(rx);
9727 regexp_internal *reti;
9731 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9733 npar = r->nparens+1;
9736 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9737 Copy(ri->program, reti->program, len+1, regnode);
9740 reti->regstclass = NULL;
9744 const int count = ri->data->count;
9747 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9748 char, struct reg_data);
9749 Newx(d->what, count, U8);
9752 for (i = 0; i < count; i++) {
9753 d->what[i] = ri->data->what[i];
9754 switch (d->what[i]) {
9755 /* legal options are one of: sSfpontTu
9756 see also regcomp.h and pregfree() */
9759 case 'p': /* actually an AV, but the dup function is identical. */
9760 case 'u': /* actually an HV, but the dup function is identical. */
9761 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9764 /* This is cheating. */
9765 Newx(d->data[i], 1, struct regnode_charclass_class);
9766 StructCopy(ri->data->data[i], d->data[i],
9767 struct regnode_charclass_class);
9768 reti->regstclass = (regnode*)d->data[i];
9771 /* Compiled op trees are readonly and in shared memory,
9772 and can thus be shared without duplication. */
9774 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9778 /* Trie stclasses are readonly and can thus be shared
9779 * without duplication. We free the stclass in pregfree
9780 * when the corresponding reg_ac_data struct is freed.
9782 reti->regstclass= ri->regstclass;
9786 ((reg_trie_data*)ri->data->data[i])->refcount++;
9790 d->data[i] = ri->data->data[i];
9793 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9802 reti->name_list_idx = ri->name_list_idx;
9804 #ifdef RE_TRACK_PATTERN_OFFSETS
9805 if (ri->u.offsets) {
9806 Newx(reti->u.offsets, 2*len+1, U32);
9807 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9810 SetProgLen(reti,len);
9816 #endif /* USE_ITHREADS */
9818 #ifndef PERL_IN_XSUB_RE
9821 - regnext - dig the "next" pointer out of a node
9824 Perl_regnext(pTHX_ register regnode *p)
9827 register I32 offset;
9832 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9841 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9844 STRLEN l1 = strlen(pat1);
9845 STRLEN l2 = strlen(pat2);
9848 const char *message;
9850 PERL_ARGS_ASSERT_RE_CROAK2;
9856 Copy(pat1, buf, l1 , char);
9857 Copy(pat2, buf + l1, l2 , char);
9858 buf[l1 + l2] = '\n';
9859 buf[l1 + l2 + 1] = '\0';
9861 /* ANSI variant takes additional second argument */
9862 va_start(args, pat2);
9866 msv = vmess(buf, &args);
9868 message = SvPV_const(msv,l1);
9871 Copy(message, buf, l1 , char);
9872 buf[l1-1] = '\0'; /* Overwrite \n */
9873 Perl_croak(aTHX_ "%s", buf);
9876 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9878 #ifndef PERL_IN_XSUB_RE
9880 Perl_save_re_context(pTHX)
9884 struct re_save_state *state;
9886 SAVEVPTR(PL_curcop);
9887 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9889 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9890 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9891 SSPUSHINT(SAVEt_RE_STATE);
9893 Copy(&PL_reg_state, state, 1, struct re_save_state);
9895 PL_reg_start_tmp = 0;
9896 PL_reg_start_tmpl = 0;
9897 PL_reg_oldsaved = NULL;
9898 PL_reg_oldsavedlen = 0;
9900 PL_reg_leftiter = 0;
9901 PL_reg_poscache = NULL;
9902 PL_reg_poscache_size = 0;
9903 #ifdef PERL_OLD_COPY_ON_WRITE
9907 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9909 const REGEXP * const rx = PM_GETRE(PL_curpm);
9912 for (i = 1; i <= RX_NPARENS(rx); i++) {
9913 char digits[TYPE_CHARS(long)];
9914 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9915 GV *const *const gvp
9916 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9919 GV * const gv = *gvp;
9920 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9930 clear_re(pTHX_ void *r)
9933 ReREFCNT_dec((REGEXP *)r);
9939 S_put_byte(pTHX_ SV *sv, int c)
9941 PERL_ARGS_ASSERT_PUT_BYTE;
9943 /* Our definition of isPRINT() ignores locales, so only bytes that are
9944 not part of UTF-8 are considered printable. I assume that the same
9945 holds for UTF-EBCDIC.
9946 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9947 which Wikipedia says:
9949 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9950 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9951 identical, to the ASCII delete (DEL) or rubout control character.
9952 ) So the old condition can be simplified to !isPRINT(c) */
9954 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9956 const char string = c;
9957 if (c == '-' || c == ']' || c == '\\' || c == '^')
9958 sv_catpvs(sv, "\\");
9959 sv_catpvn(sv, &string, 1);
9964 #define CLEAR_OPTSTART \
9965 if (optstart) STMT_START { \
9966 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9970 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9972 STATIC const regnode *
9973 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9974 const regnode *last, const regnode *plast,
9975 SV* sv, I32 indent, U32 depth)
9978 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9979 register const regnode *next;
9980 const regnode *optstart= NULL;
9983 GET_RE_DEBUG_FLAGS_DECL;
9985 PERL_ARGS_ASSERT_DUMPUNTIL;
9987 #ifdef DEBUG_DUMPUNTIL
9988 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9989 last ? last-start : 0,plast ? plast-start : 0);
9992 if (plast && plast < last)
9995 while (PL_regkind[op] != END && (!last || node < last)) {
9996 /* While that wasn't END last time... */
9999 if (op == CLOSE || op == WHILEM)
10001 next = regnext((regnode *)node);
10004 if (OP(node) == OPTIMIZED) {
10005 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10012 regprop(r, sv, node);
10013 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10014 (int)(2*indent + 1), "", SvPVX_const(sv));
10016 if (OP(node) != OPTIMIZED) {
10017 if (next == NULL) /* Next ptr. */
10018 PerlIO_printf(Perl_debug_log, " (0)");
10019 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10020 PerlIO_printf(Perl_debug_log, " (FAIL)");
10022 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10023 (void)PerlIO_putc(Perl_debug_log, '\n');
10027 if (PL_regkind[(U8)op] == BRANCHJ) {
10030 register const regnode *nnode = (OP(next) == LONGJMP
10031 ? regnext((regnode *)next)
10033 if (last && nnode > last)
10035 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10038 else if (PL_regkind[(U8)op] == BRANCH) {
10040 DUMPUNTIL(NEXTOPER(node), next);
10042 else if ( PL_regkind[(U8)op] == TRIE ) {
10043 const regnode *this_trie = node;
10044 const char op = OP(node);
10045 const U32 n = ARG(node);
10046 const reg_ac_data * const ac = op>=AHOCORASICK ?
10047 (reg_ac_data *)ri->data->data[n] :
10049 const reg_trie_data * const trie =
10050 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10052 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10054 const regnode *nextbranch= NULL;
10057 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10058 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10060 PerlIO_printf(Perl_debug_log, "%*s%s ",
10061 (int)(2*(indent+3)), "",
10062 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10063 PL_colors[0], PL_colors[1],
10064 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10065 PERL_PV_PRETTY_ELLIPSES |
10066 PERL_PV_PRETTY_LTGT
10071 U16 dist= trie->jump[word_idx+1];
10072 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10073 (UV)((dist ? this_trie + dist : next) - start));
10076 nextbranch= this_trie + trie->jump[0];
10077 DUMPUNTIL(this_trie + dist, nextbranch);
10079 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10080 nextbranch= regnext((regnode *)nextbranch);
10082 PerlIO_printf(Perl_debug_log, "\n");
10085 if (last && next > last)
10090 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10091 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10092 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10094 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10096 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10098 else if ( op == PLUS || op == STAR) {
10099 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10101 else if (op == ANYOF) {
10102 /* arglen 1 + class block */
10103 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10104 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10105 node = NEXTOPER(node);
10107 else if (PL_regkind[(U8)op] == EXACT) {
10108 /* Literal string, where present. */
10109 node += NODE_SZ_STR(node) - 1;
10110 node = NEXTOPER(node);
10113 node = NEXTOPER(node);
10114 node += regarglen[(U8)op];
10116 if (op == CURLYX || op == OPEN)
10120 #ifdef DEBUG_DUMPUNTIL
10121 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10126 #endif /* DEBUGGING */
10130 * c-indentation-style: bsd
10131 * c-basic-offset: 4
10132 * indent-tabs-mode: t
10135 * ex: set ts=8 sts=4 sw=4 noet: