5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
93 # if defined(BUGGY_MSC6)
94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 # pragma optimize("a",off)
96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 # pragma optimize("w",on )
98 # endif /* BUGGY_MSC6 */
102 #define STATIC static
105 typedef struct RExC_state_t {
106 U32 flags; /* are we folding, multilining? */
107 char *precomp; /* uncompiled string. */
108 regexp *rx; /* perl core regexp structure */
109 regexp_internal *rxi; /* internal data for regexp object pprivate field */
110 char *start; /* Start of input for compile */
111 char *end; /* End of input for compile */
112 char *parse; /* Input-scan pointer. */
113 I32 whilem_seen; /* number of WHILEM in this expr */
114 regnode *emit_start; /* Start of emitted-code area */
115 regnode *emit_bound; /* First regnode outside of the allocated space */
116 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
117 I32 naughty; /* How bad is this pattern? */
118 I32 sawback; /* Did we see \1, ...? */
120 I32 size; /* Code size. */
121 I32 npar; /* Capture buffer count, (OPEN). */
122 I32 cpar; /* Capture buffer count, (CLOSE). */
123 I32 nestroot; /* root parens we are in - used by accept */
127 regnode **open_parens; /* pointers to open parens */
128 regnode **close_parens; /* pointers to close parens */
129 regnode *opend; /* END node in program */
130 I32 utf8; /* whether the pattern is utf8 or not */
131 I32 orig_utf8; /* whether the pattern was originally in utf8 */
132 /* XXX use this for future optimisation of case
133 * where pattern must be upgraded to utf8. */
134 HV *charnames; /* cache of named sequences */
135 HV *paren_names; /* Paren names */
137 regnode **recurse; /* Recurse regops */
138 I32 recurse_count; /* Number of recurse regops */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
144 const char *lastparse;
146 AV *paren_name_list; /* idx -> name */
147 #define RExC_lastparse (pRExC_state->lastparse)
148 #define RExC_lastnum (pRExC_state->lastnum)
149 #define RExC_paren_name_list (pRExC_state->paren_name_list)
153 #define RExC_flags (pRExC_state->flags)
154 #define RExC_precomp (pRExC_state->precomp)
155 #define RExC_rx (pRExC_state->rx)
156 #define RExC_rxi (pRExC_state->rxi)
157 #define RExC_start (pRExC_state->start)
158 #define RExC_end (pRExC_state->end)
159 #define RExC_parse (pRExC_state->parse)
160 #define RExC_whilem_seen (pRExC_state->whilem_seen)
161 #ifdef RE_TRACK_PATTERN_OFFSETS
162 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
164 #define RExC_emit (pRExC_state->emit)
165 #define RExC_emit_start (pRExC_state->emit_start)
166 #define RExC_emit_bound (pRExC_state->emit_bound)
167 #define RExC_naughty (pRExC_state->naughty)
168 #define RExC_sawback (pRExC_state->sawback)
169 #define RExC_seen (pRExC_state->seen)
170 #define RExC_size (pRExC_state->size)
171 #define RExC_npar (pRExC_state->npar)
172 #define RExC_nestroot (pRExC_state->nestroot)
173 #define RExC_extralen (pRExC_state->extralen)
174 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
175 #define RExC_seen_evals (pRExC_state->seen_evals)
176 #define RExC_utf8 (pRExC_state->utf8)
177 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
178 #define RExC_charnames (pRExC_state->charnames)
179 #define RExC_open_parens (pRExC_state->open_parens)
180 #define RExC_close_parens (pRExC_state->close_parens)
181 #define RExC_opend (pRExC_state->opend)
182 #define RExC_paren_names (pRExC_state->paren_names)
183 #define RExC_recurse (pRExC_state->recurse)
184 #define RExC_recurse_count (pRExC_state->recurse_count)
187 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
188 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 ((*s) == '{' && regcurly(s)))
192 #undef SPSTART /* dratted cpp namespace... */
195 * Flags to be passed up and down.
197 #define WORST 0 /* Worst case. */
198 #define HASWIDTH 0x01 /* Known to match non-null strings. */
199 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
200 #define SPSTART 0x04 /* Starts with * or +. */
201 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
202 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
204 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
206 /* whether trie related optimizations are enabled */
207 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208 #define TRIE_STUDY_OPT
209 #define FULL_TRIE_STUDY
215 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216 #define PBITVAL(paren) (1 << ((paren) & 7))
217 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
222 /* About scan_data_t.
224 During optimisation we recurse through the regexp program performing
225 various inplace (keyhole style) optimisations. In addition study_chunk
226 and scan_commit populate this data structure with information about
227 what strings MUST appear in the pattern. We look for the longest
228 string that must appear for at a fixed location, and we look for the
229 longest string that may appear at a floating location. So for instance
234 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
235 strings (because they follow a .* construct). study_chunk will identify
236 both FOO and BAR as being the longest fixed and floating strings respectively.
238 The strings can be composites, for instance
242 will result in a composite fixed substring 'foo'.
244 For each string some basic information is maintained:
246 - offset or min_offset
247 This is the position the string must appear at, or not before.
248 It also implicitly (when combined with minlenp) tells us how many
249 character must match before the string we are searching.
250 Likewise when combined with minlenp and the length of the string
251 tells us how many characters must appear after the string we have
255 Only used for floating strings. This is the rightmost point that
256 the string can appear at. Ifset to I32 max it indicates that the
257 string can occur infinitely far to the right.
260 A pointer to the minimum length of the pattern that the string
261 was found inside. This is important as in the case of positive
262 lookahead or positive lookbehind we can have multiple patterns
267 The minimum length of the pattern overall is 3, the minimum length
268 of the lookahead part is 3, but the minimum length of the part that
269 will actually match is 1. So 'FOO's minimum length is 3, but the
270 minimum length for the F is 1. This is important as the minimum length
271 is used to determine offsets in front of and behind the string being
272 looked for. Since strings can be composites this is the length of the
273 pattern at the time it was commited with a scan_commit. Note that
274 the length is calculated by study_chunk, so that the minimum lengths
275 are not known until the full pattern has been compiled, thus the
276 pointer to the value.
280 In the case of lookbehind the string being searched for can be
281 offset past the start point of the final matching string.
282 If this value was just blithely removed from the min_offset it would
283 invalidate some of the calculations for how many chars must match
284 before or after (as they are derived from min_offset and minlen and
285 the length of the string being searched for).
286 When the final pattern is compiled and the data is moved from the
287 scan_data_t structure into the regexp structure the information
288 about lookbehind is factored in, with the information that would
289 have been lost precalculated in the end_shift field for the
292 The fields pos_min and pos_delta are used to store the minimum offset
293 and the delta to the maximum offset at the current point in the pattern.
297 typedef struct scan_data_t {
298 /*I32 len_min; unused */
299 /*I32 len_delta; unused */
303 I32 last_end; /* min value, <0 unless valid. */
306 SV **longest; /* Either &l_fixed, or &l_float. */
307 SV *longest_fixed; /* longest fixed string found in pattern */
308 I32 offset_fixed; /* offset where it starts */
309 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
310 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
311 SV *longest_float; /* longest floating string found in pattern */
312 I32 offset_float_min; /* earliest point in string it can appear */
313 I32 offset_float_max; /* latest point in string it can appear */
314 I32 *minlen_float; /* pointer to the minlen relevent to the string */
315 I32 lookbehind_float; /* is the position of the string modified by LB */
319 struct regnode_charclass_class *start_class;
323 * Forward declarations for pregcomp()'s friends.
326 static const scan_data_t zero_scan_data =
327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
329 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
330 #define SF_BEFORE_SEOL 0x0001
331 #define SF_BEFORE_MEOL 0x0002
332 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
336 # define SF_FIX_SHIFT_EOL (0+2)
337 # define SF_FL_SHIFT_EOL (0+4)
339 # define SF_FIX_SHIFT_EOL (+2)
340 # define SF_FL_SHIFT_EOL (+4)
343 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
346 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
348 #define SF_IS_INF 0x0040
349 #define SF_HAS_PAR 0x0080
350 #define SF_IN_PAR 0x0100
351 #define SF_HAS_EVAL 0x0200
352 #define SCF_DO_SUBSTR 0x0400
353 #define SCF_DO_STCLASS_AND 0x0800
354 #define SCF_DO_STCLASS_OR 0x1000
355 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
356 #define SCF_WHILEM_VISITED_POS 0x2000
358 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
359 #define SCF_SEEN_ACCEPT 0x8000
361 #define UTF (RExC_utf8 != 0)
362 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
365 #define OOB_UNICODE 12345678
366 #define OOB_NAMEDCLASS -1
368 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
372 /* length of regex to show in messages that don't mark a position within */
373 #define RegexLengthToShowInErrorMessages 127
376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
378 * op/pragma/warn/regcomp.
380 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
381 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
383 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
387 * arg. Show regex, up to a maximum length. If it's too long, chop and add
390 #define _FAIL(code) STMT_START { \
391 const char *ellipses = ""; \
392 IV len = RExC_end - RExC_precomp; \
395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
396 if (len > RegexLengthToShowInErrorMessages) { \
397 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
398 len = RegexLengthToShowInErrorMessages - 10; \
404 #define FAIL(msg) _FAIL( \
405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
406 msg, (int)len, RExC_precomp, ellipses))
408 #define FAIL2(msg,arg) _FAIL( \
409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
410 arg, (int)len, RExC_precomp, ellipses))
413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
415 #define Simple_vFAIL(m) STMT_START { \
416 const IV offset = RExC_parse - RExC_precomp; \
417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
424 #define vFAIL(m) STMT_START { \
426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
431 * Like Simple_vFAIL(), but accepts two arguments.
433 #define Simple_vFAIL2(m,a1) STMT_START { \
434 const IV offset = RExC_parse - RExC_precomp; \
435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
436 (int)offset, RExC_precomp, RExC_precomp + offset); \
440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
442 #define vFAIL2(m,a1) STMT_START { \
444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
445 Simple_vFAIL2(m, a1); \
450 * Like Simple_vFAIL(), but accepts three arguments.
452 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
453 const IV offset = RExC_parse - RExC_precomp; \
454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
455 (int)offset, RExC_precomp, RExC_precomp + offset); \
459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
461 #define vFAIL3(m,a1,a2) STMT_START { \
463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
464 Simple_vFAIL3(m, a1, a2); \
468 * Like Simple_vFAIL(), but accepts four arguments.
470 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
471 const IV offset = RExC_parse - RExC_precomp; \
472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
473 (int)offset, RExC_precomp, RExC_precomp + offset); \
476 #define vWARN(loc,m) STMT_START { \
477 const IV offset = loc - RExC_precomp; \
478 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
479 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
482 #define vWARNdep(loc,m) STMT_START { \
483 const IV offset = loc - RExC_precomp; \
484 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
485 "%s" REPORT_LOCATION, \
486 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
490 #define vWARN2(loc, m, a1) STMT_START { \
491 const IV offset = loc - RExC_precomp; \
492 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
493 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
496 #define vWARN3(loc, m, a1, a2) STMT_START { \
497 const IV offset = loc - RExC_precomp; \
498 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
499 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
502 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
503 const IV offset = loc - RExC_precomp; \
504 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
505 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
508 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
509 const IV offset = loc - RExC_precomp; \
510 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
511 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
515 /* Allow for side effects in s */
516 #define REGC(c,s) STMT_START { \
517 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
520 /* Macros for recording node offsets. 20001227 mjd@plover.com
521 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
522 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
523 * Element 0 holds the number n.
524 * Position is 1 indexed.
526 #ifndef RE_TRACK_PATTERN_OFFSETS
527 #define Set_Node_Offset_To_R(node,byte)
528 #define Set_Node_Offset(node,byte)
529 #define Set_Cur_Node_Offset
530 #define Set_Node_Length_To_R(node,len)
531 #define Set_Node_Length(node,len)
532 #define Set_Node_Cur_Length(node)
533 #define Node_Offset(n)
534 #define Node_Length(n)
535 #define Set_Node_Offset_Length(node,offset,len)
536 #define ProgLen(ri) ri->u.proglen
537 #define SetProgLen(ri,x) ri->u.proglen = x
539 #define ProgLen(ri) ri->u.offsets[0]
540 #define SetProgLen(ri,x) ri->u.offsets[0] = x
541 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
543 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
544 __LINE__, (int)(node), (int)(byte))); \
546 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
548 RExC_offsets[2*(node)-1] = (byte); \
553 #define Set_Node_Offset(node,byte) \
554 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
555 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
557 #define Set_Node_Length_To_R(node,len) STMT_START { \
559 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
560 __LINE__, (int)(node), (int)(len))); \
562 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
564 RExC_offsets[2*(node)] = (len); \
569 #define Set_Node_Length(node,len) \
570 Set_Node_Length_To_R((node)-RExC_emit_start, len)
571 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
572 #define Set_Node_Cur_Length(node) \
573 Set_Node_Length(node, RExC_parse - parse_start)
575 /* Get offsets and lengths */
576 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
577 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
579 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
580 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
581 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
585 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
586 #define EXPERIMENTAL_INPLACESCAN
587 #endif /*RE_TRACK_PATTERN_OFFSETS*/
589 #define DEBUG_STUDYDATA(str,data,depth) \
590 DEBUG_OPTIMISE_MORE_r(if(data){ \
591 PerlIO_printf(Perl_debug_log, \
592 "%*s" str "Pos:%"IVdf"/%"IVdf \
593 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
594 (int)(depth)*2, "", \
595 (IV)((data)->pos_min), \
596 (IV)((data)->pos_delta), \
597 (UV)((data)->flags), \
598 (IV)((data)->whilem_c), \
599 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
600 is_inf ? "INF " : "" \
602 if ((data)->last_found) \
603 PerlIO_printf(Perl_debug_log, \
604 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
605 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
606 SvPVX_const((data)->last_found), \
607 (IV)((data)->last_end), \
608 (IV)((data)->last_start_min), \
609 (IV)((data)->last_start_max), \
610 ((data)->longest && \
611 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
612 SvPVX_const((data)->longest_fixed), \
613 (IV)((data)->offset_fixed), \
614 ((data)->longest && \
615 (data)->longest==&((data)->longest_float)) ? "*" : "", \
616 SvPVX_const((data)->longest_float), \
617 (IV)((data)->offset_float_min), \
618 (IV)((data)->offset_float_max) \
620 PerlIO_printf(Perl_debug_log,"\n"); \
623 static void clear_re(pTHX_ void *r);
625 /* Mark that we cannot extend a found fixed substring at this point.
626 Update the longest found anchored substring and the longest found
627 floating substrings if needed. */
630 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
632 const STRLEN l = CHR_SVLEN(data->last_found);
633 const STRLEN old_l = CHR_SVLEN(*data->longest);
634 GET_RE_DEBUG_FLAGS_DECL;
636 PERL_ARGS_ASSERT_SCAN_COMMIT;
638 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
639 SvSetMagicSV(*data->longest, data->last_found);
640 if (*data->longest == data->longest_fixed) {
641 data->offset_fixed = l ? data->last_start_min : data->pos_min;
642 if (data->flags & SF_BEFORE_EOL)
644 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
646 data->flags &= ~SF_FIX_BEFORE_EOL;
647 data->minlen_fixed=minlenp;
648 data->lookbehind_fixed=0;
650 else { /* *data->longest == data->longest_float */
651 data->offset_float_min = l ? data->last_start_min : data->pos_min;
652 data->offset_float_max = (l
653 ? data->last_start_max
654 : data->pos_min + data->pos_delta);
655 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
656 data->offset_float_max = I32_MAX;
657 if (data->flags & SF_BEFORE_EOL)
659 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
661 data->flags &= ~SF_FL_BEFORE_EOL;
662 data->minlen_float=minlenp;
663 data->lookbehind_float=0;
666 SvCUR_set(data->last_found, 0);
668 SV * const sv = data->last_found;
669 if (SvUTF8(sv) && SvMAGICAL(sv)) {
670 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
676 data->flags &= ~SF_BEFORE_EOL;
677 DEBUG_STUDYDATA("commit: ",data,0);
680 /* Can match anything (initialization) */
682 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
684 PERL_ARGS_ASSERT_CL_ANYTHING;
686 ANYOF_CLASS_ZERO(cl);
687 ANYOF_BITMAP_SETALL(cl);
688 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
690 cl->flags |= ANYOF_LOCALE;
693 /* Can match anything (initialization) */
695 S_cl_is_anything(const struct regnode_charclass_class *cl)
699 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
701 for (value = 0; value <= ANYOF_MAX; value += 2)
702 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
704 if (!(cl->flags & ANYOF_UNICODE_ALL))
706 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
711 /* Can match anything (initialization) */
713 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
715 PERL_ARGS_ASSERT_CL_INIT;
717 Zero(cl, 1, struct regnode_charclass_class);
719 cl_anything(pRExC_state, cl);
723 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
725 PERL_ARGS_ASSERT_CL_INIT_ZERO;
727 Zero(cl, 1, struct regnode_charclass_class);
729 cl_anything(pRExC_state, cl);
731 cl->flags |= ANYOF_LOCALE;
734 /* 'And' a given class with another one. Can create false positives */
735 /* We assume that cl is not inverted */
737 S_cl_and(struct regnode_charclass_class *cl,
738 const struct regnode_charclass_class *and_with)
740 PERL_ARGS_ASSERT_CL_AND;
742 assert(and_with->type == ANYOF);
743 if (!(and_with->flags & ANYOF_CLASS)
744 && !(cl->flags & ANYOF_CLASS)
745 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
746 && !(and_with->flags & ANYOF_FOLD)
747 && !(cl->flags & ANYOF_FOLD)) {
750 if (and_with->flags & ANYOF_INVERT)
751 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
752 cl->bitmap[i] &= ~and_with->bitmap[i];
754 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
755 cl->bitmap[i] &= and_with->bitmap[i];
756 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
757 if (!(and_with->flags & ANYOF_EOS))
758 cl->flags &= ~ANYOF_EOS;
760 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
761 !(and_with->flags & ANYOF_INVERT)) {
762 cl->flags &= ~ANYOF_UNICODE_ALL;
763 cl->flags |= ANYOF_UNICODE;
764 ARG_SET(cl, ARG(and_with));
766 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
767 !(and_with->flags & ANYOF_INVERT))
768 cl->flags &= ~ANYOF_UNICODE_ALL;
769 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
770 !(and_with->flags & ANYOF_INVERT))
771 cl->flags &= ~ANYOF_UNICODE;
774 /* 'OR' a given class with another one. Can create false positives */
775 /* We assume that cl is not inverted */
777 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
779 PERL_ARGS_ASSERT_CL_OR;
781 if (or_with->flags & ANYOF_INVERT) {
783 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
784 * <= (B1 | !B2) | (CL1 | !CL2)
785 * which is wasteful if CL2 is small, but we ignore CL2:
786 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
787 * XXXX Can we handle case-fold? Unclear:
788 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
789 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
791 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
792 && !(or_with->flags & ANYOF_FOLD)
793 && !(cl->flags & ANYOF_FOLD) ) {
796 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
797 cl->bitmap[i] |= ~or_with->bitmap[i];
798 } /* XXXX: logic is complicated otherwise */
800 cl_anything(pRExC_state, cl);
803 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
804 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 && (!(or_with->flags & ANYOF_FOLD)
806 || (cl->flags & ANYOF_FOLD)) ) {
809 /* OR char bitmap and class bitmap separately */
810 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
811 cl->bitmap[i] |= or_with->bitmap[i];
812 if (or_with->flags & ANYOF_CLASS) {
813 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
814 cl->classflags[i] |= or_with->classflags[i];
815 cl->flags |= ANYOF_CLASS;
818 else { /* XXXX: logic is complicated, leave it along for a moment. */
819 cl_anything(pRExC_state, cl);
822 if (or_with->flags & ANYOF_EOS)
823 cl->flags |= ANYOF_EOS;
825 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
826 ARG(cl) != ARG(or_with)) {
827 cl->flags |= ANYOF_UNICODE_ALL;
828 cl->flags &= ~ANYOF_UNICODE;
830 if (or_with->flags & ANYOF_UNICODE_ALL) {
831 cl->flags |= ANYOF_UNICODE_ALL;
832 cl->flags &= ~ANYOF_UNICODE;
836 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
837 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
838 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
839 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
844 dump_trie(trie,widecharmap,revcharmap)
845 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
846 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
848 These routines dump out a trie in a somewhat readable format.
849 The _interim_ variants are used for debugging the interim
850 tables that are used to generate the final compressed
851 representation which is what dump_trie expects.
853 Part of the reason for their existance is to provide a form
854 of documentation as to how the different representations function.
859 Dumps the final compressed table form of the trie to Perl_debug_log.
860 Used for debugging make_trie().
864 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
865 AV *revcharmap, U32 depth)
868 SV *sv=sv_newmortal();
869 int colwidth= widecharmap ? 6 : 4;
870 GET_RE_DEBUG_FLAGS_DECL;
872 PERL_ARGS_ASSERT_DUMP_TRIE;
874 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
875 (int)depth * 2 + 2,"",
876 "Match","Base","Ofs" );
878 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
879 SV ** const tmp = av_fetch( revcharmap, state, 0);
881 PerlIO_printf( Perl_debug_log, "%*s",
883 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
884 PL_colors[0], PL_colors[1],
885 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
886 PERL_PV_ESCAPE_FIRSTCHAR
891 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
892 (int)depth * 2 + 2,"");
894 for( state = 0 ; state < trie->uniquecharcount ; state++ )
895 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
896 PerlIO_printf( Perl_debug_log, "\n");
898 for( state = 1 ; state < trie->statecount ; state++ ) {
899 const U32 base = trie->states[ state ].trans.base;
901 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
903 if ( trie->states[ state ].wordnum ) {
904 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
906 PerlIO_printf( Perl_debug_log, "%6s", "" );
909 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
914 while( ( base + ofs < trie->uniquecharcount ) ||
915 ( base + ofs - trie->uniquecharcount < trie->lasttrans
916 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
919 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
921 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
922 if ( ( base + ofs >= trie->uniquecharcount ) &&
923 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
924 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
926 PerlIO_printf( Perl_debug_log, "%*"UVXf,
928 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
930 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
934 PerlIO_printf( Perl_debug_log, "]");
937 PerlIO_printf( Perl_debug_log, "\n" );
941 Dumps a fully constructed but uncompressed trie in list form.
942 List tries normally only are used for construction when the number of
943 possible chars (trie->uniquecharcount) is very high.
944 Used for debugging make_trie().
947 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
948 HV *widecharmap, AV *revcharmap, U32 next_alloc,
952 SV *sv=sv_newmortal();
953 int colwidth= widecharmap ? 6 : 4;
954 GET_RE_DEBUG_FLAGS_DECL;
956 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
958 /* print out the table precompression. */
959 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
960 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
961 "------:-----+-----------------\n" );
963 for( state=1 ; state < next_alloc ; state ++ ) {
966 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
967 (int)depth * 2 + 2,"", (UV)state );
968 if ( ! trie->states[ state ].wordnum ) {
969 PerlIO_printf( Perl_debug_log, "%5s| ","");
971 PerlIO_printf( Perl_debug_log, "W%4x| ",
972 trie->states[ state ].wordnum
975 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
976 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
978 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
980 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
981 PL_colors[0], PL_colors[1],
982 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
983 PERL_PV_ESCAPE_FIRSTCHAR
985 TRIE_LIST_ITEM(state,charid).forid,
986 (UV)TRIE_LIST_ITEM(state,charid).newstate
989 PerlIO_printf(Perl_debug_log, "\n%*s| ",
990 (int)((depth * 2) + 14), "");
993 PerlIO_printf( Perl_debug_log, "\n");
998 Dumps a fully constructed but uncompressed trie in table form.
999 This is the normal DFA style state transition table, with a few
1000 twists to facilitate compression later.
1001 Used for debugging make_trie().
1004 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1005 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1010 SV *sv=sv_newmortal();
1011 int colwidth= widecharmap ? 6 : 4;
1012 GET_RE_DEBUG_FLAGS_DECL;
1014 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1017 print out the table precompression so that we can do a visual check
1018 that they are identical.
1021 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1023 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1024 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1026 PerlIO_printf( Perl_debug_log, "%*s",
1028 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1029 PL_colors[0], PL_colors[1],
1030 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1031 PERL_PV_ESCAPE_FIRSTCHAR
1037 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1039 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1040 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1043 PerlIO_printf( Perl_debug_log, "\n" );
1045 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1047 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1048 (int)depth * 2 + 2,"",
1049 (UV)TRIE_NODENUM( state ) );
1051 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1052 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1054 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1056 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1058 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1059 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1061 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1062 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1069 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1070 startbranch: the first branch in the whole branch sequence
1071 first : start branch of sequence of branch-exact nodes.
1072 May be the same as startbranch
1073 last : Thing following the last branch.
1074 May be the same as tail.
1075 tail : item following the branch sequence
1076 count : words in the sequence
1077 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1078 depth : indent depth
1080 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1082 A trie is an N'ary tree where the branches are determined by digital
1083 decomposition of the key. IE, at the root node you look up the 1st character and
1084 follow that branch repeat until you find the end of the branches. Nodes can be
1085 marked as "accepting" meaning they represent a complete word. Eg:
1089 would convert into the following structure. Numbers represent states, letters
1090 following numbers represent valid transitions on the letter from that state, if
1091 the number is in square brackets it represents an accepting state, otherwise it
1092 will be in parenthesis.
1094 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1098 (1) +-i->(6)-+-s->[7]
1100 +-s->(3)-+-h->(4)-+-e->[5]
1102 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1104 This shows that when matching against the string 'hers' we will begin at state 1
1105 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1106 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1107 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1108 single traverse. We store a mapping from accepting to state to which word was
1109 matched, and then when we have multiple possibilities we try to complete the
1110 rest of the regex in the order in which they occured in the alternation.
1112 The only prior NFA like behaviour that would be changed by the TRIE support is
1113 the silent ignoring of duplicate alternations which are of the form:
1115 / (DUPE|DUPE) X? (?{ ... }) Y /x
1117 Thus EVAL blocks follwing a trie may be called a different number of times with
1118 and without the optimisation. With the optimisations dupes will be silently
1119 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1120 the following demonstrates:
1122 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1124 which prints out 'word' three times, but
1126 'words'=~/(word|word|word)(?{ print $1 })S/
1128 which doesnt print it out at all. This is due to other optimisations kicking in.
1130 Example of what happens on a structural level:
1132 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1134 1: CURLYM[1] {1,32767}(18)
1145 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1146 and should turn into:
1148 1: CURLYM[1] {1,32767}(18)
1150 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1158 Cases where tail != last would be like /(?foo|bar)baz/:
1168 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1169 and would end up looking like:
1172 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1179 d = uvuni_to_utf8_flags(d, uv, 0);
1181 is the recommended Unicode-aware way of saying
1186 #define TRIE_STORE_REVCHAR \
1189 SV *zlopp = newSV(2); \
1190 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1191 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1192 SvCUR_set(zlopp, kapow - flrbbbbb); \
1195 av_push(revcharmap, zlopp); \
1197 char ooooff = (char)uvc; \
1198 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1202 #define TRIE_READ_CHAR STMT_START { \
1206 if ( foldlen > 0 ) { \
1207 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1212 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1213 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1214 foldlen -= UNISKIP( uvc ); \
1215 scan = foldbuf + UNISKIP( uvc ); \
1218 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1228 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1229 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1230 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1231 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1233 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1234 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1235 TRIE_LIST_CUR( state )++; \
1238 #define TRIE_LIST_NEW(state) STMT_START { \
1239 Newxz( trie->states[ state ].trans.list, \
1240 4, reg_trie_trans_le ); \
1241 TRIE_LIST_CUR( state ) = 1; \
1242 TRIE_LIST_LEN( state ) = 4; \
1245 #define TRIE_HANDLE_WORD(state) STMT_START { \
1246 U16 dupe= trie->states[ state ].wordnum; \
1247 regnode * const noper_next = regnext( noper ); \
1249 if (trie->wordlen) \
1250 trie->wordlen[ curword ] = wordlen; \
1252 /* store the word for dumping */ \
1254 if (OP(noper) != NOTHING) \
1255 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1257 tmp = newSVpvn_utf8( "", 0, UTF ); \
1258 av_push( trie_words, tmp ); \
1263 if ( noper_next < tail ) { \
1265 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1266 trie->jump[curword] = (U16)(noper_next - convert); \
1268 jumper = noper_next; \
1270 nextbranch= regnext(cur); \
1274 /* So it's a dupe. This means we need to maintain a */\
1275 /* linked-list from the first to the next. */\
1276 /* we only allocate the nextword buffer when there */\
1277 /* a dupe, so first time we have to do the allocation */\
1278 if (!trie->nextword) \
1279 trie->nextword = (U16 *) \
1280 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1281 while ( trie->nextword[dupe] ) \
1282 dupe= trie->nextword[dupe]; \
1283 trie->nextword[dupe]= curword; \
1285 /* we haven't inserted this word yet. */ \
1286 trie->states[ state ].wordnum = curword; \
1291 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1292 ( ( base + charid >= ucharcount \
1293 && base + charid < ubound \
1294 && state == trie->trans[ base - ucharcount + charid ].check \
1295 && trie->trans[ base - ucharcount + charid ].next ) \
1296 ? trie->trans[ base - ucharcount + charid ].next \
1297 : ( state==1 ? special : 0 ) \
1301 #define MADE_JUMP_TRIE 2
1302 #define MADE_EXACT_TRIE 4
1305 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1308 /* first pass, loop through and scan words */
1309 reg_trie_data *trie;
1310 HV *widecharmap = NULL;
1311 AV *revcharmap = newAV();
1313 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1318 regnode *jumper = NULL;
1319 regnode *nextbranch = NULL;
1320 regnode *convert = NULL;
1321 /* we just use folder as a flag in utf8 */
1322 const U8 * const folder = ( flags == EXACTF
1324 : ( flags == EXACTFL
1331 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1332 AV *trie_words = NULL;
1333 /* along with revcharmap, this only used during construction but both are
1334 * useful during debugging so we store them in the struct when debugging.
1337 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1338 STRLEN trie_charcount=0;
1340 SV *re_trie_maxbuff;
1341 GET_RE_DEBUG_FLAGS_DECL;
1343 PERL_ARGS_ASSERT_MAKE_TRIE;
1345 PERL_UNUSED_ARG(depth);
1348 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1350 trie->startstate = 1;
1351 trie->wordcount = word_count;
1352 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1353 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1354 if (!(UTF && folder))
1355 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1357 trie_words = newAV();
1360 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1361 if (!SvIOK(re_trie_maxbuff)) {
1362 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1365 PerlIO_printf( Perl_debug_log,
1366 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1367 (int)depth * 2 + 2, "",
1368 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1369 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1373 /* Find the node we are going to overwrite */
1374 if ( first == startbranch && OP( last ) != BRANCH ) {
1375 /* whole branch chain */
1378 /* branch sub-chain */
1379 convert = NEXTOPER( first );
1382 /* -- First loop and Setup --
1384 We first traverse the branches and scan each word to determine if it
1385 contains widechars, and how many unique chars there are, this is
1386 important as we have to build a table with at least as many columns as we
1389 We use an array of integers to represent the character codes 0..255
1390 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1391 native representation of the character value as the key and IV's for the
1394 *TODO* If we keep track of how many times each character is used we can
1395 remap the columns so that the table compression later on is more
1396 efficient in terms of memory by ensuring most common value is in the
1397 middle and the least common are on the outside. IMO this would be better
1398 than a most to least common mapping as theres a decent chance the most
1399 common letter will share a node with the least common, meaning the node
1400 will not be compressable. With a middle is most common approach the worst
1401 case is when we have the least common nodes twice.
1405 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1406 regnode * const noper = NEXTOPER( cur );
1407 const U8 *uc = (U8*)STRING( noper );
1408 const U8 * const e = uc + STR_LEN( noper );
1410 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1411 const U8 *scan = (U8*)NULL;
1412 U32 wordlen = 0; /* required init */
1414 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1416 if (OP(noper) == NOTHING) {
1420 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1421 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1422 regardless of encoding */
1424 for ( ; uc < e ; uc += len ) {
1425 TRIE_CHARCOUNT(trie)++;
1429 if ( !trie->charmap[ uvc ] ) {
1430 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1432 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1436 /* store the codepoint in the bitmap, and if its ascii
1437 also store its folded equivelent. */
1438 TRIE_BITMAP_SET(trie,uvc);
1440 /* store the folded codepoint */
1441 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1444 /* store first byte of utf8 representation of
1445 codepoints in the 127 < uvc < 256 range */
1446 if (127 < uvc && uvc < 192) {
1447 TRIE_BITMAP_SET(trie,194);
1448 } else if (191 < uvc ) {
1449 TRIE_BITMAP_SET(trie,195);
1450 /* && uvc < 256 -- we know uvc is < 256 already */
1453 set_bit = 0; /* We've done our bit :-) */
1458 widecharmap = newHV();
1460 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1463 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1465 if ( !SvTRUE( *svpp ) ) {
1466 sv_setiv( *svpp, ++trie->uniquecharcount );
1471 if( cur == first ) {
1474 } else if (chars < trie->minlen) {
1476 } else if (chars > trie->maxlen) {
1480 } /* end first pass */
1481 DEBUG_TRIE_COMPILE_r(
1482 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1483 (int)depth * 2 + 2,"",
1484 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1485 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1486 (int)trie->minlen, (int)trie->maxlen )
1488 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1491 We now know what we are dealing with in terms of unique chars and
1492 string sizes so we can calculate how much memory a naive
1493 representation using a flat table will take. If it's over a reasonable
1494 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1495 conservative but potentially much slower representation using an array
1498 At the end we convert both representations into the same compressed
1499 form that will be used in regexec.c for matching with. The latter
1500 is a form that cannot be used to construct with but has memory
1501 properties similar to the list form and access properties similar
1502 to the table form making it both suitable for fast searches and
1503 small enough that its feasable to store for the duration of a program.
1505 See the comment in the code where the compressed table is produced
1506 inplace from the flat tabe representation for an explanation of how
1507 the compression works.
1512 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1514 Second Pass -- Array Of Lists Representation
1516 Each state will be represented by a list of charid:state records
1517 (reg_trie_trans_le) the first such element holds the CUR and LEN
1518 points of the allocated array. (See defines above).
1520 We build the initial structure using the lists, and then convert
1521 it into the compressed table form which allows faster lookups
1522 (but cant be modified once converted).
1525 STRLEN transcount = 1;
1527 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1528 "%*sCompiling trie using list compiler\n",
1529 (int)depth * 2 + 2, ""));
1531 trie->states = (reg_trie_state *)
1532 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1533 sizeof(reg_trie_state) );
1537 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1539 regnode * const noper = NEXTOPER( cur );
1540 U8 *uc = (U8*)STRING( noper );
1541 const U8 * const e = uc + STR_LEN( noper );
1542 U32 state = 1; /* required init */
1543 U16 charid = 0; /* sanity init */
1544 U8 *scan = (U8*)NULL; /* sanity init */
1545 STRLEN foldlen = 0; /* required init */
1546 U32 wordlen = 0; /* required init */
1547 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1549 if (OP(noper) != NOTHING) {
1550 for ( ; uc < e ; uc += len ) {
1555 charid = trie->charmap[ uvc ];
1557 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1561 charid=(U16)SvIV( *svpp );
1564 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1571 if ( !trie->states[ state ].trans.list ) {
1572 TRIE_LIST_NEW( state );
1574 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1575 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1576 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1581 newstate = next_alloc++;
1582 TRIE_LIST_PUSH( state, charid, newstate );
1587 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1591 TRIE_HANDLE_WORD(state);
1593 } /* end second pass */
1595 /* next alloc is the NEXT state to be allocated */
1596 trie->statecount = next_alloc;
1597 trie->states = (reg_trie_state *)
1598 PerlMemShared_realloc( trie->states,
1600 * sizeof(reg_trie_state) );
1602 /* and now dump it out before we compress it */
1603 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1604 revcharmap, next_alloc,
1608 trie->trans = (reg_trie_trans *)
1609 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1616 for( state=1 ; state < next_alloc ; state ++ ) {
1620 DEBUG_TRIE_COMPILE_MORE_r(
1621 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1625 if (trie->states[state].trans.list) {
1626 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1630 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1631 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1632 if ( forid < minid ) {
1634 } else if ( forid > maxid ) {
1638 if ( transcount < tp + maxid - minid + 1) {
1640 trie->trans = (reg_trie_trans *)
1641 PerlMemShared_realloc( trie->trans,
1643 * sizeof(reg_trie_trans) );
1644 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1646 base = trie->uniquecharcount + tp - minid;
1647 if ( maxid == minid ) {
1649 for ( ; zp < tp ; zp++ ) {
1650 if ( ! trie->trans[ zp ].next ) {
1651 base = trie->uniquecharcount + zp - minid;
1652 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1653 trie->trans[ zp ].check = state;
1659 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1660 trie->trans[ tp ].check = state;
1665 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1666 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1667 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1668 trie->trans[ tid ].check = state;
1670 tp += ( maxid - minid + 1 );
1672 Safefree(trie->states[ state ].trans.list);
1675 DEBUG_TRIE_COMPILE_MORE_r(
1676 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1679 trie->states[ state ].trans.base=base;
1681 trie->lasttrans = tp + 1;
1685 Second Pass -- Flat Table Representation.
1687 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1688 We know that we will need Charcount+1 trans at most to store the data
1689 (one row per char at worst case) So we preallocate both structures
1690 assuming worst case.
1692 We then construct the trie using only the .next slots of the entry
1695 We use the .check field of the first entry of the node temporarily to
1696 make compression both faster and easier by keeping track of how many non
1697 zero fields are in the node.
1699 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1702 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1703 number representing the first entry of the node, and state as a
1704 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1705 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1706 are 2 entrys per node. eg:
1714 The table is internally in the right hand, idx form. However as we also
1715 have to deal with the states array which is indexed by nodenum we have to
1716 use TRIE_NODENUM() to convert.
1719 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1720 "%*sCompiling trie using table compiler\n",
1721 (int)depth * 2 + 2, ""));
1723 trie->trans = (reg_trie_trans *)
1724 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1725 * trie->uniquecharcount + 1,
1726 sizeof(reg_trie_trans) );
1727 trie->states = (reg_trie_state *)
1728 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1729 sizeof(reg_trie_state) );
1730 next_alloc = trie->uniquecharcount + 1;
1733 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1735 regnode * const noper = NEXTOPER( cur );
1736 const U8 *uc = (U8*)STRING( noper );
1737 const U8 * const e = uc + STR_LEN( noper );
1739 U32 state = 1; /* required init */
1741 U16 charid = 0; /* sanity init */
1742 U32 accept_state = 0; /* sanity init */
1743 U8 *scan = (U8*)NULL; /* sanity init */
1745 STRLEN foldlen = 0; /* required init */
1746 U32 wordlen = 0; /* required init */
1747 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1749 if ( OP(noper) != NOTHING ) {
1750 for ( ; uc < e ; uc += len ) {
1755 charid = trie->charmap[ uvc ];
1757 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1758 charid = svpp ? (U16)SvIV(*svpp) : 0;
1762 if ( !trie->trans[ state + charid ].next ) {
1763 trie->trans[ state + charid ].next = next_alloc;
1764 trie->trans[ state ].check++;
1765 next_alloc += trie->uniquecharcount;
1767 state = trie->trans[ state + charid ].next;
1769 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1771 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1774 accept_state = TRIE_NODENUM( state );
1775 TRIE_HANDLE_WORD(accept_state);
1777 } /* end second pass */
1779 /* and now dump it out before we compress it */
1780 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1782 next_alloc, depth+1));
1786 * Inplace compress the table.*
1788 For sparse data sets the table constructed by the trie algorithm will
1789 be mostly 0/FAIL transitions or to put it another way mostly empty.
1790 (Note that leaf nodes will not contain any transitions.)
1792 This algorithm compresses the tables by eliminating most such
1793 transitions, at the cost of a modest bit of extra work during lookup:
1795 - Each states[] entry contains a .base field which indicates the
1796 index in the state[] array wheres its transition data is stored.
1798 - If .base is 0 there are no valid transitions from that node.
1800 - If .base is nonzero then charid is added to it to find an entry in
1803 -If trans[states[state].base+charid].check!=state then the
1804 transition is taken to be a 0/Fail transition. Thus if there are fail
1805 transitions at the front of the node then the .base offset will point
1806 somewhere inside the previous nodes data (or maybe even into a node
1807 even earlier), but the .check field determines if the transition is
1811 The following process inplace converts the table to the compressed
1812 table: We first do not compress the root node 1,and mark its all its
1813 .check pointers as 1 and set its .base pointer as 1 as well. This
1814 allows to do a DFA construction from the compressed table later, and
1815 ensures that any .base pointers we calculate later are greater than
1818 - We set 'pos' to indicate the first entry of the second node.
1820 - We then iterate over the columns of the node, finding the first and
1821 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1822 and set the .check pointers accordingly, and advance pos
1823 appropriately and repreat for the next node. Note that when we copy
1824 the next pointers we have to convert them from the original
1825 NODEIDX form to NODENUM form as the former is not valid post
1828 - If a node has no transitions used we mark its base as 0 and do not
1829 advance the pos pointer.
1831 - If a node only has one transition we use a second pointer into the
1832 structure to fill in allocated fail transitions from other states.
1833 This pointer is independent of the main pointer and scans forward
1834 looking for null transitions that are allocated to a state. When it
1835 finds one it writes the single transition into the "hole". If the
1836 pointer doesnt find one the single transition is appended as normal.
1838 - Once compressed we can Renew/realloc the structures to release the
1841 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1842 specifically Fig 3.47 and the associated pseudocode.
1846 const U32 laststate = TRIE_NODENUM( next_alloc );
1849 trie->statecount = laststate;
1851 for ( state = 1 ; state < laststate ; state++ ) {
1853 const U32 stateidx = TRIE_NODEIDX( state );
1854 const U32 o_used = trie->trans[ stateidx ].check;
1855 U32 used = trie->trans[ stateidx ].check;
1856 trie->trans[ stateidx ].check = 0;
1858 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1859 if ( flag || trie->trans[ stateidx + charid ].next ) {
1860 if ( trie->trans[ stateidx + charid ].next ) {
1862 for ( ; zp < pos ; zp++ ) {
1863 if ( ! trie->trans[ zp ].next ) {
1867 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1868 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1869 trie->trans[ zp ].check = state;
1870 if ( ++zp > pos ) pos = zp;
1877 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1879 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880 trie->trans[ pos ].check = state;
1885 trie->lasttrans = pos + 1;
1886 trie->states = (reg_trie_state *)
1887 PerlMemShared_realloc( trie->states, laststate
1888 * sizeof(reg_trie_state) );
1889 DEBUG_TRIE_COMPILE_MORE_r(
1890 PerlIO_printf( Perl_debug_log,
1891 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1892 (int)depth * 2 + 2,"",
1893 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1896 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1899 } /* end table compress */
1901 DEBUG_TRIE_COMPILE_MORE_r(
1902 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1903 (int)depth * 2 + 2, "",
1904 (UV)trie->statecount,
1905 (UV)trie->lasttrans)
1907 /* resize the trans array to remove unused space */
1908 trie->trans = (reg_trie_trans *)
1909 PerlMemShared_realloc( trie->trans, trie->lasttrans
1910 * sizeof(reg_trie_trans) );
1912 /* and now dump out the compressed format */
1913 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1915 { /* Modify the program and insert the new TRIE node*/
1916 U8 nodetype =(U8)(flags & 0xFF);
1920 regnode *optimize = NULL;
1921 #ifdef RE_TRACK_PATTERN_OFFSETS
1924 U32 mjd_nodelen = 0;
1925 #endif /* RE_TRACK_PATTERN_OFFSETS */
1926 #endif /* DEBUGGING */
1928 This means we convert either the first branch or the first Exact,
1929 depending on whether the thing following (in 'last') is a branch
1930 or not and whther first is the startbranch (ie is it a sub part of
1931 the alternation or is it the whole thing.)
1932 Assuming its a sub part we conver the EXACT otherwise we convert
1933 the whole branch sequence, including the first.
1935 /* Find the node we are going to overwrite */
1936 if ( first != startbranch || OP( last ) == BRANCH ) {
1937 /* branch sub-chain */
1938 NEXT_OFF( first ) = (U16)(last - first);
1939 #ifdef RE_TRACK_PATTERN_OFFSETS
1941 mjd_offset= Node_Offset((convert));
1942 mjd_nodelen= Node_Length((convert));
1945 /* whole branch chain */
1947 #ifdef RE_TRACK_PATTERN_OFFSETS
1950 const regnode *nop = NEXTOPER( convert );
1951 mjd_offset= Node_Offset((nop));
1952 mjd_nodelen= Node_Length((nop));
1956 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1957 (int)depth * 2 + 2, "",
1958 (UV)mjd_offset, (UV)mjd_nodelen)
1961 /* But first we check to see if there is a common prefix we can
1962 split out as an EXACT and put in front of the TRIE node. */
1963 trie->startstate= 1;
1964 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1966 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1970 const U32 base = trie->states[ state ].trans.base;
1972 if ( trie->states[state].wordnum )
1975 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1976 if ( ( base + ofs >= trie->uniquecharcount ) &&
1977 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1978 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1980 if ( ++count > 1 ) {
1981 SV **tmp = av_fetch( revcharmap, ofs, 0);
1982 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1983 if ( state == 1 ) break;
1985 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1987 PerlIO_printf(Perl_debug_log,
1988 "%*sNew Start State=%"UVuf" Class: [",
1989 (int)depth * 2 + 2, "",
1992 SV ** const tmp = av_fetch( revcharmap, idx, 0);
1993 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1995 TRIE_BITMAP_SET(trie,*ch);
1997 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1999 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2003 TRIE_BITMAP_SET(trie,*ch);
2005 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2006 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2012 SV **tmp = av_fetch( revcharmap, idx, 0);
2014 char *ch = SvPV( *tmp, len );
2016 SV *sv=sv_newmortal();
2017 PerlIO_printf( Perl_debug_log,
2018 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2019 (int)depth * 2 + 2, "",
2021 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2022 PL_colors[0], PL_colors[1],
2023 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2024 PERL_PV_ESCAPE_FIRSTCHAR
2029 OP( convert ) = nodetype;
2030 str=STRING(convert);
2033 STR_LEN(convert) += len;
2039 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2045 regnode *n = convert+NODE_SZ_STR(convert);
2046 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2047 trie->startstate = state;
2048 trie->minlen -= (state - 1);
2049 trie->maxlen -= (state - 1);
2051 /* At least the UNICOS C compiler choked on this
2052 * being argument to DEBUG_r(), so let's just have
2055 #ifdef PERL_EXT_RE_BUILD
2061 regnode *fix = convert;
2062 U32 word = trie->wordcount;
2064 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2065 while( ++fix < n ) {
2066 Set_Node_Offset_Length(fix, 0, 0);
2069 SV ** const tmp = av_fetch( trie_words, word, 0 );
2071 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2072 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2074 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2082 NEXT_OFF(convert) = (U16)(tail - convert);
2083 DEBUG_r(optimize= n);
2089 if ( trie->maxlen ) {
2090 NEXT_OFF( convert ) = (U16)(tail - convert);
2091 ARG_SET( convert, data_slot );
2092 /* Store the offset to the first unabsorbed branch in
2093 jump[0], which is otherwise unused by the jump logic.
2094 We use this when dumping a trie and during optimisation. */
2096 trie->jump[0] = (U16)(nextbranch - convert);
2099 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2100 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2102 OP( convert ) = TRIEC;
2103 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2104 PerlMemShared_free(trie->bitmap);
2107 OP( convert ) = TRIE;
2109 /* store the type in the flags */
2110 convert->flags = nodetype;
2114 + regarglen[ OP( convert ) ];
2116 /* XXX We really should free up the resource in trie now,
2117 as we won't use them - (which resources?) dmq */
2119 /* needed for dumping*/
2120 DEBUG_r(if (optimize) {
2121 regnode *opt = convert;
2123 while ( ++opt < optimize) {
2124 Set_Node_Offset_Length(opt,0,0);
2127 Try to clean up some of the debris left after the
2130 while( optimize < jumper ) {
2131 mjd_nodelen += Node_Length((optimize));
2132 OP( optimize ) = OPTIMIZED;
2133 Set_Node_Offset_Length(optimize,0,0);
2136 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2138 } /* end node insert */
2139 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2140 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2142 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2143 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2145 SvREFCNT_dec(revcharmap);
2149 : trie->startstate>1
2155 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2157 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2159 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2160 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2163 We find the fail state for each state in the trie, this state is the longest proper
2164 suffix of the current states 'word' that is also a proper prefix of another word in our
2165 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2166 the DFA not to have to restart after its tried and failed a word at a given point, it
2167 simply continues as though it had been matching the other word in the first place.
2169 'abcdgu'=~/abcdefg|cdgu/
2170 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2171 fail, which would bring use to the state representing 'd' in the second word where we would
2172 try 'g' and succeed, prodceding to match 'cdgu'.
2174 /* add a fail transition */
2175 const U32 trie_offset = ARG(source);
2176 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2178 const U32 ucharcount = trie->uniquecharcount;
2179 const U32 numstates = trie->statecount;
2180 const U32 ubound = trie->lasttrans + ucharcount;
2184 U32 base = trie->states[ 1 ].trans.base;
2187 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2188 GET_RE_DEBUG_FLAGS_DECL;
2190 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2192 PERL_UNUSED_ARG(depth);
2196 ARG_SET( stclass, data_slot );
2197 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2198 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2199 aho->trie=trie_offset;
2200 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2201 Copy( trie->states, aho->states, numstates, reg_trie_state );
2202 Newxz( q, numstates, U32);
2203 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2206 /* initialize fail[0..1] to be 1 so that we always have
2207 a valid final fail state */
2208 fail[ 0 ] = fail[ 1 ] = 1;
2210 for ( charid = 0; charid < ucharcount ; charid++ ) {
2211 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2213 q[ q_write ] = newstate;
2214 /* set to point at the root */
2215 fail[ q[ q_write++ ] ]=1;
2218 while ( q_read < q_write) {
2219 const U32 cur = q[ q_read++ % numstates ];
2220 base = trie->states[ cur ].trans.base;
2222 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2223 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2225 U32 fail_state = cur;
2228 fail_state = fail[ fail_state ];
2229 fail_base = aho->states[ fail_state ].trans.base;
2230 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2232 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2233 fail[ ch_state ] = fail_state;
2234 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2236 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2238 q[ q_write++ % numstates] = ch_state;
2242 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2243 when we fail in state 1, this allows us to use the
2244 charclass scan to find a valid start char. This is based on the principle
2245 that theres a good chance the string being searched contains lots of stuff
2246 that cant be a start char.
2248 fail[ 0 ] = fail[ 1 ] = 0;
2249 DEBUG_TRIE_COMPILE_r({
2250 PerlIO_printf(Perl_debug_log,
2251 "%*sStclass Failtable (%"UVuf" states): 0",
2252 (int)(depth * 2), "", (UV)numstates
2254 for( q_read=1; q_read<numstates; q_read++ ) {
2255 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2257 PerlIO_printf(Perl_debug_log, "\n");
2260 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2265 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2266 * These need to be revisited when a newer toolchain becomes available.
2268 #if defined(__sparc64__) && defined(__GNUC__)
2269 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2270 # undef SPARC64_GCC_WORKAROUND
2271 # define SPARC64_GCC_WORKAROUND 1
2275 #define DEBUG_PEEP(str,scan,depth) \
2276 DEBUG_OPTIMISE_r({if (scan){ \
2277 SV * const mysv=sv_newmortal(); \
2278 regnode *Next = regnext(scan); \
2279 regprop(RExC_rx, mysv, scan); \
2280 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2281 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2282 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2289 #define JOIN_EXACT(scan,min,flags) \
2290 if (PL_regkind[OP(scan)] == EXACT) \
2291 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2294 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2295 /* Merge several consecutive EXACTish nodes into one. */
2296 regnode *n = regnext(scan);
2298 regnode *next = scan + NODE_SZ_STR(scan);
2302 regnode *stop = scan;
2303 GET_RE_DEBUG_FLAGS_DECL;
2305 PERL_UNUSED_ARG(depth);
2308 PERL_ARGS_ASSERT_JOIN_EXACT;
2309 #ifndef EXPERIMENTAL_INPLACESCAN
2310 PERL_UNUSED_ARG(flags);
2311 PERL_UNUSED_ARG(val);
2313 DEBUG_PEEP("join",scan,depth);
2315 /* Skip NOTHING, merge EXACT*. */
2317 ( PL_regkind[OP(n)] == NOTHING ||
2318 (stringok && (OP(n) == OP(scan))))
2320 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2322 if (OP(n) == TAIL || n > next)
2324 if (PL_regkind[OP(n)] == NOTHING) {
2325 DEBUG_PEEP("skip:",n,depth);
2326 NEXT_OFF(scan) += NEXT_OFF(n);
2327 next = n + NODE_STEP_REGNODE;
2334 else if (stringok) {
2335 const unsigned int oldl = STR_LEN(scan);
2336 regnode * const nnext = regnext(n);
2338 DEBUG_PEEP("merg",n,depth);
2341 if (oldl + STR_LEN(n) > U8_MAX)
2343 NEXT_OFF(scan) += NEXT_OFF(n);
2344 STR_LEN(scan) += STR_LEN(n);
2345 next = n + NODE_SZ_STR(n);
2346 /* Now we can overwrite *n : */
2347 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2355 #ifdef EXPERIMENTAL_INPLACESCAN
2356 if (flags && !NEXT_OFF(n)) {
2357 DEBUG_PEEP("atch", val, depth);
2358 if (reg_off_by_arg[OP(n)]) {
2359 ARG_SET(n, val - n);
2362 NEXT_OFF(n) = val - n;
2369 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2371 Two problematic code points in Unicode casefolding of EXACT nodes:
2373 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2374 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2380 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2381 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2383 This means that in case-insensitive matching (or "loose matching",
2384 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2385 length of the above casefolded versions) can match a target string
2386 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2387 This would rather mess up the minimum length computation.
2389 What we'll do is to look for the tail four bytes, and then peek
2390 at the preceding two bytes to see whether we need to decrease
2391 the minimum length by four (six minus two).
2393 Thanks to the design of UTF-8, there cannot be false matches:
2394 A sequence of valid UTF-8 bytes cannot be a subsequence of
2395 another valid sequence of UTF-8 bytes.
2398 char * const s0 = STRING(scan), *s, *t;
2399 char * const s1 = s0 + STR_LEN(scan) - 1;
2400 char * const s2 = s1 - 4;
2401 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2402 const char t0[] = "\xaf\x49\xaf\x42";
2404 const char t0[] = "\xcc\x88\xcc\x81";
2406 const char * const t1 = t0 + 3;
2409 s < s2 && (t = ninstr(s, s1, t0, t1));
2412 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2413 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2415 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2416 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2424 n = scan + NODE_SZ_STR(scan);
2426 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2433 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2437 /* REx optimizer. Converts nodes into quickier variants "in place".
2438 Finds fixed substrings. */
2440 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2441 to the position after last scanned or to NULL. */
2443 #define INIT_AND_WITHP \
2444 assert(!and_withp); \
2445 Newx(and_withp,1,struct regnode_charclass_class); \
2446 SAVEFREEPV(and_withp)
2448 /* this is a chain of data about sub patterns we are processing that
2449 need to be handled seperately/specially in study_chunk. Its so
2450 we can simulate recursion without losing state. */
2452 typedef struct scan_frame {
2453 regnode *last; /* last node to process in this frame */
2454 regnode *next; /* next node to process when last is reached */
2455 struct scan_frame *prev; /*previous frame*/
2456 I32 stop; /* what stopparen do we use */
2460 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2462 #define CASE_SYNST_FNC(nAmE) \
2464 if (flags & SCF_DO_STCLASS_AND) { \
2465 for (value = 0; value < 256; value++) \
2466 if (!is_ ## nAmE ## _cp(value)) \
2467 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2470 for (value = 0; value < 256; value++) \
2471 if (is_ ## nAmE ## _cp(value)) \
2472 ANYOF_BITMAP_SET(data->start_class, value); \
2476 if (flags & SCF_DO_STCLASS_AND) { \
2477 for (value = 0; value < 256; value++) \
2478 if (is_ ## nAmE ## _cp(value)) \
2479 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2482 for (value = 0; value < 256; value++) \
2483 if (!is_ ## nAmE ## _cp(value)) \
2484 ANYOF_BITMAP_SET(data->start_class, value); \
2491 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2492 I32 *minlenp, I32 *deltap,
2497 struct regnode_charclass_class *and_withp,
2498 U32 flags, U32 depth)
2499 /* scanp: Start here (read-write). */
2500 /* deltap: Write maxlen-minlen here. */
2501 /* last: Stop before this one. */
2502 /* data: string data about the pattern */
2503 /* stopparen: treat close N as END */
2504 /* recursed: which subroutines have we recursed into */
2505 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2508 I32 min = 0, pars = 0, code;
2509 regnode *scan = *scanp, *next;
2511 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2512 int is_inf_internal = 0; /* The studied chunk is infinite */
2513 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2514 scan_data_t data_fake;
2515 SV *re_trie_maxbuff = NULL;
2516 regnode *first_non_open = scan;
2517 I32 stopmin = I32_MAX;
2518 scan_frame *frame = NULL;
2519 GET_RE_DEBUG_FLAGS_DECL;
2521 PERL_ARGS_ASSERT_STUDY_CHUNK;
2524 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2528 while (first_non_open && OP(first_non_open) == OPEN)
2529 first_non_open=regnext(first_non_open);
2534 while ( scan && OP(scan) != END && scan < last ){
2535 /* Peephole optimizer: */
2536 DEBUG_STUDYDATA("Peep:", data,depth);
2537 DEBUG_PEEP("Peep",scan,depth);
2538 JOIN_EXACT(scan,&min,0);
2540 /* Follow the next-chain of the current node and optimize
2541 away all the NOTHINGs from it. */
2542 if (OP(scan) != CURLYX) {
2543 const int max = (reg_off_by_arg[OP(scan)]
2545 /* I32 may be smaller than U16 on CRAYs! */
2546 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2547 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2551 /* Skip NOTHING and LONGJMP. */
2552 while ((n = regnext(n))
2553 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2554 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2555 && off + noff < max)
2557 if (reg_off_by_arg[OP(scan)])
2560 NEXT_OFF(scan) = off;
2565 /* The principal pseudo-switch. Cannot be a switch, since we
2566 look into several different things. */
2567 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2568 || OP(scan) == IFTHEN) {
2569 next = regnext(scan);
2571 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2573 if (OP(next) == code || code == IFTHEN) {
2574 /* NOTE - There is similar code to this block below for handling
2575 TRIE nodes on a re-study. If you change stuff here check there
2577 I32 max1 = 0, min1 = I32_MAX, num = 0;
2578 struct regnode_charclass_class accum;
2579 regnode * const startbranch=scan;
2581 if (flags & SCF_DO_SUBSTR)
2582 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2583 if (flags & SCF_DO_STCLASS)
2584 cl_init_zero(pRExC_state, &accum);
2586 while (OP(scan) == code) {
2587 I32 deltanext, minnext, f = 0, fake;
2588 struct regnode_charclass_class this_class;
2591 data_fake.flags = 0;
2593 data_fake.whilem_c = data->whilem_c;
2594 data_fake.last_closep = data->last_closep;
2597 data_fake.last_closep = &fake;
2599 data_fake.pos_delta = delta;
2600 next = regnext(scan);
2601 scan = NEXTOPER(scan);
2603 scan = NEXTOPER(scan);
2604 if (flags & SCF_DO_STCLASS) {
2605 cl_init(pRExC_state, &this_class);
2606 data_fake.start_class = &this_class;
2607 f = SCF_DO_STCLASS_AND;
2609 if (flags & SCF_WHILEM_VISITED_POS)
2610 f |= SCF_WHILEM_VISITED_POS;
2612 /* we suppose the run is continuous, last=next...*/
2613 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2615 stopparen, recursed, NULL, f,depth+1);
2618 if (max1 < minnext + deltanext)
2619 max1 = minnext + deltanext;
2620 if (deltanext == I32_MAX)
2621 is_inf = is_inf_internal = 1;
2623 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2625 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2626 if ( stopmin > minnext)
2627 stopmin = min + min1;
2628 flags &= ~SCF_DO_SUBSTR;
2630 data->flags |= SCF_SEEN_ACCEPT;
2633 if (data_fake.flags & SF_HAS_EVAL)
2634 data->flags |= SF_HAS_EVAL;
2635 data->whilem_c = data_fake.whilem_c;
2637 if (flags & SCF_DO_STCLASS)
2638 cl_or(pRExC_state, &accum, &this_class);
2640 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2642 if (flags & SCF_DO_SUBSTR) {
2643 data->pos_min += min1;
2644 data->pos_delta += max1 - min1;
2645 if (max1 != min1 || is_inf)
2646 data->longest = &(data->longest_float);
2649 delta += max1 - min1;
2650 if (flags & SCF_DO_STCLASS_OR) {
2651 cl_or(pRExC_state, data->start_class, &accum);
2653 cl_and(data->start_class, and_withp);
2654 flags &= ~SCF_DO_STCLASS;
2657 else if (flags & SCF_DO_STCLASS_AND) {
2659 cl_and(data->start_class, &accum);
2660 flags &= ~SCF_DO_STCLASS;
2663 /* Switch to OR mode: cache the old value of
2664 * data->start_class */
2666 StructCopy(data->start_class, and_withp,
2667 struct regnode_charclass_class);
2668 flags &= ~SCF_DO_STCLASS_AND;
2669 StructCopy(&accum, data->start_class,
2670 struct regnode_charclass_class);
2671 flags |= SCF_DO_STCLASS_OR;
2672 data->start_class->flags |= ANYOF_EOS;
2676 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2679 Assuming this was/is a branch we are dealing with: 'scan' now
2680 points at the item that follows the branch sequence, whatever
2681 it is. We now start at the beginning of the sequence and look
2688 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2690 If we can find such a subseqence we need to turn the first
2691 element into a trie and then add the subsequent branch exact
2692 strings to the trie.
2696 1. patterns where the whole set of branch can be converted.
2698 2. patterns where only a subset can be converted.
2700 In case 1 we can replace the whole set with a single regop
2701 for the trie. In case 2 we need to keep the start and end
2704 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2705 becomes BRANCH TRIE; BRANCH X;
2707 There is an additional case, that being where there is a
2708 common prefix, which gets split out into an EXACT like node
2709 preceding the TRIE node.
2711 If x(1..n)==tail then we can do a simple trie, if not we make
2712 a "jump" trie, such that when we match the appropriate word
2713 we "jump" to the appopriate tail node. Essentailly we turn
2714 a nested if into a case structure of sorts.
2719 if (!re_trie_maxbuff) {
2720 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2721 if (!SvIOK(re_trie_maxbuff))
2722 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2724 if ( SvIV(re_trie_maxbuff)>=0 ) {
2726 regnode *first = (regnode *)NULL;
2727 regnode *last = (regnode *)NULL;
2728 regnode *tail = scan;
2733 SV * const mysv = sv_newmortal(); /* for dumping */
2735 /* var tail is used because there may be a TAIL
2736 regop in the way. Ie, the exacts will point to the
2737 thing following the TAIL, but the last branch will
2738 point at the TAIL. So we advance tail. If we
2739 have nested (?:) we may have to move through several
2743 while ( OP( tail ) == TAIL ) {
2744 /* this is the TAIL generated by (?:) */
2745 tail = regnext( tail );
2750 regprop(RExC_rx, mysv, tail );
2751 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2752 (int)depth * 2 + 2, "",
2753 "Looking for TRIE'able sequences. Tail node is: ",
2754 SvPV_nolen_const( mysv )
2760 step through the branches, cur represents each
2761 branch, noper is the first thing to be matched
2762 as part of that branch and noper_next is the
2763 regnext() of that node. if noper is an EXACT
2764 and noper_next is the same as scan (our current
2765 position in the regex) then the EXACT branch is
2766 a possible optimization target. Once we have
2767 two or more consequetive such branches we can
2768 create a trie of the EXACT's contents and stich
2769 it in place. If the sequence represents all of
2770 the branches we eliminate the whole thing and
2771 replace it with a single TRIE. If it is a
2772 subsequence then we need to stitch it in. This
2773 means the first branch has to remain, and needs
2774 to be repointed at the item on the branch chain
2775 following the last branch optimized. This could
2776 be either a BRANCH, in which case the
2777 subsequence is internal, or it could be the
2778 item following the branch sequence in which
2779 case the subsequence is at the end.
2783 /* dont use tail as the end marker for this traverse */
2784 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2785 regnode * const noper = NEXTOPER( cur );
2786 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2787 regnode * const noper_next = regnext( noper );
2791 regprop(RExC_rx, mysv, cur);
2792 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2793 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2795 regprop(RExC_rx, mysv, noper);
2796 PerlIO_printf( Perl_debug_log, " -> %s",
2797 SvPV_nolen_const(mysv));
2800 regprop(RExC_rx, mysv, noper_next );
2801 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2802 SvPV_nolen_const(mysv));
2804 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2805 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2807 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2808 : PL_regkind[ OP( noper ) ] == EXACT )
2809 || OP(noper) == NOTHING )
2811 && noper_next == tail
2816 if ( !first || optype == NOTHING ) {
2817 if (!first) first = cur;
2818 optype = OP( noper );
2824 Currently we assume that the trie can handle unicode and ascii
2825 matches fold cased matches. If this proves true then the following
2826 define will prevent tries in this situation.
2828 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2830 #define TRIE_TYPE_IS_SAFE 1
2831 if ( last && TRIE_TYPE_IS_SAFE ) {
2832 make_trie( pRExC_state,
2833 startbranch, first, cur, tail, count,
2836 if ( PL_regkind[ OP( noper ) ] == EXACT
2838 && noper_next == tail
2843 optype = OP( noper );
2853 regprop(RExC_rx, mysv, cur);
2854 PerlIO_printf( Perl_debug_log,
2855 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2856 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2860 if ( last && TRIE_TYPE_IS_SAFE ) {
2861 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2862 #ifdef TRIE_STUDY_OPT
2863 if ( ((made == MADE_EXACT_TRIE &&
2864 startbranch == first)
2865 || ( first_non_open == first )) &&
2867 flags |= SCF_TRIE_RESTUDY;
2868 if ( startbranch == first
2871 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2881 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2882 scan = NEXTOPER(NEXTOPER(scan));
2883 } else /* single branch is optimized. */
2884 scan = NEXTOPER(scan);
2886 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2887 scan_frame *newframe = NULL;
2892 if (OP(scan) != SUSPEND) {
2893 /* set the pointer */
2894 if (OP(scan) == GOSUB) {
2896 RExC_recurse[ARG2L(scan)] = scan;
2897 start = RExC_open_parens[paren-1];
2898 end = RExC_close_parens[paren-1];
2901 start = RExC_rxi->program + 1;
2905 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2906 SAVEFREEPV(recursed);
2908 if (!PAREN_TEST(recursed,paren+1)) {
2909 PAREN_SET(recursed,paren+1);
2910 Newx(newframe,1,scan_frame);
2912 if (flags & SCF_DO_SUBSTR) {
2913 SCAN_COMMIT(pRExC_state,data,minlenp);
2914 data->longest = &(data->longest_float);
2916 is_inf = is_inf_internal = 1;
2917 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2918 cl_anything(pRExC_state, data->start_class);
2919 flags &= ~SCF_DO_STCLASS;
2922 Newx(newframe,1,scan_frame);
2925 end = regnext(scan);
2930 SAVEFREEPV(newframe);
2931 newframe->next = regnext(scan);
2932 newframe->last = last;
2933 newframe->stop = stopparen;
2934 newframe->prev = frame;
2944 else if (OP(scan) == EXACT) {
2945 I32 l = STR_LEN(scan);
2948 const U8 * const s = (U8*)STRING(scan);
2949 l = utf8_length(s, s + l);
2950 uc = utf8_to_uvchr(s, NULL);
2952 uc = *((U8*)STRING(scan));
2955 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2956 /* The code below prefers earlier match for fixed
2957 offset, later match for variable offset. */
2958 if (data->last_end == -1) { /* Update the start info. */
2959 data->last_start_min = data->pos_min;
2960 data->last_start_max = is_inf
2961 ? I32_MAX : data->pos_min + data->pos_delta;
2963 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2965 SvUTF8_on(data->last_found);
2967 SV * const sv = data->last_found;
2968 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2969 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2970 if (mg && mg->mg_len >= 0)
2971 mg->mg_len += utf8_length((U8*)STRING(scan),
2972 (U8*)STRING(scan)+STR_LEN(scan));
2974 data->last_end = data->pos_min + l;
2975 data->pos_min += l; /* As in the first entry. */
2976 data->flags &= ~SF_BEFORE_EOL;
2978 if (flags & SCF_DO_STCLASS_AND) {
2979 /* Check whether it is compatible with what we know already! */
2983 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2984 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2985 && (!(data->start_class->flags & ANYOF_FOLD)
2986 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2989 ANYOF_CLASS_ZERO(data->start_class);
2990 ANYOF_BITMAP_ZERO(data->start_class);
2992 ANYOF_BITMAP_SET(data->start_class, uc);
2993 data->start_class->flags &= ~ANYOF_EOS;
2995 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2997 else if (flags & SCF_DO_STCLASS_OR) {
2998 /* false positive possible if the class is case-folded */
3000 ANYOF_BITMAP_SET(data->start_class, uc);
3002 data->start_class->flags |= ANYOF_UNICODE_ALL;
3003 data->start_class->flags &= ~ANYOF_EOS;
3004 cl_and(data->start_class, and_withp);
3006 flags &= ~SCF_DO_STCLASS;
3008 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3009 I32 l = STR_LEN(scan);
3010 UV uc = *((U8*)STRING(scan));
3012 /* Search for fixed substrings supports EXACT only. */
3013 if (flags & SCF_DO_SUBSTR) {
3015 SCAN_COMMIT(pRExC_state, data, minlenp);
3018 const U8 * const s = (U8 *)STRING(scan);
3019 l = utf8_length(s, s + l);
3020 uc = utf8_to_uvchr(s, NULL);
3023 if (flags & SCF_DO_SUBSTR)
3025 if (flags & SCF_DO_STCLASS_AND) {
3026 /* Check whether it is compatible with what we know already! */
3030 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3031 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3032 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3034 ANYOF_CLASS_ZERO(data->start_class);
3035 ANYOF_BITMAP_ZERO(data->start_class);
3037 ANYOF_BITMAP_SET(data->start_class, uc);
3038 data->start_class->flags &= ~ANYOF_EOS;
3039 data->start_class->flags |= ANYOF_FOLD;
3040 if (OP(scan) == EXACTFL)
3041 data->start_class->flags |= ANYOF_LOCALE;
3044 else if (flags & SCF_DO_STCLASS_OR) {
3045 if (data->start_class->flags & ANYOF_FOLD) {
3046 /* false positive possible if the class is case-folded.
3047 Assume that the locale settings are the same... */
3049 ANYOF_BITMAP_SET(data->start_class, uc);
3050 data->start_class->flags &= ~ANYOF_EOS;
3052 cl_and(data->start_class, and_withp);
3054 flags &= ~SCF_DO_STCLASS;
3056 else if (strchr((const char*)PL_varies,OP(scan))) {
3057 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3058 I32 f = flags, pos_before = 0;
3059 regnode * const oscan = scan;
3060 struct regnode_charclass_class this_class;
3061 struct regnode_charclass_class *oclass = NULL;
3062 I32 next_is_eval = 0;
3064 switch (PL_regkind[OP(scan)]) {
3065 case WHILEM: /* End of (?:...)* . */
3066 scan = NEXTOPER(scan);
3069 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3070 next = NEXTOPER(scan);
3071 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3073 maxcount = REG_INFTY;
3074 next = regnext(scan);
3075 scan = NEXTOPER(scan);
3079 if (flags & SCF_DO_SUBSTR)
3084 if (flags & SCF_DO_STCLASS) {
3086 maxcount = REG_INFTY;
3087 next = regnext(scan);
3088 scan = NEXTOPER(scan);
3091 is_inf = is_inf_internal = 1;
3092 scan = regnext(scan);
3093 if (flags & SCF_DO_SUBSTR) {
3094 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3095 data->longest = &(data->longest_float);
3097 goto optimize_curly_tail;
3099 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3100 && (scan->flags == stopparen))
3105 mincount = ARG1(scan);
3106 maxcount = ARG2(scan);
3108 next = regnext(scan);
3109 if (OP(scan) == CURLYX) {
3110 I32 lp = (data ? *(data->last_closep) : 0);
3111 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3113 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3114 next_is_eval = (OP(scan) == EVAL);
3116 if (flags & SCF_DO_SUBSTR) {
3117 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3118 pos_before = data->pos_min;
3122 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3124 data->flags |= SF_IS_INF;
3126 if (flags & SCF_DO_STCLASS) {
3127 cl_init(pRExC_state, &this_class);
3128 oclass = data->start_class;
3129 data->start_class = &this_class;
3130 f |= SCF_DO_STCLASS_AND;
3131 f &= ~SCF_DO_STCLASS_OR;
3133 /* These are the cases when once a subexpression
3134 fails at a particular position, it cannot succeed
3135 even after backtracking at the enclosing scope.
3137 XXXX what if minimal match and we are at the
3138 initial run of {n,m}? */
3139 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3140 f &= ~SCF_WHILEM_VISITED_POS;
3142 /* This will finish on WHILEM, setting scan, or on NULL: */
3143 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3144 last, data, stopparen, recursed, NULL,
3146 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3148 if (flags & SCF_DO_STCLASS)
3149 data->start_class = oclass;
3150 if (mincount == 0 || minnext == 0) {
3151 if (flags & SCF_DO_STCLASS_OR) {
3152 cl_or(pRExC_state, data->start_class, &this_class);
3154 else if (flags & SCF_DO_STCLASS_AND) {
3155 /* Switch to OR mode: cache the old value of
3156 * data->start_class */
3158 StructCopy(data->start_class, and_withp,
3159 struct regnode_charclass_class);
3160 flags &= ~SCF_DO_STCLASS_AND;
3161 StructCopy(&this_class, data->start_class,
3162 struct regnode_charclass_class);
3163 flags |= SCF_DO_STCLASS_OR;
3164 data->start_class->flags |= ANYOF_EOS;
3166 } else { /* Non-zero len */
3167 if (flags & SCF_DO_STCLASS_OR) {
3168 cl_or(pRExC_state, data->start_class, &this_class);
3169 cl_and(data->start_class, and_withp);
3171 else if (flags & SCF_DO_STCLASS_AND)
3172 cl_and(data->start_class, &this_class);
3173 flags &= ~SCF_DO_STCLASS;
3175 if (!scan) /* It was not CURLYX, but CURLY. */
3177 if ( /* ? quantifier ok, except for (?{ ... }) */
3178 (next_is_eval || !(mincount == 0 && maxcount == 1))
3179 && (minnext == 0) && (deltanext == 0)
3180 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3181 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3182 && ckWARN(WARN_REGEXP))
3185 "Quantifier unexpected on zero-length expression");
3188 min += minnext * mincount;
3189 is_inf_internal |= ((maxcount == REG_INFTY
3190 && (minnext + deltanext) > 0)
3191 || deltanext == I32_MAX);
3192 is_inf |= is_inf_internal;
3193 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3195 /* Try powerful optimization CURLYX => CURLYN. */
3196 if ( OP(oscan) == CURLYX && data
3197 && data->flags & SF_IN_PAR
3198 && !(data->flags & SF_HAS_EVAL)
3199 && !deltanext && minnext == 1 ) {
3200 /* Try to optimize to CURLYN. */
3201 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3202 regnode * const nxt1 = nxt;
3209 if (!strchr((const char*)PL_simple,OP(nxt))
3210 && !(PL_regkind[OP(nxt)] == EXACT
3211 && STR_LEN(nxt) == 1))
3217 if (OP(nxt) != CLOSE)
3219 if (RExC_open_parens) {
3220 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3221 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3223 /* Now we know that nxt2 is the only contents: */
3224 oscan->flags = (U8)ARG(nxt);
3226 OP(nxt1) = NOTHING; /* was OPEN. */
3229 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3230 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3231 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3232 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3233 OP(nxt + 1) = OPTIMIZED; /* was count. */
3234 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3239 /* Try optimization CURLYX => CURLYM. */
3240 if ( OP(oscan) == CURLYX && data
3241 && !(data->flags & SF_HAS_PAR)
3242 && !(data->flags & SF_HAS_EVAL)
3243 && !deltanext /* atom is fixed width */
3244 && minnext != 0 /* CURLYM can't handle zero width */
3246 /* XXXX How to optimize if data == 0? */
3247 /* Optimize to a simpler form. */
3248 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3252 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3253 && (OP(nxt2) != WHILEM))
3255 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3256 /* Need to optimize away parenths. */
3257 if (data->flags & SF_IN_PAR) {
3258 /* Set the parenth number. */
3259 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3261 if (OP(nxt) != CLOSE)
3262 FAIL("Panic opt close");
3263 oscan->flags = (U8)ARG(nxt);
3264 if (RExC_open_parens) {
3265 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3266 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3268 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3269 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3272 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3273 OP(nxt + 1) = OPTIMIZED; /* was count. */
3274 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3275 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3278 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3279 regnode *nnxt = regnext(nxt1);
3282 if (reg_off_by_arg[OP(nxt1)])
3283 ARG_SET(nxt1, nxt2 - nxt1);
3284 else if (nxt2 - nxt1 < U16_MAX)
3285 NEXT_OFF(nxt1) = nxt2 - nxt1;
3287 OP(nxt) = NOTHING; /* Cannot beautify */
3292 /* Optimize again: */
3293 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3294 NULL, stopparen, recursed, NULL, 0,depth+1);
3299 else if ((OP(oscan) == CURLYX)
3300 && (flags & SCF_WHILEM_VISITED_POS)
3301 /* See the comment on a similar expression above.
3302 However, this time it not a subexpression
3303 we care about, but the expression itself. */
3304 && (maxcount == REG_INFTY)
3305 && data && ++data->whilem_c < 16) {
3306 /* This stays as CURLYX, we can put the count/of pair. */
3307 /* Find WHILEM (as in regexec.c) */
3308 regnode *nxt = oscan + NEXT_OFF(oscan);
3310 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3312 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3313 | (RExC_whilem_seen << 4)); /* On WHILEM */
3315 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3317 if (flags & SCF_DO_SUBSTR) {
3318 SV *last_str = NULL;
3319 int counted = mincount != 0;
3321 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3322 #if defined(SPARC64_GCC_WORKAROUND)
3325 const char *s = NULL;
3328 if (pos_before >= data->last_start_min)
3331 b = data->last_start_min;
3334 s = SvPV_const(data->last_found, l);
3335 old = b - data->last_start_min;
3338 I32 b = pos_before >= data->last_start_min
3339 ? pos_before : data->last_start_min;
3341 const char * const s = SvPV_const(data->last_found, l);
3342 I32 old = b - data->last_start_min;
3346 old = utf8_hop((U8*)s, old) - (U8*)s;
3349 /* Get the added string: */
3350 last_str = newSVpvn_utf8(s + old, l, UTF);
3351 if (deltanext == 0 && pos_before == b) {
3352 /* What was added is a constant string */
3354 SvGROW(last_str, (mincount * l) + 1);
3355 repeatcpy(SvPVX(last_str) + l,
3356 SvPVX_const(last_str), l, mincount - 1);
3357 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3358 /* Add additional parts. */
3359 SvCUR_set(data->last_found,
3360 SvCUR(data->last_found) - l);
3361 sv_catsv(data->last_found, last_str);
3363 SV * sv = data->last_found;
3365 SvUTF8(sv) && SvMAGICAL(sv) ?
3366 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3367 if (mg && mg->mg_len >= 0)
3368 mg->mg_len += CHR_SVLEN(last_str) - l;
3370 data->last_end += l * (mincount - 1);
3373 /* start offset must point into the last copy */
3374 data->last_start_min += minnext * (mincount - 1);
3375 data->last_start_max += is_inf ? I32_MAX
3376 : (maxcount - 1) * (minnext + data->pos_delta);
3379 /* It is counted once already... */
3380 data->pos_min += minnext * (mincount - counted);
3381 data->pos_delta += - counted * deltanext +
3382 (minnext + deltanext) * maxcount - minnext * mincount;
3383 if (mincount != maxcount) {
3384 /* Cannot extend fixed substrings found inside
3386 SCAN_COMMIT(pRExC_state,data,minlenp);
3387 if (mincount && last_str) {
3388 SV * const sv = data->last_found;
3389 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3390 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3394 sv_setsv(sv, last_str);
3395 data->last_end = data->pos_min;
3396 data->last_start_min =
3397 data->pos_min - CHR_SVLEN(last_str);
3398 data->last_start_max = is_inf
3400 : data->pos_min + data->pos_delta
3401 - CHR_SVLEN(last_str);
3403 data->longest = &(data->longest_float);
3405 SvREFCNT_dec(last_str);
3407 if (data && (fl & SF_HAS_EVAL))
3408 data->flags |= SF_HAS_EVAL;
3409 optimize_curly_tail:
3410 if (OP(oscan) != CURLYX) {
3411 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3413 NEXT_OFF(oscan) += NEXT_OFF(next);
3416 default: /* REF and CLUMP only? */
3417 if (flags & SCF_DO_SUBSTR) {
3418 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3419 data->longest = &(data->longest_float);
3421 is_inf = is_inf_internal = 1;
3422 if (flags & SCF_DO_STCLASS_OR)
3423 cl_anything(pRExC_state, data->start_class);
3424 flags &= ~SCF_DO_STCLASS;
3428 else if (OP(scan) == LNBREAK) {
3429 if (flags & SCF_DO_STCLASS) {
3431 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3432 if (flags & SCF_DO_STCLASS_AND) {
3433 for (value = 0; value < 256; value++)
3434 if (!is_VERTWS_cp(value))
3435 ANYOF_BITMAP_CLEAR(data->start_class, value);
3438 for (value = 0; value < 256; value++)
3439 if (is_VERTWS_cp(value))
3440 ANYOF_BITMAP_SET(data->start_class, value);
3442 if (flags & SCF_DO_STCLASS_OR)
3443 cl_and(data->start_class, and_withp);
3444 flags &= ~SCF_DO_STCLASS;
3448 if (flags & SCF_DO_SUBSTR) {
3449 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3451 data->pos_delta += 1;
3452 data->longest = &(data->longest_float);
3456 else if (OP(scan) == FOLDCHAR) {
3457 int d = ARG(scan)==0xDF ? 1 : 2;
3458 flags &= ~SCF_DO_STCLASS;
3461 if (flags & SCF_DO_SUBSTR) {
3462 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3464 data->pos_delta += d;
3465 data->longest = &(data->longest_float);
3468 else if (strchr((const char*)PL_simple,OP(scan))) {
3471 if (flags & SCF_DO_SUBSTR) {
3472 SCAN_COMMIT(pRExC_state,data,minlenp);
3476 if (flags & SCF_DO_STCLASS) {
3477 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3479 /* Some of the logic below assumes that switching
3480 locale on will only add false positives. */
3481 switch (PL_regkind[OP(scan)]) {
3485 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3486 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3487 cl_anything(pRExC_state, data->start_class);
3490 if (OP(scan) == SANY)
3492 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3493 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3494 || (data->start_class->flags & ANYOF_CLASS));
3495 cl_anything(pRExC_state, data->start_class);
3497 if (flags & SCF_DO_STCLASS_AND || !value)
3498 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3501 if (flags & SCF_DO_STCLASS_AND)
3502 cl_and(data->start_class,
3503 (struct regnode_charclass_class*)scan);
3505 cl_or(pRExC_state, data->start_class,
3506 (struct regnode_charclass_class*)scan);
3509 if (flags & SCF_DO_STCLASS_AND) {
3510 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3511 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3512 for (value = 0; value < 256; value++)
3513 if (!isALNUM(value))
3514 ANYOF_BITMAP_CLEAR(data->start_class, value);
3518 if (data->start_class->flags & ANYOF_LOCALE)
3519 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3521 for (value = 0; value < 256; value++)
3523 ANYOF_BITMAP_SET(data->start_class, value);
3528 if (flags & SCF_DO_STCLASS_AND) {
3529 if (data->start_class->flags & ANYOF_LOCALE)
3530 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3533 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3534 data->start_class->flags |= ANYOF_LOCALE;
3538 if (flags & SCF_DO_STCLASS_AND) {
3539 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3540 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3541 for (value = 0; value < 256; value++)
3543 ANYOF_BITMAP_CLEAR(data->start_class, value);
3547 if (data->start_class->flags & ANYOF_LOCALE)
3548 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3550 for (value = 0; value < 256; value++)
3551 if (!isALNUM(value))
3552 ANYOF_BITMAP_SET(data->start_class, value);
3557 if (flags & SCF_DO_STCLASS_AND) {
3558 if (data->start_class->flags & ANYOF_LOCALE)
3559 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3562 data->start_class->flags |= ANYOF_LOCALE;
3563 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3567 if (flags & SCF_DO_STCLASS_AND) {
3568 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3569 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3570 for (value = 0; value < 256; value++)
3571 if (!isSPACE(value))
3572 ANYOF_BITMAP_CLEAR(data->start_class, value);
3576 if (data->start_class->flags & ANYOF_LOCALE)
3577 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3579 for (value = 0; value < 256; value++)
3581 ANYOF_BITMAP_SET(data->start_class, value);
3586 if (flags & SCF_DO_STCLASS_AND) {
3587 if (data->start_class->flags & ANYOF_LOCALE)
3588 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3591 data->start_class->flags |= ANYOF_LOCALE;
3592 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3596 if (flags & SCF_DO_STCLASS_AND) {
3597 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3598 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3599 for (value = 0; value < 256; value++)
3601 ANYOF_BITMAP_CLEAR(data->start_class, value);
3605 if (data->start_class->flags & ANYOF_LOCALE)
3606 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3608 for (value = 0; value < 256; value++)
3609 if (!isSPACE(value))
3610 ANYOF_BITMAP_SET(data->start_class, value);
3615 if (flags & SCF_DO_STCLASS_AND) {
3616 if (data->start_class->flags & ANYOF_LOCALE) {
3617 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3618 for (value = 0; value < 256; value++)
3619 if (!isSPACE(value))
3620 ANYOF_BITMAP_CLEAR(data->start_class, value);
3624 data->start_class->flags |= ANYOF_LOCALE;
3625 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3629 if (flags & SCF_DO_STCLASS_AND) {
3630 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3631 for (value = 0; value < 256; value++)
3632 if (!isDIGIT(value))
3633 ANYOF_BITMAP_CLEAR(data->start_class, value);
3636 if (data->start_class->flags & ANYOF_LOCALE)
3637 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3639 for (value = 0; value < 256; value++)
3641 ANYOF_BITMAP_SET(data->start_class, value);
3646 if (flags & SCF_DO_STCLASS_AND) {
3647 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3648 for (value = 0; value < 256; value++)
3650 ANYOF_BITMAP_CLEAR(data->start_class, value);
3653 if (data->start_class->flags & ANYOF_LOCALE)
3654 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3656 for (value = 0; value < 256; value++)
3657 if (!isDIGIT(value))
3658 ANYOF_BITMAP_SET(data->start_class, value);
3662 CASE_SYNST_FNC(VERTWS);
3663 CASE_SYNST_FNC(HORIZWS);
3666 if (flags & SCF_DO_STCLASS_OR)
3667 cl_and(data->start_class, and_withp);
3668 flags &= ~SCF_DO_STCLASS;
3671 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3672 data->flags |= (OP(scan) == MEOL
3676 else if ( PL_regkind[OP(scan)] == BRANCHJ
3677 /* Lookbehind, or need to calculate parens/evals/stclass: */
3678 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3679 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3680 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3681 || OP(scan) == UNLESSM )
3683 /* Negative Lookahead/lookbehind
3684 In this case we can't do fixed string optimisation.
3687 I32 deltanext, minnext, fake = 0;
3689 struct regnode_charclass_class intrnl;
3692 data_fake.flags = 0;
3694 data_fake.whilem_c = data->whilem_c;
3695 data_fake.last_closep = data->last_closep;
3698 data_fake.last_closep = &fake;
3699 data_fake.pos_delta = delta;
3700 if ( flags & SCF_DO_STCLASS && !scan->flags
3701 && OP(scan) == IFMATCH ) { /* Lookahead */
3702 cl_init(pRExC_state, &intrnl);
3703 data_fake.start_class = &intrnl;
3704 f |= SCF_DO_STCLASS_AND;
3706 if (flags & SCF_WHILEM_VISITED_POS)
3707 f |= SCF_WHILEM_VISITED_POS;
3708 next = regnext(scan);
3709 nscan = NEXTOPER(NEXTOPER(scan));
3710 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3711 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3714 FAIL("Variable length lookbehind not implemented");
3716 else if (minnext > (I32)U8_MAX) {
3717 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3719 scan->flags = (U8)minnext;
3722 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3724 if (data_fake.flags & SF_HAS_EVAL)
3725 data->flags |= SF_HAS_EVAL;
3726 data->whilem_c = data_fake.whilem_c;
3728 if (f & SCF_DO_STCLASS_AND) {
3729 if (flags & SCF_DO_STCLASS_OR) {
3730 /* OR before, AND after: ideally we would recurse with
3731 * data_fake to get the AND applied by study of the
3732 * remainder of the pattern, and then derecurse;
3733 * *** HACK *** for now just treat as "no information".
3734 * See [perl #56690].
3736 cl_init(pRExC_state, data->start_class);
3738 /* AND before and after: combine and continue */
3739 const int was = (data->start_class->flags & ANYOF_EOS);
3741 cl_and(data->start_class, &intrnl);
3743 data->start_class->flags |= ANYOF_EOS;
3747 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3749 /* Positive Lookahead/lookbehind
3750 In this case we can do fixed string optimisation,
3751 but we must be careful about it. Note in the case of
3752 lookbehind the positions will be offset by the minimum
3753 length of the pattern, something we won't know about
3754 until after the recurse.
3756 I32 deltanext, fake = 0;
3758 struct regnode_charclass_class intrnl;
3760 /* We use SAVEFREEPV so that when the full compile
3761 is finished perl will clean up the allocated
3762 minlens when its all done. This was we don't
3763 have to worry about freeing them when we know
3764 they wont be used, which would be a pain.
3767 Newx( minnextp, 1, I32 );
3768 SAVEFREEPV(minnextp);
3771 StructCopy(data, &data_fake, scan_data_t);
3772 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3775 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3776 data_fake.last_found=newSVsv(data->last_found);
3780 data_fake.last_closep = &fake;
3781 data_fake.flags = 0;
3782 data_fake.pos_delta = delta;
3784 data_fake.flags |= SF_IS_INF;
3785 if ( flags & SCF_DO_STCLASS && !scan->flags
3786 && OP(scan) == IFMATCH ) { /* Lookahead */
3787 cl_init(pRExC_state, &intrnl);
3788 data_fake.start_class = &intrnl;
3789 f |= SCF_DO_STCLASS_AND;
3791 if (flags & SCF_WHILEM_VISITED_POS)
3792 f |= SCF_WHILEM_VISITED_POS;
3793 next = regnext(scan);
3794 nscan = NEXTOPER(NEXTOPER(scan));
3796 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3797 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3800 FAIL("Variable length lookbehind not implemented");
3802 else if (*minnextp > (I32)U8_MAX) {
3803 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3805 scan->flags = (U8)*minnextp;
3810 if (f & SCF_DO_STCLASS_AND) {
3811 const int was = (data->start_class->flags & ANYOF_EOS);
3813 cl_and(data->start_class, &intrnl);
3815 data->start_class->flags |= ANYOF_EOS;
3818 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3820 if (data_fake.flags & SF_HAS_EVAL)
3821 data->flags |= SF_HAS_EVAL;
3822 data->whilem_c = data_fake.whilem_c;
3823 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3824 if (RExC_rx->minlen<*minnextp)
3825 RExC_rx->minlen=*minnextp;
3826 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3827 SvREFCNT_dec(data_fake.last_found);
3829 if ( data_fake.minlen_fixed != minlenp )
3831 data->offset_fixed= data_fake.offset_fixed;
3832 data->minlen_fixed= data_fake.minlen_fixed;
3833 data->lookbehind_fixed+= scan->flags;
3835 if ( data_fake.minlen_float != minlenp )
3837 data->minlen_float= data_fake.minlen_float;
3838 data->offset_float_min=data_fake.offset_float_min;
3839 data->offset_float_max=data_fake.offset_float_max;
3840 data->lookbehind_float+= scan->flags;
3849 else if (OP(scan) == OPEN) {
3850 if (stopparen != (I32)ARG(scan))
3853 else if (OP(scan) == CLOSE) {
3854 if (stopparen == (I32)ARG(scan)) {
3857 if ((I32)ARG(scan) == is_par) {
3858 next = regnext(scan);
3860 if ( next && (OP(next) != WHILEM) && next < last)
3861 is_par = 0; /* Disable optimization */
3864 *(data->last_closep) = ARG(scan);
3866 else if (OP(scan) == EVAL) {
3868 data->flags |= SF_HAS_EVAL;
3870 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3871 if (flags & SCF_DO_SUBSTR) {
3872 SCAN_COMMIT(pRExC_state,data,minlenp);
3873 flags &= ~SCF_DO_SUBSTR;
3875 if (data && OP(scan)==ACCEPT) {
3876 data->flags |= SCF_SEEN_ACCEPT;
3881 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3883 if (flags & SCF_DO_SUBSTR) {
3884 SCAN_COMMIT(pRExC_state,data,minlenp);
3885 data->longest = &(data->longest_float);
3887 is_inf = is_inf_internal = 1;
3888 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3889 cl_anything(pRExC_state, data->start_class);
3890 flags &= ~SCF_DO_STCLASS;
3892 else if (OP(scan) == GPOS) {
3893 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3894 !(delta || is_inf || (data && data->pos_delta)))
3896 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3897 RExC_rx->extflags |= RXf_ANCH_GPOS;
3898 if (RExC_rx->gofs < (U32)min)
3899 RExC_rx->gofs = min;
3901 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3905 #ifdef TRIE_STUDY_OPT
3906 #ifdef FULL_TRIE_STUDY
3907 else if (PL_regkind[OP(scan)] == TRIE) {
3908 /* NOTE - There is similar code to this block above for handling
3909 BRANCH nodes on the initial study. If you change stuff here
3911 regnode *trie_node= scan;
3912 regnode *tail= regnext(scan);
3913 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3914 I32 max1 = 0, min1 = I32_MAX;
3915 struct regnode_charclass_class accum;
3917 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3918 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3919 if (flags & SCF_DO_STCLASS)
3920 cl_init_zero(pRExC_state, &accum);
3926 const regnode *nextbranch= NULL;
3929 for ( word=1 ; word <= trie->wordcount ; word++)
3931 I32 deltanext=0, minnext=0, f = 0, fake;
3932 struct regnode_charclass_class this_class;
3934 data_fake.flags = 0;
3936 data_fake.whilem_c = data->whilem_c;
3937 data_fake.last_closep = data->last_closep;
3940 data_fake.last_closep = &fake;
3941 data_fake.pos_delta = delta;
3942 if (flags & SCF_DO_STCLASS) {
3943 cl_init(pRExC_state, &this_class);
3944 data_fake.start_class = &this_class;
3945 f = SCF_DO_STCLASS_AND;
3947 if (flags & SCF_WHILEM_VISITED_POS)
3948 f |= SCF_WHILEM_VISITED_POS;
3950 if (trie->jump[word]) {
3952 nextbranch = trie_node + trie->jump[0];
3953 scan= trie_node + trie->jump[word];
3954 /* We go from the jump point to the branch that follows
3955 it. Note this means we need the vestigal unused branches
3956 even though they arent otherwise used.
3958 minnext = study_chunk(pRExC_state, &scan, minlenp,
3959 &deltanext, (regnode *)nextbranch, &data_fake,
3960 stopparen, recursed, NULL, f,depth+1);
3962 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3963 nextbranch= regnext((regnode*)nextbranch);
3965 if (min1 > (I32)(minnext + trie->minlen))
3966 min1 = minnext + trie->minlen;
3967 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3968 max1 = minnext + deltanext + trie->maxlen;
3969 if (deltanext == I32_MAX)
3970 is_inf = is_inf_internal = 1;
3972 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3974 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3975 if ( stopmin > min + min1)
3976 stopmin = min + min1;
3977 flags &= ~SCF_DO_SUBSTR;
3979 data->flags |= SCF_SEEN_ACCEPT;
3982 if (data_fake.flags & SF_HAS_EVAL)
3983 data->flags |= SF_HAS_EVAL;
3984 data->whilem_c = data_fake.whilem_c;
3986 if (flags & SCF_DO_STCLASS)
3987 cl_or(pRExC_state, &accum, &this_class);
3990 if (flags & SCF_DO_SUBSTR) {
3991 data->pos_min += min1;
3992 data->pos_delta += max1 - min1;
3993 if (max1 != min1 || is_inf)
3994 data->longest = &(data->longest_float);
3997 delta += max1 - min1;
3998 if (flags & SCF_DO_STCLASS_OR) {
3999 cl_or(pRExC_state, data->start_class, &accum);
4001 cl_and(data->start_class, and_withp);
4002 flags &= ~SCF_DO_STCLASS;
4005 else if (flags & SCF_DO_STCLASS_AND) {
4007 cl_and(data->start_class, &accum);
4008 flags &= ~SCF_DO_STCLASS;
4011 /* Switch to OR mode: cache the old value of
4012 * data->start_class */
4014 StructCopy(data->start_class, and_withp,
4015 struct regnode_charclass_class);
4016 flags &= ~SCF_DO_STCLASS_AND;
4017 StructCopy(&accum, data->start_class,
4018 struct regnode_charclass_class);
4019 flags |= SCF_DO_STCLASS_OR;
4020 data->start_class->flags |= ANYOF_EOS;
4027 else if (PL_regkind[OP(scan)] == TRIE) {
4028 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4031 min += trie->minlen;
4032 delta += (trie->maxlen - trie->minlen);
4033 flags &= ~SCF_DO_STCLASS; /* xxx */
4034 if (flags & SCF_DO_SUBSTR) {
4035 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4036 data->pos_min += trie->minlen;
4037 data->pos_delta += (trie->maxlen - trie->minlen);
4038 if (trie->maxlen != trie->minlen)
4039 data->longest = &(data->longest_float);
4041 if (trie->jump) /* no more substrings -- for now /grr*/
4042 flags &= ~SCF_DO_SUBSTR;
4044 #endif /* old or new */
4045 #endif /* TRIE_STUDY_OPT */
4047 /* Else: zero-length, ignore. */
4048 scan = regnext(scan);
4053 stopparen = frame->stop;
4054 frame = frame->prev;
4055 goto fake_study_recurse;
4060 DEBUG_STUDYDATA("pre-fin:",data,depth);
4063 *deltap = is_inf_internal ? I32_MAX : delta;
4064 if (flags & SCF_DO_SUBSTR && is_inf)
4065 data->pos_delta = I32_MAX - data->pos_min;
4066 if (is_par > (I32)U8_MAX)
4068 if (is_par && pars==1 && data) {
4069 data->flags |= SF_IN_PAR;
4070 data->flags &= ~SF_HAS_PAR;
4072 else if (pars && data) {
4073 data->flags |= SF_HAS_PAR;
4074 data->flags &= ~SF_IN_PAR;
4076 if (flags & SCF_DO_STCLASS_OR)
4077 cl_and(data->start_class, and_withp);
4078 if (flags & SCF_TRIE_RESTUDY)
4079 data->flags |= SCF_TRIE_RESTUDY;
4081 DEBUG_STUDYDATA("post-fin:",data,depth);
4083 return min < stopmin ? min : stopmin;
4087 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4089 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4091 PERL_ARGS_ASSERT_ADD_DATA;
4093 Renewc(RExC_rxi->data,
4094 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4095 char, struct reg_data);
4097 Renew(RExC_rxi->data->what, count + n, U8);
4099 Newx(RExC_rxi->data->what, n, U8);
4100 RExC_rxi->data->count = count + n;
4101 Copy(s, RExC_rxi->data->what + count, n, U8);
4105 /*XXX: todo make this not included in a non debugging perl */
4106 #ifndef PERL_IN_XSUB_RE
4108 Perl_reginitcolors(pTHX)
4111 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4113 char *t = savepv(s);
4117 t = strchr(t, '\t');
4123 PL_colors[i] = t = (char *)"";
4128 PL_colors[i++] = (char *)"";
4135 #ifdef TRIE_STUDY_OPT
4136 #define CHECK_RESTUDY_GOTO \
4138 (data.flags & SCF_TRIE_RESTUDY) \
4142 #define CHECK_RESTUDY_GOTO
4146 - pregcomp - compile a regular expression into internal code
4148 * We can't allocate space until we know how big the compiled form will be,
4149 * but we can't compile it (and thus know how big it is) until we've got a
4150 * place to put the code. So we cheat: we compile it twice, once with code
4151 * generation turned off and size counting turned on, and once "for real".
4152 * This also means that we don't allocate space until we are sure that the
4153 * thing really will compile successfully, and we never have to move the
4154 * code and thus invalidate pointers into it. (Note that it has to be in
4155 * one piece because free() must be able to free it all.) [NB: not true in perl]
4157 * Beware that the optimization-preparation code in here knows about some
4158 * of the structure of the compiled regexp. [I'll say.]
4163 #ifndef PERL_IN_XSUB_RE
4164 #define RE_ENGINE_PTR &reh_regexp_engine
4166 extern const struct regexp_engine my_reg_engine;
4167 #define RE_ENGINE_PTR &my_reg_engine
4170 #ifndef PERL_IN_XSUB_RE
4172 Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
4175 HV * const table = GvHV(PL_hintgv);
4177 PERL_ARGS_ASSERT_PREGCOMP;
4179 /* Dispatch a request to compile a regexp to correct
4182 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4183 GET_RE_DEBUG_FLAGS_DECL;
4184 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4185 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4187 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4190 return CALLREGCOMP_ENG(eng, pattern, flags);
4193 return Perl_re_compile(aTHX_ pattern, flags);
4198 Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
4202 register regexp_internal *ri;
4204 char* exp = SvPV((SV*)pattern, plen);
4205 char* xend = exp + plen;
4212 RExC_state_t RExC_state;
4213 RExC_state_t * const pRExC_state = &RExC_state;
4214 #ifdef TRIE_STUDY_OPT
4216 RExC_state_t copyRExC_state;
4218 GET_RE_DEBUG_FLAGS_DECL;
4220 PERL_ARGS_ASSERT_RE_COMPILE;
4222 DEBUG_r(if (!PL_colorset) reginitcolors());
4224 RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
4227 SV *dsv= sv_newmortal();
4228 RE_PV_QUOTED_DECL(s, RExC_utf8,
4229 dsv, exp, plen, 60);
4230 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4231 PL_colors[4],PL_colors[5],s);
4236 RExC_flags = pm_flags;
4240 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4241 RExC_seen_evals = 0;
4244 /* First pass: determine size, legality. */
4252 RExC_emit = &PL_regdummy;
4253 RExC_whilem_seen = 0;
4254 RExC_charnames = NULL;
4255 RExC_open_parens = NULL;
4256 RExC_close_parens = NULL;
4258 RExC_paren_names = NULL;
4260 RExC_paren_name_list = NULL;
4262 RExC_recurse = NULL;
4263 RExC_recurse_count = 0;
4265 #if 0 /* REGC() is (currently) a NOP at the first pass.
4266 * Clever compilers notice this and complain. --jhi */
4267 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4269 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4270 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4271 RExC_precomp = NULL;
4274 if (RExC_utf8 && !RExC_orig_utf8) {
4275 /* It's possible to write a regexp in ascii that represents Unicode
4276 codepoints outside of the byte range, such as via \x{100}. If we
4277 detect such a sequence we have to convert the entire pattern to utf8
4278 and then recompile, as our sizing calculation will have been based
4279 on 1 byte == 1 character, but we will need to use utf8 to encode
4280 at least some part of the pattern, and therefore must convert the whole
4282 XXX: somehow figure out how to make this less expensive...
4285 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4286 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4287 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4289 RExC_orig_utf8 = RExC_utf8;
4291 goto redo_first_pass;
4294 PerlIO_printf(Perl_debug_log,
4295 "Required size %"IVdf" nodes\n"
4296 "Starting second pass (creation)\n",
4299 RExC_lastparse=NULL;
4301 /* Small enough for pointer-storage convention?
4302 If extralen==0, this means that we will not need long jumps. */
4303 if (RExC_size >= 0x10000L && RExC_extralen)
4304 RExC_size += RExC_extralen;
4307 if (RExC_whilem_seen > 15)
4308 RExC_whilem_seen = 15;
4310 /* Allocate space and zero-initialize. Note, the two step process
4311 of zeroing when in debug mode, thus anything assigned has to
4312 happen after that */
4313 Newxz(r, 1, regexp);
4314 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4315 char, regexp_internal);
4316 if ( r == NULL || ri == NULL )
4317 FAIL("Regexp out of space");
4319 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4320 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4322 /* bulk initialize base fields with 0. */
4323 Zero(ri, sizeof(regexp_internal), char);
4326 /* non-zero initialization begins here */
4328 r->engine= RE_ENGINE_PTR;
4330 RX_PRELEN(r) = plen;
4331 r->extflags = pm_flags;
4333 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4334 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4335 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4336 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4337 >> RXf_PMf_STD_PMMOD_SHIFT);
4338 const char *fptr = STD_PAT_MODS; /*"msix"*/
4340 RX_WRAPLEN(r) = plen + has_minus + has_p + has_runon
4341 + (sizeof(STD_PAT_MODS) - 1)
4342 + (sizeof("(?:)") - 1);
4344 Newx(RX_WRAPPED(r), RX_WRAPLEN(r) + 1, char );
4348 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4350 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4351 char *colon = r + 1;
4354 while((ch = *fptr++)) {
4368 Copy(RExC_precomp, p, plen, char);
4378 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4380 if (RExC_seen & REG_SEEN_RECURSE) {
4381 Newxz(RExC_open_parens, RExC_npar,regnode *);
4382 SAVEFREEPV(RExC_open_parens);
4383 Newxz(RExC_close_parens,RExC_npar,regnode *);
4384 SAVEFREEPV(RExC_close_parens);
4387 /* Useful during FAIL. */
4388 #ifdef RE_TRACK_PATTERN_OFFSETS
4389 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4390 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4391 "%s %"UVuf" bytes for offset annotations.\n",
4392 ri->u.offsets ? "Got" : "Couldn't get",
4393 (UV)((2*RExC_size+1) * sizeof(U32))));
4395 SetProgLen(ri,RExC_size);
4398 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4400 /* Second pass: emit code. */
4401 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4406 RExC_emit_start = ri->program;
4407 RExC_emit = ri->program;
4408 RExC_emit_bound = ri->program + RExC_size + 1;
4410 /* Store the count of eval-groups for security checks: */
4411 RExC_rx->seen_evals = RExC_seen_evals;
4412 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4413 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4417 /* XXXX To minimize changes to RE engine we always allocate
4418 3-units-long substrs field. */
4419 Newx(r->substrs, 1, struct reg_substr_data);
4420 if (RExC_recurse_count) {
4421 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4422 SAVEFREEPV(RExC_recurse);
4426 r->minlen = minlen = sawplus = sawopen = 0;
4427 Zero(r->substrs, 1, struct reg_substr_data);
4429 #ifdef TRIE_STUDY_OPT
4431 StructCopy(&zero_scan_data, &data, scan_data_t);
4432 copyRExC_state = RExC_state;
4435 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4437 RExC_state = copyRExC_state;
4438 if (seen & REG_TOP_LEVEL_BRANCHES)
4439 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4441 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4442 if (data.last_found) {
4443 SvREFCNT_dec(data.longest_fixed);
4444 SvREFCNT_dec(data.longest_float);
4445 SvREFCNT_dec(data.last_found);
4447 StructCopy(&zero_scan_data, &data, scan_data_t);
4450 StructCopy(&zero_scan_data, &data, scan_data_t);
4453 /* Dig out information for optimizations. */
4454 r->extflags = RExC_flags; /* was pm_op */
4455 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4458 r->extflags |= RXf_UTF8; /* Unicode in it? */
4459 ri->regstclass = NULL;
4460 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4461 r->intflags |= PREGf_NAUGHTY;
4462 scan = ri->program + 1; /* First BRANCH. */
4464 /* testing for BRANCH here tells us whether there is "must appear"
4465 data in the pattern. If there is then we can use it for optimisations */
4466 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4468 STRLEN longest_float_length, longest_fixed_length;
4469 struct regnode_charclass_class ch_class; /* pointed to by data */
4471 I32 last_close = 0; /* pointed to by data */
4472 regnode *first= scan;
4473 regnode *first_next= regnext(first);
4476 * Skip introductions and multiplicators >= 1
4477 * so that we can extract the 'meat' of the pattern that must
4478 * match in the large if() sequence following.
4479 * NOTE that EXACT is NOT covered here, as it is normally
4480 * picked up by the optimiser separately.
4482 * This is unfortunate as the optimiser isnt handling lookahead
4483 * properly currently.
4486 while ((OP(first) == OPEN && (sawopen = 1)) ||
4487 /* An OR of *one* alternative - should not happen now. */
4488 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4489 /* for now we can't handle lookbehind IFMATCH*/
4490 (OP(first) == IFMATCH && !first->flags) ||
4491 (OP(first) == PLUS) ||
4492 (OP(first) == MINMOD) ||
4493 /* An {n,m} with n>0 */
4494 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4495 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4498 * the only op that could be a regnode is PLUS, all the rest
4499 * will be regnode_1 or regnode_2.
4502 if (OP(first) == PLUS)
4505 first += regarglen[OP(first)];
4507 first = NEXTOPER(first);
4508 first_next= regnext(first);
4511 /* Starting-point info. */
4513 DEBUG_PEEP("first:",first,0);
4514 /* Ignore EXACT as we deal with it later. */
4515 if (PL_regkind[OP(first)] == EXACT) {
4516 if (OP(first) == EXACT)
4517 NOOP; /* Empty, get anchored substr later. */
4518 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4519 ri->regstclass = first;
4522 else if (PL_regkind[OP(first)] == TRIE &&
4523 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4526 /* this can happen only on restudy */
4527 if ( OP(first) == TRIE ) {
4528 struct regnode_1 *trieop = (struct regnode_1 *)
4529 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4530 StructCopy(first,trieop,struct regnode_1);
4531 trie_op=(regnode *)trieop;
4533 struct regnode_charclass *trieop = (struct regnode_charclass *)
4534 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4535 StructCopy(first,trieop,struct regnode_charclass);
4536 trie_op=(regnode *)trieop;
4539 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4540 ri->regstclass = trie_op;
4543 else if (strchr((const char*)PL_simple,OP(first)))
4544 ri->regstclass = first;
4545 else if (PL_regkind[OP(first)] == BOUND ||
4546 PL_regkind[OP(first)] == NBOUND)
4547 ri->regstclass = first;
4548 else if (PL_regkind[OP(first)] == BOL) {
4549 r->extflags |= (OP(first) == MBOL
4551 : (OP(first) == SBOL
4554 first = NEXTOPER(first);
4557 else if (OP(first) == GPOS) {
4558 r->extflags |= RXf_ANCH_GPOS;
4559 first = NEXTOPER(first);
4562 else if ((!sawopen || !RExC_sawback) &&
4563 (OP(first) == STAR &&
4564 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4565 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4567 /* turn .* into ^.* with an implied $*=1 */
4569 (OP(NEXTOPER(first)) == REG_ANY)
4572 r->extflags |= type;
4573 r->intflags |= PREGf_IMPLICIT;
4574 first = NEXTOPER(first);
4577 if (sawplus && (!sawopen || !RExC_sawback)
4578 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4579 /* x+ must match at the 1st pos of run of x's */
4580 r->intflags |= PREGf_SKIP;
4582 /* Scan is after the zeroth branch, first is atomic matcher. */
4583 #ifdef TRIE_STUDY_OPT
4586 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4587 (IV)(first - scan + 1))
4591 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4592 (IV)(first - scan + 1))
4598 * If there's something expensive in the r.e., find the
4599 * longest literal string that must appear and make it the
4600 * regmust. Resolve ties in favor of later strings, since
4601 * the regstart check works with the beginning of the r.e.
4602 * and avoiding duplication strengthens checking. Not a
4603 * strong reason, but sufficient in the absence of others.
4604 * [Now we resolve ties in favor of the earlier string if
4605 * it happens that c_offset_min has been invalidated, since the
4606 * earlier string may buy us something the later one won't.]
4609 data.longest_fixed = newSVpvs("");
4610 data.longest_float = newSVpvs("");
4611 data.last_found = newSVpvs("");
4612 data.longest = &(data.longest_fixed);
4614 if (!ri->regstclass) {
4615 cl_init(pRExC_state, &ch_class);
4616 data.start_class = &ch_class;
4617 stclass_flag = SCF_DO_STCLASS_AND;
4618 } else /* XXXX Check for BOUND? */
4620 data.last_closep = &last_close;
4622 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4623 &data, -1, NULL, NULL,
4624 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4630 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4631 && data.last_start_min == 0 && data.last_end > 0
4632 && !RExC_seen_zerolen
4633 && !(RExC_seen & REG_SEEN_VERBARG)
4634 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4635 r->extflags |= RXf_CHECK_ALL;
4636 scan_commit(pRExC_state, &data,&minlen,0);
4637 SvREFCNT_dec(data.last_found);
4639 /* Note that code very similar to this but for anchored string
4640 follows immediately below, changes may need to be made to both.
4643 longest_float_length = CHR_SVLEN(data.longest_float);
4644 if (longest_float_length
4645 || (data.flags & SF_FL_BEFORE_EOL
4646 && (!(data.flags & SF_FL_BEFORE_MEOL)
4647 || (RExC_flags & RXf_PMf_MULTILINE))))
4651 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4652 && data.offset_fixed == data.offset_float_min
4653 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4654 goto remove_float; /* As in (a)+. */
4656 /* copy the information about the longest float from the reg_scan_data
4657 over to the program. */
4658 if (SvUTF8(data.longest_float)) {
4659 r->float_utf8 = data.longest_float;
4660 r->float_substr = NULL;
4662 r->float_substr = data.longest_float;
4663 r->float_utf8 = NULL;
4665 /* float_end_shift is how many chars that must be matched that
4666 follow this item. We calculate it ahead of time as once the
4667 lookbehind offset is added in we lose the ability to correctly
4669 ml = data.minlen_float ? *(data.minlen_float)
4670 : (I32)longest_float_length;
4671 r->float_end_shift = ml - data.offset_float_min
4672 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4673 + data.lookbehind_float;
4674 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4675 r->float_max_offset = data.offset_float_max;
4676 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4677 r->float_max_offset -= data.lookbehind_float;
4679 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4680 && (!(data.flags & SF_FL_BEFORE_MEOL)
4681 || (RExC_flags & RXf_PMf_MULTILINE)));
4682 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4686 r->float_substr = r->float_utf8 = NULL;
4687 SvREFCNT_dec(data.longest_float);
4688 longest_float_length = 0;
4691 /* Note that code very similar to this but for floating string
4692 is immediately above, changes may need to be made to both.
4695 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4696 if (longest_fixed_length
4697 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4698 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4699 || (RExC_flags & RXf_PMf_MULTILINE))))
4703 /* copy the information about the longest fixed
4704 from the reg_scan_data over to the program. */
4705 if (SvUTF8(data.longest_fixed)) {
4706 r->anchored_utf8 = data.longest_fixed;
4707 r->anchored_substr = NULL;
4709 r->anchored_substr = data.longest_fixed;
4710 r->anchored_utf8 = NULL;
4712 /* fixed_end_shift is how many chars that must be matched that
4713 follow this item. We calculate it ahead of time as once the
4714 lookbehind offset is added in we lose the ability to correctly
4716 ml = data.minlen_fixed ? *(data.minlen_fixed)
4717 : (I32)longest_fixed_length;
4718 r->anchored_end_shift = ml - data.offset_fixed
4719 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4720 + data.lookbehind_fixed;
4721 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4723 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4724 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4725 || (RExC_flags & RXf_PMf_MULTILINE)));
4726 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4729 r->anchored_substr = r->anchored_utf8 = NULL;
4730 SvREFCNT_dec(data.longest_fixed);
4731 longest_fixed_length = 0;
4734 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4735 ri->regstclass = NULL;
4736 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4738 && !(data.start_class->flags & ANYOF_EOS)
4739 && !cl_is_anything(data.start_class))
4741 const U32 n = add_data(pRExC_state, 1, "f");
4743 Newx(RExC_rxi->data->data[n], 1,
4744 struct regnode_charclass_class);
4745 StructCopy(data.start_class,
4746 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4747 struct regnode_charclass_class);
4748 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4749 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4750 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4751 regprop(r, sv, (regnode*)data.start_class);
4752 PerlIO_printf(Perl_debug_log,
4753 "synthetic stclass \"%s\".\n",
4754 SvPVX_const(sv));});
4757 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4758 if (longest_fixed_length > longest_float_length) {
4759 r->check_end_shift = r->anchored_end_shift;
4760 r->check_substr = r->anchored_substr;
4761 r->check_utf8 = r->anchored_utf8;
4762 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4763 if (r->extflags & RXf_ANCH_SINGLE)
4764 r->extflags |= RXf_NOSCAN;
4767 r->check_end_shift = r->float_end_shift;
4768 r->check_substr = r->float_substr;
4769 r->check_utf8 = r->float_utf8;
4770 r->check_offset_min = r->float_min_offset;
4771 r->check_offset_max = r->float_max_offset;
4773 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4774 This should be changed ASAP! */
4775 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4776 r->extflags |= RXf_USE_INTUIT;
4777 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4778 r->extflags |= RXf_INTUIT_TAIL;
4780 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4781 if ( (STRLEN)minlen < longest_float_length )
4782 minlen= longest_float_length;
4783 if ( (STRLEN)minlen < longest_fixed_length )
4784 minlen= longest_fixed_length;
4788 /* Several toplevels. Best we can is to set minlen. */
4790 struct regnode_charclass_class ch_class;
4793 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4795 scan = ri->program + 1;
4796 cl_init(pRExC_state, &ch_class);
4797 data.start_class = &ch_class;
4798 data.last_closep = &last_close;
4801 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4802 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4806 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4807 = r->float_substr = r->float_utf8 = NULL;
4808 if (!(data.start_class->flags & ANYOF_EOS)
4809 && !cl_is_anything(data.start_class))
4811 const U32 n = add_data(pRExC_state, 1, "f");
4813 Newx(RExC_rxi->data->data[n], 1,
4814 struct regnode_charclass_class);
4815 StructCopy(data.start_class,
4816 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4817 struct regnode_charclass_class);
4818 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4819 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4820 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4821 regprop(r, sv, (regnode*)data.start_class);
4822 PerlIO_printf(Perl_debug_log,
4823 "synthetic stclass \"%s\".\n",
4824 SvPVX_const(sv));});
4828 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4829 the "real" pattern. */
4831 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4832 (IV)minlen, (IV)r->minlen);
4834 r->minlenret = minlen;
4835 if (r->minlen < minlen)
4838 if (RExC_seen & REG_SEEN_GPOS)
4839 r->extflags |= RXf_GPOS_SEEN;
4840 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4841 r->extflags |= RXf_LOOKBEHIND_SEEN;
4842 if (RExC_seen & REG_SEEN_EVAL)
4843 r->extflags |= RXf_EVAL_SEEN;
4844 if (RExC_seen & REG_SEEN_CANY)
4845 r->extflags |= RXf_CANY_SEEN;
4846 if (RExC_seen & REG_SEEN_VERBARG)
4847 r->intflags |= PREGf_VERBARG_SEEN;
4848 if (RExC_seen & REG_SEEN_CUTGROUP)
4849 r->intflags |= PREGf_CUTGROUP_SEEN;
4850 if (RExC_paren_names)
4851 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4853 RXp_PAREN_NAMES(r) = NULL;
4855 #ifdef STUPID_PATTERN_CHECKS
4856 if (RX_PRELEN(r) == 0)
4857 r->extflags |= RXf_NULL;
4858 if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
4859 /* XXX: this should happen BEFORE we compile */
4860 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4861 else if (RX_PRELEN(r) == 3 && memEQ("\\s+", RX_PRECOMP(r), 3))
4862 r->extflags |= RXf_WHITE;
4863 else if (RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == '^')
4864 r->extflags |= RXf_START_ONLY;
4866 if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
4867 /* XXX: this should happen BEFORE we compile */
4868 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4870 regnode *first = ri->program + 1;
4872 U8 nop = OP(NEXTOPER(first));
4874 if (PL_regkind[fop] == NOTHING && nop == END)
4875 r->extflags |= RXf_NULL;
4876 else if (PL_regkind[fop] == BOL && nop == END)
4877 r->extflags |= RXf_START_ONLY;
4878 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4879 r->extflags |= RXf_WHITE;
4883 if (RExC_paren_names) {
4884 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4885 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4888 ri->name_list_idx = 0;
4890 if (RExC_recurse_count) {
4891 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4892 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4893 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4896 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4897 /* assume we don't need to swap parens around before we match */
4900 PerlIO_printf(Perl_debug_log,"Final program:\n");
4903 #ifdef RE_TRACK_PATTERN_OFFSETS
4904 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4905 const U32 len = ri->u.offsets[0];
4907 GET_RE_DEBUG_FLAGS_DECL;
4908 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4909 for (i = 1; i <= len; i++) {
4910 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4911 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4912 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4914 PerlIO_printf(Perl_debug_log, "\n");
4920 #undef RE_ENGINE_PTR
4924 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4927 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4929 PERL_UNUSED_ARG(value);
4931 if (flags & RXapif_FETCH) {
4932 return reg_named_buff_fetch(rx, key, flags);
4933 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4934 Perl_croak(aTHX_ "%s", PL_no_modify);
4936 } else if (flags & RXapif_EXISTS) {
4937 return reg_named_buff_exists(rx, key, flags)
4940 } else if (flags & RXapif_REGNAMES) {
4941 return reg_named_buff_all(rx, flags);
4942 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4943 return reg_named_buff_scalar(rx, flags);
4945 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4951 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4954 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4955 PERL_UNUSED_ARG(lastkey);
4957 if (flags & RXapif_FIRSTKEY)
4958 return reg_named_buff_firstkey(rx, flags);
4959 else if (flags & RXapif_NEXTKEY)
4960 return reg_named_buff_nextkey(rx, flags);
4962 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4968 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags)
4970 AV *retarray = NULL;
4973 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4975 if (flags & RXapif_ALL)
4978 if (rx && RXp_PAREN_NAMES(rx)) {
4979 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
4982 SV* sv_dat=HeVAL(he_str);
4983 I32 *nums=(I32*)SvPVX(sv_dat);
4984 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4985 if ((I32)(rx->nparens) >= nums[i]
4986 && rx->offs[nums[i]].start != -1
4987 && rx->offs[nums[i]].end != -1)
4990 CALLREG_NUMBUF_FETCH(rx,nums[i],ret);
4994 ret = newSVsv(&PL_sv_undef);
4997 av_push(retarray, ret);
5000 return newRV_noinc(MUTABLE_SV(retarray));
5007 Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key,
5011 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5013 if (rx && RXp_PAREN_NAMES(rx)) {
5014 if (flags & RXapif_ALL) {
5015 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5017 SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags);
5031 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
5034 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5036 if ( rx && RXp_PAREN_NAMES(rx) ) {
5037 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5039 return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
5046 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags)
5048 GET_RE_DEBUG_FLAGS_DECL;
5050 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5052 if (rx && RXp_PAREN_NAMES(rx)) {
5053 HV *hv = RXp_PAREN_NAMES(rx);
5055 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5058 SV* sv_dat = HeVAL(temphe);
5059 I32 *nums = (I32*)SvPVX(sv_dat);
5060 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5061 if ((I32)(rx->lastparen) >= nums[i] &&
5062 rx->offs[nums[i]].start != -1 &&
5063 rx->offs[nums[i]].end != -1)
5069 if (parno || flags & RXapif_ALL) {
5070 return newSVhek(HeKEY_hek(temphe));
5078 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags)
5084 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5086 if (rx && RXp_PAREN_NAMES(rx)) {
5087 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5088 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5089 } else if (flags & RXapif_ONE) {
5090 ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES));
5091 av = MUTABLE_AV(SvRV(ret));
5092 length = av_len(av);
5094 return newSViv(length + 1);
5096 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5100 return &PL_sv_undef;
5104 Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags)
5108 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5110 if (rx && RXp_PAREN_NAMES(rx)) {
5111 HV *hv= RXp_PAREN_NAMES(rx);
5113 (void)hv_iterinit(hv);
5114 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5117 SV* sv_dat = HeVAL(temphe);
5118 I32 *nums = (I32*)SvPVX(sv_dat);
5119 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5120 if ((I32)(rx->lastparen) >= nums[i] &&
5121 rx->offs[nums[i]].start != -1 &&
5122 rx->offs[nums[i]].end != -1)
5128 if (parno || flags & RXapif_ALL) {
5129 av_push(av, newSVhek(HeKEY_hek(temphe)));
5134 return newRV_noinc(MUTABLE_SV(av));
5138 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv)
5144 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5147 sv_setsv(sv,&PL_sv_undef);
5151 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5153 i = rx->offs[0].start;
5157 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5159 s = rx->subbeg + rx->offs[0].end;
5160 i = rx->sublen - rx->offs[0].end;
5163 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5164 (s1 = rx->offs[paren].start) != -1 &&
5165 (t1 = rx->offs[paren].end) != -1)
5169 s = rx->subbeg + s1;
5171 sv_setsv(sv,&PL_sv_undef);
5174 assert(rx->sublen >= (s - rx->subbeg) + i );
5176 const int oldtainted = PL_tainted;
5178 sv_setpvn(sv, s, i);
5179 PL_tainted = oldtainted;
5180 if ( (rx->extflags & RXf_CANY_SEEN)
5181 ? (RXp_MATCH_UTF8(rx)
5182 && (!i || is_utf8_string((U8*)s, i)))
5183 : (RXp_MATCH_UTF8(rx)) )
5190 if (RXp_MATCH_TAINTED(rx)) {
5191 if (SvTYPE(sv) >= SVt_PVMG) {
5192 MAGIC* const mg = SvMAGIC(sv);
5195 SvMAGIC_set(sv, mg->mg_moremagic);
5197 if ((mgt = SvMAGIC(sv))) {
5198 mg->mg_moremagic = mgt;
5199 SvMAGIC_set(sv, mg);
5209 sv_setsv(sv,&PL_sv_undef);
5215 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5216 SV const * const value)
5218 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5220 PERL_UNUSED_ARG(rx);
5221 PERL_UNUSED_ARG(paren);
5222 PERL_UNUSED_ARG(value);
5225 Perl_croak(aTHX_ "%s", PL_no_modify);
5229 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv,
5235 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5237 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5239 /* $` / ${^PREMATCH} */
5240 case RX_BUFF_IDX_PREMATCH:
5241 if (rx->offs[0].start != -1) {
5242 i = rx->offs[0].start;
5250 /* $' / ${^POSTMATCH} */
5251 case RX_BUFF_IDX_POSTMATCH:
5252 if (rx->offs[0].end != -1) {
5253 i = rx->sublen - rx->offs[0].end;
5255 s1 = rx->offs[0].end;
5261 /* $& / ${^MATCH}, $1, $2, ... */
5263 if (paren <= (I32)rx->nparens &&
5264 (s1 = rx->offs[paren].start) != -1 &&
5265 (t1 = rx->offs[paren].end) != -1)
5270 if (ckWARN(WARN_UNINITIALIZED))
5271 report_uninit((SV *)sv);
5276 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5277 const char * const s = rx->subbeg + s1;
5282 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5289 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5291 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5292 PERL_UNUSED_ARG(rx);
5293 return newSVpvs("Regexp");
5296 /* Scans the name of a named buffer from the pattern.
5297 * If flags is REG_RSN_RETURN_NULL returns null.
5298 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5299 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5300 * to the parsed name as looked up in the RExC_paren_names hash.
5301 * If there is an error throws a vFAIL().. type exception.
5304 #define REG_RSN_RETURN_NULL 0
5305 #define REG_RSN_RETURN_NAME 1
5306 #define REG_RSN_RETURN_DATA 2
5309 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5311 char *name_start = RExC_parse;
5313 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5315 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5316 /* skip IDFIRST by using do...while */
5319 RExC_parse += UTF8SKIP(RExC_parse);
5320 } while (isALNUM_utf8((U8*)RExC_parse));
5324 } while (isALNUM(*RExC_parse));
5329 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5330 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5331 if ( flags == REG_RSN_RETURN_NAME)
5333 else if (flags==REG_RSN_RETURN_DATA) {
5336 if ( ! sv_name ) /* should not happen*/
5337 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5338 if (RExC_paren_names)
5339 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5341 sv_dat = HeVAL(he_str);
5343 vFAIL("Reference to nonexistent named group");
5347 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5354 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5355 int rem=(int)(RExC_end - RExC_parse); \
5364 if (RExC_lastparse!=RExC_parse) \
5365 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5368 iscut ? "..." : "<" \
5371 PerlIO_printf(Perl_debug_log,"%16s",""); \
5374 num = RExC_size + 1; \
5376 num=REG_NODE_NUM(RExC_emit); \
5377 if (RExC_lastnum!=num) \
5378 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5380 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5381 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5382 (int)((depth*2)), "", \
5386 RExC_lastparse=RExC_parse; \
5391 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5392 DEBUG_PARSE_MSG((funcname)); \
5393 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5395 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5396 DEBUG_PARSE_MSG((funcname)); \
5397 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5400 - reg - regular expression, i.e. main body or parenthesized thing
5402 * Caller must absorb opening parenthesis.
5404 * Combining parenthesis handling with the base level of regular expression
5405 * is a trifle forced, but the need to tie the tails of the branches to what
5406 * follows makes it hard to avoid.
5408 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5410 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5412 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5416 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5417 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5420 register regnode *ret; /* Will be the head of the group. */
5421 register regnode *br;
5422 register regnode *lastbr;
5423 register regnode *ender = NULL;
5424 register I32 parno = 0;
5426 U32 oregflags = RExC_flags;
5427 bool have_branch = 0;
5429 I32 freeze_paren = 0;
5430 I32 after_freeze = 0;
5432 /* for (?g), (?gc), and (?o) warnings; warning
5433 about (?c) will warn about (?g) -- japhy */
5435 #define WASTED_O 0x01
5436 #define WASTED_G 0x02
5437 #define WASTED_C 0x04
5438 #define WASTED_GC (0x02|0x04)
5439 I32 wastedflags = 0x00;
5441 char * parse_start = RExC_parse; /* MJD */
5442 char * const oregcomp_parse = RExC_parse;
5444 GET_RE_DEBUG_FLAGS_DECL;
5446 PERL_ARGS_ASSERT_REG;
5447 DEBUG_PARSE("reg ");
5449 *flagp = 0; /* Tentatively. */
5452 /* Make an OPEN node, if parenthesized. */
5454 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5455 char *start_verb = RExC_parse;
5456 STRLEN verb_len = 0;
5457 char *start_arg = NULL;
5458 unsigned char op = 0;
5460 int internal_argval = 0; /* internal_argval is only useful if !argok */
5461 while ( *RExC_parse && *RExC_parse != ')' ) {
5462 if ( *RExC_parse == ':' ) {
5463 start_arg = RExC_parse + 1;
5469 verb_len = RExC_parse - start_verb;
5472 while ( *RExC_parse && *RExC_parse != ')' )
5474 if ( *RExC_parse != ')' )
5475 vFAIL("Unterminated verb pattern argument");
5476 if ( RExC_parse == start_arg )
5479 if ( *RExC_parse != ')' )
5480 vFAIL("Unterminated verb pattern");
5483 switch ( *start_verb ) {
5484 case 'A': /* (*ACCEPT) */
5485 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5487 internal_argval = RExC_nestroot;
5490 case 'C': /* (*COMMIT) */
5491 if ( memEQs(start_verb,verb_len,"COMMIT") )
5494 case 'F': /* (*FAIL) */
5495 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5500 case ':': /* (*:NAME) */
5501 case 'M': /* (*MARK:NAME) */
5502 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5507 case 'P': /* (*PRUNE) */
5508 if ( memEQs(start_verb,verb_len,"PRUNE") )
5511 case 'S': /* (*SKIP) */
5512 if ( memEQs(start_verb,verb_len,"SKIP") )
5515 case 'T': /* (*THEN) */
5516 /* [19:06] <TimToady> :: is then */
5517 if ( memEQs(start_verb,verb_len,"THEN") ) {
5519 RExC_seen |= REG_SEEN_CUTGROUP;
5525 vFAIL3("Unknown verb pattern '%.*s'",
5526 verb_len, start_verb);
5529 if ( start_arg && internal_argval ) {
5530 vFAIL3("Verb pattern '%.*s' may not have an argument",
5531 verb_len, start_verb);
5532 } else if ( argok < 0 && !start_arg ) {
5533 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5534 verb_len, start_verb);
5536 ret = reganode(pRExC_state, op, internal_argval);
5537 if ( ! internal_argval && ! SIZE_ONLY ) {
5539 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5540 ARG(ret) = add_data( pRExC_state, 1, "S" );
5541 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5548 if (!internal_argval)
5549 RExC_seen |= REG_SEEN_VERBARG;
5550 } else if ( start_arg ) {
5551 vFAIL3("Verb pattern '%.*s' may not have an argument",
5552 verb_len, start_verb);
5554 ret = reg_node(pRExC_state, op);
5556 nextchar(pRExC_state);
5559 if (*RExC_parse == '?') { /* (?...) */
5560 bool is_logical = 0;
5561 const char * const seqstart = RExC_parse;
5564 paren = *RExC_parse++;
5565 ret = NULL; /* For look-ahead/behind. */
5568 case 'P': /* (?P...) variants for those used to PCRE/Python */
5569 paren = *RExC_parse++;
5570 if ( paren == '<') /* (?P<...>) named capture */
5572 else if (paren == '>') { /* (?P>name) named recursion */
5573 goto named_recursion;
5575 else if (paren == '=') { /* (?P=...) named backref */
5576 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5577 you change this make sure you change that */
5578 char* name_start = RExC_parse;
5580 SV *sv_dat = reg_scan_name(pRExC_state,
5581 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5582 if (RExC_parse == name_start || *RExC_parse != ')')
5583 vFAIL2("Sequence %.3s... not terminated",parse_start);
5586 num = add_data( pRExC_state, 1, "S" );
5587 RExC_rxi->data->data[num]=(void*)sv_dat;
5588 SvREFCNT_inc_simple_void(sv_dat);
5591 ret = reganode(pRExC_state,
5592 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5596 Set_Node_Offset(ret, parse_start+1);
5597 Set_Node_Cur_Length(ret); /* MJD */
5599 nextchar(pRExC_state);
5603 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5605 case '<': /* (?<...) */
5606 if (*RExC_parse == '!')
5608 else if (*RExC_parse != '=')
5614 case '\'': /* (?'...') */
5615 name_start= RExC_parse;
5616 svname = reg_scan_name(pRExC_state,
5617 SIZE_ONLY ? /* reverse test from the others */
5618 REG_RSN_RETURN_NAME :
5619 REG_RSN_RETURN_NULL);
5620 if (RExC_parse == name_start) {
5622 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5625 if (*RExC_parse != paren)
5626 vFAIL2("Sequence (?%c... not terminated",
5627 paren=='>' ? '<' : paren);
5631 if (!svname) /* shouldnt happen */
5633 "panic: reg_scan_name returned NULL");
5634 if (!RExC_paren_names) {
5635 RExC_paren_names= newHV();
5636 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5638 RExC_paren_name_list= newAV();
5639 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5642 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5644 sv_dat = HeVAL(he_str);
5646 /* croak baby croak */
5648 "panic: paren_name hash element allocation failed");
5649 } else if ( SvPOK(sv_dat) ) {
5650 /* (?|...) can mean we have dupes so scan to check
5651 its already been stored. Maybe a flag indicating
5652 we are inside such a construct would be useful,
5653 but the arrays are likely to be quite small, so
5654 for now we punt -- dmq */
5655 IV count = SvIV(sv_dat);
5656 I32 *pv = (I32*)SvPVX(sv_dat);
5658 for ( i = 0 ; i < count ; i++ ) {
5659 if ( pv[i] == RExC_npar ) {
5665 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5666 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5667 pv[count] = RExC_npar;
5668 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5671 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5672 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5674 SvIV_set(sv_dat, 1);
5677 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5678 SvREFCNT_dec(svname);
5681 /*sv_dump(sv_dat);*/
5683 nextchar(pRExC_state);
5685 goto capturing_parens;
5687 RExC_seen |= REG_SEEN_LOOKBEHIND;
5689 case '=': /* (?=...) */
5690 RExC_seen_zerolen++;
5692 case '!': /* (?!...) */
5693 RExC_seen_zerolen++;
5694 if (*RExC_parse == ')') {
5695 ret=reg_node(pRExC_state, OPFAIL);
5696 nextchar(pRExC_state);
5700 case '|': /* (?|...) */
5701 /* branch reset, behave like a (?:...) except that
5702 buffers in alternations share the same numbers */
5704 after_freeze = freeze_paren = RExC_npar;
5706 case ':': /* (?:...) */
5707 case '>': /* (?>...) */
5709 case '$': /* (?$...) */
5710 case '@': /* (?@...) */
5711 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5713 case '#': /* (?#...) */
5714 while (*RExC_parse && *RExC_parse != ')')
5716 if (*RExC_parse != ')')
5717 FAIL("Sequence (?#... not terminated");
5718 nextchar(pRExC_state);
5721 case '0' : /* (?0) */
5722 case 'R' : /* (?R) */
5723 if (*RExC_parse != ')')
5724 FAIL("Sequence (?R) not terminated");
5725 ret = reg_node(pRExC_state, GOSTART);
5726 *flagp |= POSTPONED;
5727 nextchar(pRExC_state);
5730 { /* named and numeric backreferences */
5732 case '&': /* (?&NAME) */
5733 parse_start = RExC_parse - 1;
5736 SV *sv_dat = reg_scan_name(pRExC_state,
5737 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5738 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5740 goto gen_recurse_regop;
5743 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5745 vFAIL("Illegal pattern");
5747 goto parse_recursion;
5749 case '-': /* (?-1) */
5750 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5751 RExC_parse--; /* rewind to let it be handled later */
5755 case '1': case '2': case '3': case '4': /* (?1) */
5756 case '5': case '6': case '7': case '8': case '9':
5759 num = atoi(RExC_parse);
5760 parse_start = RExC_parse - 1; /* MJD */
5761 if (*RExC_parse == '-')
5763 while (isDIGIT(*RExC_parse))
5765 if (*RExC_parse!=')')
5766 vFAIL("Expecting close bracket");
5769 if ( paren == '-' ) {
5771 Diagram of capture buffer numbering.
5772 Top line is the normal capture buffer numbers
5773 Botton line is the negative indexing as from
5777 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5781 num = RExC_npar + num;
5784 vFAIL("Reference to nonexistent group");
5786 } else if ( paren == '+' ) {
5787 num = RExC_npar + num - 1;
5790 ret = reganode(pRExC_state, GOSUB, num);
5792 if (num > (I32)RExC_rx->nparens) {
5794 vFAIL("Reference to nonexistent group");
5796 ARG2L_SET( ret, RExC_recurse_count++);
5798 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5799 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5803 RExC_seen |= REG_SEEN_RECURSE;
5804 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5805 Set_Node_Offset(ret, parse_start); /* MJD */
5807 *flagp |= POSTPONED;
5808 nextchar(pRExC_state);
5810 } /* named and numeric backreferences */
5813 case '?': /* (??...) */
5815 if (*RExC_parse != '{') {
5817 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5820 *flagp |= POSTPONED;
5821 paren = *RExC_parse++;
5823 case '{': /* (?{...}) */
5828 char *s = RExC_parse;
5830 RExC_seen_zerolen++;
5831 RExC_seen |= REG_SEEN_EVAL;
5832 while (count && (c = *RExC_parse)) {
5843 if (*RExC_parse != ')') {
5845 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5849 OP_4tree *sop, *rop;
5850 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5853 Perl_save_re_context(aTHX);
5854 rop = sv_compile_2op(sv, &sop, "re", &pad);
5855 sop->op_private |= OPpREFCOUNTED;
5856 /* re_dup will OpREFCNT_inc */
5857 OpREFCNT_set(sop, 1);
5860 n = add_data(pRExC_state, 3, "nop");
5861 RExC_rxi->data->data[n] = (void*)rop;
5862 RExC_rxi->data->data[n+1] = (void*)sop;
5863 RExC_rxi->data->data[n+2] = (void*)pad;
5866 else { /* First pass */
5867 if (PL_reginterp_cnt < ++RExC_seen_evals
5869 /* No compiled RE interpolated, has runtime
5870 components ===> unsafe. */
5871 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5872 if (PL_tainting && PL_tainted)
5873 FAIL("Eval-group in insecure regular expression");
5874 #if PERL_VERSION > 8
5875 if (IN_PERL_COMPILETIME)
5880 nextchar(pRExC_state);
5882 ret = reg_node(pRExC_state, LOGICAL);
5885 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5886 /* deal with the length of this later - MJD */
5889 ret = reganode(pRExC_state, EVAL, n);
5890 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5891 Set_Node_Offset(ret, parse_start);
5894 case '(': /* (?(?{...})...) and (?(?=...)...) */
5897 if (RExC_parse[0] == '?') { /* (?(?...)) */
5898 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5899 || RExC_parse[1] == '<'
5900 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5903 ret = reg_node(pRExC_state, LOGICAL);
5906 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5910 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5911 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5913 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5914 char *name_start= RExC_parse++;
5916 SV *sv_dat=reg_scan_name(pRExC_state,
5917 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5918 if (RExC_parse == name_start || *RExC_parse != ch)
5919 vFAIL2("Sequence (?(%c... not terminated",
5920 (ch == '>' ? '<' : ch));
5923 num = add_data( pRExC_state, 1, "S" );
5924 RExC_rxi->data->data[num]=(void*)sv_dat;
5925 SvREFCNT_inc_simple_void(sv_dat);
5927 ret = reganode(pRExC_state,NGROUPP,num);
5928 goto insert_if_check_paren;
5930 else if (RExC_parse[0] == 'D' &&
5931 RExC_parse[1] == 'E' &&
5932 RExC_parse[2] == 'F' &&
5933 RExC_parse[3] == 'I' &&
5934 RExC_parse[4] == 'N' &&
5935 RExC_parse[5] == 'E')
5937 ret = reganode(pRExC_state,DEFINEP,0);
5940 goto insert_if_check_paren;
5942 else if (RExC_parse[0] == 'R') {
5945 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5946 parno = atoi(RExC_parse++);
5947 while (isDIGIT(*RExC_parse))
5949 } else if (RExC_parse[0] == '&') {
5952 sv_dat = reg_scan_name(pRExC_state,
5953 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5954 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5956 ret = reganode(pRExC_state,INSUBP,parno);
5957 goto insert_if_check_paren;
5959 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5962 parno = atoi(RExC_parse++);
5964 while (isDIGIT(*RExC_parse))
5966 ret = reganode(pRExC_state, GROUPP, parno);
5968 insert_if_check_paren:
5969 if ((c = *nextchar(pRExC_state)) != ')')
5970 vFAIL("Switch condition not recognized");
5972 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5973 br = regbranch(pRExC_state, &flags, 1,depth+1);
5975 br = reganode(pRExC_state, LONGJMP, 0);
5977 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5978 c = *nextchar(pRExC_state);
5983 vFAIL("(?(DEFINE)....) does not allow branches");
5984 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5985 regbranch(pRExC_state, &flags, 1,depth+1);
5986 REGTAIL(pRExC_state, ret, lastbr);
5989 c = *nextchar(pRExC_state);
5994 vFAIL("Switch (?(condition)... contains too many branches");
5995 ender = reg_node(pRExC_state, TAIL);
5996 REGTAIL(pRExC_state, br, ender);
5998 REGTAIL(pRExC_state, lastbr, ender);
5999 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6002 REGTAIL(pRExC_state, ret, ender);
6003 RExC_size++; /* XXX WHY do we need this?!!
6004 For large programs it seems to be required
6005 but I can't figure out why. -- dmq*/
6009 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6013 RExC_parse--; /* for vFAIL to print correctly */
6014 vFAIL("Sequence (? incomplete");
6018 parse_flags: /* (?i) */
6020 U32 posflags = 0, negflags = 0;
6021 U32 *flagsp = &posflags;
6023 while (*RExC_parse) {
6024 /* && strchr("iogcmsx", *RExC_parse) */
6025 /* (?g), (?gc) and (?o) are useless here
6026 and must be globally applied -- japhy */
6027 switch (*RExC_parse) {
6028 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6029 case ONCE_PAT_MOD: /* 'o' */
6030 case GLOBAL_PAT_MOD: /* 'g' */
6031 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6032 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6033 if (! (wastedflags & wflagbit) ) {
6034 wastedflags |= wflagbit;
6037 "Useless (%s%c) - %suse /%c modifier",
6038 flagsp == &negflags ? "?-" : "?",
6040 flagsp == &negflags ? "don't " : "",
6047 case CONTINUE_PAT_MOD: /* 'c' */
6048 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6049 if (! (wastedflags & WASTED_C) ) {
6050 wastedflags |= WASTED_GC;
6053 "Useless (%sc) - %suse /gc modifier",
6054 flagsp == &negflags ? "?-" : "?",
6055 flagsp == &negflags ? "don't " : ""
6060 case KEEPCOPY_PAT_MOD: /* 'p' */
6061 if (flagsp == &negflags) {
6062 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
6063 vWARN(RExC_parse + 1,"Useless use of (?-p)");
6065 *flagsp |= RXf_PMf_KEEPCOPY;
6069 if (flagsp == &negflags) {
6071 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6075 wastedflags = 0; /* reset so (?g-c) warns twice */
6081 RExC_flags |= posflags;
6082 RExC_flags &= ~negflags;
6084 oregflags |= posflags;
6085 oregflags &= ~negflags;
6087 nextchar(pRExC_state);
6098 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6103 }} /* one for the default block, one for the switch */
6110 ret = reganode(pRExC_state, OPEN, parno);
6113 RExC_nestroot = parno;
6114 if (RExC_seen & REG_SEEN_RECURSE
6115 && !RExC_open_parens[parno-1])
6117 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6118 "Setting open paren #%"IVdf" to %d\n",
6119 (IV)parno, REG_NODE_NUM(ret)));
6120 RExC_open_parens[parno-1]= ret;
6123 Set_Node_Length(ret, 1); /* MJD */
6124 Set_Node_Offset(ret, RExC_parse); /* MJD */
6132 /* Pick up the branches, linking them together. */
6133 parse_start = RExC_parse; /* MJD */
6134 br = regbranch(pRExC_state, &flags, 1,depth+1);
6137 if (RExC_npar > after_freeze)
6138 after_freeze = RExC_npar;
6139 RExC_npar = freeze_paren;
6142 /* branch_len = (paren != 0); */
6146 if (*RExC_parse == '|') {
6147 if (!SIZE_ONLY && RExC_extralen) {
6148 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6151 reginsert(pRExC_state, BRANCH, br, depth+1);
6152 Set_Node_Length(br, paren != 0);
6153 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6157 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6159 else if (paren == ':') {
6160 *flagp |= flags&SIMPLE;
6162 if (is_open) { /* Starts with OPEN. */
6163 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6165 else if (paren != '?') /* Not Conditional */
6167 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6169 while (*RExC_parse == '|') {
6170 if (!SIZE_ONLY && RExC_extralen) {
6171 ender = reganode(pRExC_state, LONGJMP,0);
6172 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6175 RExC_extralen += 2; /* Account for LONGJMP. */
6176 nextchar(pRExC_state);
6178 if (RExC_npar > after_freeze)
6179 after_freeze = RExC_npar;
6180 RExC_npar = freeze_paren;
6182 br = regbranch(pRExC_state, &flags, 0, depth+1);
6186 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6188 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6191 if (have_branch || paren != ':') {
6192 /* Make a closing node, and hook it on the end. */
6195 ender = reg_node(pRExC_state, TAIL);
6198 ender = reganode(pRExC_state, CLOSE, parno);
6199 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6200 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6201 "Setting close paren #%"IVdf" to %d\n",
6202 (IV)parno, REG_NODE_NUM(ender)));
6203 RExC_close_parens[parno-1]= ender;
6204 if (RExC_nestroot == parno)
6207 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6208 Set_Node_Length(ender,1); /* MJD */
6214 *flagp &= ~HASWIDTH;
6217 ender = reg_node(pRExC_state, SUCCEED);
6220 ender = reg_node(pRExC_state, END);
6222 assert(!RExC_opend); /* there can only be one! */
6227 REGTAIL(pRExC_state, lastbr, ender);
6229 if (have_branch && !SIZE_ONLY) {
6231 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6233 /* Hook the tails of the branches to the closing node. */
6234 for (br = ret; br; br = regnext(br)) {
6235 const U8 op = PL_regkind[OP(br)];
6237 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6239 else if (op == BRANCHJ) {
6240 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6248 static const char parens[] = "=!<,>";
6250 if (paren && (p = strchr(parens, paren))) {
6251 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6252 int flag = (p - parens) > 1;
6255 node = SUSPEND, flag = 0;
6256 reginsert(pRExC_state, node,ret, depth+1);
6257 Set_Node_Cur_Length(ret);
6258 Set_Node_Offset(ret, parse_start + 1);
6260 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6264 /* Check for proper termination. */
6266 RExC_flags = oregflags;
6267 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6268 RExC_parse = oregcomp_parse;
6269 vFAIL("Unmatched (");
6272 else if (!paren && RExC_parse < RExC_end) {
6273 if (*RExC_parse == ')') {
6275 vFAIL("Unmatched )");
6278 FAIL("Junk on end of regexp"); /* "Can't happen". */
6282 RExC_npar = after_freeze;
6287 - regbranch - one alternative of an | operator
6289 * Implements the concatenation operator.
6292 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6295 register regnode *ret;
6296 register regnode *chain = NULL;
6297 register regnode *latest;
6298 I32 flags = 0, c = 0;
6299 GET_RE_DEBUG_FLAGS_DECL;
6301 PERL_ARGS_ASSERT_REGBRANCH;
6303 DEBUG_PARSE("brnc");
6308 if (!SIZE_ONLY && RExC_extralen)
6309 ret = reganode(pRExC_state, BRANCHJ,0);
6311 ret = reg_node(pRExC_state, BRANCH);
6312 Set_Node_Length(ret, 1);
6316 if (!first && SIZE_ONLY)
6317 RExC_extralen += 1; /* BRANCHJ */
6319 *flagp = WORST; /* Tentatively. */
6322 nextchar(pRExC_state);
6323 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6325 latest = regpiece(pRExC_state, &flags,depth+1);
6326 if (latest == NULL) {
6327 if (flags & TRYAGAIN)
6331 else if (ret == NULL)
6333 *flagp |= flags&(HASWIDTH|POSTPONED);
6334 if (chain == NULL) /* First piece. */
6335 *flagp |= flags&SPSTART;
6338 REGTAIL(pRExC_state, chain, latest);
6343 if (chain == NULL) { /* Loop ran zero times. */
6344 chain = reg_node(pRExC_state, NOTHING);
6349 *flagp |= flags&SIMPLE;
6356 - regpiece - something followed by possible [*+?]
6358 * Note that the branching code sequences used for ? and the general cases
6359 * of * and + are somewhat optimized: they use the same NOTHING node as
6360 * both the endmarker for their branch list and the body of the last branch.
6361 * It might seem that this node could be dispensed with entirely, but the
6362 * endmarker role is not redundant.
6365 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6368 register regnode *ret;
6370 register char *next;
6372 const char * const origparse = RExC_parse;
6374 I32 max = REG_INFTY;
6376 const char *maxpos = NULL;
6377 GET_RE_DEBUG_FLAGS_DECL;
6379 PERL_ARGS_ASSERT_REGPIECE;
6381 DEBUG_PARSE("piec");
6383 ret = regatom(pRExC_state, &flags,depth+1);
6385 if (flags & TRYAGAIN)
6392 if (op == '{' && regcurly(RExC_parse)) {
6394 parse_start = RExC_parse; /* MJD */
6395 next = RExC_parse + 1;
6396 while (isDIGIT(*next) || *next == ',') {
6405 if (*next == '}') { /* got one */
6409 min = atoi(RExC_parse);
6413 maxpos = RExC_parse;
6415 if (!max && *maxpos != '0')
6416 max = REG_INFTY; /* meaning "infinity" */
6417 else if (max >= REG_INFTY)
6418 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6420 nextchar(pRExC_state);
6423 if ((flags&SIMPLE)) {
6424 RExC_naughty += 2 + RExC_naughty / 2;
6425 reginsert(pRExC_state, CURLY, ret, depth+1);
6426 Set_Node_Offset(ret, parse_start+1); /* MJD */
6427 Set_Node_Cur_Length(ret);
6430 regnode * const w = reg_node(pRExC_state, WHILEM);
6433 REGTAIL(pRExC_state, ret, w);
6434 if (!SIZE_ONLY && RExC_extralen) {
6435 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6436 reginsert(pRExC_state, NOTHING,ret, depth+1);
6437 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6439 reginsert(pRExC_state, CURLYX,ret, depth+1);
6441 Set_Node_Offset(ret, parse_start+1);
6442 Set_Node_Length(ret,
6443 op == '{' ? (RExC_parse - parse_start) : 1);
6445 if (!SIZE_ONLY && RExC_extralen)
6446 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6447 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6449 RExC_whilem_seen++, RExC_extralen += 3;
6450 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6459 vFAIL("Can't do {n,m} with n > m");
6461 ARG1_SET(ret, (U16)min);
6462 ARG2_SET(ret, (U16)max);
6474 #if 0 /* Now runtime fix should be reliable. */
6476 /* if this is reinstated, don't forget to put this back into perldiag:
6478 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6480 (F) The part of the regexp subject to either the * or + quantifier
6481 could match an empty string. The {#} shows in the regular
6482 expression about where the problem was discovered.
6486 if (!(flags&HASWIDTH) && op != '?')
6487 vFAIL("Regexp *+ operand could be empty");
6490 parse_start = RExC_parse;
6491 nextchar(pRExC_state);
6493 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6495 if (op == '*' && (flags&SIMPLE)) {
6496 reginsert(pRExC_state, STAR, ret, depth+1);
6500 else if (op == '*') {
6504 else if (op == '+' && (flags&SIMPLE)) {
6505 reginsert(pRExC_state, PLUS, ret, depth+1);
6509 else if (op == '+') {
6513 else if (op == '?') {
6518 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
6520 "%.*s matches null string many times",
6521 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6525 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6526 nextchar(pRExC_state);
6527 reginsert(pRExC_state, MINMOD, ret, depth+1);
6528 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6530 #ifndef REG_ALLOW_MINMOD_SUSPEND
6533 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6535 nextchar(pRExC_state);
6536 ender = reg_node(pRExC_state, SUCCEED);
6537 REGTAIL(pRExC_state, ret, ender);
6538 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6540 ender = reg_node(pRExC_state, TAIL);
6541 REGTAIL(pRExC_state, ret, ender);
6545 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6547 vFAIL("Nested quantifiers");
6554 /* reg_namedseq(pRExC_state,UVp)
6556 This is expected to be called by a parser routine that has
6557 recognized'\N' and needs to handle the rest. RExC_parse is
6558 expected to point at the first char following the N at the time
6561 If valuep is non-null then it is assumed that we are parsing inside
6562 of a charclass definition and the first codepoint in the resolved
6563 string is returned via *valuep and the routine will return NULL.
6564 In this mode if a multichar string is returned from the charnames
6565 handler a warning will be issued, and only the first char in the
6566 sequence will be examined. If the string returned is zero length
6567 then the value of *valuep is undefined and NON-NULL will
6568 be returned to indicate failure. (This will NOT be a valid pointer
6571 If value is null then it is assumed that we are parsing normal text
6572 and inserts a new EXACT node into the program containing the resolved
6573 string and returns a pointer to the new node. If the string is
6574 zerolength a NOTHING node is emitted.
6576 On success RExC_parse is set to the char following the endbrace.
6577 Parsing failures will generate a fatal errorvia vFAIL(...)
6579 NOTE: We cache all results from the charnames handler locally in
6580 the RExC_charnames hash (created on first use) to prevent a charnames
6581 handler from playing silly-buggers and returning a short string and
6582 then a long string for a given pattern. Since the regexp program
6583 size is calculated during an initial parse this would result
6584 in a buffer overrun so we cache to prevent the charname result from
6585 changing during the course of the parse.
6589 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6591 char * name; /* start of the content of the name */
6592 char * endbrace; /* endbrace following the name */
6595 STRLEN len; /* this has various purposes throughout the code */
6596 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6597 regnode *ret = NULL;
6599 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6601 if (*RExC_parse != '{') {
6602 vFAIL("Missing braces on \\N{}");
6604 name = RExC_parse+1;
6605 endbrace = strchr(RExC_parse, '}');
6608 vFAIL("Missing right brace on \\N{}");
6610 RExC_parse = endbrace + 1;
6613 /* RExC_parse points at the beginning brace,
6614 endbrace points at the last */
6615 if ( name[0]=='U' && name[1]=='+' ) {
6616 /* its a "Unicode hex" notation {U+89AB} */
6617 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6618 | PERL_SCAN_DISALLOW_PREFIX
6619 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6621 len = (STRLEN)(endbrace - name - 2);
6622 cp = grok_hex(name + 2, &len, &fl, NULL);
6623 if ( len != (STRLEN)(endbrace - name - 2) ) {
6627 if (cp > 0xff) RExC_utf8 = 1;
6632 /* Need to convert to utf8 if either: won't fit into a byte, or the re
6633 * is going to be in utf8 and the representation changes under utf8. */
6634 if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
6635 U8 string[UTF8_MAXBYTES+1];
6638 tmps = uvuni_to_utf8(string, cp);
6639 sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
6640 } else { /* Otherwise, no need for utf8, can skip that step */
6643 sv_str= newSVpvn(&string, 1);
6646 /* fetch the charnames handler for this scope */
6647 HV * const table = GvHV(PL_hintgv);
6649 hv_fetchs(table, "charnames", FALSE) :
6651 SV *cv= cvp ? *cvp : NULL;
6654 /* create an SV with the name as argument */
6655 sv_name = newSVpvn(name, endbrace - name);
6657 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6658 vFAIL2("Constant(\\N{%s}) unknown: "
6659 "(possibly a missing \"use charnames ...\")",
6662 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6663 vFAIL2("Constant(\\N{%s}): "
6664 "$^H{charnames} is not defined",SvPVX(sv_name));
6669 if (!RExC_charnames) {
6670 /* make sure our cache is allocated */
6671 RExC_charnames = newHV();
6672 sv_2mortal(MUTABLE_SV(RExC_charnames));
6674 /* see if we have looked this one up before */
6675 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6677 sv_str = HeVAL(he_str);
6690 count= call_sv(cv, G_SCALAR);
6692 if (count == 1) { /* XXXX is this right? dmq */
6694 SvREFCNT_inc_simple_void(sv_str);
6702 if ( !sv_str || !SvOK(sv_str) ) {
6703 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6704 "did not return a defined value",SvPVX(sv_name));
6706 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6711 char *p = SvPV(sv_str, len);
6714 if ( SvUTF8(sv_str) ) {
6715 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6719 We have to turn on utf8 for high bit chars otherwise
6720 we get failures with
6722 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6723 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6725 This is different from what \x{} would do with the same
6726 codepoint, where the condition is > 0xFF.
6733 /* warn if we havent used the whole string? */
6735 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6737 "Ignoring excess chars from \\N{%s} in character class",
6741 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6743 "Ignoring zero length \\N{%s} in character class",
6748 SvREFCNT_dec(sv_name);
6750 SvREFCNT_dec(sv_str);
6751 return len ? NULL : (regnode *)&len;
6752 } else if(SvCUR(sv_str)) {
6758 char * parse_start = name-3; /* needed for the offsets */
6760 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6762 ret = reg_node(pRExC_state,
6763 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6766 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6767 sv_utf8_upgrade(sv_str);
6768 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6772 p = SvPV(sv_str, len);
6774 /* len is the length written, charlen is the size the char read */
6775 for ( len = 0; p < pend; p += charlen ) {
6777 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6779 STRLEN foldlen,numlen;
6780 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6781 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6782 /* Emit all the Unicode characters. */
6784 for (foldbuf = tmpbuf;
6788 uvc = utf8_to_uvchr(foldbuf, &numlen);
6790 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6793 /* In EBCDIC the numlen
6794 * and unilen can differ. */
6796 if (numlen >= foldlen)
6800 break; /* "Can't happen." */
6803 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6815 RExC_size += STR_SZ(len);
6818 RExC_emit += STR_SZ(len);
6820 Set_Node_Cur_Length(ret); /* MJD */
6822 nextchar(pRExC_state);
6823 } else { /* zero length */
6824 ret = reg_node(pRExC_state,NOTHING);
6827 SvREFCNT_dec(sv_str);
6830 SvREFCNT_dec(sv_name);
6840 * It returns the code point in utf8 for the value in *encp.
6841 * value: a code value in the source encoding
6842 * encp: a pointer to an Encode object
6844 * If the result from Encode is not a single character,
6845 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6848 S_reg_recode(pTHX_ const char value, SV **encp)
6851 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6852 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6853 const STRLEN newlen = SvCUR(sv);
6854 UV uv = UNICODE_REPLACEMENT;
6856 PERL_ARGS_ASSERT_REG_RECODE;
6860 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6863 if (!newlen || numlen != newlen) {
6864 uv = UNICODE_REPLACEMENT;
6872 - regatom - the lowest level
6874 Try to identify anything special at the start of the pattern. If there
6875 is, then handle it as required. This may involve generating a single regop,
6876 such as for an assertion; or it may involve recursing, such as to
6877 handle a () structure.
6879 If the string doesn't start with something special then we gobble up
6880 as much literal text as we can.
6882 Once we have been able to handle whatever type of thing started the
6883 sequence, we return.
6885 Note: we have to be careful with escapes, as they can be both literal
6886 and special, and in the case of \10 and friends can either, depending
6887 on context. Specifically there are two seperate switches for handling
6888 escape sequences, with the one for handling literal escapes requiring
6889 a dummy entry for all of the special escapes that are actually handled
6894 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6897 register regnode *ret = NULL;
6899 char *parse_start = RExC_parse;
6900 GET_RE_DEBUG_FLAGS_DECL;
6901 DEBUG_PARSE("atom");
6902 *flagp = WORST; /* Tentatively. */
6904 PERL_ARGS_ASSERT_REGATOM;
6907 switch ((U8)*RExC_parse) {
6909 RExC_seen_zerolen++;
6910 nextchar(pRExC_state);
6911 if (RExC_flags & RXf_PMf_MULTILINE)
6912 ret = reg_node(pRExC_state, MBOL);
6913 else if (RExC_flags & RXf_PMf_SINGLELINE)
6914 ret = reg_node(pRExC_state, SBOL);
6916 ret = reg_node(pRExC_state, BOL);
6917 Set_Node_Length(ret, 1); /* MJD */
6920 nextchar(pRExC_state);
6922 RExC_seen_zerolen++;
6923 if (RExC_flags & RXf_PMf_MULTILINE)
6924 ret = reg_node(pRExC_state, MEOL);
6925 else if (RExC_flags & RXf_PMf_SINGLELINE)
6926 ret = reg_node(pRExC_state, SEOL);
6928 ret = reg_node(pRExC_state, EOL);
6929 Set_Node_Length(ret, 1); /* MJD */
6932 nextchar(pRExC_state);
6933 if (RExC_flags & RXf_PMf_SINGLELINE)
6934 ret = reg_node(pRExC_state, SANY);
6936 ret = reg_node(pRExC_state, REG_ANY);
6937 *flagp |= HASWIDTH|SIMPLE;
6939 Set_Node_Length(ret, 1); /* MJD */
6943 char * const oregcomp_parse = ++RExC_parse;
6944 ret = regclass(pRExC_state,depth+1);
6945 if (*RExC_parse != ']') {
6946 RExC_parse = oregcomp_parse;
6947 vFAIL("Unmatched [");
6949 nextchar(pRExC_state);
6950 *flagp |= HASWIDTH|SIMPLE;
6951 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6955 nextchar(pRExC_state);
6956 ret = reg(pRExC_state, 1, &flags,depth+1);
6958 if (flags & TRYAGAIN) {
6959 if (RExC_parse == RExC_end) {
6960 /* Make parent create an empty node if needed. */
6968 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6972 if (flags & TRYAGAIN) {
6976 vFAIL("Internal urp");
6977 /* Supposed to be caught earlier. */
6980 if (!regcurly(RExC_parse)) {
6989 vFAIL("Quantifier follows nothing");
6997 len=0; /* silence a spurious compiler warning */
6998 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
6999 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7000 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7001 ret = reganode(pRExC_state, FOLDCHAR, cp);
7002 Set_Node_Length(ret, 1); /* MJD */
7003 nextchar(pRExC_state); /* kill whitespace under /x */
7011 This switch handles escape sequences that resolve to some kind
7012 of special regop and not to literal text. Escape sequnces that
7013 resolve to literal text are handled below in the switch marked
7016 Every entry in this switch *must* have a corresponding entry
7017 in the literal escape switch. However, the opposite is not
7018 required, as the default for this switch is to jump to the
7019 literal text handling code.
7021 switch ((U8)*++RExC_parse) {
7026 /* Special Escapes */
7028 RExC_seen_zerolen++;
7029 ret = reg_node(pRExC_state, SBOL);
7031 goto finish_meta_pat;
7033 ret = reg_node(pRExC_state, GPOS);
7034 RExC_seen |= REG_SEEN_GPOS;
7036 goto finish_meta_pat;
7038 RExC_seen_zerolen++;
7039 ret = reg_node(pRExC_state, KEEPS);
7041 /* XXX:dmq : disabling in-place substitution seems to
7042 * be necessary here to avoid cases of memory corruption, as
7043 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7045 RExC_seen |= REG_SEEN_LOOKBEHIND;
7046 goto finish_meta_pat;
7048 ret = reg_node(pRExC_state, SEOL);
7050 RExC_seen_zerolen++; /* Do not optimize RE away */
7051 goto finish_meta_pat;
7053 ret = reg_node(pRExC_state, EOS);
7055 RExC_seen_zerolen++; /* Do not optimize RE away */
7056 goto finish_meta_pat;
7058 ret = reg_node(pRExC_state, CANY);
7059 RExC_seen |= REG_SEEN_CANY;
7060 *flagp |= HASWIDTH|SIMPLE;
7061 goto finish_meta_pat;
7063 ret = reg_node(pRExC_state, CLUMP);
7065 goto finish_meta_pat;
7067 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7068 *flagp |= HASWIDTH|SIMPLE;
7069 goto finish_meta_pat;
7071 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7072 *flagp |= HASWIDTH|SIMPLE;
7073 goto finish_meta_pat;
7075 RExC_seen_zerolen++;
7076 RExC_seen |= REG_SEEN_LOOKBEHIND;
7077 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7079 goto finish_meta_pat;
7081 RExC_seen_zerolen++;
7082 RExC_seen |= REG_SEEN_LOOKBEHIND;
7083 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7085 goto finish_meta_pat;
7087 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7088 *flagp |= HASWIDTH|SIMPLE;
7089 goto finish_meta_pat;
7091 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7092 *flagp |= HASWIDTH|SIMPLE;
7093 goto finish_meta_pat;
7095 ret = reg_node(pRExC_state, DIGIT);
7096 *flagp |= HASWIDTH|SIMPLE;
7097 goto finish_meta_pat;
7099 ret = reg_node(pRExC_state, NDIGIT);
7100 *flagp |= HASWIDTH|SIMPLE;
7101 goto finish_meta_pat;
7103 ret = reg_node(pRExC_state, LNBREAK);
7104 *flagp |= HASWIDTH|SIMPLE;
7105 goto finish_meta_pat;
7107 ret = reg_node(pRExC_state, HORIZWS);
7108 *flagp |= HASWIDTH|SIMPLE;
7109 goto finish_meta_pat;
7111 ret = reg_node(pRExC_state, NHORIZWS);
7112 *flagp |= HASWIDTH|SIMPLE;
7113 goto finish_meta_pat;
7115 ret = reg_node(pRExC_state, VERTWS);
7116 *flagp |= HASWIDTH|SIMPLE;
7117 goto finish_meta_pat;
7119 ret = reg_node(pRExC_state, NVERTWS);
7120 *flagp |= HASWIDTH|SIMPLE;
7122 nextchar(pRExC_state);
7123 Set_Node_Length(ret, 2); /* MJD */
7128 char* const oldregxend = RExC_end;
7130 char* parse_start = RExC_parse - 2;
7133 if (RExC_parse[1] == '{') {
7134 /* a lovely hack--pretend we saw [\pX] instead */
7135 RExC_end = strchr(RExC_parse, '}');
7137 const U8 c = (U8)*RExC_parse;
7139 RExC_end = oldregxend;
7140 vFAIL2("Missing right brace on \\%c{}", c);
7145 RExC_end = RExC_parse + 2;
7146 if (RExC_end > oldregxend)
7147 RExC_end = oldregxend;
7151 ret = regclass(pRExC_state,depth+1);
7153 RExC_end = oldregxend;
7156 Set_Node_Offset(ret, parse_start + 2);
7157 Set_Node_Cur_Length(ret);
7158 nextchar(pRExC_state);
7159 *flagp |= HASWIDTH|SIMPLE;
7163 /* Handle \N{NAME} here and not below because it can be
7164 multicharacter. join_exact() will join them up later on.
7165 Also this makes sure that things like /\N{BLAH}+/ and
7166 \N{BLAH} being multi char Just Happen. dmq*/
7168 ret= reg_namedseq(pRExC_state, NULL);
7170 case 'k': /* Handle \k<NAME> and \k'NAME' */
7173 char ch= RExC_parse[1];
7174 if (ch != '<' && ch != '\'' && ch != '{') {
7176 vFAIL2("Sequence %.2s... not terminated",parse_start);
7178 /* this pretty much dupes the code for (?P=...) in reg(), if
7179 you change this make sure you change that */
7180 char* name_start = (RExC_parse += 2);
7182 SV *sv_dat = reg_scan_name(pRExC_state,
7183 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7184 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7185 if (RExC_parse == name_start || *RExC_parse != ch)
7186 vFAIL2("Sequence %.3s... not terminated",parse_start);
7189 num = add_data( pRExC_state, 1, "S" );
7190 RExC_rxi->data->data[num]=(void*)sv_dat;
7191 SvREFCNT_inc_simple_void(sv_dat);
7195 ret = reganode(pRExC_state,
7196 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7200 /* override incorrect value set in reganode MJD */
7201 Set_Node_Offset(ret, parse_start+1);
7202 Set_Node_Cur_Length(ret); /* MJD */
7203 nextchar(pRExC_state);
7209 case '1': case '2': case '3': case '4':
7210 case '5': case '6': case '7': case '8': case '9':
7213 bool isg = *RExC_parse == 'g';
7218 if (*RExC_parse == '{') {
7222 if (*RExC_parse == '-') {
7226 if (hasbrace && !isDIGIT(*RExC_parse)) {
7227 if (isrel) RExC_parse--;
7229 goto parse_named_seq;
7231 num = atoi(RExC_parse);
7232 if (isg && num == 0)
7233 vFAIL("Reference to invalid group 0");
7235 num = RExC_npar - num;
7237 vFAIL("Reference to nonexistent or unclosed group");
7239 if (!isg && num > 9 && num >= RExC_npar)
7242 char * const parse_start = RExC_parse - 1; /* MJD */
7243 while (isDIGIT(*RExC_parse))
7245 if (parse_start == RExC_parse - 1)
7246 vFAIL("Unterminated \\g... pattern");
7248 if (*RExC_parse != '}')
7249 vFAIL("Unterminated \\g{...} pattern");
7253 if (num > (I32)RExC_rx->nparens)
7254 vFAIL("Reference to nonexistent group");
7257 ret = reganode(pRExC_state,
7258 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7262 /* override incorrect value set in reganode MJD */
7263 Set_Node_Offset(ret, parse_start+1);
7264 Set_Node_Cur_Length(ret); /* MJD */
7266 nextchar(pRExC_state);
7271 if (RExC_parse >= RExC_end)
7272 FAIL("Trailing \\");
7275 /* Do not generate "unrecognized" warnings here, we fall
7276 back into the quick-grab loop below */
7283 if (RExC_flags & RXf_PMf_EXTENDED) {
7284 if ( reg_skipcomment( pRExC_state ) )
7291 register STRLEN len;
7296 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7298 parse_start = RExC_parse - 1;
7304 ret = reg_node(pRExC_state,
7305 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7307 for (len = 0, p = RExC_parse - 1;
7308 len < 127 && p < RExC_end;
7311 char * const oldp = p;
7313 if (RExC_flags & RXf_PMf_EXTENDED)
7314 p = regwhite( pRExC_state, p );
7319 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7320 goto normal_default;
7330 /* Literal Escapes Switch
7332 This switch is meant to handle escape sequences that
7333 resolve to a literal character.
7335 Every escape sequence that represents something
7336 else, like an assertion or a char class, is handled
7337 in the switch marked 'Special Escapes' above in this
7338 routine, but also has an entry here as anything that
7339 isn't explicitly mentioned here will be treated as
7340 an unescaped equivalent literal.
7344 /* These are all the special escapes. */
7348 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7349 goto normal_default;
7350 case 'A': /* Start assertion */
7351 case 'b': case 'B': /* Word-boundary assertion*/
7352 case 'C': /* Single char !DANGEROUS! */
7353 case 'd': case 'D': /* digit class */
7354 case 'g': case 'G': /* generic-backref, pos assertion */
7355 case 'h': case 'H': /* HORIZWS */
7356 case 'k': case 'K': /* named backref, keep marker */
7357 case 'N': /* named char sequence */
7358 case 'p': case 'P': /* Unicode property */
7359 case 'R': /* LNBREAK */
7360 case 's': case 'S': /* space class */
7361 case 'v': case 'V': /* VERTWS */
7362 case 'w': case 'W': /* word class */
7363 case 'X': /* eXtended Unicode "combining character sequence" */
7364 case 'z': case 'Z': /* End of line/string assertion */
7368 /* Anything after here is an escape that resolves to a
7369 literal. (Except digits, which may or may not)
7388 ender = ASCII_TO_NATIVE('\033');
7392 ender = ASCII_TO_NATIVE('\007');
7397 char* const e = strchr(p, '}');
7401 vFAIL("Missing right brace on \\x{}");
7404 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7405 | PERL_SCAN_DISALLOW_PREFIX;
7406 STRLEN numlen = e - p - 1;
7407 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7414 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7416 ender = grok_hex(p, &numlen, &flags, NULL);
7419 if (PL_encoding && ender < 0x100)
7420 goto recode_encoding;
7424 ender = UCHARAT(p++);
7425 ender = toCTRL(ender);
7427 case '0': case '1': case '2': case '3':case '4':
7428 case '5': case '6': case '7': case '8':case '9':
7430 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7433 ender = grok_oct(p, &numlen, &flags, NULL);
7440 if (PL_encoding && ender < 0x100)
7441 goto recode_encoding;
7445 SV* enc = PL_encoding;
7446 ender = reg_recode((const char)(U8)ender, &enc);
7447 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7448 vWARN(p, "Invalid escape in the specified encoding");
7454 FAIL("Trailing \\");
7457 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
7458 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7459 goto normal_default;
7464 if (UTF8_IS_START(*p) && UTF) {
7466 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7467 &numlen, UTF8_ALLOW_DEFAULT);
7474 if ( RExC_flags & RXf_PMf_EXTENDED)
7475 p = regwhite( pRExC_state, p );
7477 /* Prime the casefolded buffer. */
7478 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7480 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7485 /* Emit all the Unicode characters. */
7487 for (foldbuf = tmpbuf;
7489 foldlen -= numlen) {
7490 ender = utf8_to_uvchr(foldbuf, &numlen);
7492 const STRLEN unilen = reguni(pRExC_state, ender, s);
7495 /* In EBCDIC the numlen
7496 * and unilen can differ. */
7498 if (numlen >= foldlen)
7502 break; /* "Can't happen." */
7506 const STRLEN unilen = reguni(pRExC_state, ender, s);
7515 REGC((char)ender, s++);
7521 /* Emit all the Unicode characters. */
7523 for (foldbuf = tmpbuf;
7525 foldlen -= numlen) {
7526 ender = utf8_to_uvchr(foldbuf, &numlen);
7528 const STRLEN unilen = reguni(pRExC_state, ender, s);
7531 /* In EBCDIC the numlen
7532 * and unilen can differ. */
7534 if (numlen >= foldlen)
7542 const STRLEN unilen = reguni(pRExC_state, ender, s);
7551 REGC((char)ender, s++);
7555 Set_Node_Cur_Length(ret); /* MJD */
7556 nextchar(pRExC_state);
7558 /* len is STRLEN which is unsigned, need to copy to signed */
7561 vFAIL("Internal disaster");
7565 if (len == 1 && UNI_IS_INVARIANT(ender))
7569 RExC_size += STR_SZ(len);
7572 RExC_emit += STR_SZ(len);
7582 S_regwhite( RExC_state_t *pRExC_state, char *p )
7584 const char *e = RExC_end;
7586 PERL_ARGS_ASSERT_REGWHITE;
7591 else if (*p == '#') {
7600 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7608 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7609 Character classes ([:foo:]) can also be negated ([:^foo:]).
7610 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7611 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7612 but trigger failures because they are currently unimplemented. */
7614 #define POSIXCC_DONE(c) ((c) == ':')
7615 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7616 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7619 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7622 I32 namedclass = OOB_NAMEDCLASS;
7624 PERL_ARGS_ASSERT_REGPPOSIXCC;
7626 if (value == '[' && RExC_parse + 1 < RExC_end &&
7627 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7628 POSIXCC(UCHARAT(RExC_parse))) {
7629 const char c = UCHARAT(RExC_parse);
7630 char* const s = RExC_parse++;
7632 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7634 if (RExC_parse == RExC_end)
7635 /* Grandfather lone [:, [=, [. */
7638 const char* const t = RExC_parse++; /* skip over the c */
7641 if (UCHARAT(RExC_parse) == ']') {
7642 const char *posixcc = s + 1;
7643 RExC_parse++; /* skip over the ending ] */
7646 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7647 const I32 skip = t - posixcc;
7649 /* Initially switch on the length of the name. */
7652 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7653 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7656 /* Names all of length 5. */
7657 /* alnum alpha ascii blank cntrl digit graph lower
7658 print punct space upper */
7659 /* Offset 4 gives the best switch position. */
7660 switch (posixcc[4]) {
7662 if (memEQ(posixcc, "alph", 4)) /* alpha */
7663 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7666 if (memEQ(posixcc, "spac", 4)) /* space */
7667 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7670 if (memEQ(posixcc, "grap", 4)) /* graph */
7671 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7674 if (memEQ(posixcc, "asci", 4)) /* ascii */
7675 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7678 if (memEQ(posixcc, "blan", 4)) /* blank */
7679 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7682 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7683 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7686 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7687 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7690 if (memEQ(posixcc, "lowe", 4)) /* lower */
7691 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7692 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7693 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7696 if (memEQ(posixcc, "digi", 4)) /* digit */
7697 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7698 else if (memEQ(posixcc, "prin", 4)) /* print */
7699 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7700 else if (memEQ(posixcc, "punc", 4)) /* punct */
7701 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7706 if (memEQ(posixcc, "xdigit", 6))
7707 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7711 if (namedclass == OOB_NAMEDCLASS)
7712 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7714 assert (posixcc[skip] == ':');
7715 assert (posixcc[skip+1] == ']');
7716 } else if (!SIZE_ONLY) {
7717 /* [[=foo=]] and [[.foo.]] are still future. */
7719 /* adjust RExC_parse so the warning shows after
7721 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7723 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7726 /* Maternal grandfather:
7727 * "[:" ending in ":" but not in ":]" */
7737 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7741 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7743 if (POSIXCC(UCHARAT(RExC_parse))) {
7744 const char *s = RExC_parse;
7745 const char c = *s++;
7749 if (*s && c == *s && s[1] == ']') {
7750 if (ckWARN(WARN_REGEXP))
7752 "POSIX syntax [%c %c] belongs inside character classes",
7755 /* [[=foo=]] and [[.foo.]] are still future. */
7756 if (POSIXCC_NOTYET(c)) {
7757 /* adjust RExC_parse so the error shows after
7759 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7761 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7768 #define _C_C_T_(NAME,TEST,WORD) \
7771 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7773 for (value = 0; value < 256; value++) \
7775 ANYOF_BITMAP_SET(ret, value); \
7780 case ANYOF_N##NAME: \
7782 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7784 for (value = 0; value < 256; value++) \
7786 ANYOF_BITMAP_SET(ret, value); \
7792 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7794 for (value = 0; value < 256; value++) \
7796 ANYOF_BITMAP_SET(ret, value); \
7800 case ANYOF_N##NAME: \
7801 for (value = 0; value < 256; value++) \
7803 ANYOF_BITMAP_SET(ret, value); \
7809 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7810 so that it is possible to override the option here without having to
7811 rebuild the entire core. as we are required to do if we change regcomp.h
7812 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7814 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7815 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7818 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7819 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7821 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7825 parse a class specification and produce either an ANYOF node that
7826 matches the pattern or if the pattern matches a single char only and
7827 that char is < 256 and we are case insensitive then we produce an
7832 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7835 register UV nextvalue;
7836 register IV prevvalue = OOB_UNICODE;
7837 register IV range = 0;
7838 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7839 register regnode *ret;
7842 char *rangebegin = NULL;
7843 bool need_class = 0;
7846 bool optimize_invert = TRUE;
7847 AV* unicode_alternate = NULL;
7849 UV literal_endpoint = 0;
7851 UV stored = 0; /* number of chars stored in the class */
7853 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7854 case we need to change the emitted regop to an EXACT. */
7855 const char * orig_parse = RExC_parse;
7856 GET_RE_DEBUG_FLAGS_DECL;
7858 PERL_ARGS_ASSERT_REGCLASS;
7860 PERL_UNUSED_ARG(depth);
7863 DEBUG_PARSE("clas");
7865 /* Assume we are going to generate an ANYOF node. */
7866 ret = reganode(pRExC_state, ANYOF, 0);
7869 ANYOF_FLAGS(ret) = 0;
7871 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7875 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7879 RExC_size += ANYOF_SKIP;
7880 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7883 RExC_emit += ANYOF_SKIP;
7885 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7887 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7888 ANYOF_BITMAP_ZERO(ret);
7889 listsv = newSVpvs("# comment\n");
7892 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7894 if (!SIZE_ONLY && POSIXCC(nextvalue))
7895 checkposixcc(pRExC_state);
7897 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7898 if (UCHARAT(RExC_parse) == ']')
7902 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7906 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7909 rangebegin = RExC_parse;
7911 value = utf8n_to_uvchr((U8*)RExC_parse,
7912 RExC_end - RExC_parse,
7913 &numlen, UTF8_ALLOW_DEFAULT);
7914 RExC_parse += numlen;
7917 value = UCHARAT(RExC_parse++);
7919 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7920 if (value == '[' && POSIXCC(nextvalue))
7921 namedclass = regpposixcc(pRExC_state, value);
7922 else if (value == '\\') {
7924 value = utf8n_to_uvchr((U8*)RExC_parse,
7925 RExC_end - RExC_parse,
7926 &numlen, UTF8_ALLOW_DEFAULT);
7927 RExC_parse += numlen;
7930 value = UCHARAT(RExC_parse++);
7931 /* Some compilers cannot handle switching on 64-bit integer
7932 * values, therefore value cannot be an UV. Yes, this will
7933 * be a problem later if we want switch on Unicode.
7934 * A similar issue a little bit later when switching on
7935 * namedclass. --jhi */
7936 switch ((I32)value) {
7937 case 'w': namedclass = ANYOF_ALNUM; break;
7938 case 'W': namedclass = ANYOF_NALNUM; break;
7939 case 's': namedclass = ANYOF_SPACE; break;
7940 case 'S': namedclass = ANYOF_NSPACE; break;
7941 case 'd': namedclass = ANYOF_DIGIT; break;
7942 case 'D': namedclass = ANYOF_NDIGIT; break;
7943 case 'v': namedclass = ANYOF_VERTWS; break;
7944 case 'V': namedclass = ANYOF_NVERTWS; break;
7945 case 'h': namedclass = ANYOF_HORIZWS; break;
7946 case 'H': namedclass = ANYOF_NHORIZWS; break;
7947 case 'N': /* Handle \N{NAME} in class */
7949 /* We only pay attention to the first char of
7950 multichar strings being returned. I kinda wonder
7951 if this makes sense as it does change the behaviour
7952 from earlier versions, OTOH that behaviour was broken
7954 UV v; /* value is register so we cant & it /grrr */
7955 if (reg_namedseq(pRExC_state, &v)) {
7965 if (RExC_parse >= RExC_end)
7966 vFAIL2("Empty \\%c{}", (U8)value);
7967 if (*RExC_parse == '{') {
7968 const U8 c = (U8)value;
7969 e = strchr(RExC_parse++, '}');
7971 vFAIL2("Missing right brace on \\%c{}", c);
7972 while (isSPACE(UCHARAT(RExC_parse)))
7974 if (e == RExC_parse)
7975 vFAIL2("Empty \\%c{}", c);
7977 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7985 if (UCHARAT(RExC_parse) == '^') {
7988 value = value == 'p' ? 'P' : 'p'; /* toggle */
7989 while (isSPACE(UCHARAT(RExC_parse))) {
7994 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7995 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7998 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7999 namedclass = ANYOF_MAX; /* no official name, but it's named */
8002 case 'n': value = '\n'; break;
8003 case 'r': value = '\r'; break;
8004 case 't': value = '\t'; break;
8005 case 'f': value = '\f'; break;
8006 case 'b': value = '\b'; break;
8007 case 'e': value = ASCII_TO_NATIVE('\033');break;
8008 case 'a': value = ASCII_TO_NATIVE('\007');break;
8010 if (*RExC_parse == '{') {
8011 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8012 | PERL_SCAN_DISALLOW_PREFIX;
8013 char * const e = strchr(RExC_parse++, '}');
8015 vFAIL("Missing right brace on \\x{}");
8017 numlen = e - RExC_parse;
8018 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8022 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8024 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8025 RExC_parse += numlen;
8027 if (PL_encoding && value < 0x100)
8028 goto recode_encoding;
8031 value = UCHARAT(RExC_parse++);
8032 value = toCTRL(value);
8034 case '0': case '1': case '2': case '3': case '4':
8035 case '5': case '6': case '7': case '8': case '9':
8039 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8040 RExC_parse += numlen;
8041 if (PL_encoding && value < 0x100)
8042 goto recode_encoding;
8047 SV* enc = PL_encoding;
8048 value = reg_recode((const char)(U8)value, &enc);
8049 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
8051 "Invalid escape in the specified encoding");
8055 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
8057 "Unrecognized escape \\%c in character class passed through",
8061 } /* end of \blah */
8067 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8069 if (!SIZE_ONLY && !need_class)
8070 ANYOF_CLASS_ZERO(ret);
8074 /* a bad range like a-\d, a-[:digit:] ? */
8077 if (ckWARN(WARN_REGEXP)) {
8079 RExC_parse >= rangebegin ?
8080 RExC_parse - rangebegin : 0;
8082 "False [] range \"%*.*s\"",
8085 if (prevvalue < 256) {
8086 ANYOF_BITMAP_SET(ret, prevvalue);
8087 ANYOF_BITMAP_SET(ret, '-');
8090 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8091 Perl_sv_catpvf(aTHX_ listsv,
8092 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8096 range = 0; /* this was not a true range */
8102 const char *what = NULL;
8105 if (namedclass > OOB_NAMEDCLASS)
8106 optimize_invert = FALSE;
8107 /* Possible truncation here but in some 64-bit environments
8108 * the compiler gets heartburn about switch on 64-bit values.
8109 * A similar issue a little earlier when switching on value.
8111 switch ((I32)namedclass) {
8113 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8114 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8115 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8116 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8117 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8118 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8119 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8120 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8121 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8122 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8123 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8124 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8125 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8127 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8128 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8130 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8131 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8132 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8135 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8138 for (value = 0; value < 128; value++)
8139 ANYOF_BITMAP_SET(ret, value);
8141 for (value = 0; value < 256; value++) {
8143 ANYOF_BITMAP_SET(ret, value);
8152 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8155 for (value = 128; value < 256; value++)
8156 ANYOF_BITMAP_SET(ret, value);
8158 for (value = 0; value < 256; value++) {
8159 if (!isASCII(value))
8160 ANYOF_BITMAP_SET(ret, value);
8169 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8171 /* consecutive digits assumed */
8172 for (value = '0'; value <= '9'; value++)
8173 ANYOF_BITMAP_SET(ret, value);
8176 what = POSIX_CC_UNI_NAME("Digit");
8180 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8182 /* consecutive digits assumed */
8183 for (value = 0; value < '0'; value++)
8184 ANYOF_BITMAP_SET(ret, value);
8185 for (value = '9' + 1; value < 256; value++)
8186 ANYOF_BITMAP_SET(ret, value);
8189 what = POSIX_CC_UNI_NAME("Digit");
8192 /* this is to handle \p and \P */
8195 vFAIL("Invalid [::] class");
8199 /* Strings such as "+utf8::isWord\n" */
8200 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8203 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8206 } /* end of namedclass \blah */
8209 if (prevvalue > (IV)value) /* b-a */ {
8210 const int w = RExC_parse - rangebegin;
8211 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8212 range = 0; /* not a valid range */
8216 prevvalue = value; /* save the beginning of the range */
8217 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8218 RExC_parse[1] != ']') {
8221 /* a bad range like \w-, [:word:]- ? */
8222 if (namedclass > OOB_NAMEDCLASS) {
8223 if (ckWARN(WARN_REGEXP)) {
8225 RExC_parse >= rangebegin ?
8226 RExC_parse - rangebegin : 0;
8228 "False [] range \"%*.*s\"",
8232 ANYOF_BITMAP_SET(ret, '-');
8234 range = 1; /* yeah, it's a range! */
8235 continue; /* but do it the next time */
8239 /* now is the next time */
8240 /*stored += (value - prevvalue + 1);*/
8242 if (prevvalue < 256) {
8243 const IV ceilvalue = value < 256 ? value : 255;
8246 /* In EBCDIC [\x89-\x91] should include
8247 * the \x8e but [i-j] should not. */
8248 if (literal_endpoint == 2 &&
8249 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8250 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8252 if (isLOWER(prevvalue)) {
8253 for (i = prevvalue; i <= ceilvalue; i++)
8254 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8256 ANYOF_BITMAP_SET(ret, i);
8259 for (i = prevvalue; i <= ceilvalue; i++)
8260 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8262 ANYOF_BITMAP_SET(ret, i);
8268 for (i = prevvalue; i <= ceilvalue; i++) {
8269 if (!ANYOF_BITMAP_TEST(ret,i)) {
8271 ANYOF_BITMAP_SET(ret, i);
8275 if (value > 255 || UTF) {
8276 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8277 const UV natvalue = NATIVE_TO_UNI(value);
8278 stored+=2; /* can't optimize this class */
8279 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8280 if (prevnatvalue < natvalue) { /* what about > ? */
8281 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8282 prevnatvalue, natvalue);
8284 else if (prevnatvalue == natvalue) {
8285 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8287 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8289 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8291 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8292 if (RExC_precomp[0] == ':' &&
8293 RExC_precomp[1] == '[' &&
8294 (f == 0xDF || f == 0x92)) {
8295 f = NATIVE_TO_UNI(f);
8298 /* If folding and foldable and a single
8299 * character, insert also the folded version
8300 * to the charclass. */
8302 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8303 if ((RExC_precomp[0] == ':' &&
8304 RExC_precomp[1] == '[' &&
8306 (value == 0xFB05 || value == 0xFB06))) ?
8307 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8308 foldlen == (STRLEN)UNISKIP(f) )
8310 if (foldlen == (STRLEN)UNISKIP(f))
8312 Perl_sv_catpvf(aTHX_ listsv,
8315 /* Any multicharacter foldings
8316 * require the following transform:
8317 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8318 * where E folds into "pq" and F folds
8319 * into "rst", all other characters
8320 * fold to single characters. We save
8321 * away these multicharacter foldings,
8322 * to be later saved as part of the
8323 * additional "s" data. */
8326 if (!unicode_alternate)
8327 unicode_alternate = newAV();
8328 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8330 av_push(unicode_alternate, sv);
8334 /* If folding and the value is one of the Greek
8335 * sigmas insert a few more sigmas to make the
8336 * folding rules of the sigmas to work right.
8337 * Note that not all the possible combinations
8338 * are handled here: some of them are handled
8339 * by the standard folding rules, and some of
8340 * them (literal or EXACTF cases) are handled
8341 * during runtime in regexec.c:S_find_byclass(). */
8342 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8343 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8344 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8345 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8346 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8348 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8349 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8350 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8355 literal_endpoint = 0;
8359 range = 0; /* this range (if it was one) is done now */
8363 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8365 RExC_size += ANYOF_CLASS_ADD_SKIP;
8367 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8373 /****** !SIZE_ONLY AFTER HERE *********/
8375 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8376 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8378 /* optimize single char class to an EXACT node
8379 but *only* when its not a UTF/high char */
8380 const char * cur_parse= RExC_parse;
8381 RExC_emit = (regnode *)orig_emit;
8382 RExC_parse = (char *)orig_parse;
8383 ret = reg_node(pRExC_state,
8384 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8385 RExC_parse = (char *)cur_parse;
8386 *STRING(ret)= (char)value;
8388 RExC_emit += STR_SZ(1);
8390 SvREFCNT_dec(listsv);
8394 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8395 if ( /* If the only flag is folding (plus possibly inversion). */
8396 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8398 for (value = 0; value < 256; ++value) {
8399 if (ANYOF_BITMAP_TEST(ret, value)) {
8400 UV fold = PL_fold[value];
8403 ANYOF_BITMAP_SET(ret, fold);
8406 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8409 /* optimize inverted simple patterns (e.g. [^a-z]) */
8410 if (optimize_invert &&
8411 /* If the only flag is inversion. */
8412 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8413 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8414 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8415 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8418 AV * const av = newAV();
8420 /* The 0th element stores the character class description
8421 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8422 * to initialize the appropriate swash (which gets stored in
8423 * the 1st element), and also useful for dumping the regnode.
8424 * The 2nd element stores the multicharacter foldings,
8425 * used later (regexec.c:S_reginclass()). */
8426 av_store(av, 0, listsv);
8427 av_store(av, 1, NULL);
8428 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8429 rv = newRV_noinc(MUTABLE_SV(av));
8430 n = add_data(pRExC_state, 1, "s");
8431 RExC_rxi->data->data[n] = (void*)rv;
8439 /* reg_skipcomment()
8441 Absorbs an /x style # comments from the input stream.
8442 Returns true if there is more text remaining in the stream.
8443 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8444 terminates the pattern without including a newline.
8446 Note its the callers responsibility to ensure that we are
8452 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8456 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8458 while (RExC_parse < RExC_end)
8459 if (*RExC_parse++ == '\n') {
8464 /* we ran off the end of the pattern without ending
8465 the comment, so we have to add an \n when wrapping */
8466 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8474 Advance that parse position, and optionall absorbs
8475 "whitespace" from the inputstream.
8477 Without /x "whitespace" means (?#...) style comments only,
8478 with /x this means (?#...) and # comments and whitespace proper.
8480 Returns the RExC_parse point from BEFORE the scan occurs.
8482 This is the /x friendly way of saying RExC_parse++.
8486 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8488 char* const retval = RExC_parse++;
8490 PERL_ARGS_ASSERT_NEXTCHAR;
8493 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8494 RExC_parse[2] == '#') {
8495 while (*RExC_parse != ')') {
8496 if (RExC_parse == RExC_end)
8497 FAIL("Sequence (?#... not terminated");
8503 if (RExC_flags & RXf_PMf_EXTENDED) {
8504 if (isSPACE(*RExC_parse)) {
8508 else if (*RExC_parse == '#') {
8509 if ( reg_skipcomment( pRExC_state ) )
8518 - reg_node - emit a node
8520 STATIC regnode * /* Location. */
8521 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8524 register regnode *ptr;
8525 regnode * const ret = RExC_emit;
8526 GET_RE_DEBUG_FLAGS_DECL;
8528 PERL_ARGS_ASSERT_REG_NODE;
8531 SIZE_ALIGN(RExC_size);
8535 if (RExC_emit >= RExC_emit_bound)
8536 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8538 NODE_ALIGN_FILL(ret);
8540 FILL_ADVANCE_NODE(ptr, op);
8541 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8542 #ifdef RE_TRACK_PATTERN_OFFSETS
8543 if (RExC_offsets) { /* MJD */
8544 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8545 "reg_node", __LINE__,
8547 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8548 ? "Overwriting end of array!\n" : "OK",
8549 (UV)(RExC_emit - RExC_emit_start),
8550 (UV)(RExC_parse - RExC_start),
8551 (UV)RExC_offsets[0]));
8552 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8560 - reganode - emit a node with an argument
8562 STATIC regnode * /* Location. */
8563 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8566 register regnode *ptr;
8567 regnode * const ret = RExC_emit;
8568 GET_RE_DEBUG_FLAGS_DECL;
8570 PERL_ARGS_ASSERT_REGANODE;
8573 SIZE_ALIGN(RExC_size);
8578 assert(2==regarglen[op]+1);
8580 Anything larger than this has to allocate the extra amount.
8581 If we changed this to be:
8583 RExC_size += (1 + regarglen[op]);
8585 then it wouldn't matter. Its not clear what side effect
8586 might come from that so its not done so far.
8591 if (RExC_emit >= RExC_emit_bound)
8592 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8594 NODE_ALIGN_FILL(ret);
8596 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8597 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8598 #ifdef RE_TRACK_PATTERN_OFFSETS
8599 if (RExC_offsets) { /* MJD */
8600 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8604 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8605 "Overwriting end of array!\n" : "OK",
8606 (UV)(RExC_emit - RExC_emit_start),
8607 (UV)(RExC_parse - RExC_start),
8608 (UV)RExC_offsets[0]));
8609 Set_Cur_Node_Offset;
8617 - reguni - emit (if appropriate) a Unicode character
8620 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8624 PERL_ARGS_ASSERT_REGUNI;
8626 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8630 - reginsert - insert an operator in front of already-emitted operand
8632 * Means relocating the operand.
8635 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8638 register regnode *src;
8639 register regnode *dst;
8640 register regnode *place;
8641 const int offset = regarglen[(U8)op];
8642 const int size = NODE_STEP_REGNODE + offset;
8643 GET_RE_DEBUG_FLAGS_DECL;
8645 PERL_ARGS_ASSERT_REGINSERT;
8646 PERL_UNUSED_ARG(depth);
8647 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8648 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8657 if (RExC_open_parens) {
8659 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8660 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8661 if ( RExC_open_parens[paren] >= opnd ) {
8662 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8663 RExC_open_parens[paren] += size;
8665 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8667 if ( RExC_close_parens[paren] >= opnd ) {
8668 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8669 RExC_close_parens[paren] += size;
8671 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8676 while (src > opnd) {
8677 StructCopy(--src, --dst, regnode);
8678 #ifdef RE_TRACK_PATTERN_OFFSETS
8679 if (RExC_offsets) { /* MJD 20010112 */
8680 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8684 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8685 ? "Overwriting end of array!\n" : "OK",
8686 (UV)(src - RExC_emit_start),
8687 (UV)(dst - RExC_emit_start),
8688 (UV)RExC_offsets[0]));
8689 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8690 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8696 place = opnd; /* Op node, where operand used to be. */
8697 #ifdef RE_TRACK_PATTERN_OFFSETS
8698 if (RExC_offsets) { /* MJD */
8699 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8703 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8704 ? "Overwriting end of array!\n" : "OK",
8705 (UV)(place - RExC_emit_start),
8706 (UV)(RExC_parse - RExC_start),
8707 (UV)RExC_offsets[0]));
8708 Set_Node_Offset(place, RExC_parse);
8709 Set_Node_Length(place, 1);
8712 src = NEXTOPER(place);
8713 FILL_ADVANCE_NODE(place, op);
8714 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8715 Zero(src, offset, regnode);
8719 - regtail - set the next-pointer at the end of a node chain of p to val.
8720 - SEE ALSO: regtail_study
8722 /* TODO: All three parms should be const */
8724 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8727 register regnode *scan;
8728 GET_RE_DEBUG_FLAGS_DECL;
8730 PERL_ARGS_ASSERT_REGTAIL;
8732 PERL_UNUSED_ARG(depth);
8738 /* Find last node. */
8741 regnode * const temp = regnext(scan);
8743 SV * const mysv=sv_newmortal();
8744 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8745 regprop(RExC_rx, mysv, scan);
8746 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8747 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8748 (temp == NULL ? "->" : ""),
8749 (temp == NULL ? PL_reg_name[OP(val)] : "")
8757 if (reg_off_by_arg[OP(scan)]) {
8758 ARG_SET(scan, val - scan);
8761 NEXT_OFF(scan) = val - scan;
8767 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8768 - Look for optimizable sequences at the same time.
8769 - currently only looks for EXACT chains.
8771 This is expermental code. The idea is to use this routine to perform
8772 in place optimizations on branches and groups as they are constructed,
8773 with the long term intention of removing optimization from study_chunk so
8774 that it is purely analytical.
8776 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8777 to control which is which.
8780 /* TODO: All four parms should be const */
8783 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8786 register regnode *scan;
8788 #ifdef EXPERIMENTAL_INPLACESCAN
8791 GET_RE_DEBUG_FLAGS_DECL;
8793 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8799 /* Find last node. */
8803 regnode * const temp = regnext(scan);
8804 #ifdef EXPERIMENTAL_INPLACESCAN
8805 if (PL_regkind[OP(scan)] == EXACT)
8806 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8814 if( exact == PSEUDO )
8816 else if ( exact != OP(scan) )
8825 SV * const mysv=sv_newmortal();
8826 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8827 regprop(RExC_rx, mysv, scan);
8828 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8829 SvPV_nolen_const(mysv),
8831 PL_reg_name[exact]);
8838 SV * const mysv_val=sv_newmortal();
8839 DEBUG_PARSE_MSG("");
8840 regprop(RExC_rx, mysv_val, val);
8841 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8842 SvPV_nolen_const(mysv_val),
8843 (IV)REG_NODE_NUM(val),
8847 if (reg_off_by_arg[OP(scan)]) {
8848 ARG_SET(scan, val - scan);
8851 NEXT_OFF(scan) = val - scan;
8859 - regcurly - a little FSA that accepts {\d+,?\d*}
8862 S_regcurly(register const char *s)
8864 PERL_ARGS_ASSERT_REGCURLY;
8883 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8887 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8892 for (bit=0; bit<32; bit++) {
8893 if (flags & (1<<bit)) {
8895 PerlIO_printf(Perl_debug_log, "%s",lead);
8896 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8901 PerlIO_printf(Perl_debug_log, "\n");
8903 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8909 Perl_regdump(pTHX_ const regexp *r)
8913 SV * const sv = sv_newmortal();
8914 SV *dsv= sv_newmortal();
8916 GET_RE_DEBUG_FLAGS_DECL;
8918 PERL_ARGS_ASSERT_REGDUMP;
8920 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8922 /* Header fields of interest. */
8923 if (r->anchored_substr) {
8924 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8925 RE_SV_DUMPLEN(r->anchored_substr), 30);
8926 PerlIO_printf(Perl_debug_log,
8927 "anchored %s%s at %"IVdf" ",
8928 s, RE_SV_TAIL(r->anchored_substr),
8929 (IV)r->anchored_offset);
8930 } else if (r->anchored_utf8) {
8931 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8932 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8933 PerlIO_printf(Perl_debug_log,
8934 "anchored utf8 %s%s at %"IVdf" ",
8935 s, RE_SV_TAIL(r->anchored_utf8),
8936 (IV)r->anchored_offset);
8938 if (r->float_substr) {
8939 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8940 RE_SV_DUMPLEN(r->float_substr), 30);
8941 PerlIO_printf(Perl_debug_log,
8942 "floating %s%s at %"IVdf"..%"UVuf" ",
8943 s, RE_SV_TAIL(r->float_substr),
8944 (IV)r->float_min_offset, (UV)r->float_max_offset);
8945 } else if (r->float_utf8) {
8946 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8947 RE_SV_DUMPLEN(r->float_utf8), 30);
8948 PerlIO_printf(Perl_debug_log,
8949 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8950 s, RE_SV_TAIL(r->float_utf8),
8951 (IV)r->float_min_offset, (UV)r->float_max_offset);
8953 if (r->check_substr || r->check_utf8)
8954 PerlIO_printf(Perl_debug_log,
8956 (r->check_substr == r->float_substr
8957 && r->check_utf8 == r->float_utf8
8958 ? "(checking floating" : "(checking anchored"));
8959 if (r->extflags & RXf_NOSCAN)
8960 PerlIO_printf(Perl_debug_log, " noscan");
8961 if (r->extflags & RXf_CHECK_ALL)
8962 PerlIO_printf(Perl_debug_log, " isall");
8963 if (r->check_substr || r->check_utf8)
8964 PerlIO_printf(Perl_debug_log, ") ");
8966 if (ri->regstclass) {
8967 regprop(r, sv, ri->regstclass);
8968 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8970 if (r->extflags & RXf_ANCH) {
8971 PerlIO_printf(Perl_debug_log, "anchored");
8972 if (r->extflags & RXf_ANCH_BOL)
8973 PerlIO_printf(Perl_debug_log, "(BOL)");
8974 if (r->extflags & RXf_ANCH_MBOL)
8975 PerlIO_printf(Perl_debug_log, "(MBOL)");
8976 if (r->extflags & RXf_ANCH_SBOL)
8977 PerlIO_printf(Perl_debug_log, "(SBOL)");
8978 if (r->extflags & RXf_ANCH_GPOS)
8979 PerlIO_printf(Perl_debug_log, "(GPOS)");
8980 PerlIO_putc(Perl_debug_log, ' ');
8982 if (r->extflags & RXf_GPOS_SEEN)
8983 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8984 if (r->intflags & PREGf_SKIP)
8985 PerlIO_printf(Perl_debug_log, "plus ");
8986 if (r->intflags & PREGf_IMPLICIT)
8987 PerlIO_printf(Perl_debug_log, "implicit ");
8988 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8989 if (r->extflags & RXf_EVAL_SEEN)
8990 PerlIO_printf(Perl_debug_log, "with eval ");
8991 PerlIO_printf(Perl_debug_log, "\n");
8992 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
8994 PERL_ARGS_ASSERT_REGDUMP;
8995 PERL_UNUSED_CONTEXT;
8997 #endif /* DEBUGGING */
9001 - regprop - printable representation of opcode
9003 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9006 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9007 if (flags & ANYOF_INVERT) \
9008 /*make sure the invert info is in each */ \
9009 sv_catpvs(sv, "^"); \
9015 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9020 RXi_GET_DECL(prog,progi);
9021 GET_RE_DEBUG_FLAGS_DECL;
9023 PERL_ARGS_ASSERT_REGPROP;
9027 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9028 /* It would be nice to FAIL() here, but this may be called from
9029 regexec.c, and it would be hard to supply pRExC_state. */
9030 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9031 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9033 k = PL_regkind[OP(o)];
9037 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9038 * is a crude hack but it may be the best for now since
9039 * we have no flag "this EXACTish node was UTF-8"
9041 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9042 PERL_PV_ESCAPE_UNI_DETECT |
9043 PERL_PV_PRETTY_ELLIPSES |
9044 PERL_PV_PRETTY_LTGT |
9045 PERL_PV_PRETTY_NOCLEAR
9047 } else if (k == TRIE) {
9048 /* print the details of the trie in dumpuntil instead, as
9049 * progi->data isn't available here */
9050 const char op = OP(o);
9051 const U32 n = ARG(o);
9052 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9053 (reg_ac_data *)progi->data->data[n] :
9055 const reg_trie_data * const trie
9056 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9058 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9059 DEBUG_TRIE_COMPILE_r(
9060 Perl_sv_catpvf(aTHX_ sv,
9061 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9062 (UV)trie->startstate,
9063 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9064 (UV)trie->wordcount,
9067 (UV)TRIE_CHARCOUNT(trie),
9068 (UV)trie->uniquecharcount
9071 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9073 int rangestart = -1;
9074 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9076 for (i = 0; i <= 256; i++) {
9077 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9078 if (rangestart == -1)
9080 } else if (rangestart != -1) {
9081 if (i <= rangestart + 3)
9082 for (; rangestart < i; rangestart++)
9083 put_byte(sv, rangestart);
9085 put_byte(sv, rangestart);
9087 put_byte(sv, i - 1);
9095 } else if (k == CURLY) {
9096 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9097 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9098 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9100 else if (k == WHILEM && o->flags) /* Ordinal/of */
9101 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9102 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9103 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9104 if ( RXp_PAREN_NAMES(prog) ) {
9105 if ( k != REF || OP(o) < NREF) {
9106 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9107 SV **name= av_fetch(list, ARG(o), 0 );
9109 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9112 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9113 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9114 I32 *nums=(I32*)SvPVX(sv_dat);
9115 SV **name= av_fetch(list, nums[0], 0 );
9118 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9119 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9120 (n ? "," : ""), (IV)nums[n]);
9122 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9126 } else if (k == GOSUB)
9127 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9128 else if (k == VERB) {
9130 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9131 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9132 } else if (k == LOGICAL)
9133 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9134 else if (k == FOLDCHAR)
9135 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9136 else if (k == ANYOF) {
9137 int i, rangestart = -1;
9138 const U8 flags = ANYOF_FLAGS(o);
9141 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9142 static const char * const anyofs[] = {
9175 if (flags & ANYOF_LOCALE)
9176 sv_catpvs(sv, "{loc}");
9177 if (flags & ANYOF_FOLD)
9178 sv_catpvs(sv, "{i}");
9179 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9180 if (flags & ANYOF_INVERT)
9183 /* output what the standard cp 0-255 bitmap matches */
9184 for (i = 0; i <= 256; i++) {
9185 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9186 if (rangestart == -1)
9188 } else if (rangestart != -1) {
9189 if (i <= rangestart + 3)
9190 for (; rangestart < i; rangestart++)
9191 put_byte(sv, rangestart);
9193 put_byte(sv, rangestart);
9195 put_byte(sv, i - 1);
9202 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9203 /* output any special charclass tests (used mostly under use locale) */
9204 if (o->flags & ANYOF_CLASS)
9205 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9206 if (ANYOF_CLASS_TEST(o,i)) {
9207 sv_catpv(sv, anyofs[i]);
9211 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9213 /* output information about the unicode matching */
9214 if (flags & ANYOF_UNICODE)
9215 sv_catpvs(sv, "{unicode}");
9216 else if (flags & ANYOF_UNICODE_ALL)
9217 sv_catpvs(sv, "{unicode_all}");
9221 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9225 U8 s[UTF8_MAXBYTES_CASE+1];
9227 for (i = 0; i <= 256; i++) { /* just the first 256 */
9228 uvchr_to_utf8(s, i);
9230 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9231 if (rangestart == -1)
9233 } else if (rangestart != -1) {
9234 if (i <= rangestart + 3)
9235 for (; rangestart < i; rangestart++) {
9236 const U8 * const e = uvchr_to_utf8(s,rangestart);
9238 for(p = s; p < e; p++)
9242 const U8 *e = uvchr_to_utf8(s,rangestart);
9244 for (p = s; p < e; p++)
9247 e = uvchr_to_utf8(s, i-1);
9248 for (p = s; p < e; p++)
9255 sv_catpvs(sv, "..."); /* et cetera */
9259 char *s = savesvpv(lv);
9260 char * const origs = s;
9262 while (*s && *s != '\n')
9266 const char * const t = ++s;
9284 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9286 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9287 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9289 PERL_UNUSED_CONTEXT;
9290 PERL_UNUSED_ARG(sv);
9292 PERL_UNUSED_ARG(prog);
9293 #endif /* DEBUGGING */
9297 Perl_re_intuit_string(pTHX_ REGEXP * const prog)
9298 { /* Assume that RE_INTUIT is set */
9300 GET_RE_DEBUG_FLAGS_DECL;
9302 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9303 PERL_UNUSED_CONTEXT;
9307 const char * const s = SvPV_nolen_const(prog->check_substr
9308 ? prog->check_substr : prog->check_utf8);
9310 if (!PL_colorset) reginitcolors();
9311 PerlIO_printf(Perl_debug_log,
9312 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9314 prog->check_substr ? "" : "utf8 ",
9315 PL_colors[5],PL_colors[0],
9318 (strlen(s) > 60 ? "..." : ""));
9321 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9327 handles refcounting and freeing the perl core regexp structure. When
9328 it is necessary to actually free the structure the first thing it
9329 does is call the 'free' method of the regexp_engine associated to to
9330 the regexp, allowing the handling of the void *pprivate; member
9331 first. (This routine is not overridable by extensions, which is why
9332 the extensions free is called first.)
9334 See regdupe and regdupe_internal if you change anything here.
9336 #ifndef PERL_IN_XSUB_RE
9338 Perl_pregfree(pTHX_ REGEXP *r)
9341 GET_RE_DEBUG_FLAGS_DECL;
9343 if (!r || (--r->refcnt > 0))
9346 ReREFCNT_dec(r->mother_re);
9348 CALLREGFREE_PVT(r); /* free the private data */
9349 if (RXp_PAREN_NAMES(r))
9350 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9351 Safefree(RX_WRAPPED(r));
9354 if (r->anchored_substr)
9355 SvREFCNT_dec(r->anchored_substr);
9356 if (r->anchored_utf8)
9357 SvREFCNT_dec(r->anchored_utf8);
9358 if (r->float_substr)
9359 SvREFCNT_dec(r->float_substr);
9361 SvREFCNT_dec(r->float_utf8);
9362 Safefree(r->substrs);
9364 RX_MATCH_COPY_FREE(r);
9365 #ifdef PERL_OLD_COPY_ON_WRITE
9367 SvREFCNT_dec(r->saved_copy);
9376 This is a hacky workaround to the structural issue of match results
9377 being stored in the regexp structure which is in turn stored in
9378 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9379 could be PL_curpm in multiple contexts, and could require multiple
9380 result sets being associated with the pattern simultaneously, such
9381 as when doing a recursive match with (??{$qr})
9383 The solution is to make a lightweight copy of the regexp structure
9384 when a qr// is returned from the code executed by (??{$qr}) this
9385 lightweight copy doesnt actually own any of its data except for
9386 the starp/end and the actual regexp structure itself.
9392 Perl_reg_temp_copy (pTHX_ REGEXP *r) {
9394 register const I32 npar = r->nparens+1;
9396 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9398 (void)ReREFCNT_inc(r);
9399 Newx(ret, 1, regexp);
9400 StructCopy(r, ret, regexp);
9401 Newx(ret->offs, npar, regexp_paren_pair);
9402 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9405 Newx(ret->substrs, 1, struct reg_substr_data);
9406 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9408 SvREFCNT_inc_void(ret->anchored_substr);
9409 SvREFCNT_inc_void(ret->anchored_utf8);
9410 SvREFCNT_inc_void(ret->float_substr);
9411 SvREFCNT_inc_void(ret->float_utf8);
9413 /* check_substr and check_utf8, if non-NULL, point to either their
9414 anchored or float namesakes, and don't hold a second reference. */
9416 RX_MATCH_COPIED_off(ret);
9417 #ifdef PERL_OLD_COPY_ON_WRITE
9418 ret->saved_copy = NULL;
9427 /* regfree_internal()
9429 Free the private data in a regexp. This is overloadable by
9430 extensions. Perl takes care of the regexp structure in pregfree(),
9431 this covers the *pprivate pointer which technically perldoesnt
9432 know about, however of course we have to handle the
9433 regexp_internal structure when no extension is in use.
9435 Note this is called before freeing anything in the regexp
9440 Perl_regfree_internal(pTHX_ REGEXP * const r)
9444 GET_RE_DEBUG_FLAGS_DECL;
9446 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9452 SV *dsv= sv_newmortal();
9453 RE_PV_QUOTED_DECL(s, RX_UTF8(r),
9454 dsv, RX_PRECOMP(r), RX_PRELEN(r), 60);
9455 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9456 PL_colors[4],PL_colors[5],s);
9459 #ifdef RE_TRACK_PATTERN_OFFSETS
9461 Safefree(ri->u.offsets); /* 20010421 MJD */
9464 int n = ri->data->count;
9465 PAD* new_comppad = NULL;
9470 /* If you add a ->what type here, update the comment in regcomp.h */
9471 switch (ri->data->what[n]) {
9475 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9478 Safefree(ri->data->data[n]);
9481 new_comppad = MUTABLE_AV(ri->data->data[n]);
9484 if (new_comppad == NULL)
9485 Perl_croak(aTHX_ "panic: pregfree comppad");
9486 PAD_SAVE_LOCAL(old_comppad,
9487 /* Watch out for global destruction's random ordering. */
9488 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9491 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9494 op_free((OP_4tree*)ri->data->data[n]);
9496 PAD_RESTORE_LOCAL(old_comppad);
9497 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9503 { /* Aho Corasick add-on structure for a trie node.
9504 Used in stclass optimization only */
9506 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9508 refcount = --aho->refcount;
9511 PerlMemShared_free(aho->states);
9512 PerlMemShared_free(aho->fail);
9513 /* do this last!!!! */
9514 PerlMemShared_free(ri->data->data[n]);
9515 PerlMemShared_free(ri->regstclass);
9521 /* trie structure. */
9523 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9525 refcount = --trie->refcount;
9528 PerlMemShared_free(trie->charmap);
9529 PerlMemShared_free(trie->states);
9530 PerlMemShared_free(trie->trans);
9532 PerlMemShared_free(trie->bitmap);
9534 PerlMemShared_free(trie->wordlen);
9536 PerlMemShared_free(trie->jump);
9538 PerlMemShared_free(trie->nextword);
9539 /* do this last!!!! */
9540 PerlMemShared_free(ri->data->data[n]);
9545 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9548 Safefree(ri->data->what);
9555 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9556 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9557 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9558 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9561 re_dup - duplicate a regexp.
9563 This routine is expected to clone a given regexp structure. It is only
9564 compiled under USE_ITHREADS.
9566 After all of the core data stored in struct regexp is duplicated
9567 the regexp_engine.dupe method is used to copy any private data
9568 stored in the *pprivate pointer. This allows extensions to handle
9569 any duplication it needs to do.
9571 See pregfree() and regfree_internal() if you change anything here.
9573 #if defined(USE_ITHREADS)
9574 #ifndef PERL_IN_XSUB_RE
9576 Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
9583 PERL_ARGS_ASSERT_RE_DUP;
9586 return (REGEXP *)NULL;
9588 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
9591 npar = r->nparens+1;
9592 Newx(ret, 1, regexp);
9593 StructCopy(r, ret, regexp);
9594 Newx(ret->offs, npar, regexp_paren_pair);
9595 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9597 /* no need to copy these */
9598 Newx(ret->swap, npar, regexp_paren_pair);
9602 /* Do it this way to avoid reading from *r after the StructCopy().
9603 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9604 cache, it doesn't matter. */
9605 const bool anchored = r->check_substr
9606 ? r->check_substr == r->anchored_substr
9607 : r->check_utf8 == r->anchored_utf8;
9608 Newx(ret->substrs, 1, struct reg_substr_data);
9609 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9611 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9612 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9613 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9614 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9616 /* check_substr and check_utf8, if non-NULL, point to either their
9617 anchored or float namesakes, and don't hold a second reference. */
9619 if (ret->check_substr) {
9621 assert(r->check_utf8 == r->anchored_utf8);
9622 ret->check_substr = ret->anchored_substr;
9623 ret->check_utf8 = ret->anchored_utf8;
9625 assert(r->check_substr == r->float_substr);
9626 assert(r->check_utf8 == r->float_utf8);
9627 ret->check_substr = ret->float_substr;
9628 ret->check_utf8 = ret->float_utf8;
9630 } else if (ret->check_utf8) {
9632 ret->check_utf8 = ret->anchored_utf8;
9634 ret->check_utf8 = ret->float_utf8;
9639 precomp_offset = RX_PRECOMP(ret) - ret->wrapped;
9641 RX_WRAPPED(ret) = SAVEPVN(RX_WRAPPED(ret), RX_WRAPLEN(ret)+1);
9642 RX_PRECOMP(ret) = ret->wrapped + precomp_offset;
9643 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9646 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
9648 if (RX_MATCH_COPIED(ret))
9649 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9652 #ifdef PERL_OLD_COPY_ON_WRITE
9653 ret->saved_copy = NULL;
9656 ret->mother_re = NULL;
9659 ptr_table_store(PL_ptr_table, r, ret);
9662 #endif /* PERL_IN_XSUB_RE */
9667 This is the internal complement to regdupe() which is used to copy
9668 the structure pointed to by the *pprivate pointer in the regexp.
9669 This is the core version of the extension overridable cloning hook.
9670 The regexp structure being duplicated will be copied by perl prior
9671 to this and will be provided as the regexp *r argument, however
9672 with the /old/ structures pprivate pointer value. Thus this routine
9673 may override any copying normally done by perl.
9675 It returns a pointer to the new regexp_internal structure.
9679 Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param)
9682 regexp_internal *reti;
9686 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9688 npar = r->nparens+1;
9691 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9692 Copy(ri->program, reti->program, len+1, regnode);
9695 reti->regstclass = NULL;
9699 const int count = ri->data->count;
9702 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9703 char, struct reg_data);
9704 Newx(d->what, count, U8);
9707 for (i = 0; i < count; i++) {
9708 d->what[i] = ri->data->what[i];
9709 switch (d->what[i]) {
9710 /* legal options are one of: sSfpontTu
9711 see also regcomp.h and pregfree() */
9714 case 'p': /* actually an AV, but the dup function is identical. */
9715 case 'u': /* actually an HV, but the dup function is identical. */
9716 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9719 /* This is cheating. */
9720 Newx(d->data[i], 1, struct regnode_charclass_class);
9721 StructCopy(ri->data->data[i], d->data[i],
9722 struct regnode_charclass_class);
9723 reti->regstclass = (regnode*)d->data[i];
9726 /* Compiled op trees are readonly and in shared memory,
9727 and can thus be shared without duplication. */
9729 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9733 /* Trie stclasses are readonly and can thus be shared
9734 * without duplication. We free the stclass in pregfree
9735 * when the corresponding reg_ac_data struct is freed.
9737 reti->regstclass= ri->regstclass;
9741 ((reg_trie_data*)ri->data->data[i])->refcount++;
9745 d->data[i] = ri->data->data[i];
9748 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9757 reti->name_list_idx = ri->name_list_idx;
9759 #ifdef RE_TRACK_PATTERN_OFFSETS
9760 if (ri->u.offsets) {
9761 Newx(reti->u.offsets, 2*len+1, U32);
9762 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9765 SetProgLen(reti,len);
9771 #endif /* USE_ITHREADS */
9776 converts a regexp embedded in a MAGIC struct to its stringified form,
9777 caching the converted form in the struct and returns the cached
9780 If lp is nonnull then it is used to return the length of the
9783 If flags is nonnull and the returned string contains UTF8 then
9784 (*flags & 1) will be true.
9786 If haseval is nonnull then it is used to return whether the pattern
9789 Normally called via macro:
9791 CALLREG_STRINGIFY(mg,&len,&utf8);
9795 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
9797 See sv_2pv_flags() in sv.c for an example of internal usage.
9800 #ifndef PERL_IN_XSUB_RE
9803 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9805 const REGEXP * const re = (REGEXP *)mg->mg_obj;
9807 *haseval = RX_SEEN_EVALS(re);
9809 *flags = RX_UTF8(re) ? 1 : 0;
9811 *lp = RX_WRAPLEN(re);
9812 return RX_WRAPPED(re);
9816 - regnext - dig the "next" pointer out of a node
9819 Perl_regnext(pTHX_ register regnode *p)
9822 register I32 offset;
9827 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9836 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9839 STRLEN l1 = strlen(pat1);
9840 STRLEN l2 = strlen(pat2);
9843 const char *message;
9845 PERL_ARGS_ASSERT_RE_CROAK2;
9851 Copy(pat1, buf, l1 , char);
9852 Copy(pat2, buf + l1, l2 , char);
9853 buf[l1 + l2] = '\n';
9854 buf[l1 + l2 + 1] = '\0';
9856 /* ANSI variant takes additional second argument */
9857 va_start(args, pat2);
9861 msv = vmess(buf, &args);
9863 message = SvPV_const(msv,l1);
9866 Copy(message, buf, l1 , char);
9867 buf[l1-1] = '\0'; /* Overwrite \n */
9868 Perl_croak(aTHX_ "%s", buf);
9871 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9873 #ifndef PERL_IN_XSUB_RE
9875 Perl_save_re_context(pTHX)
9879 struct re_save_state *state;
9881 SAVEVPTR(PL_curcop);
9882 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9884 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9885 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9886 SSPUSHINT(SAVEt_RE_STATE);
9888 Copy(&PL_reg_state, state, 1, struct re_save_state);
9890 PL_reg_start_tmp = 0;
9891 PL_reg_start_tmpl = 0;
9892 PL_reg_oldsaved = NULL;
9893 PL_reg_oldsavedlen = 0;
9895 PL_reg_leftiter = 0;
9896 PL_reg_poscache = NULL;
9897 PL_reg_poscache_size = 0;
9898 #ifdef PERL_OLD_COPY_ON_WRITE
9902 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9904 const REGEXP * const rx = PM_GETRE(PL_curpm);
9907 for (i = 1; i <= RX_NPARENS(rx); i++) {
9908 char digits[TYPE_CHARS(long)];
9909 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9910 GV *const *const gvp
9911 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9914 GV * const gv = *gvp;
9915 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9925 clear_re(pTHX_ void *r)
9928 ReREFCNT_dec((REGEXP *)r);
9934 S_put_byte(pTHX_ SV *sv, int c)
9936 PERL_ARGS_ASSERT_PUT_BYTE;
9938 /* Our definition of isPRINT() ignores locales, so only bytes that are
9939 not part of UTF-8 are considered printable. I assume that the same
9940 holds for UTF-EBCDIC.
9941 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9942 which Wikipedia says:
9944 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9945 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9946 identical, to the ASCII delete (DEL) or rubout control character.
9947 ) So the old condition can be simplified to !isPRINT(c) */
9949 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9951 const char string = c;
9952 if (c == '-' || c == ']' || c == '\\' || c == '^')
9953 sv_catpvs(sv, "\\");
9954 sv_catpvn(sv, &string, 1);
9959 #define CLEAR_OPTSTART \
9960 if (optstart) STMT_START { \
9961 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9965 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9967 STATIC const regnode *
9968 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9969 const regnode *last, const regnode *plast,
9970 SV* sv, I32 indent, U32 depth)
9973 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9974 register const regnode *next;
9975 const regnode *optstart= NULL;
9978 GET_RE_DEBUG_FLAGS_DECL;
9980 PERL_ARGS_ASSERT_DUMPUNTIL;
9982 #ifdef DEBUG_DUMPUNTIL
9983 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9984 last ? last-start : 0,plast ? plast-start : 0);
9987 if (plast && plast < last)
9990 while (PL_regkind[op] != END && (!last || node < last)) {
9991 /* While that wasn't END last time... */
9994 if (op == CLOSE || op == WHILEM)
9996 next = regnext((regnode *)node);
9999 if (OP(node) == OPTIMIZED) {
10000 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10007 regprop(r, sv, node);
10008 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10009 (int)(2*indent + 1), "", SvPVX_const(sv));
10011 if (OP(node) != OPTIMIZED) {
10012 if (next == NULL) /* Next ptr. */
10013 PerlIO_printf(Perl_debug_log, " (0)");
10014 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10015 PerlIO_printf(Perl_debug_log, " (FAIL)");
10017 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10018 (void)PerlIO_putc(Perl_debug_log, '\n');
10022 if (PL_regkind[(U8)op] == BRANCHJ) {
10025 register const regnode *nnode = (OP(next) == LONGJMP
10026 ? regnext((regnode *)next)
10028 if (last && nnode > last)
10030 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10033 else if (PL_regkind[(U8)op] == BRANCH) {
10035 DUMPUNTIL(NEXTOPER(node), next);
10037 else if ( PL_regkind[(U8)op] == TRIE ) {
10038 const regnode *this_trie = node;
10039 const char op = OP(node);
10040 const U32 n = ARG(node);
10041 const reg_ac_data * const ac = op>=AHOCORASICK ?
10042 (reg_ac_data *)ri->data->data[n] :
10044 const reg_trie_data * const trie =
10045 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10047 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10049 const regnode *nextbranch= NULL;
10052 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10053 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10055 PerlIO_printf(Perl_debug_log, "%*s%s ",
10056 (int)(2*(indent+3)), "",
10057 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10058 PL_colors[0], PL_colors[1],
10059 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10060 PERL_PV_PRETTY_ELLIPSES |
10061 PERL_PV_PRETTY_LTGT
10066 U16 dist= trie->jump[word_idx+1];
10067 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10068 (UV)((dist ? this_trie + dist : next) - start));
10071 nextbranch= this_trie + trie->jump[0];
10072 DUMPUNTIL(this_trie + dist, nextbranch);
10074 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10075 nextbranch= regnext((regnode *)nextbranch);
10077 PerlIO_printf(Perl_debug_log, "\n");
10080 if (last && next > last)
10085 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10086 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10087 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10089 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10091 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10093 else if ( op == PLUS || op == STAR) {
10094 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10096 else if (op == ANYOF) {
10097 /* arglen 1 + class block */
10098 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10099 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10100 node = NEXTOPER(node);
10102 else if (PL_regkind[(U8)op] == EXACT) {
10103 /* Literal string, where present. */
10104 node += NODE_SZ_STR(node) - 1;
10105 node = NEXTOPER(node);
10108 node = NEXTOPER(node);
10109 node += regarglen[(U8)op];
10111 if (op == CURLYX || op == OPEN)
10115 #ifdef DEBUG_DUMPUNTIL
10116 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10121 #endif /* DEBUGGING */
10125 * c-indentation-style: bsd
10126 * c-basic-offset: 4
10127 * indent-tabs-mode: t
10130 * ex: set ts=8 sts=4 sw=4 noet: