5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
93 # if defined(BUGGY_MSC6)
94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 # pragma optimize("a",off)
96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 # pragma optimize("w",on )
98 # endif /* BUGGY_MSC6 */
102 #define STATIC static
105 typedef struct RExC_state_t {
106 U32 flags; /* are we folding, multilining? */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
111 char *start; /* Start of input for compile */
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
115 regnode *emit_start; /* Start of emitted-code area */
116 regnode *emit_bound; /* First regnode outside of the allocated space */
117 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
121 I32 size; /* Code size. */
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
124 I32 nestroot; /* root parens we are in - used by accept */
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
135 HV *paren_names; /* Paren names */
137 regnode **recurse; /* Recurse regops */
138 I32 recurse_count; /* Number of recurse regops */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
144 const char *lastparse;
146 AV *paren_name_list; /* idx -> name */
147 #define RExC_lastparse (pRExC_state->lastparse)
148 #define RExC_lastnum (pRExC_state->lastnum)
149 #define RExC_paren_name_list (pRExC_state->paren_name_list)
153 #define RExC_flags (pRExC_state->flags)
154 #define RExC_precomp (pRExC_state->precomp)
155 #define RExC_rx_sv (pRExC_state->rx_sv)
156 #define RExC_rx (pRExC_state->rx)
157 #define RExC_rxi (pRExC_state->rxi)
158 #define RExC_start (pRExC_state->start)
159 #define RExC_end (pRExC_state->end)
160 #define RExC_parse (pRExC_state->parse)
161 #define RExC_whilem_seen (pRExC_state->whilem_seen)
162 #ifdef RE_TRACK_PATTERN_OFFSETS
163 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
165 #define RExC_emit (pRExC_state->emit)
166 #define RExC_emit_start (pRExC_state->emit_start)
167 #define RExC_emit_bound (pRExC_state->emit_bound)
168 #define RExC_naughty (pRExC_state->naughty)
169 #define RExC_sawback (pRExC_state->sawback)
170 #define RExC_seen (pRExC_state->seen)
171 #define RExC_size (pRExC_state->size)
172 #define RExC_npar (pRExC_state->npar)
173 #define RExC_nestroot (pRExC_state->nestroot)
174 #define RExC_extralen (pRExC_state->extralen)
175 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
176 #define RExC_seen_evals (pRExC_state->seen_evals)
177 #define RExC_utf8 (pRExC_state->utf8)
178 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
179 #define RExC_open_parens (pRExC_state->open_parens)
180 #define RExC_close_parens (pRExC_state->close_parens)
181 #define RExC_opend (pRExC_state->opend)
182 #define RExC_paren_names (pRExC_state->paren_names)
183 #define RExC_recurse (pRExC_state->recurse)
184 #define RExC_recurse_count (pRExC_state->recurse_count)
187 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
188 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 ((*s) == '{' && regcurly(s)))
192 #undef SPSTART /* dratted cpp namespace... */
195 * Flags to be passed up and down.
197 #define WORST 0 /* Worst case. */
198 #define HASWIDTH 0x01 /* Known to match non-null strings. */
200 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
201 * character, and if utf8, must be invariant. */
203 #define SPSTART 0x04 /* Starts with * or +. */
204 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
205 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
207 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
209 /* whether trie related optimizations are enabled */
210 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
211 #define TRIE_STUDY_OPT
212 #define FULL_TRIE_STUDY
218 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
219 #define PBITVAL(paren) (1 << ((paren) & 7))
220 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
221 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
222 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
224 /* If not already in utf8, do a longjmp back to the beginning */
225 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
226 #define REQUIRE_UTF8 STMT_START { \
227 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 /* About scan_data_t.
232 During optimisation we recurse through the regexp program performing
233 various inplace (keyhole style) optimisations. In addition study_chunk
234 and scan_commit populate this data structure with information about
235 what strings MUST appear in the pattern. We look for the longest
236 string that must appear for at a fixed location, and we look for the
237 longest string that may appear at a floating location. So for instance
242 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
243 strings (because they follow a .* construct). study_chunk will identify
244 both FOO and BAR as being the longest fixed and floating strings respectively.
246 The strings can be composites, for instance
250 will result in a composite fixed substring 'foo'.
252 For each string some basic information is maintained:
254 - offset or min_offset
255 This is the position the string must appear at, or not before.
256 It also implicitly (when combined with minlenp) tells us how many
257 character must match before the string we are searching.
258 Likewise when combined with minlenp and the length of the string
259 tells us how many characters must appear after the string we have
263 Only used for floating strings. This is the rightmost point that
264 the string can appear at. Ifset to I32 max it indicates that the
265 string can occur infinitely far to the right.
268 A pointer to the minimum length of the pattern that the string
269 was found inside. This is important as in the case of positive
270 lookahead or positive lookbehind we can have multiple patterns
275 The minimum length of the pattern overall is 3, the minimum length
276 of the lookahead part is 3, but the minimum length of the part that
277 will actually match is 1. So 'FOO's minimum length is 3, but the
278 minimum length for the F is 1. This is important as the minimum length
279 is used to determine offsets in front of and behind the string being
280 looked for. Since strings can be composites this is the length of the
281 pattern at the time it was commited with a scan_commit. Note that
282 the length is calculated by study_chunk, so that the minimum lengths
283 are not known until the full pattern has been compiled, thus the
284 pointer to the value.
288 In the case of lookbehind the string being searched for can be
289 offset past the start point of the final matching string.
290 If this value was just blithely removed from the min_offset it would
291 invalidate some of the calculations for how many chars must match
292 before or after (as they are derived from min_offset and minlen and
293 the length of the string being searched for).
294 When the final pattern is compiled and the data is moved from the
295 scan_data_t structure into the regexp structure the information
296 about lookbehind is factored in, with the information that would
297 have been lost precalculated in the end_shift field for the
300 The fields pos_min and pos_delta are used to store the minimum offset
301 and the delta to the maximum offset at the current point in the pattern.
305 typedef struct scan_data_t {
306 /*I32 len_min; unused */
307 /*I32 len_delta; unused */
311 I32 last_end; /* min value, <0 unless valid. */
314 SV **longest; /* Either &l_fixed, or &l_float. */
315 SV *longest_fixed; /* longest fixed string found in pattern */
316 I32 offset_fixed; /* offset where it starts */
317 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
318 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
319 SV *longest_float; /* longest floating string found in pattern */
320 I32 offset_float_min; /* earliest point in string it can appear */
321 I32 offset_float_max; /* latest point in string it can appear */
322 I32 *minlen_float; /* pointer to the minlen relevent to the string */
323 I32 lookbehind_float; /* is the position of the string modified by LB */
327 struct regnode_charclass_class *start_class;
331 * Forward declarations for pregcomp()'s friends.
334 static const scan_data_t zero_scan_data =
335 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
337 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
338 #define SF_BEFORE_SEOL 0x0001
339 #define SF_BEFORE_MEOL 0x0002
340 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
341 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344 # define SF_FIX_SHIFT_EOL (0+2)
345 # define SF_FL_SHIFT_EOL (0+4)
347 # define SF_FIX_SHIFT_EOL (+2)
348 # define SF_FL_SHIFT_EOL (+4)
351 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
352 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
354 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
355 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
356 #define SF_IS_INF 0x0040
357 #define SF_HAS_PAR 0x0080
358 #define SF_IN_PAR 0x0100
359 #define SF_HAS_EVAL 0x0200
360 #define SCF_DO_SUBSTR 0x0400
361 #define SCF_DO_STCLASS_AND 0x0800
362 #define SCF_DO_STCLASS_OR 0x1000
363 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
364 #define SCF_WHILEM_VISITED_POS 0x2000
366 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
367 #define SCF_SEEN_ACCEPT 0x8000
369 #define UTF (RExC_utf8 != 0)
370 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
371 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
373 #define OOB_UNICODE 12345678
374 #define OOB_NAMEDCLASS -1
376 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
377 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
380 /* length of regex to show in messages that don't mark a position within */
381 #define RegexLengthToShowInErrorMessages 127
384 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
385 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
386 * op/pragma/warn/regcomp.
388 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
389 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
391 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
394 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
395 * arg. Show regex, up to a maximum length. If it's too long, chop and add
398 #define _FAIL(code) STMT_START { \
399 const char *ellipses = ""; \
400 IV len = RExC_end - RExC_precomp; \
403 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
404 if (len > RegexLengthToShowInErrorMessages) { \
405 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
406 len = RegexLengthToShowInErrorMessages - 10; \
412 #define FAIL(msg) _FAIL( \
413 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
414 msg, (int)len, RExC_precomp, ellipses))
416 #define FAIL2(msg,arg) _FAIL( \
417 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
418 arg, (int)len, RExC_precomp, ellipses))
421 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
423 #define Simple_vFAIL(m) STMT_START { \
424 const IV offset = RExC_parse - RExC_precomp; \
425 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
426 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
432 #define vFAIL(m) STMT_START { \
434 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
439 * Like Simple_vFAIL(), but accepts two arguments.
441 #define Simple_vFAIL2(m,a1) STMT_START { \
442 const IV offset = RExC_parse - RExC_precomp; \
443 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
444 (int)offset, RExC_precomp, RExC_precomp + offset); \
448 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
450 #define vFAIL2(m,a1) STMT_START { \
452 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
453 Simple_vFAIL2(m, a1); \
458 * Like Simple_vFAIL(), but accepts three arguments.
460 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
461 const IV offset = RExC_parse - RExC_precomp; \
462 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
463 (int)offset, RExC_precomp, RExC_precomp + offset); \
467 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
469 #define vFAIL3(m,a1,a2) STMT_START { \
471 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
472 Simple_vFAIL3(m, a1, a2); \
476 * Like Simple_vFAIL(), but accepts four arguments.
478 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
479 const IV offset = RExC_parse - RExC_precomp; \
480 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
481 (int)offset, RExC_precomp, RExC_precomp + offset); \
484 #define ckWARNreg(loc,m) STMT_START { \
485 const IV offset = loc - RExC_precomp; \
486 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 (int)offset, RExC_precomp, RExC_precomp + offset); \
490 #define ckWARNregdep(loc,m) STMT_START { \
491 const IV offset = loc - RExC_precomp; \
492 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
494 (int)offset, RExC_precomp, RExC_precomp + offset); \
497 #define ckWARN2reg(loc, m, a1) STMT_START { \
498 const IV offset = loc - RExC_precomp; \
499 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
500 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
503 #define vWARN3(loc, m, a1, a2) STMT_START { \
504 const IV offset = loc - RExC_precomp; \
505 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
506 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
509 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
510 const IV offset = loc - RExC_precomp; \
511 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
512 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
515 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
516 const IV offset = loc - RExC_precomp; \
517 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
518 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
521 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
522 const IV offset = loc - RExC_precomp; \
523 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
527 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
528 const IV offset = loc - RExC_precomp; \
529 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
530 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
534 /* Allow for side effects in s */
535 #define REGC(c,s) STMT_START { \
536 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
539 /* Macros for recording node offsets. 20001227 mjd@plover.com
540 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
541 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
542 * Element 0 holds the number n.
543 * Position is 1 indexed.
545 #ifndef RE_TRACK_PATTERN_OFFSETS
546 #define Set_Node_Offset_To_R(node,byte)
547 #define Set_Node_Offset(node,byte)
548 #define Set_Cur_Node_Offset
549 #define Set_Node_Length_To_R(node,len)
550 #define Set_Node_Length(node,len)
551 #define Set_Node_Cur_Length(node)
552 #define Node_Offset(n)
553 #define Node_Length(n)
554 #define Set_Node_Offset_Length(node,offset,len)
555 #define ProgLen(ri) ri->u.proglen
556 #define SetProgLen(ri,x) ri->u.proglen = x
558 #define ProgLen(ri) ri->u.offsets[0]
559 #define SetProgLen(ri,x) ri->u.offsets[0] = x
560 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
562 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
563 __LINE__, (int)(node), (int)(byte))); \
565 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
567 RExC_offsets[2*(node)-1] = (byte); \
572 #define Set_Node_Offset(node,byte) \
573 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
574 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
576 #define Set_Node_Length_To_R(node,len) STMT_START { \
578 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
579 __LINE__, (int)(node), (int)(len))); \
581 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
583 RExC_offsets[2*(node)] = (len); \
588 #define Set_Node_Length(node,len) \
589 Set_Node_Length_To_R((node)-RExC_emit_start, len)
590 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
591 #define Set_Node_Cur_Length(node) \
592 Set_Node_Length(node, RExC_parse - parse_start)
594 /* Get offsets and lengths */
595 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
596 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
598 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
599 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
600 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
604 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
605 #define EXPERIMENTAL_INPLACESCAN
606 #endif /*RE_TRACK_PATTERN_OFFSETS*/
608 #define DEBUG_STUDYDATA(str,data,depth) \
609 DEBUG_OPTIMISE_MORE_r(if(data){ \
610 PerlIO_printf(Perl_debug_log, \
611 "%*s" str "Pos:%"IVdf"/%"IVdf \
612 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
613 (int)(depth)*2, "", \
614 (IV)((data)->pos_min), \
615 (IV)((data)->pos_delta), \
616 (UV)((data)->flags), \
617 (IV)((data)->whilem_c), \
618 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
619 is_inf ? "INF " : "" \
621 if ((data)->last_found) \
622 PerlIO_printf(Perl_debug_log, \
623 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
624 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
625 SvPVX_const((data)->last_found), \
626 (IV)((data)->last_end), \
627 (IV)((data)->last_start_min), \
628 (IV)((data)->last_start_max), \
629 ((data)->longest && \
630 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
631 SvPVX_const((data)->longest_fixed), \
632 (IV)((data)->offset_fixed), \
633 ((data)->longest && \
634 (data)->longest==&((data)->longest_float)) ? "*" : "", \
635 SvPVX_const((data)->longest_float), \
636 (IV)((data)->offset_float_min), \
637 (IV)((data)->offset_float_max) \
639 PerlIO_printf(Perl_debug_log,"\n"); \
642 static void clear_re(pTHX_ void *r);
644 /* Mark that we cannot extend a found fixed substring at this point.
645 Update the longest found anchored substring and the longest found
646 floating substrings if needed. */
649 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
651 const STRLEN l = CHR_SVLEN(data->last_found);
652 const STRLEN old_l = CHR_SVLEN(*data->longest);
653 GET_RE_DEBUG_FLAGS_DECL;
655 PERL_ARGS_ASSERT_SCAN_COMMIT;
657 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
658 SvSetMagicSV(*data->longest, data->last_found);
659 if (*data->longest == data->longest_fixed) {
660 data->offset_fixed = l ? data->last_start_min : data->pos_min;
661 if (data->flags & SF_BEFORE_EOL)
663 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
665 data->flags &= ~SF_FIX_BEFORE_EOL;
666 data->minlen_fixed=minlenp;
667 data->lookbehind_fixed=0;
669 else { /* *data->longest == data->longest_float */
670 data->offset_float_min = l ? data->last_start_min : data->pos_min;
671 data->offset_float_max = (l
672 ? data->last_start_max
673 : data->pos_min + data->pos_delta);
674 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
675 data->offset_float_max = I32_MAX;
676 if (data->flags & SF_BEFORE_EOL)
678 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
680 data->flags &= ~SF_FL_BEFORE_EOL;
681 data->minlen_float=minlenp;
682 data->lookbehind_float=0;
685 SvCUR_set(data->last_found, 0);
687 SV * const sv = data->last_found;
688 if (SvUTF8(sv) && SvMAGICAL(sv)) {
689 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
695 data->flags &= ~SF_BEFORE_EOL;
696 DEBUG_STUDYDATA("commit: ",data,0);
699 /* Can match anything (initialization) */
701 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
703 PERL_ARGS_ASSERT_CL_ANYTHING;
705 ANYOF_CLASS_ZERO(cl);
706 ANYOF_BITMAP_SETALL(cl);
707 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
709 cl->flags |= ANYOF_LOCALE;
712 /* Can match anything (initialization) */
714 S_cl_is_anything(const struct regnode_charclass_class *cl)
718 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
720 for (value = 0; value <= ANYOF_MAX; value += 2)
721 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
723 if (!(cl->flags & ANYOF_UNICODE_ALL))
725 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
730 /* Can match anything (initialization) */
732 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
734 PERL_ARGS_ASSERT_CL_INIT;
736 Zero(cl, 1, struct regnode_charclass_class);
738 cl_anything(pRExC_state, cl);
742 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
744 PERL_ARGS_ASSERT_CL_INIT_ZERO;
746 Zero(cl, 1, struct regnode_charclass_class);
748 cl_anything(pRExC_state, cl);
750 cl->flags |= ANYOF_LOCALE;
753 /* 'And' a given class with another one. Can create false positives */
754 /* We assume that cl is not inverted */
756 S_cl_and(struct regnode_charclass_class *cl,
757 const struct regnode_charclass_class *and_with)
759 PERL_ARGS_ASSERT_CL_AND;
761 assert(and_with->type == ANYOF);
762 if (!(and_with->flags & ANYOF_CLASS)
763 && !(cl->flags & ANYOF_CLASS)
764 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
765 && !(and_with->flags & ANYOF_FOLD)
766 && !(cl->flags & ANYOF_FOLD)) {
769 if (and_with->flags & ANYOF_INVERT)
770 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
771 cl->bitmap[i] &= ~and_with->bitmap[i];
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] &= and_with->bitmap[i];
775 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
776 if (!(and_with->flags & ANYOF_EOS))
777 cl->flags &= ~ANYOF_EOS;
779 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
780 !(and_with->flags & ANYOF_INVERT)) {
781 cl->flags &= ~ANYOF_UNICODE_ALL;
782 cl->flags |= ANYOF_UNICODE;
783 ARG_SET(cl, ARG(and_with));
785 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
786 !(and_with->flags & ANYOF_INVERT))
787 cl->flags &= ~ANYOF_UNICODE_ALL;
788 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
789 !(and_with->flags & ANYOF_INVERT))
790 cl->flags &= ~ANYOF_UNICODE;
793 /* 'OR' a given class with another one. Can create false positives */
794 /* We assume that cl is not inverted */
796 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
798 PERL_ARGS_ASSERT_CL_OR;
800 if (or_with->flags & ANYOF_INVERT) {
802 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
803 * <= (B1 | !B2) | (CL1 | !CL2)
804 * which is wasteful if CL2 is small, but we ignore CL2:
805 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
806 * XXXX Can we handle case-fold? Unclear:
807 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
808 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
810 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
811 && !(or_with->flags & ANYOF_FOLD)
812 && !(cl->flags & ANYOF_FOLD) ) {
815 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
816 cl->bitmap[i] |= ~or_with->bitmap[i];
817 } /* XXXX: logic is complicated otherwise */
819 cl_anything(pRExC_state, cl);
822 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
823 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
824 && (!(or_with->flags & ANYOF_FOLD)
825 || (cl->flags & ANYOF_FOLD)) ) {
828 /* OR char bitmap and class bitmap separately */
829 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
830 cl->bitmap[i] |= or_with->bitmap[i];
831 if (or_with->flags & ANYOF_CLASS) {
832 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
833 cl->classflags[i] |= or_with->classflags[i];
834 cl->flags |= ANYOF_CLASS;
837 else { /* XXXX: logic is complicated, leave it along for a moment. */
838 cl_anything(pRExC_state, cl);
841 if (or_with->flags & ANYOF_EOS)
842 cl->flags |= ANYOF_EOS;
844 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
845 ARG(cl) != ARG(or_with)) {
846 cl->flags |= ANYOF_UNICODE_ALL;
847 cl->flags &= ~ANYOF_UNICODE;
849 if (or_with->flags & ANYOF_UNICODE_ALL) {
850 cl->flags |= ANYOF_UNICODE_ALL;
851 cl->flags &= ~ANYOF_UNICODE;
855 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
856 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
857 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
858 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
863 dump_trie(trie,widecharmap,revcharmap)
864 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
865 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
867 These routines dump out a trie in a somewhat readable format.
868 The _interim_ variants are used for debugging the interim
869 tables that are used to generate the final compressed
870 representation which is what dump_trie expects.
872 Part of the reason for their existance is to provide a form
873 of documentation as to how the different representations function.
878 Dumps the final compressed table form of the trie to Perl_debug_log.
879 Used for debugging make_trie().
883 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
884 AV *revcharmap, U32 depth)
887 SV *sv=sv_newmortal();
888 int colwidth= widecharmap ? 6 : 4;
890 GET_RE_DEBUG_FLAGS_DECL;
892 PERL_ARGS_ASSERT_DUMP_TRIE;
894 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
895 (int)depth * 2 + 2,"",
896 "Match","Base","Ofs" );
898 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
899 SV ** const tmp = av_fetch( revcharmap, state, 0);
901 PerlIO_printf( Perl_debug_log, "%*s",
903 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
904 PL_colors[0], PL_colors[1],
905 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
906 PERL_PV_ESCAPE_FIRSTCHAR
911 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
912 (int)depth * 2 + 2,"");
914 for( state = 0 ; state < trie->uniquecharcount ; state++ )
915 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
916 PerlIO_printf( Perl_debug_log, "\n");
918 for( state = 1 ; state < trie->statecount ; state++ ) {
919 const U32 base = trie->states[ state ].trans.base;
921 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
923 if ( trie->states[ state ].wordnum ) {
924 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
926 PerlIO_printf( Perl_debug_log, "%6s", "" );
929 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
934 while( ( base + ofs < trie->uniquecharcount ) ||
935 ( base + ofs - trie->uniquecharcount < trie->lasttrans
936 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
939 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
941 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
942 if ( ( base + ofs >= trie->uniquecharcount ) &&
943 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
944 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
946 PerlIO_printf( Perl_debug_log, "%*"UVXf,
948 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
950 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
954 PerlIO_printf( Perl_debug_log, "]");
957 PerlIO_printf( Perl_debug_log, "\n" );
959 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
960 for (word=1; word <= trie->wordcount; word++) {
961 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
962 (int)word, (int)(trie->wordinfo[word].prev),
963 (int)(trie->wordinfo[word].len));
965 PerlIO_printf(Perl_debug_log, "\n" );
968 Dumps a fully constructed but uncompressed trie in list form.
969 List tries normally only are used for construction when the number of
970 possible chars (trie->uniquecharcount) is very high.
971 Used for debugging make_trie().
974 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
975 HV *widecharmap, AV *revcharmap, U32 next_alloc,
979 SV *sv=sv_newmortal();
980 int colwidth= widecharmap ? 6 : 4;
981 GET_RE_DEBUG_FLAGS_DECL;
983 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
985 /* print out the table precompression. */
986 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
987 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
988 "------:-----+-----------------\n" );
990 for( state=1 ; state < next_alloc ; state ++ ) {
993 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
994 (int)depth * 2 + 2,"", (UV)state );
995 if ( ! trie->states[ state ].wordnum ) {
996 PerlIO_printf( Perl_debug_log, "%5s| ","");
998 PerlIO_printf( Perl_debug_log, "W%4x| ",
999 trie->states[ state ].wordnum
1002 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1003 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1005 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1007 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1008 PL_colors[0], PL_colors[1],
1009 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1010 PERL_PV_ESCAPE_FIRSTCHAR
1012 TRIE_LIST_ITEM(state,charid).forid,
1013 (UV)TRIE_LIST_ITEM(state,charid).newstate
1016 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1017 (int)((depth * 2) + 14), "");
1020 PerlIO_printf( Perl_debug_log, "\n");
1025 Dumps a fully constructed but uncompressed trie in table form.
1026 This is the normal DFA style state transition table, with a few
1027 twists to facilitate compression later.
1028 Used for debugging make_trie().
1031 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1032 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1037 SV *sv=sv_newmortal();
1038 int colwidth= widecharmap ? 6 : 4;
1039 GET_RE_DEBUG_FLAGS_DECL;
1041 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1044 print out the table precompression so that we can do a visual check
1045 that they are identical.
1048 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1050 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1051 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1053 PerlIO_printf( Perl_debug_log, "%*s",
1055 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1056 PL_colors[0], PL_colors[1],
1057 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1058 PERL_PV_ESCAPE_FIRSTCHAR
1064 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1066 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1067 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1070 PerlIO_printf( Perl_debug_log, "\n" );
1072 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1074 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1075 (int)depth * 2 + 2,"",
1076 (UV)TRIE_NODENUM( state ) );
1078 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1079 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1081 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1083 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1085 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1086 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1088 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1089 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1097 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1098 startbranch: the first branch in the whole branch sequence
1099 first : start branch of sequence of branch-exact nodes.
1100 May be the same as startbranch
1101 last : Thing following the last branch.
1102 May be the same as tail.
1103 tail : item following the branch sequence
1104 count : words in the sequence
1105 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1106 depth : indent depth
1108 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1110 A trie is an N'ary tree where the branches are determined by digital
1111 decomposition of the key. IE, at the root node you look up the 1st character and
1112 follow that branch repeat until you find the end of the branches. Nodes can be
1113 marked as "accepting" meaning they represent a complete word. Eg:
1117 would convert into the following structure. Numbers represent states, letters
1118 following numbers represent valid transitions on the letter from that state, if
1119 the number is in square brackets it represents an accepting state, otherwise it
1120 will be in parenthesis.
1122 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1126 (1) +-i->(6)-+-s->[7]
1128 +-s->(3)-+-h->(4)-+-e->[5]
1130 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1132 This shows that when matching against the string 'hers' we will begin at state 1
1133 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1134 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1135 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1136 single traverse. We store a mapping from accepting to state to which word was
1137 matched, and then when we have multiple possibilities we try to complete the
1138 rest of the regex in the order in which they occured in the alternation.
1140 The only prior NFA like behaviour that would be changed by the TRIE support is
1141 the silent ignoring of duplicate alternations which are of the form:
1143 / (DUPE|DUPE) X? (?{ ... }) Y /x
1145 Thus EVAL blocks follwing a trie may be called a different number of times with
1146 and without the optimisation. With the optimisations dupes will be silently
1147 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1148 the following demonstrates:
1150 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1152 which prints out 'word' three times, but
1154 'words'=~/(word|word|word)(?{ print $1 })S/
1156 which doesnt print it out at all. This is due to other optimisations kicking in.
1158 Example of what happens on a structural level:
1160 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1162 1: CURLYM[1] {1,32767}(18)
1173 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1174 and should turn into:
1176 1: CURLYM[1] {1,32767}(18)
1178 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1186 Cases where tail != last would be like /(?foo|bar)baz/:
1196 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1197 and would end up looking like:
1200 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1207 d = uvuni_to_utf8_flags(d, uv, 0);
1209 is the recommended Unicode-aware way of saying
1214 #define TRIE_STORE_REVCHAR \
1217 SV *zlopp = newSV(2); \
1218 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1219 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1220 SvCUR_set(zlopp, kapow - flrbbbbb); \
1223 av_push(revcharmap, zlopp); \
1225 char ooooff = (char)uvc; \
1226 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1230 #define TRIE_READ_CHAR STMT_START { \
1234 if ( foldlen > 0 ) { \
1235 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1240 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1241 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1242 foldlen -= UNISKIP( uvc ); \
1243 scan = foldbuf + UNISKIP( uvc ); \
1246 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1256 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1257 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1258 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1259 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1261 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1262 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1263 TRIE_LIST_CUR( state )++; \
1266 #define TRIE_LIST_NEW(state) STMT_START { \
1267 Newxz( trie->states[ state ].trans.list, \
1268 4, reg_trie_trans_le ); \
1269 TRIE_LIST_CUR( state ) = 1; \
1270 TRIE_LIST_LEN( state ) = 4; \
1273 #define TRIE_HANDLE_WORD(state) STMT_START { \
1274 U16 dupe= trie->states[ state ].wordnum; \
1275 regnode * const noper_next = regnext( noper ); \
1278 /* store the word for dumping */ \
1280 if (OP(noper) != NOTHING) \
1281 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1283 tmp = newSVpvn_utf8( "", 0, UTF ); \
1284 av_push( trie_words, tmp ); \
1288 trie->wordinfo[curword].prev = 0; \
1289 trie->wordinfo[curword].len = wordlen; \
1290 trie->wordinfo[curword].accept = state; \
1292 if ( noper_next < tail ) { \
1294 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1295 trie->jump[curword] = (U16)(noper_next - convert); \
1297 jumper = noper_next; \
1299 nextbranch= regnext(cur); \
1303 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1304 /* chain, so that when the bits of chain are later */\
1305 /* linked together, the dups appear in the chain */\
1306 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1307 trie->wordinfo[dupe].prev = curword; \
1309 /* we haven't inserted this word yet. */ \
1310 trie->states[ state ].wordnum = curword; \
1315 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1316 ( ( base + charid >= ucharcount \
1317 && base + charid < ubound \
1318 && state == trie->trans[ base - ucharcount + charid ].check \
1319 && trie->trans[ base - ucharcount + charid ].next ) \
1320 ? trie->trans[ base - ucharcount + charid ].next \
1321 : ( state==1 ? special : 0 ) \
1325 #define MADE_JUMP_TRIE 2
1326 #define MADE_EXACT_TRIE 4
1329 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1332 /* first pass, loop through and scan words */
1333 reg_trie_data *trie;
1334 HV *widecharmap = NULL;
1335 AV *revcharmap = newAV();
1337 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1342 regnode *jumper = NULL;
1343 regnode *nextbranch = NULL;
1344 regnode *convert = NULL;
1345 U32 *prev_states; /* temp array mapping each state to previous one */
1346 /* we just use folder as a flag in utf8 */
1347 const U8 * const folder = ( flags == EXACTF
1349 : ( flags == EXACTFL
1356 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1357 AV *trie_words = NULL;
1358 /* along with revcharmap, this only used during construction but both are
1359 * useful during debugging so we store them in the struct when debugging.
1362 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1363 STRLEN trie_charcount=0;
1365 SV *re_trie_maxbuff;
1366 GET_RE_DEBUG_FLAGS_DECL;
1368 PERL_ARGS_ASSERT_MAKE_TRIE;
1370 PERL_UNUSED_ARG(depth);
1373 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1375 trie->startstate = 1;
1376 trie->wordcount = word_count;
1377 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1378 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1379 if (!(UTF && folder))
1380 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1381 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1382 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1385 trie_words = newAV();
1388 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1389 if (!SvIOK(re_trie_maxbuff)) {
1390 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1393 PerlIO_printf( Perl_debug_log,
1394 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1395 (int)depth * 2 + 2, "",
1396 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1397 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1401 /* Find the node we are going to overwrite */
1402 if ( first == startbranch && OP( last ) != BRANCH ) {
1403 /* whole branch chain */
1406 /* branch sub-chain */
1407 convert = NEXTOPER( first );
1410 /* -- First loop and Setup --
1412 We first traverse the branches and scan each word to determine if it
1413 contains widechars, and how many unique chars there are, this is
1414 important as we have to build a table with at least as many columns as we
1417 We use an array of integers to represent the character codes 0..255
1418 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1419 native representation of the character value as the key and IV's for the
1422 *TODO* If we keep track of how many times each character is used we can
1423 remap the columns so that the table compression later on is more
1424 efficient in terms of memory by ensuring most common value is in the
1425 middle and the least common are on the outside. IMO this would be better
1426 than a most to least common mapping as theres a decent chance the most
1427 common letter will share a node with the least common, meaning the node
1428 will not be compressable. With a middle is most common approach the worst
1429 case is when we have the least common nodes twice.
1433 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1434 regnode * const noper = NEXTOPER( cur );
1435 const U8 *uc = (U8*)STRING( noper );
1436 const U8 * const e = uc + STR_LEN( noper );
1438 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1439 const U8 *scan = (U8*)NULL;
1440 U32 wordlen = 0; /* required init */
1442 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1444 if (OP(noper) == NOTHING) {
1448 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1449 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1450 regardless of encoding */
1452 for ( ; uc < e ; uc += len ) {
1453 TRIE_CHARCOUNT(trie)++;
1457 if ( !trie->charmap[ uvc ] ) {
1458 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1460 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1464 /* store the codepoint in the bitmap, and if its ascii
1465 also store its folded equivelent. */
1466 TRIE_BITMAP_SET(trie,uvc);
1468 /* store the folded codepoint */
1469 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1472 /* store first byte of utf8 representation of
1473 codepoints in the 127 < uvc < 256 range */
1474 if (127 < uvc && uvc < 192) {
1475 TRIE_BITMAP_SET(trie,194);
1476 } else if (191 < uvc ) {
1477 TRIE_BITMAP_SET(trie,195);
1478 /* && uvc < 256 -- we know uvc is < 256 already */
1481 set_bit = 0; /* We've done our bit :-) */
1486 widecharmap = newHV();
1488 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1491 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1493 if ( !SvTRUE( *svpp ) ) {
1494 sv_setiv( *svpp, ++trie->uniquecharcount );
1499 if( cur == first ) {
1502 } else if (chars < trie->minlen) {
1504 } else if (chars > trie->maxlen) {
1508 } /* end first pass */
1509 DEBUG_TRIE_COMPILE_r(
1510 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1511 (int)depth * 2 + 2,"",
1512 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1513 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1514 (int)trie->minlen, (int)trie->maxlen )
1518 We now know what we are dealing with in terms of unique chars and
1519 string sizes so we can calculate how much memory a naive
1520 representation using a flat table will take. If it's over a reasonable
1521 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1522 conservative but potentially much slower representation using an array
1525 At the end we convert both representations into the same compressed
1526 form that will be used in regexec.c for matching with. The latter
1527 is a form that cannot be used to construct with but has memory
1528 properties similar to the list form and access properties similar
1529 to the table form making it both suitable for fast searches and
1530 small enough that its feasable to store for the duration of a program.
1532 See the comment in the code where the compressed table is produced
1533 inplace from the flat tabe representation for an explanation of how
1534 the compression works.
1539 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1542 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1544 Second Pass -- Array Of Lists Representation
1546 Each state will be represented by a list of charid:state records
1547 (reg_trie_trans_le) the first such element holds the CUR and LEN
1548 points of the allocated array. (See defines above).
1550 We build the initial structure using the lists, and then convert
1551 it into the compressed table form which allows faster lookups
1552 (but cant be modified once converted).
1555 STRLEN transcount = 1;
1557 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1558 "%*sCompiling trie using list compiler\n",
1559 (int)depth * 2 + 2, ""));
1561 trie->states = (reg_trie_state *)
1562 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1563 sizeof(reg_trie_state) );
1567 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1569 regnode * const noper = NEXTOPER( cur );
1570 U8 *uc = (U8*)STRING( noper );
1571 const U8 * const e = uc + STR_LEN( noper );
1572 U32 state = 1; /* required init */
1573 U16 charid = 0; /* sanity init */
1574 U8 *scan = (U8*)NULL; /* sanity init */
1575 STRLEN foldlen = 0; /* required init */
1576 U32 wordlen = 0; /* required init */
1577 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1579 if (OP(noper) != NOTHING) {
1580 for ( ; uc < e ; uc += len ) {
1585 charid = trie->charmap[ uvc ];
1587 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1591 charid=(U16)SvIV( *svpp );
1594 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1601 if ( !trie->states[ state ].trans.list ) {
1602 TRIE_LIST_NEW( state );
1604 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1605 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1606 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1611 newstate = next_alloc++;
1612 prev_states[newstate] = state;
1613 TRIE_LIST_PUSH( state, charid, newstate );
1618 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1622 TRIE_HANDLE_WORD(state);
1624 } /* end second pass */
1626 /* next alloc is the NEXT state to be allocated */
1627 trie->statecount = next_alloc;
1628 trie->states = (reg_trie_state *)
1629 PerlMemShared_realloc( trie->states,
1631 * sizeof(reg_trie_state) );
1633 /* and now dump it out before we compress it */
1634 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1635 revcharmap, next_alloc,
1639 trie->trans = (reg_trie_trans *)
1640 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1647 for( state=1 ; state < next_alloc ; state ++ ) {
1651 DEBUG_TRIE_COMPILE_MORE_r(
1652 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1656 if (trie->states[state].trans.list) {
1657 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1661 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1662 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1663 if ( forid < minid ) {
1665 } else if ( forid > maxid ) {
1669 if ( transcount < tp + maxid - minid + 1) {
1671 trie->trans = (reg_trie_trans *)
1672 PerlMemShared_realloc( trie->trans,
1674 * sizeof(reg_trie_trans) );
1675 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1677 base = trie->uniquecharcount + tp - minid;
1678 if ( maxid == minid ) {
1680 for ( ; zp < tp ; zp++ ) {
1681 if ( ! trie->trans[ zp ].next ) {
1682 base = trie->uniquecharcount + zp - minid;
1683 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1684 trie->trans[ zp ].check = state;
1690 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1691 trie->trans[ tp ].check = state;
1696 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1697 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1698 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1699 trie->trans[ tid ].check = state;
1701 tp += ( maxid - minid + 1 );
1703 Safefree(trie->states[ state ].trans.list);
1706 DEBUG_TRIE_COMPILE_MORE_r(
1707 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1710 trie->states[ state ].trans.base=base;
1712 trie->lasttrans = tp + 1;
1716 Second Pass -- Flat Table Representation.
1718 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1719 We know that we will need Charcount+1 trans at most to store the data
1720 (one row per char at worst case) So we preallocate both structures
1721 assuming worst case.
1723 We then construct the trie using only the .next slots of the entry
1726 We use the .check field of the first entry of the node temporarily to
1727 make compression both faster and easier by keeping track of how many non
1728 zero fields are in the node.
1730 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1733 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1734 number representing the first entry of the node, and state as a
1735 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1736 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1737 are 2 entrys per node. eg:
1745 The table is internally in the right hand, idx form. However as we also
1746 have to deal with the states array which is indexed by nodenum we have to
1747 use TRIE_NODENUM() to convert.
1750 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1751 "%*sCompiling trie using table compiler\n",
1752 (int)depth * 2 + 2, ""));
1754 trie->trans = (reg_trie_trans *)
1755 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1756 * trie->uniquecharcount + 1,
1757 sizeof(reg_trie_trans) );
1758 trie->states = (reg_trie_state *)
1759 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1760 sizeof(reg_trie_state) );
1761 next_alloc = trie->uniquecharcount + 1;
1764 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1766 regnode * const noper = NEXTOPER( cur );
1767 const U8 *uc = (U8*)STRING( noper );
1768 const U8 * const e = uc + STR_LEN( noper );
1770 U32 state = 1; /* required init */
1772 U16 charid = 0; /* sanity init */
1773 U32 accept_state = 0; /* sanity init */
1774 U8 *scan = (U8*)NULL; /* sanity init */
1776 STRLEN foldlen = 0; /* required init */
1777 U32 wordlen = 0; /* required init */
1778 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1780 if ( OP(noper) != NOTHING ) {
1781 for ( ; uc < e ; uc += len ) {
1786 charid = trie->charmap[ uvc ];
1788 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1789 charid = svpp ? (U16)SvIV(*svpp) : 0;
1793 if ( !trie->trans[ state + charid ].next ) {
1794 trie->trans[ state + charid ].next = next_alloc;
1795 trie->trans[ state ].check++;
1796 prev_states[TRIE_NODENUM(next_alloc)]
1797 = TRIE_NODENUM(state);
1798 next_alloc += trie->uniquecharcount;
1800 state = trie->trans[ state + charid ].next;
1802 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1804 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1807 accept_state = TRIE_NODENUM( state );
1808 TRIE_HANDLE_WORD(accept_state);
1810 } /* end second pass */
1812 /* and now dump it out before we compress it */
1813 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1815 next_alloc, depth+1));
1819 * Inplace compress the table.*
1821 For sparse data sets the table constructed by the trie algorithm will
1822 be mostly 0/FAIL transitions or to put it another way mostly empty.
1823 (Note that leaf nodes will not contain any transitions.)
1825 This algorithm compresses the tables by eliminating most such
1826 transitions, at the cost of a modest bit of extra work during lookup:
1828 - Each states[] entry contains a .base field which indicates the
1829 index in the state[] array wheres its transition data is stored.
1831 - If .base is 0 there are no valid transitions from that node.
1833 - If .base is nonzero then charid is added to it to find an entry in
1836 -If trans[states[state].base+charid].check!=state then the
1837 transition is taken to be a 0/Fail transition. Thus if there are fail
1838 transitions at the front of the node then the .base offset will point
1839 somewhere inside the previous nodes data (or maybe even into a node
1840 even earlier), but the .check field determines if the transition is
1844 The following process inplace converts the table to the compressed
1845 table: We first do not compress the root node 1,and mark its all its
1846 .check pointers as 1 and set its .base pointer as 1 as well. This
1847 allows to do a DFA construction from the compressed table later, and
1848 ensures that any .base pointers we calculate later are greater than
1851 - We set 'pos' to indicate the first entry of the second node.
1853 - We then iterate over the columns of the node, finding the first and
1854 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1855 and set the .check pointers accordingly, and advance pos
1856 appropriately and repreat for the next node. Note that when we copy
1857 the next pointers we have to convert them from the original
1858 NODEIDX form to NODENUM form as the former is not valid post
1861 - If a node has no transitions used we mark its base as 0 and do not
1862 advance the pos pointer.
1864 - If a node only has one transition we use a second pointer into the
1865 structure to fill in allocated fail transitions from other states.
1866 This pointer is independent of the main pointer and scans forward
1867 looking for null transitions that are allocated to a state. When it
1868 finds one it writes the single transition into the "hole". If the
1869 pointer doesnt find one the single transition is appended as normal.
1871 - Once compressed we can Renew/realloc the structures to release the
1874 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1875 specifically Fig 3.47 and the associated pseudocode.
1879 const U32 laststate = TRIE_NODENUM( next_alloc );
1882 trie->statecount = laststate;
1884 for ( state = 1 ; state < laststate ; state++ ) {
1886 const U32 stateidx = TRIE_NODEIDX( state );
1887 const U32 o_used = trie->trans[ stateidx ].check;
1888 U32 used = trie->trans[ stateidx ].check;
1889 trie->trans[ stateidx ].check = 0;
1891 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1892 if ( flag || trie->trans[ stateidx + charid ].next ) {
1893 if ( trie->trans[ stateidx + charid ].next ) {
1895 for ( ; zp < pos ; zp++ ) {
1896 if ( ! trie->trans[ zp ].next ) {
1900 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1901 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1902 trie->trans[ zp ].check = state;
1903 if ( ++zp > pos ) pos = zp;
1910 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1912 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1913 trie->trans[ pos ].check = state;
1918 trie->lasttrans = pos + 1;
1919 trie->states = (reg_trie_state *)
1920 PerlMemShared_realloc( trie->states, laststate
1921 * sizeof(reg_trie_state) );
1922 DEBUG_TRIE_COMPILE_MORE_r(
1923 PerlIO_printf( Perl_debug_log,
1924 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1925 (int)depth * 2 + 2,"",
1926 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1929 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1932 } /* end table compress */
1934 DEBUG_TRIE_COMPILE_MORE_r(
1935 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1936 (int)depth * 2 + 2, "",
1937 (UV)trie->statecount,
1938 (UV)trie->lasttrans)
1940 /* resize the trans array to remove unused space */
1941 trie->trans = (reg_trie_trans *)
1942 PerlMemShared_realloc( trie->trans, trie->lasttrans
1943 * sizeof(reg_trie_trans) );
1945 { /* Modify the program and insert the new TRIE node*/
1946 U8 nodetype =(U8)(flags & 0xFF);
1950 regnode *optimize = NULL;
1951 #ifdef RE_TRACK_PATTERN_OFFSETS
1954 U32 mjd_nodelen = 0;
1955 #endif /* RE_TRACK_PATTERN_OFFSETS */
1956 #endif /* DEBUGGING */
1958 This means we convert either the first branch or the first Exact,
1959 depending on whether the thing following (in 'last') is a branch
1960 or not and whther first is the startbranch (ie is it a sub part of
1961 the alternation or is it the whole thing.)
1962 Assuming its a sub part we conver the EXACT otherwise we convert
1963 the whole branch sequence, including the first.
1965 /* Find the node we are going to overwrite */
1966 if ( first != startbranch || OP( last ) == BRANCH ) {
1967 /* branch sub-chain */
1968 NEXT_OFF( first ) = (U16)(last - first);
1969 #ifdef RE_TRACK_PATTERN_OFFSETS
1971 mjd_offset= Node_Offset((convert));
1972 mjd_nodelen= Node_Length((convert));
1975 /* whole branch chain */
1977 #ifdef RE_TRACK_PATTERN_OFFSETS
1980 const regnode *nop = NEXTOPER( convert );
1981 mjd_offset= Node_Offset((nop));
1982 mjd_nodelen= Node_Length((nop));
1986 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1987 (int)depth * 2 + 2, "",
1988 (UV)mjd_offset, (UV)mjd_nodelen)
1991 /* But first we check to see if there is a common prefix we can
1992 split out as an EXACT and put in front of the TRIE node. */
1993 trie->startstate= 1;
1994 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1996 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2000 const U32 base = trie->states[ state ].trans.base;
2002 if ( trie->states[state].wordnum )
2005 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2006 if ( ( base + ofs >= trie->uniquecharcount ) &&
2007 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2008 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2010 if ( ++count > 1 ) {
2011 SV **tmp = av_fetch( revcharmap, ofs, 0);
2012 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2013 if ( state == 1 ) break;
2015 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2017 PerlIO_printf(Perl_debug_log,
2018 "%*sNew Start State=%"UVuf" Class: [",
2019 (int)depth * 2 + 2, "",
2022 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2023 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2025 TRIE_BITMAP_SET(trie,*ch);
2027 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2029 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2033 TRIE_BITMAP_SET(trie,*ch);
2035 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2036 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2042 SV **tmp = av_fetch( revcharmap, idx, 0);
2044 char *ch = SvPV( *tmp, len );
2046 SV *sv=sv_newmortal();
2047 PerlIO_printf( Perl_debug_log,
2048 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2049 (int)depth * 2 + 2, "",
2051 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2052 PL_colors[0], PL_colors[1],
2053 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2054 PERL_PV_ESCAPE_FIRSTCHAR
2059 OP( convert ) = nodetype;
2060 str=STRING(convert);
2063 STR_LEN(convert) += len;
2069 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2074 trie->prefixlen = (state-1);
2076 regnode *n = convert+NODE_SZ_STR(convert);
2077 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2078 trie->startstate = state;
2079 trie->minlen -= (state - 1);
2080 trie->maxlen -= (state - 1);
2082 /* At least the UNICOS C compiler choked on this
2083 * being argument to DEBUG_r(), so let's just have
2086 #ifdef PERL_EXT_RE_BUILD
2092 regnode *fix = convert;
2093 U32 word = trie->wordcount;
2095 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2096 while( ++fix < n ) {
2097 Set_Node_Offset_Length(fix, 0, 0);
2100 SV ** const tmp = av_fetch( trie_words, word, 0 );
2102 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2103 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2105 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2113 NEXT_OFF(convert) = (U16)(tail - convert);
2114 DEBUG_r(optimize= n);
2120 if ( trie->maxlen ) {
2121 NEXT_OFF( convert ) = (U16)(tail - convert);
2122 ARG_SET( convert, data_slot );
2123 /* Store the offset to the first unabsorbed branch in
2124 jump[0], which is otherwise unused by the jump logic.
2125 We use this when dumping a trie and during optimisation. */
2127 trie->jump[0] = (U16)(nextbranch - convert);
2130 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2131 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2133 OP( convert ) = TRIEC;
2134 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2135 PerlMemShared_free(trie->bitmap);
2138 OP( convert ) = TRIE;
2140 /* store the type in the flags */
2141 convert->flags = nodetype;
2145 + regarglen[ OP( convert ) ];
2147 /* XXX We really should free up the resource in trie now,
2148 as we won't use them - (which resources?) dmq */
2150 /* needed for dumping*/
2151 DEBUG_r(if (optimize) {
2152 regnode *opt = convert;
2154 while ( ++opt < optimize) {
2155 Set_Node_Offset_Length(opt,0,0);
2158 Try to clean up some of the debris left after the
2161 while( optimize < jumper ) {
2162 mjd_nodelen += Node_Length((optimize));
2163 OP( optimize ) = OPTIMIZED;
2164 Set_Node_Offset_Length(optimize,0,0);
2167 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2169 } /* end node insert */
2170 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2172 /* Finish populating the prev field of the wordinfo array. Walk back
2173 * from each accept state until we find another accept state, and if
2174 * so, point the first word's .prev field at the second word. If the
2175 * second already has a .prev field set, stop now. This will be the
2176 * case either if we've already processed that word's accept state,
2177 * or that that state had multiple words, and the overspill words
2178 * were already linked up earlier.
2185 for (word=1; word <= trie->wordcount; word++) {
2187 if (trie->wordinfo[word].prev)
2189 state = trie->wordinfo[word].accept;
2191 state = prev_states[state];
2194 prev = trie->states[state].wordnum;
2198 trie->wordinfo[word].prev = prev;
2200 Safefree(prev_states);
2204 /* and now dump out the compressed format */
2205 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2207 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2209 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2210 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2212 SvREFCNT_dec(revcharmap);
2216 : trie->startstate>1
2222 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2224 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2226 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2227 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2230 We find the fail state for each state in the trie, this state is the longest proper
2231 suffix of the current states 'word' that is also a proper prefix of another word in our
2232 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2233 the DFA not to have to restart after its tried and failed a word at a given point, it
2234 simply continues as though it had been matching the other word in the first place.
2236 'abcdgu'=~/abcdefg|cdgu/
2237 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2238 fail, which would bring use to the state representing 'd' in the second word where we would
2239 try 'g' and succeed, prodceding to match 'cdgu'.
2241 /* add a fail transition */
2242 const U32 trie_offset = ARG(source);
2243 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2245 const U32 ucharcount = trie->uniquecharcount;
2246 const U32 numstates = trie->statecount;
2247 const U32 ubound = trie->lasttrans + ucharcount;
2251 U32 base = trie->states[ 1 ].trans.base;
2254 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2255 GET_RE_DEBUG_FLAGS_DECL;
2257 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2259 PERL_UNUSED_ARG(depth);
2263 ARG_SET( stclass, data_slot );
2264 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2265 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2266 aho->trie=trie_offset;
2267 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2268 Copy( trie->states, aho->states, numstates, reg_trie_state );
2269 Newxz( q, numstates, U32);
2270 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2273 /* initialize fail[0..1] to be 1 so that we always have
2274 a valid final fail state */
2275 fail[ 0 ] = fail[ 1 ] = 1;
2277 for ( charid = 0; charid < ucharcount ; charid++ ) {
2278 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2280 q[ q_write ] = newstate;
2281 /* set to point at the root */
2282 fail[ q[ q_write++ ] ]=1;
2285 while ( q_read < q_write) {
2286 const U32 cur = q[ q_read++ % numstates ];
2287 base = trie->states[ cur ].trans.base;
2289 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2290 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2292 U32 fail_state = cur;
2295 fail_state = fail[ fail_state ];
2296 fail_base = aho->states[ fail_state ].trans.base;
2297 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2299 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2300 fail[ ch_state ] = fail_state;
2301 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2303 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2305 q[ q_write++ % numstates] = ch_state;
2309 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2310 when we fail in state 1, this allows us to use the
2311 charclass scan to find a valid start char. This is based on the principle
2312 that theres a good chance the string being searched contains lots of stuff
2313 that cant be a start char.
2315 fail[ 0 ] = fail[ 1 ] = 0;
2316 DEBUG_TRIE_COMPILE_r({
2317 PerlIO_printf(Perl_debug_log,
2318 "%*sStclass Failtable (%"UVuf" states): 0",
2319 (int)(depth * 2), "", (UV)numstates
2321 for( q_read=1; q_read<numstates; q_read++ ) {
2322 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2324 PerlIO_printf(Perl_debug_log, "\n");
2327 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2332 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2333 * These need to be revisited when a newer toolchain becomes available.
2335 #if defined(__sparc64__) && defined(__GNUC__)
2336 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2337 # undef SPARC64_GCC_WORKAROUND
2338 # define SPARC64_GCC_WORKAROUND 1
2342 #define DEBUG_PEEP(str,scan,depth) \
2343 DEBUG_OPTIMISE_r({if (scan){ \
2344 SV * const mysv=sv_newmortal(); \
2345 regnode *Next = regnext(scan); \
2346 regprop(RExC_rx, mysv, scan); \
2347 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2348 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2349 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2356 #define JOIN_EXACT(scan,min,flags) \
2357 if (PL_regkind[OP(scan)] == EXACT) \
2358 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2361 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2362 /* Merge several consecutive EXACTish nodes into one. */
2363 regnode *n = regnext(scan);
2365 regnode *next = scan + NODE_SZ_STR(scan);
2369 regnode *stop = scan;
2370 GET_RE_DEBUG_FLAGS_DECL;
2372 PERL_UNUSED_ARG(depth);
2375 PERL_ARGS_ASSERT_JOIN_EXACT;
2376 #ifndef EXPERIMENTAL_INPLACESCAN
2377 PERL_UNUSED_ARG(flags);
2378 PERL_UNUSED_ARG(val);
2380 DEBUG_PEEP("join",scan,depth);
2382 /* Skip NOTHING, merge EXACT*. */
2384 ( PL_regkind[OP(n)] == NOTHING ||
2385 (stringok && (OP(n) == OP(scan))))
2387 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2389 if (OP(n) == TAIL || n > next)
2391 if (PL_regkind[OP(n)] == NOTHING) {
2392 DEBUG_PEEP("skip:",n,depth);
2393 NEXT_OFF(scan) += NEXT_OFF(n);
2394 next = n + NODE_STEP_REGNODE;
2401 else if (stringok) {
2402 const unsigned int oldl = STR_LEN(scan);
2403 regnode * const nnext = regnext(n);
2405 DEBUG_PEEP("merg",n,depth);
2408 if (oldl + STR_LEN(n) > U8_MAX)
2410 NEXT_OFF(scan) += NEXT_OFF(n);
2411 STR_LEN(scan) += STR_LEN(n);
2412 next = n + NODE_SZ_STR(n);
2413 /* Now we can overwrite *n : */
2414 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2422 #ifdef EXPERIMENTAL_INPLACESCAN
2423 if (flags && !NEXT_OFF(n)) {
2424 DEBUG_PEEP("atch", val, depth);
2425 if (reg_off_by_arg[OP(n)]) {
2426 ARG_SET(n, val - n);
2429 NEXT_OFF(n) = val - n;
2436 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2438 Two problematic code points in Unicode casefolding of EXACT nodes:
2440 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2441 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2447 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2448 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2450 This means that in case-insensitive matching (or "loose matching",
2451 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2452 length of the above casefolded versions) can match a target string
2453 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2454 This would rather mess up the minimum length computation.
2456 What we'll do is to look for the tail four bytes, and then peek
2457 at the preceding two bytes to see whether we need to decrease
2458 the minimum length by four (six minus two).
2460 Thanks to the design of UTF-8, there cannot be false matches:
2461 A sequence of valid UTF-8 bytes cannot be a subsequence of
2462 another valid sequence of UTF-8 bytes.
2465 char * const s0 = STRING(scan), *s, *t;
2466 char * const s1 = s0 + STR_LEN(scan) - 1;
2467 char * const s2 = s1 - 4;
2468 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2469 const char t0[] = "\xaf\x49\xaf\x42";
2471 const char t0[] = "\xcc\x88\xcc\x81";
2473 const char * const t1 = t0 + 3;
2476 s < s2 && (t = ninstr(s, s1, t0, t1));
2479 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2480 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2482 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2483 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2491 n = scan + NODE_SZ_STR(scan);
2493 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2500 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2504 /* REx optimizer. Converts nodes into quickier variants "in place".
2505 Finds fixed substrings. */
2507 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2508 to the position after last scanned or to NULL. */
2510 #define INIT_AND_WITHP \
2511 assert(!and_withp); \
2512 Newx(and_withp,1,struct regnode_charclass_class); \
2513 SAVEFREEPV(and_withp)
2515 /* this is a chain of data about sub patterns we are processing that
2516 need to be handled seperately/specially in study_chunk. Its so
2517 we can simulate recursion without losing state. */
2519 typedef struct scan_frame {
2520 regnode *last; /* last node to process in this frame */
2521 regnode *next; /* next node to process when last is reached */
2522 struct scan_frame *prev; /*previous frame*/
2523 I32 stop; /* what stopparen do we use */
2527 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2529 #define CASE_SYNST_FNC(nAmE) \
2531 if (flags & SCF_DO_STCLASS_AND) { \
2532 for (value = 0; value < 256; value++) \
2533 if (!is_ ## nAmE ## _cp(value)) \
2534 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2537 for (value = 0; value < 256; value++) \
2538 if (is_ ## nAmE ## _cp(value)) \
2539 ANYOF_BITMAP_SET(data->start_class, value); \
2543 if (flags & SCF_DO_STCLASS_AND) { \
2544 for (value = 0; value < 256; value++) \
2545 if (is_ ## nAmE ## _cp(value)) \
2546 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2549 for (value = 0; value < 256; value++) \
2550 if (!is_ ## nAmE ## _cp(value)) \
2551 ANYOF_BITMAP_SET(data->start_class, value); \
2558 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2559 I32 *minlenp, I32 *deltap,
2564 struct regnode_charclass_class *and_withp,
2565 U32 flags, U32 depth)
2566 /* scanp: Start here (read-write). */
2567 /* deltap: Write maxlen-minlen here. */
2568 /* last: Stop before this one. */
2569 /* data: string data about the pattern */
2570 /* stopparen: treat close N as END */
2571 /* recursed: which subroutines have we recursed into */
2572 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2575 I32 min = 0, pars = 0, code;
2576 regnode *scan = *scanp, *next;
2578 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2579 int is_inf_internal = 0; /* The studied chunk is infinite */
2580 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2581 scan_data_t data_fake;
2582 SV *re_trie_maxbuff = NULL;
2583 regnode *first_non_open = scan;
2584 I32 stopmin = I32_MAX;
2585 scan_frame *frame = NULL;
2586 GET_RE_DEBUG_FLAGS_DECL;
2588 PERL_ARGS_ASSERT_STUDY_CHUNK;
2591 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2595 while (first_non_open && OP(first_non_open) == OPEN)
2596 first_non_open=regnext(first_non_open);
2601 while ( scan && OP(scan) != END && scan < last ){
2602 /* Peephole optimizer: */
2603 DEBUG_STUDYDATA("Peep:", data,depth);
2604 DEBUG_PEEP("Peep",scan,depth);
2605 JOIN_EXACT(scan,&min,0);
2607 /* Follow the next-chain of the current node and optimize
2608 away all the NOTHINGs from it. */
2609 if (OP(scan) != CURLYX) {
2610 const int max = (reg_off_by_arg[OP(scan)]
2612 /* I32 may be smaller than U16 on CRAYs! */
2613 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2614 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2618 /* Skip NOTHING and LONGJMP. */
2619 while ((n = regnext(n))
2620 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2621 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2622 && off + noff < max)
2624 if (reg_off_by_arg[OP(scan)])
2627 NEXT_OFF(scan) = off;
2632 /* The principal pseudo-switch. Cannot be a switch, since we
2633 look into several different things. */
2634 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2635 || OP(scan) == IFTHEN) {
2636 next = regnext(scan);
2638 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2640 if (OP(next) == code || code == IFTHEN) {
2641 /* NOTE - There is similar code to this block below for handling
2642 TRIE nodes on a re-study. If you change stuff here check there
2644 I32 max1 = 0, min1 = I32_MAX, num = 0;
2645 struct regnode_charclass_class accum;
2646 regnode * const startbranch=scan;
2648 if (flags & SCF_DO_SUBSTR)
2649 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2650 if (flags & SCF_DO_STCLASS)
2651 cl_init_zero(pRExC_state, &accum);
2653 while (OP(scan) == code) {
2654 I32 deltanext, minnext, f = 0, fake;
2655 struct regnode_charclass_class this_class;
2658 data_fake.flags = 0;
2660 data_fake.whilem_c = data->whilem_c;
2661 data_fake.last_closep = data->last_closep;
2664 data_fake.last_closep = &fake;
2666 data_fake.pos_delta = delta;
2667 next = regnext(scan);
2668 scan = NEXTOPER(scan);
2670 scan = NEXTOPER(scan);
2671 if (flags & SCF_DO_STCLASS) {
2672 cl_init(pRExC_state, &this_class);
2673 data_fake.start_class = &this_class;
2674 f = SCF_DO_STCLASS_AND;
2676 if (flags & SCF_WHILEM_VISITED_POS)
2677 f |= SCF_WHILEM_VISITED_POS;
2679 /* we suppose the run is continuous, last=next...*/
2680 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2682 stopparen, recursed, NULL, f,depth+1);
2685 if (max1 < minnext + deltanext)
2686 max1 = minnext + deltanext;
2687 if (deltanext == I32_MAX)
2688 is_inf = is_inf_internal = 1;
2690 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2692 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2693 if ( stopmin > minnext)
2694 stopmin = min + min1;
2695 flags &= ~SCF_DO_SUBSTR;
2697 data->flags |= SCF_SEEN_ACCEPT;
2700 if (data_fake.flags & SF_HAS_EVAL)
2701 data->flags |= SF_HAS_EVAL;
2702 data->whilem_c = data_fake.whilem_c;
2704 if (flags & SCF_DO_STCLASS)
2705 cl_or(pRExC_state, &accum, &this_class);
2707 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2709 if (flags & SCF_DO_SUBSTR) {
2710 data->pos_min += min1;
2711 data->pos_delta += max1 - min1;
2712 if (max1 != min1 || is_inf)
2713 data->longest = &(data->longest_float);
2716 delta += max1 - min1;
2717 if (flags & SCF_DO_STCLASS_OR) {
2718 cl_or(pRExC_state, data->start_class, &accum);
2720 cl_and(data->start_class, and_withp);
2721 flags &= ~SCF_DO_STCLASS;
2724 else if (flags & SCF_DO_STCLASS_AND) {
2726 cl_and(data->start_class, &accum);
2727 flags &= ~SCF_DO_STCLASS;
2730 /* Switch to OR mode: cache the old value of
2731 * data->start_class */
2733 StructCopy(data->start_class, and_withp,
2734 struct regnode_charclass_class);
2735 flags &= ~SCF_DO_STCLASS_AND;
2736 StructCopy(&accum, data->start_class,
2737 struct regnode_charclass_class);
2738 flags |= SCF_DO_STCLASS_OR;
2739 data->start_class->flags |= ANYOF_EOS;
2743 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2746 Assuming this was/is a branch we are dealing with: 'scan' now
2747 points at the item that follows the branch sequence, whatever
2748 it is. We now start at the beginning of the sequence and look
2755 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2757 If we can find such a subseqence we need to turn the first
2758 element into a trie and then add the subsequent branch exact
2759 strings to the trie.
2763 1. patterns where the whole set of branch can be converted.
2765 2. patterns where only a subset can be converted.
2767 In case 1 we can replace the whole set with a single regop
2768 for the trie. In case 2 we need to keep the start and end
2771 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2772 becomes BRANCH TRIE; BRANCH X;
2774 There is an additional case, that being where there is a
2775 common prefix, which gets split out into an EXACT like node
2776 preceding the TRIE node.
2778 If x(1..n)==tail then we can do a simple trie, if not we make
2779 a "jump" trie, such that when we match the appropriate word
2780 we "jump" to the appopriate tail node. Essentailly we turn
2781 a nested if into a case structure of sorts.
2786 if (!re_trie_maxbuff) {
2787 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2788 if (!SvIOK(re_trie_maxbuff))
2789 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2791 if ( SvIV(re_trie_maxbuff)>=0 ) {
2793 regnode *first = (regnode *)NULL;
2794 regnode *last = (regnode *)NULL;
2795 regnode *tail = scan;
2800 SV * const mysv = sv_newmortal(); /* for dumping */
2802 /* var tail is used because there may be a TAIL
2803 regop in the way. Ie, the exacts will point to the
2804 thing following the TAIL, but the last branch will
2805 point at the TAIL. So we advance tail. If we
2806 have nested (?:) we may have to move through several
2810 while ( OP( tail ) == TAIL ) {
2811 /* this is the TAIL generated by (?:) */
2812 tail = regnext( tail );
2817 regprop(RExC_rx, mysv, tail );
2818 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2819 (int)depth * 2 + 2, "",
2820 "Looking for TRIE'able sequences. Tail node is: ",
2821 SvPV_nolen_const( mysv )
2827 step through the branches, cur represents each
2828 branch, noper is the first thing to be matched
2829 as part of that branch and noper_next is the
2830 regnext() of that node. if noper is an EXACT
2831 and noper_next is the same as scan (our current
2832 position in the regex) then the EXACT branch is
2833 a possible optimization target. Once we have
2834 two or more consequetive such branches we can
2835 create a trie of the EXACT's contents and stich
2836 it in place. If the sequence represents all of
2837 the branches we eliminate the whole thing and
2838 replace it with a single TRIE. If it is a
2839 subsequence then we need to stitch it in. This
2840 means the first branch has to remain, and needs
2841 to be repointed at the item on the branch chain
2842 following the last branch optimized. This could
2843 be either a BRANCH, in which case the
2844 subsequence is internal, or it could be the
2845 item following the branch sequence in which
2846 case the subsequence is at the end.
2850 /* dont use tail as the end marker for this traverse */
2851 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2852 regnode * const noper = NEXTOPER( cur );
2853 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2854 regnode * const noper_next = regnext( noper );
2858 regprop(RExC_rx, mysv, cur);
2859 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2860 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2862 regprop(RExC_rx, mysv, noper);
2863 PerlIO_printf( Perl_debug_log, " -> %s",
2864 SvPV_nolen_const(mysv));
2867 regprop(RExC_rx, mysv, noper_next );
2868 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2869 SvPV_nolen_const(mysv));
2871 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2872 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2874 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2875 : PL_regkind[ OP( noper ) ] == EXACT )
2876 || OP(noper) == NOTHING )
2878 && noper_next == tail
2883 if ( !first || optype == NOTHING ) {
2884 if (!first) first = cur;
2885 optype = OP( noper );
2891 Currently we do not believe that the trie logic can
2892 handle case insensitive matching properly when the
2893 pattern is not unicode (thus forcing unicode semantics).
2895 If/when this is fixed the following define can be swapped
2896 in below to fully enable trie logic.
2898 #define TRIE_TYPE_IS_SAFE 1
2901 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2903 if ( last && TRIE_TYPE_IS_SAFE ) {
2904 make_trie( pRExC_state,
2905 startbranch, first, cur, tail, count,
2908 if ( PL_regkind[ OP( noper ) ] == EXACT
2910 && noper_next == tail
2915 optype = OP( noper );
2925 regprop(RExC_rx, mysv, cur);
2926 PerlIO_printf( Perl_debug_log,
2927 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2928 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2932 if ( last && TRIE_TYPE_IS_SAFE ) {
2933 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2934 #ifdef TRIE_STUDY_OPT
2935 if ( ((made == MADE_EXACT_TRIE &&
2936 startbranch == first)
2937 || ( first_non_open == first )) &&
2939 flags |= SCF_TRIE_RESTUDY;
2940 if ( startbranch == first
2943 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2953 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2954 scan = NEXTOPER(NEXTOPER(scan));
2955 } else /* single branch is optimized. */
2956 scan = NEXTOPER(scan);
2958 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2959 scan_frame *newframe = NULL;
2964 if (OP(scan) != SUSPEND) {
2965 /* set the pointer */
2966 if (OP(scan) == GOSUB) {
2968 RExC_recurse[ARG2L(scan)] = scan;
2969 start = RExC_open_parens[paren-1];
2970 end = RExC_close_parens[paren-1];
2973 start = RExC_rxi->program + 1;
2977 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2978 SAVEFREEPV(recursed);
2980 if (!PAREN_TEST(recursed,paren+1)) {
2981 PAREN_SET(recursed,paren+1);
2982 Newx(newframe,1,scan_frame);
2984 if (flags & SCF_DO_SUBSTR) {
2985 SCAN_COMMIT(pRExC_state,data,minlenp);
2986 data->longest = &(data->longest_float);
2988 is_inf = is_inf_internal = 1;
2989 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2990 cl_anything(pRExC_state, data->start_class);
2991 flags &= ~SCF_DO_STCLASS;
2994 Newx(newframe,1,scan_frame);
2997 end = regnext(scan);
3002 SAVEFREEPV(newframe);
3003 newframe->next = regnext(scan);
3004 newframe->last = last;
3005 newframe->stop = stopparen;
3006 newframe->prev = frame;
3016 else if (OP(scan) == EXACT) {
3017 I32 l = STR_LEN(scan);
3020 const U8 * const s = (U8*)STRING(scan);
3021 l = utf8_length(s, s + l);
3022 uc = utf8_to_uvchr(s, NULL);
3024 uc = *((U8*)STRING(scan));
3027 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3028 /* The code below prefers earlier match for fixed
3029 offset, later match for variable offset. */
3030 if (data->last_end == -1) { /* Update the start info. */
3031 data->last_start_min = data->pos_min;
3032 data->last_start_max = is_inf
3033 ? I32_MAX : data->pos_min + data->pos_delta;
3035 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3037 SvUTF8_on(data->last_found);
3039 SV * const sv = data->last_found;
3040 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3041 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3042 if (mg && mg->mg_len >= 0)
3043 mg->mg_len += utf8_length((U8*)STRING(scan),
3044 (U8*)STRING(scan)+STR_LEN(scan));
3046 data->last_end = data->pos_min + l;
3047 data->pos_min += l; /* As in the first entry. */
3048 data->flags &= ~SF_BEFORE_EOL;
3050 if (flags & SCF_DO_STCLASS_AND) {
3051 /* Check whether it is compatible with what we know already! */
3055 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3056 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3057 && (!(data->start_class->flags & ANYOF_FOLD)
3058 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3061 ANYOF_CLASS_ZERO(data->start_class);
3062 ANYOF_BITMAP_ZERO(data->start_class);
3064 ANYOF_BITMAP_SET(data->start_class, uc);
3065 data->start_class->flags &= ~ANYOF_EOS;
3067 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3069 else if (flags & SCF_DO_STCLASS_OR) {
3070 /* false positive possible if the class is case-folded */
3072 ANYOF_BITMAP_SET(data->start_class, uc);
3074 data->start_class->flags |= ANYOF_UNICODE_ALL;
3075 data->start_class->flags &= ~ANYOF_EOS;
3076 cl_and(data->start_class, and_withp);
3078 flags &= ~SCF_DO_STCLASS;
3080 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3081 I32 l = STR_LEN(scan);
3082 UV uc = *((U8*)STRING(scan));
3084 /* Search for fixed substrings supports EXACT only. */
3085 if (flags & SCF_DO_SUBSTR) {
3087 SCAN_COMMIT(pRExC_state, data, minlenp);
3090 const U8 * const s = (U8 *)STRING(scan);
3091 l = utf8_length(s, s + l);
3092 uc = utf8_to_uvchr(s, NULL);
3095 if (flags & SCF_DO_SUBSTR)
3097 if (flags & SCF_DO_STCLASS_AND) {
3098 /* Check whether it is compatible with what we know already! */
3102 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3103 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3104 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3106 ANYOF_CLASS_ZERO(data->start_class);
3107 ANYOF_BITMAP_ZERO(data->start_class);
3109 ANYOF_BITMAP_SET(data->start_class, uc);
3110 data->start_class->flags &= ~ANYOF_EOS;
3111 data->start_class->flags |= ANYOF_FOLD;
3112 if (OP(scan) == EXACTFL)
3113 data->start_class->flags |= ANYOF_LOCALE;
3116 else if (flags & SCF_DO_STCLASS_OR) {
3117 if (data->start_class->flags & ANYOF_FOLD) {
3118 /* false positive possible if the class is case-folded.
3119 Assume that the locale settings are the same... */
3121 ANYOF_BITMAP_SET(data->start_class, uc);
3122 data->start_class->flags &= ~ANYOF_EOS;
3124 cl_and(data->start_class, and_withp);
3126 flags &= ~SCF_DO_STCLASS;
3128 else if (REGNODE_VARIES(OP(scan))) {
3129 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3130 I32 f = flags, pos_before = 0;
3131 regnode * const oscan = scan;
3132 struct regnode_charclass_class this_class;
3133 struct regnode_charclass_class *oclass = NULL;
3134 I32 next_is_eval = 0;
3136 switch (PL_regkind[OP(scan)]) {
3137 case WHILEM: /* End of (?:...)* . */
3138 scan = NEXTOPER(scan);
3141 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3142 next = NEXTOPER(scan);
3143 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3145 maxcount = REG_INFTY;
3146 next = regnext(scan);
3147 scan = NEXTOPER(scan);
3151 if (flags & SCF_DO_SUBSTR)
3156 if (flags & SCF_DO_STCLASS) {
3158 maxcount = REG_INFTY;
3159 next = regnext(scan);
3160 scan = NEXTOPER(scan);
3163 is_inf = is_inf_internal = 1;
3164 scan = regnext(scan);
3165 if (flags & SCF_DO_SUBSTR) {
3166 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3167 data->longest = &(data->longest_float);
3169 goto optimize_curly_tail;
3171 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3172 && (scan->flags == stopparen))
3177 mincount = ARG1(scan);
3178 maxcount = ARG2(scan);
3180 next = regnext(scan);
3181 if (OP(scan) == CURLYX) {
3182 I32 lp = (data ? *(data->last_closep) : 0);
3183 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3185 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3186 next_is_eval = (OP(scan) == EVAL);
3188 if (flags & SCF_DO_SUBSTR) {
3189 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3190 pos_before = data->pos_min;
3194 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3196 data->flags |= SF_IS_INF;
3198 if (flags & SCF_DO_STCLASS) {
3199 cl_init(pRExC_state, &this_class);
3200 oclass = data->start_class;
3201 data->start_class = &this_class;
3202 f |= SCF_DO_STCLASS_AND;
3203 f &= ~SCF_DO_STCLASS_OR;
3205 /* These are the cases when once a subexpression
3206 fails at a particular position, it cannot succeed
3207 even after backtracking at the enclosing scope.
3209 XXXX what if minimal match and we are at the
3210 initial run of {n,m}? */
3211 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3212 f &= ~SCF_WHILEM_VISITED_POS;
3214 /* This will finish on WHILEM, setting scan, or on NULL: */
3215 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3216 last, data, stopparen, recursed, NULL,
3218 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3220 if (flags & SCF_DO_STCLASS)
3221 data->start_class = oclass;
3222 if (mincount == 0 || minnext == 0) {
3223 if (flags & SCF_DO_STCLASS_OR) {
3224 cl_or(pRExC_state, data->start_class, &this_class);
3226 else if (flags & SCF_DO_STCLASS_AND) {
3227 /* Switch to OR mode: cache the old value of
3228 * data->start_class */
3230 StructCopy(data->start_class, and_withp,
3231 struct regnode_charclass_class);
3232 flags &= ~SCF_DO_STCLASS_AND;
3233 StructCopy(&this_class, data->start_class,
3234 struct regnode_charclass_class);
3235 flags |= SCF_DO_STCLASS_OR;
3236 data->start_class->flags |= ANYOF_EOS;
3238 } else { /* Non-zero len */
3239 if (flags & SCF_DO_STCLASS_OR) {
3240 cl_or(pRExC_state, data->start_class, &this_class);
3241 cl_and(data->start_class, and_withp);
3243 else if (flags & SCF_DO_STCLASS_AND)
3244 cl_and(data->start_class, &this_class);
3245 flags &= ~SCF_DO_STCLASS;
3247 if (!scan) /* It was not CURLYX, but CURLY. */
3249 if ( /* ? quantifier ok, except for (?{ ... }) */
3250 (next_is_eval || !(mincount == 0 && maxcount == 1))
3251 && (minnext == 0) && (deltanext == 0)
3252 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3253 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3255 ckWARNreg(RExC_parse,
3256 "Quantifier unexpected on zero-length expression");
3259 min += minnext * mincount;
3260 is_inf_internal |= ((maxcount == REG_INFTY
3261 && (minnext + deltanext) > 0)
3262 || deltanext == I32_MAX);
3263 is_inf |= is_inf_internal;
3264 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3266 /* Try powerful optimization CURLYX => CURLYN. */
3267 if ( OP(oscan) == CURLYX && data
3268 && data->flags & SF_IN_PAR
3269 && !(data->flags & SF_HAS_EVAL)
3270 && !deltanext && minnext == 1 ) {
3271 /* Try to optimize to CURLYN. */
3272 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3273 regnode * const nxt1 = nxt;
3280 if (!REGNODE_SIMPLE(OP(nxt))
3281 && !(PL_regkind[OP(nxt)] == EXACT
3282 && STR_LEN(nxt) == 1))
3288 if (OP(nxt) != CLOSE)
3290 if (RExC_open_parens) {
3291 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3292 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3294 /* Now we know that nxt2 is the only contents: */
3295 oscan->flags = (U8)ARG(nxt);
3297 OP(nxt1) = NOTHING; /* was OPEN. */
3300 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3301 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3302 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3303 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3304 OP(nxt + 1) = OPTIMIZED; /* was count. */
3305 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3310 /* Try optimization CURLYX => CURLYM. */
3311 if ( OP(oscan) == CURLYX && data
3312 && !(data->flags & SF_HAS_PAR)
3313 && !(data->flags & SF_HAS_EVAL)
3314 && !deltanext /* atom is fixed width */
3315 && minnext != 0 /* CURLYM can't handle zero width */
3317 /* XXXX How to optimize if data == 0? */
3318 /* Optimize to a simpler form. */
3319 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3323 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3324 && (OP(nxt2) != WHILEM))
3326 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3327 /* Need to optimize away parenths. */
3328 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3329 /* Set the parenth number. */
3330 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3332 oscan->flags = (U8)ARG(nxt);
3333 if (RExC_open_parens) {
3334 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3335 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3337 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3338 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3341 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3342 OP(nxt + 1) = OPTIMIZED; /* was count. */
3343 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3344 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3347 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3348 regnode *nnxt = regnext(nxt1);
3351 if (reg_off_by_arg[OP(nxt1)])
3352 ARG_SET(nxt1, nxt2 - nxt1);
3353 else if (nxt2 - nxt1 < U16_MAX)
3354 NEXT_OFF(nxt1) = nxt2 - nxt1;
3356 OP(nxt) = NOTHING; /* Cannot beautify */
3361 /* Optimize again: */
3362 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3363 NULL, stopparen, recursed, NULL, 0,depth+1);
3368 else if ((OP(oscan) == CURLYX)
3369 && (flags & SCF_WHILEM_VISITED_POS)
3370 /* See the comment on a similar expression above.
3371 However, this time it not a subexpression
3372 we care about, but the expression itself. */
3373 && (maxcount == REG_INFTY)
3374 && data && ++data->whilem_c < 16) {
3375 /* This stays as CURLYX, we can put the count/of pair. */
3376 /* Find WHILEM (as in regexec.c) */
3377 regnode *nxt = oscan + NEXT_OFF(oscan);
3379 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3381 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3382 | (RExC_whilem_seen << 4)); /* On WHILEM */
3384 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3386 if (flags & SCF_DO_SUBSTR) {
3387 SV *last_str = NULL;
3388 int counted = mincount != 0;
3390 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3391 #if defined(SPARC64_GCC_WORKAROUND)
3394 const char *s = NULL;
3397 if (pos_before >= data->last_start_min)
3400 b = data->last_start_min;
3403 s = SvPV_const(data->last_found, l);
3404 old = b - data->last_start_min;
3407 I32 b = pos_before >= data->last_start_min
3408 ? pos_before : data->last_start_min;
3410 const char * const s = SvPV_const(data->last_found, l);
3411 I32 old = b - data->last_start_min;
3415 old = utf8_hop((U8*)s, old) - (U8*)s;
3418 /* Get the added string: */
3419 last_str = newSVpvn_utf8(s + old, l, UTF);
3420 if (deltanext == 0 && pos_before == b) {
3421 /* What was added is a constant string */
3423 SvGROW(last_str, (mincount * l) + 1);
3424 repeatcpy(SvPVX(last_str) + l,
3425 SvPVX_const(last_str), l, mincount - 1);
3426 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3427 /* Add additional parts. */
3428 SvCUR_set(data->last_found,
3429 SvCUR(data->last_found) - l);
3430 sv_catsv(data->last_found, last_str);
3432 SV * sv = data->last_found;
3434 SvUTF8(sv) && SvMAGICAL(sv) ?
3435 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3436 if (mg && mg->mg_len >= 0)
3437 mg->mg_len += CHR_SVLEN(last_str) - l;
3439 data->last_end += l * (mincount - 1);
3442 /* start offset must point into the last copy */
3443 data->last_start_min += minnext * (mincount - 1);
3444 data->last_start_max += is_inf ? I32_MAX
3445 : (maxcount - 1) * (minnext + data->pos_delta);
3448 /* It is counted once already... */
3449 data->pos_min += minnext * (mincount - counted);
3450 data->pos_delta += - counted * deltanext +
3451 (minnext + deltanext) * maxcount - minnext * mincount;
3452 if (mincount != maxcount) {
3453 /* Cannot extend fixed substrings found inside
3455 SCAN_COMMIT(pRExC_state,data,minlenp);
3456 if (mincount && last_str) {
3457 SV * const sv = data->last_found;
3458 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3459 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3463 sv_setsv(sv, last_str);
3464 data->last_end = data->pos_min;
3465 data->last_start_min =
3466 data->pos_min - CHR_SVLEN(last_str);
3467 data->last_start_max = is_inf
3469 : data->pos_min + data->pos_delta
3470 - CHR_SVLEN(last_str);
3472 data->longest = &(data->longest_float);
3474 SvREFCNT_dec(last_str);
3476 if (data && (fl & SF_HAS_EVAL))
3477 data->flags |= SF_HAS_EVAL;
3478 optimize_curly_tail:
3479 if (OP(oscan) != CURLYX) {
3480 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3482 NEXT_OFF(oscan) += NEXT_OFF(next);
3485 default: /* REF and CLUMP only? */
3486 if (flags & SCF_DO_SUBSTR) {
3487 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3488 data->longest = &(data->longest_float);
3490 is_inf = is_inf_internal = 1;
3491 if (flags & SCF_DO_STCLASS_OR)
3492 cl_anything(pRExC_state, data->start_class);
3493 flags &= ~SCF_DO_STCLASS;
3497 else if (OP(scan) == LNBREAK) {
3498 if (flags & SCF_DO_STCLASS) {
3500 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3501 if (flags & SCF_DO_STCLASS_AND) {
3502 for (value = 0; value < 256; value++)
3503 if (!is_VERTWS_cp(value))
3504 ANYOF_BITMAP_CLEAR(data->start_class, value);
3507 for (value = 0; value < 256; value++)
3508 if (is_VERTWS_cp(value))
3509 ANYOF_BITMAP_SET(data->start_class, value);
3511 if (flags & SCF_DO_STCLASS_OR)
3512 cl_and(data->start_class, and_withp);
3513 flags &= ~SCF_DO_STCLASS;
3517 if (flags & SCF_DO_SUBSTR) {
3518 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3520 data->pos_delta += 1;
3521 data->longest = &(data->longest_float);
3525 else if (OP(scan) == FOLDCHAR) {
3526 int d = ARG(scan)==0xDF ? 1 : 2;
3527 flags &= ~SCF_DO_STCLASS;
3530 if (flags & SCF_DO_SUBSTR) {
3531 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3533 data->pos_delta += d;
3534 data->longest = &(data->longest_float);
3537 else if (REGNODE_SIMPLE(OP(scan))) {
3540 if (flags & SCF_DO_SUBSTR) {
3541 SCAN_COMMIT(pRExC_state,data,minlenp);
3545 if (flags & SCF_DO_STCLASS) {
3546 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3548 /* Some of the logic below assumes that switching
3549 locale on will only add false positives. */
3550 switch (PL_regkind[OP(scan)]) {
3554 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3555 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3556 cl_anything(pRExC_state, data->start_class);
3559 if (OP(scan) == SANY)
3561 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3562 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3563 || (data->start_class->flags & ANYOF_CLASS));
3564 cl_anything(pRExC_state, data->start_class);
3566 if (flags & SCF_DO_STCLASS_AND || !value)
3567 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3570 if (flags & SCF_DO_STCLASS_AND)
3571 cl_and(data->start_class,
3572 (struct regnode_charclass_class*)scan);
3574 cl_or(pRExC_state, data->start_class,
3575 (struct regnode_charclass_class*)scan);
3578 if (flags & SCF_DO_STCLASS_AND) {
3579 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3580 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3581 for (value = 0; value < 256; value++)
3582 if (!isALNUM(value))
3583 ANYOF_BITMAP_CLEAR(data->start_class, value);
3587 if (data->start_class->flags & ANYOF_LOCALE)
3588 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3590 for (value = 0; value < 256; value++)
3592 ANYOF_BITMAP_SET(data->start_class, value);
3597 if (flags & SCF_DO_STCLASS_AND) {
3598 if (data->start_class->flags & ANYOF_LOCALE)
3599 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3602 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3603 data->start_class->flags |= ANYOF_LOCALE;
3607 if (flags & SCF_DO_STCLASS_AND) {
3608 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3609 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3610 for (value = 0; value < 256; value++)
3612 ANYOF_BITMAP_CLEAR(data->start_class, value);
3616 if (data->start_class->flags & ANYOF_LOCALE)
3617 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3619 for (value = 0; value < 256; value++)
3620 if (!isALNUM(value))
3621 ANYOF_BITMAP_SET(data->start_class, value);
3626 if (flags & SCF_DO_STCLASS_AND) {
3627 if (data->start_class->flags & ANYOF_LOCALE)
3628 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3631 data->start_class->flags |= ANYOF_LOCALE;
3632 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3636 if (flags & SCF_DO_STCLASS_AND) {
3637 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3638 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3639 for (value = 0; value < 256; value++)
3640 if (!isSPACE(value))
3641 ANYOF_BITMAP_CLEAR(data->start_class, value);
3645 if (data->start_class->flags & ANYOF_LOCALE)
3646 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3648 for (value = 0; value < 256; value++)
3650 ANYOF_BITMAP_SET(data->start_class, value);
3655 if (flags & SCF_DO_STCLASS_AND) {
3656 if (data->start_class->flags & ANYOF_LOCALE)
3657 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3660 data->start_class->flags |= ANYOF_LOCALE;
3661 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3665 if (flags & SCF_DO_STCLASS_AND) {
3666 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3667 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3668 for (value = 0; value < 256; value++)
3670 ANYOF_BITMAP_CLEAR(data->start_class, value);
3674 if (data->start_class->flags & ANYOF_LOCALE)
3675 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3677 for (value = 0; value < 256; value++)
3678 if (!isSPACE(value))
3679 ANYOF_BITMAP_SET(data->start_class, value);
3684 if (flags & SCF_DO_STCLASS_AND) {
3685 if (data->start_class->flags & ANYOF_LOCALE) {
3686 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3687 for (value = 0; value < 256; value++)
3688 if (!isSPACE(value))
3689 ANYOF_BITMAP_CLEAR(data->start_class, value);
3693 data->start_class->flags |= ANYOF_LOCALE;
3694 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3698 if (flags & SCF_DO_STCLASS_AND) {
3699 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3700 for (value = 0; value < 256; value++)
3701 if (!isDIGIT(value))
3702 ANYOF_BITMAP_CLEAR(data->start_class, value);
3705 if (data->start_class->flags & ANYOF_LOCALE)
3706 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3708 for (value = 0; value < 256; value++)
3710 ANYOF_BITMAP_SET(data->start_class, value);
3715 if (flags & SCF_DO_STCLASS_AND) {
3716 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3717 for (value = 0; value < 256; value++)
3719 ANYOF_BITMAP_CLEAR(data->start_class, value);
3722 if (data->start_class->flags & ANYOF_LOCALE)
3723 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3725 for (value = 0; value < 256; value++)
3726 if (!isDIGIT(value))
3727 ANYOF_BITMAP_SET(data->start_class, value);
3731 CASE_SYNST_FNC(VERTWS);
3732 CASE_SYNST_FNC(HORIZWS);
3735 if (flags & SCF_DO_STCLASS_OR)
3736 cl_and(data->start_class, and_withp);
3737 flags &= ~SCF_DO_STCLASS;
3740 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3741 data->flags |= (OP(scan) == MEOL
3745 else if ( PL_regkind[OP(scan)] == BRANCHJ
3746 /* Lookbehind, or need to calculate parens/evals/stclass: */
3747 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3748 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3749 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3750 || OP(scan) == UNLESSM )
3752 /* Negative Lookahead/lookbehind
3753 In this case we can't do fixed string optimisation.
3756 I32 deltanext, minnext, fake = 0;
3758 struct regnode_charclass_class intrnl;
3761 data_fake.flags = 0;
3763 data_fake.whilem_c = data->whilem_c;
3764 data_fake.last_closep = data->last_closep;
3767 data_fake.last_closep = &fake;
3768 data_fake.pos_delta = delta;
3769 if ( flags & SCF_DO_STCLASS && !scan->flags
3770 && OP(scan) == IFMATCH ) { /* Lookahead */
3771 cl_init(pRExC_state, &intrnl);
3772 data_fake.start_class = &intrnl;
3773 f |= SCF_DO_STCLASS_AND;
3775 if (flags & SCF_WHILEM_VISITED_POS)
3776 f |= SCF_WHILEM_VISITED_POS;
3777 next = regnext(scan);
3778 nscan = NEXTOPER(NEXTOPER(scan));
3779 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3780 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3783 FAIL("Variable length lookbehind not implemented");
3785 else if (minnext > (I32)U8_MAX) {
3786 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3788 scan->flags = (U8)minnext;
3791 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3793 if (data_fake.flags & SF_HAS_EVAL)
3794 data->flags |= SF_HAS_EVAL;
3795 data->whilem_c = data_fake.whilem_c;
3797 if (f & SCF_DO_STCLASS_AND) {
3798 if (flags & SCF_DO_STCLASS_OR) {
3799 /* OR before, AND after: ideally we would recurse with
3800 * data_fake to get the AND applied by study of the
3801 * remainder of the pattern, and then derecurse;
3802 * *** HACK *** for now just treat as "no information".
3803 * See [perl #56690].
3805 cl_init(pRExC_state, data->start_class);
3807 /* AND before and after: combine and continue */
3808 const int was = (data->start_class->flags & ANYOF_EOS);
3810 cl_and(data->start_class, &intrnl);
3812 data->start_class->flags |= ANYOF_EOS;
3816 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3818 /* Positive Lookahead/lookbehind
3819 In this case we can do fixed string optimisation,
3820 but we must be careful about it. Note in the case of
3821 lookbehind the positions will be offset by the minimum
3822 length of the pattern, something we won't know about
3823 until after the recurse.
3825 I32 deltanext, fake = 0;
3827 struct regnode_charclass_class intrnl;
3829 /* We use SAVEFREEPV so that when the full compile
3830 is finished perl will clean up the allocated
3831 minlens when its all done. This was we don't
3832 have to worry about freeing them when we know
3833 they wont be used, which would be a pain.
3836 Newx( minnextp, 1, I32 );
3837 SAVEFREEPV(minnextp);
3840 StructCopy(data, &data_fake, scan_data_t);
3841 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3844 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3845 data_fake.last_found=newSVsv(data->last_found);
3849 data_fake.last_closep = &fake;
3850 data_fake.flags = 0;
3851 data_fake.pos_delta = delta;
3853 data_fake.flags |= SF_IS_INF;
3854 if ( flags & SCF_DO_STCLASS && !scan->flags
3855 && OP(scan) == IFMATCH ) { /* Lookahead */
3856 cl_init(pRExC_state, &intrnl);
3857 data_fake.start_class = &intrnl;
3858 f |= SCF_DO_STCLASS_AND;
3860 if (flags & SCF_WHILEM_VISITED_POS)
3861 f |= SCF_WHILEM_VISITED_POS;
3862 next = regnext(scan);
3863 nscan = NEXTOPER(NEXTOPER(scan));
3865 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3866 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3869 FAIL("Variable length lookbehind not implemented");
3871 else if (*minnextp > (I32)U8_MAX) {
3872 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3874 scan->flags = (U8)*minnextp;
3879 if (f & SCF_DO_STCLASS_AND) {
3880 const int was = (data->start_class->flags & ANYOF_EOS);
3882 cl_and(data->start_class, &intrnl);
3884 data->start_class->flags |= ANYOF_EOS;
3887 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3889 if (data_fake.flags & SF_HAS_EVAL)
3890 data->flags |= SF_HAS_EVAL;
3891 data->whilem_c = data_fake.whilem_c;
3892 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3893 if (RExC_rx->minlen<*minnextp)
3894 RExC_rx->minlen=*minnextp;
3895 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3896 SvREFCNT_dec(data_fake.last_found);
3898 if ( data_fake.minlen_fixed != minlenp )
3900 data->offset_fixed= data_fake.offset_fixed;
3901 data->minlen_fixed= data_fake.minlen_fixed;
3902 data->lookbehind_fixed+= scan->flags;
3904 if ( data_fake.minlen_float != minlenp )
3906 data->minlen_float= data_fake.minlen_float;
3907 data->offset_float_min=data_fake.offset_float_min;
3908 data->offset_float_max=data_fake.offset_float_max;
3909 data->lookbehind_float+= scan->flags;
3918 else if (OP(scan) == OPEN) {
3919 if (stopparen != (I32)ARG(scan))
3922 else if (OP(scan) == CLOSE) {
3923 if (stopparen == (I32)ARG(scan)) {
3926 if ((I32)ARG(scan) == is_par) {
3927 next = regnext(scan);
3929 if ( next && (OP(next) != WHILEM) && next < last)
3930 is_par = 0; /* Disable optimization */
3933 *(data->last_closep) = ARG(scan);
3935 else if (OP(scan) == EVAL) {
3937 data->flags |= SF_HAS_EVAL;
3939 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3940 if (flags & SCF_DO_SUBSTR) {
3941 SCAN_COMMIT(pRExC_state,data,minlenp);
3942 flags &= ~SCF_DO_SUBSTR;
3944 if (data && OP(scan)==ACCEPT) {
3945 data->flags |= SCF_SEEN_ACCEPT;
3950 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3952 if (flags & SCF_DO_SUBSTR) {
3953 SCAN_COMMIT(pRExC_state,data,minlenp);
3954 data->longest = &(data->longest_float);
3956 is_inf = is_inf_internal = 1;
3957 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3958 cl_anything(pRExC_state, data->start_class);
3959 flags &= ~SCF_DO_STCLASS;
3961 else if (OP(scan) == GPOS) {
3962 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3963 !(delta || is_inf || (data && data->pos_delta)))
3965 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3966 RExC_rx->extflags |= RXf_ANCH_GPOS;
3967 if (RExC_rx->gofs < (U32)min)
3968 RExC_rx->gofs = min;
3970 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3974 #ifdef TRIE_STUDY_OPT
3975 #ifdef FULL_TRIE_STUDY
3976 else if (PL_regkind[OP(scan)] == TRIE) {
3977 /* NOTE - There is similar code to this block above for handling
3978 BRANCH nodes on the initial study. If you change stuff here
3980 regnode *trie_node= scan;
3981 regnode *tail= regnext(scan);
3982 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3983 I32 max1 = 0, min1 = I32_MAX;
3984 struct regnode_charclass_class accum;
3986 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3987 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3988 if (flags & SCF_DO_STCLASS)
3989 cl_init_zero(pRExC_state, &accum);
3995 const regnode *nextbranch= NULL;
3998 for ( word=1 ; word <= trie->wordcount ; word++)
4000 I32 deltanext=0, minnext=0, f = 0, fake;
4001 struct regnode_charclass_class this_class;
4003 data_fake.flags = 0;
4005 data_fake.whilem_c = data->whilem_c;
4006 data_fake.last_closep = data->last_closep;
4009 data_fake.last_closep = &fake;
4010 data_fake.pos_delta = delta;
4011 if (flags & SCF_DO_STCLASS) {
4012 cl_init(pRExC_state, &this_class);
4013 data_fake.start_class = &this_class;
4014 f = SCF_DO_STCLASS_AND;
4016 if (flags & SCF_WHILEM_VISITED_POS)
4017 f |= SCF_WHILEM_VISITED_POS;
4019 if (trie->jump[word]) {
4021 nextbranch = trie_node + trie->jump[0];
4022 scan= trie_node + trie->jump[word];
4023 /* We go from the jump point to the branch that follows
4024 it. Note this means we need the vestigal unused branches
4025 even though they arent otherwise used.
4027 minnext = study_chunk(pRExC_state, &scan, minlenp,
4028 &deltanext, (regnode *)nextbranch, &data_fake,
4029 stopparen, recursed, NULL, f,depth+1);
4031 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4032 nextbranch= regnext((regnode*)nextbranch);
4034 if (min1 > (I32)(minnext + trie->minlen))
4035 min1 = minnext + trie->minlen;
4036 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4037 max1 = minnext + deltanext + trie->maxlen;
4038 if (deltanext == I32_MAX)
4039 is_inf = is_inf_internal = 1;
4041 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4043 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4044 if ( stopmin > min + min1)
4045 stopmin = min + min1;
4046 flags &= ~SCF_DO_SUBSTR;
4048 data->flags |= SCF_SEEN_ACCEPT;
4051 if (data_fake.flags & SF_HAS_EVAL)
4052 data->flags |= SF_HAS_EVAL;
4053 data->whilem_c = data_fake.whilem_c;
4055 if (flags & SCF_DO_STCLASS)
4056 cl_or(pRExC_state, &accum, &this_class);
4059 if (flags & SCF_DO_SUBSTR) {
4060 data->pos_min += min1;
4061 data->pos_delta += max1 - min1;
4062 if (max1 != min1 || is_inf)
4063 data->longest = &(data->longest_float);
4066 delta += max1 - min1;
4067 if (flags & SCF_DO_STCLASS_OR) {
4068 cl_or(pRExC_state, data->start_class, &accum);
4070 cl_and(data->start_class, and_withp);
4071 flags &= ~SCF_DO_STCLASS;
4074 else if (flags & SCF_DO_STCLASS_AND) {
4076 cl_and(data->start_class, &accum);
4077 flags &= ~SCF_DO_STCLASS;
4080 /* Switch to OR mode: cache the old value of
4081 * data->start_class */
4083 StructCopy(data->start_class, and_withp,
4084 struct regnode_charclass_class);
4085 flags &= ~SCF_DO_STCLASS_AND;
4086 StructCopy(&accum, data->start_class,
4087 struct regnode_charclass_class);
4088 flags |= SCF_DO_STCLASS_OR;
4089 data->start_class->flags |= ANYOF_EOS;
4096 else if (PL_regkind[OP(scan)] == TRIE) {
4097 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4100 min += trie->minlen;
4101 delta += (trie->maxlen - trie->minlen);
4102 flags &= ~SCF_DO_STCLASS; /* xxx */
4103 if (flags & SCF_DO_SUBSTR) {
4104 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4105 data->pos_min += trie->minlen;
4106 data->pos_delta += (trie->maxlen - trie->minlen);
4107 if (trie->maxlen != trie->minlen)
4108 data->longest = &(data->longest_float);
4110 if (trie->jump) /* no more substrings -- for now /grr*/
4111 flags &= ~SCF_DO_SUBSTR;
4113 #endif /* old or new */
4114 #endif /* TRIE_STUDY_OPT */
4116 /* Else: zero-length, ignore. */
4117 scan = regnext(scan);
4122 stopparen = frame->stop;
4123 frame = frame->prev;
4124 goto fake_study_recurse;
4129 DEBUG_STUDYDATA("pre-fin:",data,depth);
4132 *deltap = is_inf_internal ? I32_MAX : delta;
4133 if (flags & SCF_DO_SUBSTR && is_inf)
4134 data->pos_delta = I32_MAX - data->pos_min;
4135 if (is_par > (I32)U8_MAX)
4137 if (is_par && pars==1 && data) {
4138 data->flags |= SF_IN_PAR;
4139 data->flags &= ~SF_HAS_PAR;
4141 else if (pars && data) {
4142 data->flags |= SF_HAS_PAR;
4143 data->flags &= ~SF_IN_PAR;
4145 if (flags & SCF_DO_STCLASS_OR)
4146 cl_and(data->start_class, and_withp);
4147 if (flags & SCF_TRIE_RESTUDY)
4148 data->flags |= SCF_TRIE_RESTUDY;
4150 DEBUG_STUDYDATA("post-fin:",data,depth);
4152 return min < stopmin ? min : stopmin;
4156 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4158 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4160 PERL_ARGS_ASSERT_ADD_DATA;
4162 Renewc(RExC_rxi->data,
4163 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4164 char, struct reg_data);
4166 Renew(RExC_rxi->data->what, count + n, U8);
4168 Newx(RExC_rxi->data->what, n, U8);
4169 RExC_rxi->data->count = count + n;
4170 Copy(s, RExC_rxi->data->what + count, n, U8);
4174 /*XXX: todo make this not included in a non debugging perl */
4175 #ifndef PERL_IN_XSUB_RE
4177 Perl_reginitcolors(pTHX)
4180 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4182 char *t = savepv(s);
4186 t = strchr(t, '\t');
4192 PL_colors[i] = t = (char *)"";
4197 PL_colors[i++] = (char *)"";
4204 #ifdef TRIE_STUDY_OPT
4205 #define CHECK_RESTUDY_GOTO \
4207 (data.flags & SCF_TRIE_RESTUDY) \
4211 #define CHECK_RESTUDY_GOTO
4215 - pregcomp - compile a regular expression into internal code
4217 * We can't allocate space until we know how big the compiled form will be,
4218 * but we can't compile it (and thus know how big it is) until we've got a
4219 * place to put the code. So we cheat: we compile it twice, once with code
4220 * generation turned off and size counting turned on, and once "for real".
4221 * This also means that we don't allocate space until we are sure that the
4222 * thing really will compile successfully, and we never have to move the
4223 * code and thus invalidate pointers into it. (Note that it has to be in
4224 * one piece because free() must be able to free it all.) [NB: not true in perl]
4226 * Beware that the optimization-preparation code in here knows about some
4227 * of the structure of the compiled regexp. [I'll say.]
4232 #ifndef PERL_IN_XSUB_RE
4233 #define RE_ENGINE_PTR &reh_regexp_engine
4235 extern const struct regexp_engine my_reg_engine;
4236 #define RE_ENGINE_PTR &my_reg_engine
4239 #ifndef PERL_IN_XSUB_RE
4241 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4244 HV * const table = GvHV(PL_hintgv);
4246 PERL_ARGS_ASSERT_PREGCOMP;
4248 /* Dispatch a request to compile a regexp to correct
4251 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4252 GET_RE_DEBUG_FLAGS_DECL;
4253 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4254 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4256 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4259 return CALLREGCOMP_ENG(eng, pattern, flags);
4262 return Perl_re_compile(aTHX_ pattern, flags);
4267 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4272 register regexp_internal *ri;
4274 char *exp = SvPV(pattern, plen);
4275 char* xend = exp + plen;
4284 RExC_state_t RExC_state;
4285 RExC_state_t * const pRExC_state = &RExC_state;
4286 #ifdef TRIE_STUDY_OPT
4288 RExC_state_t copyRExC_state;
4290 GET_RE_DEBUG_FLAGS_DECL;
4292 PERL_ARGS_ASSERT_RE_COMPILE;
4294 DEBUG_r(if (!PL_colorset) reginitcolors());
4296 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4299 SV *dsv= sv_newmortal();
4300 RE_PV_QUOTED_DECL(s, RExC_utf8,
4301 dsv, exp, plen, 60);
4302 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4303 PL_colors[4],PL_colors[5],s);
4306 /* Longjmp back to here if have to switch in midstream to utf8 */
4307 if (! RExC_orig_utf8) {
4308 JMPENV_PUSH(jump_ret);
4311 if (jump_ret != 0) {
4314 /* Here, we longjmped back. If the cause was other than changing to
4315 * utf8, pop our own setjmp, and longjmp to the correct handler */
4316 if (jump_ret != UTF8_LONGJMP) {
4318 JMPENV_JUMP(jump_ret);
4323 /* It's possible to write a regexp in ascii that represents Unicode
4324 codepoints outside of the byte range, such as via \x{100}. If we
4325 detect such a sequence we have to convert the entire pattern to utf8
4326 and then recompile, as our sizing calculation will have been based
4327 on 1 byte == 1 character, but we will need to use utf8 to encode
4328 at least some part of the pattern, and therefore must convert the whole
4331 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4332 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4333 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4335 RExC_orig_utf8 = RExC_utf8 = 1;
4340 RExC_flags = pm_flags;
4344 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4345 RExC_seen_evals = 0;
4348 /* First pass: determine size, legality. */
4356 RExC_emit = &PL_regdummy;
4357 RExC_whilem_seen = 0;
4358 RExC_open_parens = NULL;
4359 RExC_close_parens = NULL;
4361 RExC_paren_names = NULL;
4363 RExC_paren_name_list = NULL;
4365 RExC_recurse = NULL;
4366 RExC_recurse_count = 0;
4368 #if 0 /* REGC() is (currently) a NOP at the first pass.
4369 * Clever compilers notice this and complain. --jhi */
4370 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4372 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4373 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4374 RExC_precomp = NULL;
4378 /* Here, finished first pass. Get rid of our setjmp, which we added for
4379 * efficiency only if the passed-in string wasn't in utf8, as shown by
4380 * RExC_orig_utf8. But if the first pass was redone, that variable will be
4381 * 1 here even though the original string wasn't utf8, but in this case
4382 * there will have been a long jump */
4383 if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4387 PerlIO_printf(Perl_debug_log,
4388 "Required size %"IVdf" nodes\n"
4389 "Starting second pass (creation)\n",
4392 RExC_lastparse=NULL;
4394 /* Small enough for pointer-storage convention?
4395 If extralen==0, this means that we will not need long jumps. */
4396 if (RExC_size >= 0x10000L && RExC_extralen)
4397 RExC_size += RExC_extralen;
4400 if (RExC_whilem_seen > 15)
4401 RExC_whilem_seen = 15;
4403 /* Allocate space and zero-initialize. Note, the two step process
4404 of zeroing when in debug mode, thus anything assigned has to
4405 happen after that */
4406 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4407 r = (struct regexp*)SvANY(rx);
4408 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4409 char, regexp_internal);
4410 if ( r == NULL || ri == NULL )
4411 FAIL("Regexp out of space");
4413 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4414 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4416 /* bulk initialize base fields with 0. */
4417 Zero(ri, sizeof(regexp_internal), char);
4420 /* non-zero initialization begins here */
4422 r->engine= RE_ENGINE_PTR;
4423 r->extflags = pm_flags;
4425 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4426 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4427 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4428 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4429 >> RXf_PMf_STD_PMMOD_SHIFT);
4430 const char *fptr = STD_PAT_MODS; /*"msix"*/
4432 const STRLEN wraplen = plen + has_minus + has_p + has_runon
4433 + (sizeof(STD_PAT_MODS) - 1)
4434 + (sizeof("(?:)") - 1);
4436 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4437 SvCUR_set(rx, wraplen);
4439 SvFLAGS(rx) |= SvUTF8(pattern);
4442 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4444 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4445 char *colon = r + 1;
4448 while((ch = *fptr++)) {
4462 Copy(RExC_precomp, p, plen, char);
4463 assert ((RX_WRAPPED(rx) - p) < 16);
4464 r->pre_prefix = p - RX_WRAPPED(rx);
4473 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4475 if (RExC_seen & REG_SEEN_RECURSE) {
4476 Newxz(RExC_open_parens, RExC_npar,regnode *);
4477 SAVEFREEPV(RExC_open_parens);
4478 Newxz(RExC_close_parens,RExC_npar,regnode *);
4479 SAVEFREEPV(RExC_close_parens);
4482 /* Useful during FAIL. */
4483 #ifdef RE_TRACK_PATTERN_OFFSETS
4484 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4485 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4486 "%s %"UVuf" bytes for offset annotations.\n",
4487 ri->u.offsets ? "Got" : "Couldn't get",
4488 (UV)((2*RExC_size+1) * sizeof(U32))));
4490 SetProgLen(ri,RExC_size);
4494 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4496 /* Second pass: emit code. */
4497 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4502 RExC_emit_start = ri->program;
4503 RExC_emit = ri->program;
4504 RExC_emit_bound = ri->program + RExC_size + 1;
4506 /* Store the count of eval-groups for security checks: */
4507 RExC_rx->seen_evals = RExC_seen_evals;
4508 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4509 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4513 /* XXXX To minimize changes to RE engine we always allocate
4514 3-units-long substrs field. */
4515 Newx(r->substrs, 1, struct reg_substr_data);
4516 if (RExC_recurse_count) {
4517 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4518 SAVEFREEPV(RExC_recurse);
4522 r->minlen = minlen = sawplus = sawopen = 0;
4523 Zero(r->substrs, 1, struct reg_substr_data);
4525 #ifdef TRIE_STUDY_OPT
4527 StructCopy(&zero_scan_data, &data, scan_data_t);
4528 copyRExC_state = RExC_state;
4531 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4533 RExC_state = copyRExC_state;
4534 if (seen & REG_TOP_LEVEL_BRANCHES)
4535 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4537 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4538 if (data.last_found) {
4539 SvREFCNT_dec(data.longest_fixed);
4540 SvREFCNT_dec(data.longest_float);
4541 SvREFCNT_dec(data.last_found);
4543 StructCopy(&zero_scan_data, &data, scan_data_t);
4546 StructCopy(&zero_scan_data, &data, scan_data_t);
4549 /* Dig out information for optimizations. */
4550 r->extflags = RExC_flags; /* was pm_op */
4551 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4554 SvUTF8_on(rx); /* Unicode in it? */
4555 ri->regstclass = NULL;
4556 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4557 r->intflags |= PREGf_NAUGHTY;
4558 scan = ri->program + 1; /* First BRANCH. */
4560 /* testing for BRANCH here tells us whether there is "must appear"
4561 data in the pattern. If there is then we can use it for optimisations */
4562 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4564 STRLEN longest_float_length, longest_fixed_length;
4565 struct regnode_charclass_class ch_class; /* pointed to by data */
4567 I32 last_close = 0; /* pointed to by data */
4568 regnode *first= scan;
4569 regnode *first_next= regnext(first);
4572 * Skip introductions and multiplicators >= 1
4573 * so that we can extract the 'meat' of the pattern that must
4574 * match in the large if() sequence following.
4575 * NOTE that EXACT is NOT covered here, as it is normally
4576 * picked up by the optimiser separately.
4578 * This is unfortunate as the optimiser isnt handling lookahead
4579 * properly currently.
4582 while ((OP(first) == OPEN && (sawopen = 1)) ||
4583 /* An OR of *one* alternative - should not happen now. */
4584 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4585 /* for now we can't handle lookbehind IFMATCH*/
4586 (OP(first) == IFMATCH && !first->flags) ||
4587 (OP(first) == PLUS) ||
4588 (OP(first) == MINMOD) ||
4589 /* An {n,m} with n>0 */
4590 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4591 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4594 * the only op that could be a regnode is PLUS, all the rest
4595 * will be regnode_1 or regnode_2.
4598 if (OP(first) == PLUS)
4601 first += regarglen[OP(first)];
4603 first = NEXTOPER(first);
4604 first_next= regnext(first);
4607 /* Starting-point info. */
4609 DEBUG_PEEP("first:",first,0);
4610 /* Ignore EXACT as we deal with it later. */
4611 if (PL_regkind[OP(first)] == EXACT) {
4612 if (OP(first) == EXACT)
4613 NOOP; /* Empty, get anchored substr later. */
4614 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4615 ri->regstclass = first;
4618 else if (PL_regkind[OP(first)] == TRIE &&
4619 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4622 /* this can happen only on restudy */
4623 if ( OP(first) == TRIE ) {
4624 struct regnode_1 *trieop = (struct regnode_1 *)
4625 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4626 StructCopy(first,trieop,struct regnode_1);
4627 trie_op=(regnode *)trieop;
4629 struct regnode_charclass *trieop = (struct regnode_charclass *)
4630 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4631 StructCopy(first,trieop,struct regnode_charclass);
4632 trie_op=(regnode *)trieop;
4635 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4636 ri->regstclass = trie_op;
4639 else if (REGNODE_SIMPLE(OP(first)))
4640 ri->regstclass = first;
4641 else if (PL_regkind[OP(first)] == BOUND ||
4642 PL_regkind[OP(first)] == NBOUND)
4643 ri->regstclass = first;
4644 else if (PL_regkind[OP(first)] == BOL) {
4645 r->extflags |= (OP(first) == MBOL
4647 : (OP(first) == SBOL
4650 first = NEXTOPER(first);
4653 else if (OP(first) == GPOS) {
4654 r->extflags |= RXf_ANCH_GPOS;
4655 first = NEXTOPER(first);
4658 else if ((!sawopen || !RExC_sawback) &&
4659 (OP(first) == STAR &&
4660 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4661 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4663 /* turn .* into ^.* with an implied $*=1 */
4665 (OP(NEXTOPER(first)) == REG_ANY)
4668 r->extflags |= type;
4669 r->intflags |= PREGf_IMPLICIT;
4670 first = NEXTOPER(first);
4673 if (sawplus && (!sawopen || !RExC_sawback)
4674 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4675 /* x+ must match at the 1st pos of run of x's */
4676 r->intflags |= PREGf_SKIP;
4678 /* Scan is after the zeroth branch, first is atomic matcher. */
4679 #ifdef TRIE_STUDY_OPT
4682 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4683 (IV)(first - scan + 1))
4687 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4688 (IV)(first - scan + 1))
4694 * If there's something expensive in the r.e., find the
4695 * longest literal string that must appear and make it the
4696 * regmust. Resolve ties in favor of later strings, since
4697 * the regstart check works with the beginning of the r.e.
4698 * and avoiding duplication strengthens checking. Not a
4699 * strong reason, but sufficient in the absence of others.
4700 * [Now we resolve ties in favor of the earlier string if
4701 * it happens that c_offset_min has been invalidated, since the
4702 * earlier string may buy us something the later one won't.]
4705 data.longest_fixed = newSVpvs("");
4706 data.longest_float = newSVpvs("");
4707 data.last_found = newSVpvs("");
4708 data.longest = &(data.longest_fixed);
4710 if (!ri->regstclass) {
4711 cl_init(pRExC_state, &ch_class);
4712 data.start_class = &ch_class;
4713 stclass_flag = SCF_DO_STCLASS_AND;
4714 } else /* XXXX Check for BOUND? */
4716 data.last_closep = &last_close;
4718 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4719 &data, -1, NULL, NULL,
4720 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4726 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4727 && data.last_start_min == 0 && data.last_end > 0
4728 && !RExC_seen_zerolen
4729 && !(RExC_seen & REG_SEEN_VERBARG)
4730 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4731 r->extflags |= RXf_CHECK_ALL;
4732 scan_commit(pRExC_state, &data,&minlen,0);
4733 SvREFCNT_dec(data.last_found);
4735 /* Note that code very similar to this but for anchored string
4736 follows immediately below, changes may need to be made to both.
4739 longest_float_length = CHR_SVLEN(data.longest_float);
4740 if (longest_float_length
4741 || (data.flags & SF_FL_BEFORE_EOL
4742 && (!(data.flags & SF_FL_BEFORE_MEOL)
4743 || (RExC_flags & RXf_PMf_MULTILINE))))
4747 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4748 && data.offset_fixed == data.offset_float_min
4749 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4750 goto remove_float; /* As in (a)+. */
4752 /* copy the information about the longest float from the reg_scan_data
4753 over to the program. */
4754 if (SvUTF8(data.longest_float)) {
4755 r->float_utf8 = data.longest_float;
4756 r->float_substr = NULL;
4758 r->float_substr = data.longest_float;
4759 r->float_utf8 = NULL;
4761 /* float_end_shift is how many chars that must be matched that
4762 follow this item. We calculate it ahead of time as once the
4763 lookbehind offset is added in we lose the ability to correctly
4765 ml = data.minlen_float ? *(data.minlen_float)
4766 : (I32)longest_float_length;
4767 r->float_end_shift = ml - data.offset_float_min
4768 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4769 + data.lookbehind_float;
4770 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4771 r->float_max_offset = data.offset_float_max;
4772 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4773 r->float_max_offset -= data.lookbehind_float;
4775 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4776 && (!(data.flags & SF_FL_BEFORE_MEOL)
4777 || (RExC_flags & RXf_PMf_MULTILINE)));
4778 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4782 r->float_substr = r->float_utf8 = NULL;
4783 SvREFCNT_dec(data.longest_float);
4784 longest_float_length = 0;
4787 /* Note that code very similar to this but for floating string
4788 is immediately above, changes may need to be made to both.
4791 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4792 if (longest_fixed_length
4793 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4794 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4795 || (RExC_flags & RXf_PMf_MULTILINE))))
4799 /* copy the information about the longest fixed
4800 from the reg_scan_data over to the program. */
4801 if (SvUTF8(data.longest_fixed)) {
4802 r->anchored_utf8 = data.longest_fixed;
4803 r->anchored_substr = NULL;
4805 r->anchored_substr = data.longest_fixed;
4806 r->anchored_utf8 = NULL;
4808 /* fixed_end_shift is how many chars that must be matched that
4809 follow this item. We calculate it ahead of time as once the
4810 lookbehind offset is added in we lose the ability to correctly
4812 ml = data.minlen_fixed ? *(data.minlen_fixed)
4813 : (I32)longest_fixed_length;
4814 r->anchored_end_shift = ml - data.offset_fixed
4815 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4816 + data.lookbehind_fixed;
4817 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4819 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4820 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4821 || (RExC_flags & RXf_PMf_MULTILINE)));
4822 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4825 r->anchored_substr = r->anchored_utf8 = NULL;
4826 SvREFCNT_dec(data.longest_fixed);
4827 longest_fixed_length = 0;
4830 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4831 ri->regstclass = NULL;
4832 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4834 && !(data.start_class->flags & ANYOF_EOS)
4835 && !cl_is_anything(data.start_class))
4837 const U32 n = add_data(pRExC_state, 1, "f");
4839 Newx(RExC_rxi->data->data[n], 1,
4840 struct regnode_charclass_class);
4841 StructCopy(data.start_class,
4842 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4843 struct regnode_charclass_class);
4844 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4845 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4846 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4847 regprop(r, sv, (regnode*)data.start_class);
4848 PerlIO_printf(Perl_debug_log,
4849 "synthetic stclass \"%s\".\n",
4850 SvPVX_const(sv));});
4853 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4854 if (longest_fixed_length > longest_float_length) {
4855 r->check_end_shift = r->anchored_end_shift;
4856 r->check_substr = r->anchored_substr;
4857 r->check_utf8 = r->anchored_utf8;
4858 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4859 if (r->extflags & RXf_ANCH_SINGLE)
4860 r->extflags |= RXf_NOSCAN;
4863 r->check_end_shift = r->float_end_shift;
4864 r->check_substr = r->float_substr;
4865 r->check_utf8 = r->float_utf8;
4866 r->check_offset_min = r->float_min_offset;
4867 r->check_offset_max = r->float_max_offset;
4869 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4870 This should be changed ASAP! */
4871 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4872 r->extflags |= RXf_USE_INTUIT;
4873 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4874 r->extflags |= RXf_INTUIT_TAIL;
4876 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4877 if ( (STRLEN)minlen < longest_float_length )
4878 minlen= longest_float_length;
4879 if ( (STRLEN)minlen < longest_fixed_length )
4880 minlen= longest_fixed_length;
4884 /* Several toplevels. Best we can is to set minlen. */
4886 struct regnode_charclass_class ch_class;
4889 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4891 scan = ri->program + 1;
4892 cl_init(pRExC_state, &ch_class);
4893 data.start_class = &ch_class;
4894 data.last_closep = &last_close;
4897 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4898 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4902 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4903 = r->float_substr = r->float_utf8 = NULL;
4904 if (!(data.start_class->flags & ANYOF_EOS)
4905 && !cl_is_anything(data.start_class))
4907 const U32 n = add_data(pRExC_state, 1, "f");
4909 Newx(RExC_rxi->data->data[n], 1,
4910 struct regnode_charclass_class);
4911 StructCopy(data.start_class,
4912 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4913 struct regnode_charclass_class);
4914 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4915 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4916 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4917 regprop(r, sv, (regnode*)data.start_class);
4918 PerlIO_printf(Perl_debug_log,
4919 "synthetic stclass \"%s\".\n",
4920 SvPVX_const(sv));});
4924 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4925 the "real" pattern. */
4927 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4928 (IV)minlen, (IV)r->minlen);
4930 r->minlenret = minlen;
4931 if (r->minlen < minlen)
4934 if (RExC_seen & REG_SEEN_GPOS)
4935 r->extflags |= RXf_GPOS_SEEN;
4936 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4937 r->extflags |= RXf_LOOKBEHIND_SEEN;
4938 if (RExC_seen & REG_SEEN_EVAL)
4939 r->extflags |= RXf_EVAL_SEEN;
4940 if (RExC_seen & REG_SEEN_CANY)
4941 r->extflags |= RXf_CANY_SEEN;
4942 if (RExC_seen & REG_SEEN_VERBARG)
4943 r->intflags |= PREGf_VERBARG_SEEN;
4944 if (RExC_seen & REG_SEEN_CUTGROUP)
4945 r->intflags |= PREGf_CUTGROUP_SEEN;
4946 if (RExC_paren_names)
4947 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4949 RXp_PAREN_NAMES(r) = NULL;
4951 #ifdef STUPID_PATTERN_CHECKS
4952 if (RX_PRELEN(rx) == 0)
4953 r->extflags |= RXf_NULL;
4954 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4955 /* XXX: this should happen BEFORE we compile */
4956 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4957 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4958 r->extflags |= RXf_WHITE;
4959 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4960 r->extflags |= RXf_START_ONLY;
4962 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4963 /* XXX: this should happen BEFORE we compile */
4964 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4966 regnode *first = ri->program + 1;
4968 U8 nop = OP(NEXTOPER(first));
4970 if (PL_regkind[fop] == NOTHING && nop == END)
4971 r->extflags |= RXf_NULL;
4972 else if (PL_regkind[fop] == BOL && nop == END)
4973 r->extflags |= RXf_START_ONLY;
4974 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4975 r->extflags |= RXf_WHITE;
4979 if (RExC_paren_names) {
4980 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
4981 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4984 ri->name_list_idx = 0;
4986 if (RExC_recurse_count) {
4987 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4988 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4989 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4992 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4993 /* assume we don't need to swap parens around before we match */
4996 PerlIO_printf(Perl_debug_log,"Final program:\n");
4999 #ifdef RE_TRACK_PATTERN_OFFSETS
5000 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5001 const U32 len = ri->u.offsets[0];
5003 GET_RE_DEBUG_FLAGS_DECL;
5004 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5005 for (i = 1; i <= len; i++) {
5006 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5007 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5008 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5010 PerlIO_printf(Perl_debug_log, "\n");
5016 #undef RE_ENGINE_PTR
5020 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5023 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5025 PERL_UNUSED_ARG(value);
5027 if (flags & RXapif_FETCH) {
5028 return reg_named_buff_fetch(rx, key, flags);
5029 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5030 Perl_croak_no_modify(aTHX);
5032 } else if (flags & RXapif_EXISTS) {
5033 return reg_named_buff_exists(rx, key, flags)
5036 } else if (flags & RXapif_REGNAMES) {
5037 return reg_named_buff_all(rx, flags);
5038 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5039 return reg_named_buff_scalar(rx, flags);
5041 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5047 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5050 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5051 PERL_UNUSED_ARG(lastkey);
5053 if (flags & RXapif_FIRSTKEY)
5054 return reg_named_buff_firstkey(rx, flags);
5055 else if (flags & RXapif_NEXTKEY)
5056 return reg_named_buff_nextkey(rx, flags);
5058 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5064 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5067 AV *retarray = NULL;
5069 struct regexp *const rx = (struct regexp *)SvANY(r);
5071 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5073 if (flags & RXapif_ALL)
5076 if (rx && RXp_PAREN_NAMES(rx)) {
5077 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5080 SV* sv_dat=HeVAL(he_str);
5081 I32 *nums=(I32*)SvPVX(sv_dat);
5082 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5083 if ((I32)(rx->nparens) >= nums[i]
5084 && rx->offs[nums[i]].start != -1
5085 && rx->offs[nums[i]].end != -1)
5088 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5092 ret = newSVsv(&PL_sv_undef);
5095 av_push(retarray, ret);
5098 return newRV_noinc(MUTABLE_SV(retarray));
5105 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5108 struct regexp *const rx = (struct regexp *)SvANY(r);
5110 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5112 if (rx && RXp_PAREN_NAMES(rx)) {
5113 if (flags & RXapif_ALL) {
5114 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5116 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5130 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5132 struct regexp *const rx = (struct regexp *)SvANY(r);
5134 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5136 if ( rx && RXp_PAREN_NAMES(rx) ) {
5137 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5139 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5146 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5148 struct regexp *const rx = (struct regexp *)SvANY(r);
5149 GET_RE_DEBUG_FLAGS_DECL;
5151 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5153 if (rx && RXp_PAREN_NAMES(rx)) {
5154 HV *hv = RXp_PAREN_NAMES(rx);
5156 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5159 SV* sv_dat = HeVAL(temphe);
5160 I32 *nums = (I32*)SvPVX(sv_dat);
5161 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5162 if ((I32)(rx->lastparen) >= nums[i] &&
5163 rx->offs[nums[i]].start != -1 &&
5164 rx->offs[nums[i]].end != -1)
5170 if (parno || flags & RXapif_ALL) {
5171 return newSVhek(HeKEY_hek(temphe));
5179 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5184 struct regexp *const rx = (struct regexp *)SvANY(r);
5186 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5188 if (rx && RXp_PAREN_NAMES(rx)) {
5189 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5190 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5191 } else if (flags & RXapif_ONE) {
5192 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5193 av = MUTABLE_AV(SvRV(ret));
5194 length = av_len(av);
5196 return newSViv(length + 1);
5198 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5202 return &PL_sv_undef;
5206 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5208 struct regexp *const rx = (struct regexp *)SvANY(r);
5211 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5213 if (rx && RXp_PAREN_NAMES(rx)) {
5214 HV *hv= RXp_PAREN_NAMES(rx);
5216 (void)hv_iterinit(hv);
5217 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5220 SV* sv_dat = HeVAL(temphe);
5221 I32 *nums = (I32*)SvPVX(sv_dat);
5222 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5223 if ((I32)(rx->lastparen) >= nums[i] &&
5224 rx->offs[nums[i]].start != -1 &&
5225 rx->offs[nums[i]].end != -1)
5231 if (parno || flags & RXapif_ALL) {
5232 av_push(av, newSVhek(HeKEY_hek(temphe)));
5237 return newRV_noinc(MUTABLE_SV(av));
5241 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5244 struct regexp *const rx = (struct regexp *)SvANY(r);
5249 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5252 sv_setsv(sv,&PL_sv_undef);
5256 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5258 i = rx->offs[0].start;
5262 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5264 s = rx->subbeg + rx->offs[0].end;
5265 i = rx->sublen - rx->offs[0].end;
5268 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5269 (s1 = rx->offs[paren].start) != -1 &&
5270 (t1 = rx->offs[paren].end) != -1)
5274 s = rx->subbeg + s1;
5276 sv_setsv(sv,&PL_sv_undef);
5279 assert(rx->sublen >= (s - rx->subbeg) + i );
5281 const int oldtainted = PL_tainted;
5283 sv_setpvn(sv, s, i);
5284 PL_tainted = oldtainted;
5285 if ( (rx->extflags & RXf_CANY_SEEN)
5286 ? (RXp_MATCH_UTF8(rx)
5287 && (!i || is_utf8_string((U8*)s, i)))
5288 : (RXp_MATCH_UTF8(rx)) )
5295 if (RXp_MATCH_TAINTED(rx)) {
5296 if (SvTYPE(sv) >= SVt_PVMG) {
5297 MAGIC* const mg = SvMAGIC(sv);
5300 SvMAGIC_set(sv, mg->mg_moremagic);
5302 if ((mgt = SvMAGIC(sv))) {
5303 mg->mg_moremagic = mgt;
5304 SvMAGIC_set(sv, mg);
5314 sv_setsv(sv,&PL_sv_undef);
5320 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5321 SV const * const value)
5323 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5325 PERL_UNUSED_ARG(rx);
5326 PERL_UNUSED_ARG(paren);
5327 PERL_UNUSED_ARG(value);
5330 Perl_croak_no_modify(aTHX);
5334 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5337 struct regexp *const rx = (struct regexp *)SvANY(r);
5341 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5343 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5345 /* $` / ${^PREMATCH} */
5346 case RX_BUFF_IDX_PREMATCH:
5347 if (rx->offs[0].start != -1) {
5348 i = rx->offs[0].start;
5356 /* $' / ${^POSTMATCH} */
5357 case RX_BUFF_IDX_POSTMATCH:
5358 if (rx->offs[0].end != -1) {
5359 i = rx->sublen - rx->offs[0].end;
5361 s1 = rx->offs[0].end;
5367 /* $& / ${^MATCH}, $1, $2, ... */
5369 if (paren <= (I32)rx->nparens &&
5370 (s1 = rx->offs[paren].start) != -1 &&
5371 (t1 = rx->offs[paren].end) != -1)
5376 if (ckWARN(WARN_UNINITIALIZED))
5377 report_uninit((const SV *)sv);
5382 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5383 const char * const s = rx->subbeg + s1;
5388 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5395 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5397 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5398 PERL_UNUSED_ARG(rx);
5402 return newSVpvs("Regexp");
5405 /* Scans the name of a named buffer from the pattern.
5406 * If flags is REG_RSN_RETURN_NULL returns null.
5407 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5408 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5409 * to the parsed name as looked up in the RExC_paren_names hash.
5410 * If there is an error throws a vFAIL().. type exception.
5413 #define REG_RSN_RETURN_NULL 0
5414 #define REG_RSN_RETURN_NAME 1
5415 #define REG_RSN_RETURN_DATA 2
5418 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5420 char *name_start = RExC_parse;
5422 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5424 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5425 /* skip IDFIRST by using do...while */
5428 RExC_parse += UTF8SKIP(RExC_parse);
5429 } while (isALNUM_utf8((U8*)RExC_parse));
5433 } while (isALNUM(*RExC_parse));
5438 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5439 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5440 if ( flags == REG_RSN_RETURN_NAME)
5442 else if (flags==REG_RSN_RETURN_DATA) {
5445 if ( ! sv_name ) /* should not happen*/
5446 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5447 if (RExC_paren_names)
5448 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5450 sv_dat = HeVAL(he_str);
5452 vFAIL("Reference to nonexistent named group");
5456 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5463 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5464 int rem=(int)(RExC_end - RExC_parse); \
5473 if (RExC_lastparse!=RExC_parse) \
5474 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5477 iscut ? "..." : "<" \
5480 PerlIO_printf(Perl_debug_log,"%16s",""); \
5483 num = RExC_size + 1; \
5485 num=REG_NODE_NUM(RExC_emit); \
5486 if (RExC_lastnum!=num) \
5487 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5489 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5490 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5491 (int)((depth*2)), "", \
5495 RExC_lastparse=RExC_parse; \
5500 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5501 DEBUG_PARSE_MSG((funcname)); \
5502 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5504 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5505 DEBUG_PARSE_MSG((funcname)); \
5506 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5509 - reg - regular expression, i.e. main body or parenthesized thing
5511 * Caller must absorb opening parenthesis.
5513 * Combining parenthesis handling with the base level of regular expression
5514 * is a trifle forced, but the need to tie the tails of the branches to what
5515 * follows makes it hard to avoid.
5517 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5519 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5521 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5525 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5526 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5529 register regnode *ret; /* Will be the head of the group. */
5530 register regnode *br;
5531 register regnode *lastbr;
5532 register regnode *ender = NULL;
5533 register I32 parno = 0;
5535 U32 oregflags = RExC_flags;
5536 bool have_branch = 0;
5538 I32 freeze_paren = 0;
5539 I32 after_freeze = 0;
5541 /* for (?g), (?gc), and (?o) warnings; warning
5542 about (?c) will warn about (?g) -- japhy */
5544 #define WASTED_O 0x01
5545 #define WASTED_G 0x02
5546 #define WASTED_C 0x04
5547 #define WASTED_GC (0x02|0x04)
5548 I32 wastedflags = 0x00;
5550 char * parse_start = RExC_parse; /* MJD */
5551 char * const oregcomp_parse = RExC_parse;
5553 GET_RE_DEBUG_FLAGS_DECL;
5555 PERL_ARGS_ASSERT_REG;
5556 DEBUG_PARSE("reg ");
5558 *flagp = 0; /* Tentatively. */
5561 /* Make an OPEN node, if parenthesized. */
5563 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5564 char *start_verb = RExC_parse;
5565 STRLEN verb_len = 0;
5566 char *start_arg = NULL;
5567 unsigned char op = 0;
5569 int internal_argval = 0; /* internal_argval is only useful if !argok */
5570 while ( *RExC_parse && *RExC_parse != ')' ) {
5571 if ( *RExC_parse == ':' ) {
5572 start_arg = RExC_parse + 1;
5578 verb_len = RExC_parse - start_verb;
5581 while ( *RExC_parse && *RExC_parse != ')' )
5583 if ( *RExC_parse != ')' )
5584 vFAIL("Unterminated verb pattern argument");
5585 if ( RExC_parse == start_arg )
5588 if ( *RExC_parse != ')' )
5589 vFAIL("Unterminated verb pattern");
5592 switch ( *start_verb ) {
5593 case 'A': /* (*ACCEPT) */
5594 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5596 internal_argval = RExC_nestroot;
5599 case 'C': /* (*COMMIT) */
5600 if ( memEQs(start_verb,verb_len,"COMMIT") )
5603 case 'F': /* (*FAIL) */
5604 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5609 case ':': /* (*:NAME) */
5610 case 'M': /* (*MARK:NAME) */
5611 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5616 case 'P': /* (*PRUNE) */
5617 if ( memEQs(start_verb,verb_len,"PRUNE") )
5620 case 'S': /* (*SKIP) */
5621 if ( memEQs(start_verb,verb_len,"SKIP") )
5624 case 'T': /* (*THEN) */
5625 /* [19:06] <TimToady> :: is then */
5626 if ( memEQs(start_verb,verb_len,"THEN") ) {
5628 RExC_seen |= REG_SEEN_CUTGROUP;
5634 vFAIL3("Unknown verb pattern '%.*s'",
5635 verb_len, start_verb);
5638 if ( start_arg && internal_argval ) {
5639 vFAIL3("Verb pattern '%.*s' may not have an argument",
5640 verb_len, start_verb);
5641 } else if ( argok < 0 && !start_arg ) {
5642 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5643 verb_len, start_verb);
5645 ret = reganode(pRExC_state, op, internal_argval);
5646 if ( ! internal_argval && ! SIZE_ONLY ) {
5648 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5649 ARG(ret) = add_data( pRExC_state, 1, "S" );
5650 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5657 if (!internal_argval)
5658 RExC_seen |= REG_SEEN_VERBARG;
5659 } else if ( start_arg ) {
5660 vFAIL3("Verb pattern '%.*s' may not have an argument",
5661 verb_len, start_verb);
5663 ret = reg_node(pRExC_state, op);
5665 nextchar(pRExC_state);
5668 if (*RExC_parse == '?') { /* (?...) */
5669 bool is_logical = 0;
5670 const char * const seqstart = RExC_parse;
5673 paren = *RExC_parse++;
5674 ret = NULL; /* For look-ahead/behind. */
5677 case 'P': /* (?P...) variants for those used to PCRE/Python */
5678 paren = *RExC_parse++;
5679 if ( paren == '<') /* (?P<...>) named capture */
5681 else if (paren == '>') { /* (?P>name) named recursion */
5682 goto named_recursion;
5684 else if (paren == '=') { /* (?P=...) named backref */
5685 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5686 you change this make sure you change that */
5687 char* name_start = RExC_parse;
5689 SV *sv_dat = reg_scan_name(pRExC_state,
5690 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5691 if (RExC_parse == name_start || *RExC_parse != ')')
5692 vFAIL2("Sequence %.3s... not terminated",parse_start);
5695 num = add_data( pRExC_state, 1, "S" );
5696 RExC_rxi->data->data[num]=(void*)sv_dat;
5697 SvREFCNT_inc_simple_void(sv_dat);
5700 ret = reganode(pRExC_state,
5701 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5705 Set_Node_Offset(ret, parse_start+1);
5706 Set_Node_Cur_Length(ret); /* MJD */
5708 nextchar(pRExC_state);
5712 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5714 case '<': /* (?<...) */
5715 if (*RExC_parse == '!')
5717 else if (*RExC_parse != '=')
5723 case '\'': /* (?'...') */
5724 name_start= RExC_parse;
5725 svname = reg_scan_name(pRExC_state,
5726 SIZE_ONLY ? /* reverse test from the others */
5727 REG_RSN_RETURN_NAME :
5728 REG_RSN_RETURN_NULL);
5729 if (RExC_parse == name_start) {
5731 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5734 if (*RExC_parse != paren)
5735 vFAIL2("Sequence (?%c... not terminated",
5736 paren=='>' ? '<' : paren);
5740 if (!svname) /* shouldnt happen */
5742 "panic: reg_scan_name returned NULL");
5743 if (!RExC_paren_names) {
5744 RExC_paren_names= newHV();
5745 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5747 RExC_paren_name_list= newAV();
5748 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5751 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5753 sv_dat = HeVAL(he_str);
5755 /* croak baby croak */
5757 "panic: paren_name hash element allocation failed");
5758 } else if ( SvPOK(sv_dat) ) {
5759 /* (?|...) can mean we have dupes so scan to check
5760 its already been stored. Maybe a flag indicating
5761 we are inside such a construct would be useful,
5762 but the arrays are likely to be quite small, so
5763 for now we punt -- dmq */
5764 IV count = SvIV(sv_dat);
5765 I32 *pv = (I32*)SvPVX(sv_dat);
5767 for ( i = 0 ; i < count ; i++ ) {
5768 if ( pv[i] == RExC_npar ) {
5774 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5775 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5776 pv[count] = RExC_npar;
5777 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5780 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5781 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5783 SvIV_set(sv_dat, 1);
5786 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5787 SvREFCNT_dec(svname);
5790 /*sv_dump(sv_dat);*/
5792 nextchar(pRExC_state);
5794 goto capturing_parens;
5796 RExC_seen |= REG_SEEN_LOOKBEHIND;
5798 case '=': /* (?=...) */
5799 RExC_seen_zerolen++;
5801 case '!': /* (?!...) */
5802 RExC_seen_zerolen++;
5803 if (*RExC_parse == ')') {
5804 ret=reg_node(pRExC_state, OPFAIL);
5805 nextchar(pRExC_state);
5809 case '|': /* (?|...) */
5810 /* branch reset, behave like a (?:...) except that
5811 buffers in alternations share the same numbers */
5813 after_freeze = freeze_paren = RExC_npar;
5815 case ':': /* (?:...) */
5816 case '>': /* (?>...) */
5818 case '$': /* (?$...) */
5819 case '@': /* (?@...) */
5820 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5822 case '#': /* (?#...) */
5823 while (*RExC_parse && *RExC_parse != ')')
5825 if (*RExC_parse != ')')
5826 FAIL("Sequence (?#... not terminated");
5827 nextchar(pRExC_state);
5830 case '0' : /* (?0) */
5831 case 'R' : /* (?R) */
5832 if (*RExC_parse != ')')
5833 FAIL("Sequence (?R) not terminated");
5834 ret = reg_node(pRExC_state, GOSTART);
5835 *flagp |= POSTPONED;
5836 nextchar(pRExC_state);
5839 { /* named and numeric backreferences */
5841 case '&': /* (?&NAME) */
5842 parse_start = RExC_parse - 1;
5845 SV *sv_dat = reg_scan_name(pRExC_state,
5846 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5847 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5849 goto gen_recurse_regop;
5852 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5854 vFAIL("Illegal pattern");
5856 goto parse_recursion;
5858 case '-': /* (?-1) */
5859 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5860 RExC_parse--; /* rewind to let it be handled later */
5864 case '1': case '2': case '3': case '4': /* (?1) */
5865 case '5': case '6': case '7': case '8': case '9':
5868 num = atoi(RExC_parse);
5869 parse_start = RExC_parse - 1; /* MJD */
5870 if (*RExC_parse == '-')
5872 while (isDIGIT(*RExC_parse))
5874 if (*RExC_parse!=')')
5875 vFAIL("Expecting close bracket");
5878 if ( paren == '-' ) {
5880 Diagram of capture buffer numbering.
5881 Top line is the normal capture buffer numbers
5882 Botton line is the negative indexing as from
5886 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5890 num = RExC_npar + num;
5893 vFAIL("Reference to nonexistent group");
5895 } else if ( paren == '+' ) {
5896 num = RExC_npar + num - 1;
5899 ret = reganode(pRExC_state, GOSUB, num);
5901 if (num > (I32)RExC_rx->nparens) {
5903 vFAIL("Reference to nonexistent group");
5905 ARG2L_SET( ret, RExC_recurse_count++);
5907 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5908 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5912 RExC_seen |= REG_SEEN_RECURSE;
5913 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5914 Set_Node_Offset(ret, parse_start); /* MJD */
5916 *flagp |= POSTPONED;
5917 nextchar(pRExC_state);
5919 } /* named and numeric backreferences */
5922 case '?': /* (??...) */
5924 if (*RExC_parse != '{') {
5926 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5929 *flagp |= POSTPONED;
5930 paren = *RExC_parse++;
5932 case '{': /* (?{...}) */
5937 char *s = RExC_parse;
5939 RExC_seen_zerolen++;
5940 RExC_seen |= REG_SEEN_EVAL;
5941 while (count && (c = *RExC_parse)) {
5952 if (*RExC_parse != ')') {
5954 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5958 OP_4tree *sop, *rop;
5959 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5962 Perl_save_re_context(aTHX);
5963 rop = sv_compile_2op(sv, &sop, "re", &pad);
5964 sop->op_private |= OPpREFCOUNTED;
5965 /* re_dup will OpREFCNT_inc */
5966 OpREFCNT_set(sop, 1);
5969 n = add_data(pRExC_state, 3, "nop");
5970 RExC_rxi->data->data[n] = (void*)rop;
5971 RExC_rxi->data->data[n+1] = (void*)sop;
5972 RExC_rxi->data->data[n+2] = (void*)pad;
5975 else { /* First pass */
5976 if (PL_reginterp_cnt < ++RExC_seen_evals
5978 /* No compiled RE interpolated, has runtime
5979 components ===> unsafe. */
5980 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5981 if (PL_tainting && PL_tainted)
5982 FAIL("Eval-group in insecure regular expression");
5983 #if PERL_VERSION > 8
5984 if (IN_PERL_COMPILETIME)
5989 nextchar(pRExC_state);
5991 ret = reg_node(pRExC_state, LOGICAL);
5994 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5995 /* deal with the length of this later - MJD */
5998 ret = reganode(pRExC_state, EVAL, n);
5999 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6000 Set_Node_Offset(ret, parse_start);
6003 case '(': /* (?(?{...})...) and (?(?=...)...) */
6006 if (RExC_parse[0] == '?') { /* (?(?...)) */
6007 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6008 || RExC_parse[1] == '<'
6009 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6012 ret = reg_node(pRExC_state, LOGICAL);
6015 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6019 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6020 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6022 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6023 char *name_start= RExC_parse++;
6025 SV *sv_dat=reg_scan_name(pRExC_state,
6026 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6027 if (RExC_parse == name_start || *RExC_parse != ch)
6028 vFAIL2("Sequence (?(%c... not terminated",
6029 (ch == '>' ? '<' : ch));
6032 num = add_data( pRExC_state, 1, "S" );
6033 RExC_rxi->data->data[num]=(void*)sv_dat;
6034 SvREFCNT_inc_simple_void(sv_dat);
6036 ret = reganode(pRExC_state,NGROUPP,num);
6037 goto insert_if_check_paren;
6039 else if (RExC_parse[0] == 'D' &&
6040 RExC_parse[1] == 'E' &&
6041 RExC_parse[2] == 'F' &&
6042 RExC_parse[3] == 'I' &&
6043 RExC_parse[4] == 'N' &&
6044 RExC_parse[5] == 'E')
6046 ret = reganode(pRExC_state,DEFINEP,0);
6049 goto insert_if_check_paren;
6051 else if (RExC_parse[0] == 'R') {
6054 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6055 parno = atoi(RExC_parse++);
6056 while (isDIGIT(*RExC_parse))
6058 } else if (RExC_parse[0] == '&') {
6061 sv_dat = reg_scan_name(pRExC_state,
6062 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6063 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6065 ret = reganode(pRExC_state,INSUBP,parno);
6066 goto insert_if_check_paren;
6068 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6071 parno = atoi(RExC_parse++);
6073 while (isDIGIT(*RExC_parse))
6075 ret = reganode(pRExC_state, GROUPP, parno);
6077 insert_if_check_paren:
6078 if ((c = *nextchar(pRExC_state)) != ')')
6079 vFAIL("Switch condition not recognized");
6081 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6082 br = regbranch(pRExC_state, &flags, 1,depth+1);
6084 br = reganode(pRExC_state, LONGJMP, 0);
6086 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6087 c = *nextchar(pRExC_state);
6092 vFAIL("(?(DEFINE)....) does not allow branches");
6093 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6094 regbranch(pRExC_state, &flags, 1,depth+1);
6095 REGTAIL(pRExC_state, ret, lastbr);
6098 c = *nextchar(pRExC_state);
6103 vFAIL("Switch (?(condition)... contains too many branches");
6104 ender = reg_node(pRExC_state, TAIL);
6105 REGTAIL(pRExC_state, br, ender);
6107 REGTAIL(pRExC_state, lastbr, ender);
6108 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6111 REGTAIL(pRExC_state, ret, ender);
6112 RExC_size++; /* XXX WHY do we need this?!!
6113 For large programs it seems to be required
6114 but I can't figure out why. -- dmq*/
6118 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6122 RExC_parse--; /* for vFAIL to print correctly */
6123 vFAIL("Sequence (? incomplete");
6127 parse_flags: /* (?i) */
6129 U32 posflags = 0, negflags = 0;
6130 U32 *flagsp = &posflags;
6132 while (*RExC_parse) {
6133 /* && strchr("iogcmsx", *RExC_parse) */
6134 /* (?g), (?gc) and (?o) are useless here
6135 and must be globally applied -- japhy */
6136 switch (*RExC_parse) {
6137 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6138 case ONCE_PAT_MOD: /* 'o' */
6139 case GLOBAL_PAT_MOD: /* 'g' */
6140 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6141 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6142 if (! (wastedflags & wflagbit) ) {
6143 wastedflags |= wflagbit;
6146 "Useless (%s%c) - %suse /%c modifier",
6147 flagsp == &negflags ? "?-" : "?",
6149 flagsp == &negflags ? "don't " : "",
6156 case CONTINUE_PAT_MOD: /* 'c' */
6157 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6158 if (! (wastedflags & WASTED_C) ) {
6159 wastedflags |= WASTED_GC;
6162 "Useless (%sc) - %suse /gc modifier",
6163 flagsp == &negflags ? "?-" : "?",
6164 flagsp == &negflags ? "don't " : ""
6169 case KEEPCOPY_PAT_MOD: /* 'p' */
6170 if (flagsp == &negflags) {
6172 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6174 *flagsp |= RXf_PMf_KEEPCOPY;
6178 if (flagsp == &negflags) {
6180 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6184 wastedflags = 0; /* reset so (?g-c) warns twice */
6190 RExC_flags |= posflags;
6191 RExC_flags &= ~negflags;
6193 oregflags |= posflags;
6194 oregflags &= ~negflags;
6196 nextchar(pRExC_state);
6207 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6212 }} /* one for the default block, one for the switch */
6219 ret = reganode(pRExC_state, OPEN, parno);
6222 RExC_nestroot = parno;
6223 if (RExC_seen & REG_SEEN_RECURSE
6224 && !RExC_open_parens[parno-1])
6226 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6227 "Setting open paren #%"IVdf" to %d\n",
6228 (IV)parno, REG_NODE_NUM(ret)));
6229 RExC_open_parens[parno-1]= ret;
6232 Set_Node_Length(ret, 1); /* MJD */
6233 Set_Node_Offset(ret, RExC_parse); /* MJD */
6241 /* Pick up the branches, linking them together. */
6242 parse_start = RExC_parse; /* MJD */
6243 br = regbranch(pRExC_state, &flags, 1,depth+1);
6246 if (RExC_npar > after_freeze)
6247 after_freeze = RExC_npar;
6248 RExC_npar = freeze_paren;
6251 /* branch_len = (paren != 0); */
6255 if (*RExC_parse == '|') {
6256 if (!SIZE_ONLY && RExC_extralen) {
6257 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6260 reginsert(pRExC_state, BRANCH, br, depth+1);
6261 Set_Node_Length(br, paren != 0);
6262 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6266 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6268 else if (paren == ':') {
6269 *flagp |= flags&SIMPLE;
6271 if (is_open) { /* Starts with OPEN. */
6272 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6274 else if (paren != '?') /* Not Conditional */
6276 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6278 while (*RExC_parse == '|') {
6279 if (!SIZE_ONLY && RExC_extralen) {
6280 ender = reganode(pRExC_state, LONGJMP,0);
6281 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6284 RExC_extralen += 2; /* Account for LONGJMP. */
6285 nextchar(pRExC_state);
6287 if (RExC_npar > after_freeze)
6288 after_freeze = RExC_npar;
6289 RExC_npar = freeze_paren;
6291 br = regbranch(pRExC_state, &flags, 0, depth+1);
6295 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6297 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6300 if (have_branch || paren != ':') {
6301 /* Make a closing node, and hook it on the end. */
6304 ender = reg_node(pRExC_state, TAIL);
6307 ender = reganode(pRExC_state, CLOSE, parno);
6308 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6309 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6310 "Setting close paren #%"IVdf" to %d\n",
6311 (IV)parno, REG_NODE_NUM(ender)));
6312 RExC_close_parens[parno-1]= ender;
6313 if (RExC_nestroot == parno)
6316 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6317 Set_Node_Length(ender,1); /* MJD */
6323 *flagp &= ~HASWIDTH;
6326 ender = reg_node(pRExC_state, SUCCEED);
6329 ender = reg_node(pRExC_state, END);
6331 assert(!RExC_opend); /* there can only be one! */
6336 REGTAIL(pRExC_state, lastbr, ender);
6338 if (have_branch && !SIZE_ONLY) {
6340 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6342 /* Hook the tails of the branches to the closing node. */
6343 for (br = ret; br; br = regnext(br)) {
6344 const U8 op = PL_regkind[OP(br)];
6346 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6348 else if (op == BRANCHJ) {
6349 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6357 static const char parens[] = "=!<,>";
6359 if (paren && (p = strchr(parens, paren))) {
6360 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6361 int flag = (p - parens) > 1;
6364 node = SUSPEND, flag = 0;
6365 reginsert(pRExC_state, node,ret, depth+1);
6366 Set_Node_Cur_Length(ret);
6367 Set_Node_Offset(ret, parse_start + 1);
6369 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6373 /* Check for proper termination. */
6375 RExC_flags = oregflags;
6376 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6377 RExC_parse = oregcomp_parse;
6378 vFAIL("Unmatched (");
6381 else if (!paren && RExC_parse < RExC_end) {
6382 if (*RExC_parse == ')') {
6384 vFAIL("Unmatched )");
6387 FAIL("Junk on end of regexp"); /* "Can't happen". */
6391 RExC_npar = after_freeze;
6396 - regbranch - one alternative of an | operator
6398 * Implements the concatenation operator.
6401 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6404 register regnode *ret;
6405 register regnode *chain = NULL;
6406 register regnode *latest;
6407 I32 flags = 0, c = 0;
6408 GET_RE_DEBUG_FLAGS_DECL;
6410 PERL_ARGS_ASSERT_REGBRANCH;
6412 DEBUG_PARSE("brnc");
6417 if (!SIZE_ONLY && RExC_extralen)
6418 ret = reganode(pRExC_state, BRANCHJ,0);
6420 ret = reg_node(pRExC_state, BRANCH);
6421 Set_Node_Length(ret, 1);
6425 if (!first && SIZE_ONLY)
6426 RExC_extralen += 1; /* BRANCHJ */
6428 *flagp = WORST; /* Tentatively. */
6431 nextchar(pRExC_state);
6432 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6434 latest = regpiece(pRExC_state, &flags,depth+1);
6435 if (latest == NULL) {
6436 if (flags & TRYAGAIN)
6440 else if (ret == NULL)
6442 *flagp |= flags&(HASWIDTH|POSTPONED);
6443 if (chain == NULL) /* First piece. */
6444 *flagp |= flags&SPSTART;
6447 REGTAIL(pRExC_state, chain, latest);
6452 if (chain == NULL) { /* Loop ran zero times. */
6453 chain = reg_node(pRExC_state, NOTHING);
6458 *flagp |= flags&SIMPLE;
6465 - regpiece - something followed by possible [*+?]
6467 * Note that the branching code sequences used for ? and the general cases
6468 * of * and + are somewhat optimized: they use the same NOTHING node as
6469 * both the endmarker for their branch list and the body of the last branch.
6470 * It might seem that this node could be dispensed with entirely, but the
6471 * endmarker role is not redundant.
6474 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6477 register regnode *ret;
6479 register char *next;
6481 const char * const origparse = RExC_parse;
6483 I32 max = REG_INFTY;
6485 const char *maxpos = NULL;
6486 GET_RE_DEBUG_FLAGS_DECL;
6488 PERL_ARGS_ASSERT_REGPIECE;
6490 DEBUG_PARSE("piec");
6492 ret = regatom(pRExC_state, &flags,depth+1);
6494 if (flags & TRYAGAIN)
6501 if (op == '{' && regcurly(RExC_parse)) {
6503 parse_start = RExC_parse; /* MJD */
6504 next = RExC_parse + 1;
6505 while (isDIGIT(*next) || *next == ',') {
6514 if (*next == '}') { /* got one */
6518 min = atoi(RExC_parse);
6522 maxpos = RExC_parse;
6524 if (!max && *maxpos != '0')
6525 max = REG_INFTY; /* meaning "infinity" */
6526 else if (max >= REG_INFTY)
6527 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6529 nextchar(pRExC_state);
6532 if ((flags&SIMPLE)) {
6533 RExC_naughty += 2 + RExC_naughty / 2;
6534 reginsert(pRExC_state, CURLY, ret, depth+1);
6535 Set_Node_Offset(ret, parse_start+1); /* MJD */
6536 Set_Node_Cur_Length(ret);
6539 regnode * const w = reg_node(pRExC_state, WHILEM);
6542 REGTAIL(pRExC_state, ret, w);
6543 if (!SIZE_ONLY && RExC_extralen) {
6544 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6545 reginsert(pRExC_state, NOTHING,ret, depth+1);
6546 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6548 reginsert(pRExC_state, CURLYX,ret, depth+1);
6550 Set_Node_Offset(ret, parse_start+1);
6551 Set_Node_Length(ret,
6552 op == '{' ? (RExC_parse - parse_start) : 1);
6554 if (!SIZE_ONLY && RExC_extralen)
6555 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6556 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6558 RExC_whilem_seen++, RExC_extralen += 3;
6559 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6568 vFAIL("Can't do {n,m} with n > m");
6570 ARG1_SET(ret, (U16)min);
6571 ARG2_SET(ret, (U16)max);
6583 #if 0 /* Now runtime fix should be reliable. */
6585 /* if this is reinstated, don't forget to put this back into perldiag:
6587 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6589 (F) The part of the regexp subject to either the * or + quantifier
6590 could match an empty string. The {#} shows in the regular
6591 expression about where the problem was discovered.
6595 if (!(flags&HASWIDTH) && op != '?')
6596 vFAIL("Regexp *+ operand could be empty");
6599 parse_start = RExC_parse;
6600 nextchar(pRExC_state);
6602 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6604 if (op == '*' && (flags&SIMPLE)) {
6605 reginsert(pRExC_state, STAR, ret, depth+1);
6609 else if (op == '*') {
6613 else if (op == '+' && (flags&SIMPLE)) {
6614 reginsert(pRExC_state, PLUS, ret, depth+1);
6618 else if (op == '+') {
6622 else if (op == '?') {
6627 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6628 ckWARN3reg(RExC_parse,
6629 "%.*s matches null string many times",
6630 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6634 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6635 nextchar(pRExC_state);
6636 reginsert(pRExC_state, MINMOD, ret, depth+1);
6637 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6639 #ifndef REG_ALLOW_MINMOD_SUSPEND
6642 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6644 nextchar(pRExC_state);
6645 ender = reg_node(pRExC_state, SUCCEED);
6646 REGTAIL(pRExC_state, ret, ender);
6647 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6649 ender = reg_node(pRExC_state, TAIL);
6650 REGTAIL(pRExC_state, ret, ender);
6654 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6656 vFAIL("Nested quantifiers");
6663 /* reg_namedseq(pRExC_state,UVp)
6665 This is expected to be called by a parser routine that has
6666 recognized '\N' and needs to handle the rest. RExC_parse is
6667 expected to point at the first char following the N at the time
6670 The \N may be inside (indicated by valuep not being NULL) or outside a
6673 \N may begin either a named sequence, or if outside a character class, mean
6674 to match a non-newline. For non single-quoted regexes, the tokenizer has
6675 attempted to decide which, and in the case of a named sequence converted it
6676 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6677 where c1... are the characters in the sequence. For single-quoted regexes,
6678 the tokenizer passes the \N sequence through unchanged; this code will not
6679 attempt to determine this nor expand those. The net effect is that if the
6680 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6681 signals that this \N occurrence means to match a non-newline.
6683 Only the \N{U+...} form should occur in a character class, for the same
6684 reason that '.' inside a character class means to just match a period: it
6685 just doesn't make sense.
6687 If valuep is non-null then it is assumed that we are parsing inside
6688 of a charclass definition and the first codepoint in the resolved
6689 string is returned via *valuep and the routine will return NULL.
6690 In this mode if a multichar string is returned from the charnames
6691 handler, a warning will be issued, and only the first char in the
6692 sequence will be examined. If the string returned is zero length
6693 then the value of *valuep is undefined and NON-NULL will
6694 be returned to indicate failure. (This will NOT be a valid pointer
6697 If valuep is null then it is assumed that we are parsing normal text and a
6698 new EXACT node is inserted into the program containing the resolved string,
6699 and a pointer to the new node is returned. But if the string is zero length
6700 a NOTHING node is emitted instead.
6702 On success RExC_parse is set to the char following the endbrace.
6703 Parsing failures will generate a fatal error via vFAIL(...)
6706 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6708 char * endbrace; /* '}' following the name */
6709 regnode *ret = NULL;
6711 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6715 GET_RE_DEBUG_FLAGS_DECL;
6717 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6721 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6722 * modifier. The other meaning does not */
6723 p = (RExC_flags & RXf_PMf_EXTENDED)
6724 ? regwhite( pRExC_state, RExC_parse )
6727 /* Disambiguate between \N meaning a named character versus \N meaning
6728 * [^\n]. The former is assumed when it can't be the latter. */
6729 if (*p != '{' || regcurly(p)) {
6732 /* no bare \N in a charclass */
6733 vFAIL("\\N in a character class must be a named character: \\N{...}");
6735 nextchar(pRExC_state);
6736 ret = reg_node(pRExC_state, REG_ANY);
6737 *flagp |= HASWIDTH|SIMPLE;
6740 Set_Node_Length(ret, 1); /* MJD */
6744 /* Here, we have decided it should be a named sequence */
6746 /* The test above made sure that the next real character is a '{', but
6747 * under the /x modifier, it could be separated by space (or a comment and
6748 * \n) and this is not allowed (for consistency with \x{...} and the
6749 * tokenizer handling of \N{NAME}). */
6750 if (*RExC_parse != '{') {
6751 vFAIL("Missing braces on \\N{}");
6754 RExC_parse++; /* Skip past the '{' */
6756 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6757 || ! (endbrace == RExC_parse /* nothing between the {} */
6758 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6759 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6761 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6762 vFAIL("\\N{NAME} must be resolved by the lexer");
6765 if (endbrace == RExC_parse) { /* empty: \N{} */
6767 RExC_parse = endbrace + 1;
6768 return reg_node(pRExC_state,NOTHING);
6772 ckWARNreg(RExC_parse,
6773 "Ignoring zero length \\N{} in character class"
6775 RExC_parse = endbrace + 1;
6778 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6781 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
6782 RExC_parse += 2; /* Skip past the 'U+' */
6784 if (valuep) { /* In a bracketed char class */
6785 /* We only pay attention to the first char of
6786 multichar strings being returned. I kinda wonder
6787 if this makes sense as it does change the behaviour
6788 from earlier versions, OTOH that behaviour was broken
6789 as well. XXX Solution is to recharacterize as
6790 [rest-of-class]|multi1|multi2... */
6792 STRLEN length_of_hex;
6793 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6794 | PERL_SCAN_DISALLOW_PREFIX
6795 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6797 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6798 if (endchar < endbrace) {
6799 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6802 length_of_hex = (STRLEN)(endchar - RExC_parse);
6803 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6805 /* The tokenizer should have guaranteed validity, but it's possible to
6806 * bypass it by using single quoting, so check */
6807 if (length_of_hex == 0
6808 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6810 RExC_parse += length_of_hex; /* Includes all the valid */
6811 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6812 ? UTF8SKIP(RExC_parse)
6814 /* Guard against malformed utf8 */
6815 if (RExC_parse >= endchar) RExC_parse = endchar;
6816 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6819 RExC_parse = endbrace + 1;
6820 if (endchar == endbrace) return NULL;
6822 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
6824 else { /* Not a char class */
6825 char *s; /* String to put in generated EXACT node */
6826 STRLEN len = 0; /* Its current byte length */
6827 char *endchar; /* Points to '.' or '}' ending cur char in the input
6830 ret = reg_node(pRExC_state,
6831 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6834 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6835 * the input which is of the form now 'c1.c2.c3...}' until find the
6836 * ending brace or exceed length 255. The characters that exceed this
6837 * limit are dropped. The limit could be relaxed should it become
6838 * desirable by reparsing this as (?:\N{NAME}), so could generate
6839 * multiple EXACT nodes, as is done for just regular input. But this
6840 * is primarily a named character, and not intended to be a huge long
6841 * string, so 255 bytes should be good enough */
6843 STRLEN length_of_hex;
6844 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6845 | PERL_SCAN_DISALLOW_PREFIX
6846 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6847 UV cp; /* Ord of current character */
6849 /* Code points are separated by dots. If none, there is only one
6850 * code point, and is terminated by the brace */
6851 endchar = RExC_parse + strcspn(RExC_parse, ".}");
6853 /* The values are Unicode even on EBCDIC machines */
6854 length_of_hex = (STRLEN)(endchar - RExC_parse);
6855 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6856 if ( length_of_hex == 0
6857 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6859 RExC_parse += length_of_hex; /* Includes all the valid */
6860 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6861 ? UTF8SKIP(RExC_parse)
6863 /* Guard against malformed utf8 */
6864 if (RExC_parse >= endchar) RExC_parse = endchar;
6865 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6868 if (! FOLD) { /* Not folding, just append to the string */
6871 /* Quit before adding this character if would exceed limit */
6872 if (len + UNISKIP(cp) > U8_MAX) break;
6874 unilen = reguni(pRExC_state, cp, s);
6879 } else { /* Folding, output the folded equivalent */
6880 STRLEN foldlen,numlen;
6881 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6882 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6884 /* Quit before exceeding size limit */
6885 if (len + foldlen > U8_MAX) break;
6887 for (foldbuf = tmpbuf;
6891 cp = utf8_to_uvchr(foldbuf, &numlen);
6893 const STRLEN unilen = reguni(pRExC_state, cp, s);
6896 /* In EBCDIC the numlen and unilen can differ. */
6898 if (numlen >= foldlen)
6902 break; /* "Can't happen." */
6906 /* Point to the beginning of the next character in the sequence. */
6907 RExC_parse = endchar + 1;
6909 /* Quit if no more characters */
6910 if (RExC_parse >= endbrace) break;
6915 if (RExC_parse < endbrace) {
6916 ckWARNreg(RExC_parse - 1,
6917 "Using just the first characters returned by \\N{}");
6920 RExC_size += STR_SZ(len);
6923 RExC_emit += STR_SZ(len);
6926 RExC_parse = endbrace + 1;
6928 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6929 with malformed in t/re/pat_advanced.t */
6931 Set_Node_Cur_Length(ret); /* MJD */
6932 nextchar(pRExC_state);
6942 * It returns the code point in utf8 for the value in *encp.
6943 * value: a code value in the source encoding
6944 * encp: a pointer to an Encode object
6946 * If the result from Encode is not a single character,
6947 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6950 S_reg_recode(pTHX_ const char value, SV **encp)
6953 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6954 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6955 const STRLEN newlen = SvCUR(sv);
6956 UV uv = UNICODE_REPLACEMENT;
6958 PERL_ARGS_ASSERT_REG_RECODE;
6962 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6965 if (!newlen || numlen != newlen) {
6966 uv = UNICODE_REPLACEMENT;
6974 - regatom - the lowest level
6976 Try to identify anything special at the start of the pattern. If there
6977 is, then handle it as required. This may involve generating a single regop,
6978 such as for an assertion; or it may involve recursing, such as to
6979 handle a () structure.
6981 If the string doesn't start with something special then we gobble up
6982 as much literal text as we can.
6984 Once we have been able to handle whatever type of thing started the
6985 sequence, we return.
6987 Note: we have to be careful with escapes, as they can be both literal
6988 and special, and in the case of \10 and friends can either, depending
6989 on context. Specifically there are two seperate switches for handling
6990 escape sequences, with the one for handling literal escapes requiring
6991 a dummy entry for all of the special escapes that are actually handled
6996 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6999 register regnode *ret = NULL;
7001 char *parse_start = RExC_parse;
7002 GET_RE_DEBUG_FLAGS_DECL;
7003 DEBUG_PARSE("atom");
7004 *flagp = WORST; /* Tentatively. */
7006 PERL_ARGS_ASSERT_REGATOM;
7009 switch ((U8)*RExC_parse) {
7011 RExC_seen_zerolen++;
7012 nextchar(pRExC_state);
7013 if (RExC_flags & RXf_PMf_MULTILINE)
7014 ret = reg_node(pRExC_state, MBOL);
7015 else if (RExC_flags & RXf_PMf_SINGLELINE)
7016 ret = reg_node(pRExC_state, SBOL);
7018 ret = reg_node(pRExC_state, BOL);
7019 Set_Node_Length(ret, 1); /* MJD */
7022 nextchar(pRExC_state);
7024 RExC_seen_zerolen++;
7025 if (RExC_flags & RXf_PMf_MULTILINE)
7026 ret = reg_node(pRExC_state, MEOL);
7027 else if (RExC_flags & RXf_PMf_SINGLELINE)
7028 ret = reg_node(pRExC_state, SEOL);
7030 ret = reg_node(pRExC_state, EOL);
7031 Set_Node_Length(ret, 1); /* MJD */
7034 nextchar(pRExC_state);
7035 if (RExC_flags & RXf_PMf_SINGLELINE)
7036 ret = reg_node(pRExC_state, SANY);
7038 ret = reg_node(pRExC_state, REG_ANY);
7039 *flagp |= HASWIDTH|SIMPLE;
7041 Set_Node_Length(ret, 1); /* MJD */
7045 char * const oregcomp_parse = ++RExC_parse;
7046 ret = regclass(pRExC_state,depth+1);
7047 if (*RExC_parse != ']') {
7048 RExC_parse = oregcomp_parse;
7049 vFAIL("Unmatched [");
7051 nextchar(pRExC_state);
7052 *flagp |= HASWIDTH|SIMPLE;
7053 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7057 nextchar(pRExC_state);
7058 ret = reg(pRExC_state, 1, &flags,depth+1);
7060 if (flags & TRYAGAIN) {
7061 if (RExC_parse == RExC_end) {
7062 /* Make parent create an empty node if needed. */
7070 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7074 if (flags & TRYAGAIN) {
7078 vFAIL("Internal urp");
7079 /* Supposed to be caught earlier. */
7082 if (!regcurly(RExC_parse)) {
7091 vFAIL("Quantifier follows nothing");
7099 len=0; /* silence a spurious compiler warning */
7100 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7101 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7102 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7103 ret = reganode(pRExC_state, FOLDCHAR, cp);
7104 Set_Node_Length(ret, 1); /* MJD */
7105 nextchar(pRExC_state); /* kill whitespace under /x */
7113 This switch handles escape sequences that resolve to some kind
7114 of special regop and not to literal text. Escape sequnces that
7115 resolve to literal text are handled below in the switch marked
7118 Every entry in this switch *must* have a corresponding entry
7119 in the literal escape switch. However, the opposite is not
7120 required, as the default for this switch is to jump to the
7121 literal text handling code.
7123 switch ((U8)*++RExC_parse) {
7128 /* Special Escapes */
7130 RExC_seen_zerolen++;
7131 ret = reg_node(pRExC_state, SBOL);
7133 goto finish_meta_pat;
7135 ret = reg_node(pRExC_state, GPOS);
7136 RExC_seen |= REG_SEEN_GPOS;
7138 goto finish_meta_pat;
7140 RExC_seen_zerolen++;
7141 ret = reg_node(pRExC_state, KEEPS);
7143 /* XXX:dmq : disabling in-place substitution seems to
7144 * be necessary here to avoid cases of memory corruption, as
7145 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7147 RExC_seen |= REG_SEEN_LOOKBEHIND;
7148 goto finish_meta_pat;
7150 ret = reg_node(pRExC_state, SEOL);
7152 RExC_seen_zerolen++; /* Do not optimize RE away */
7153 goto finish_meta_pat;
7155 ret = reg_node(pRExC_state, EOS);
7157 RExC_seen_zerolen++; /* Do not optimize RE away */
7158 goto finish_meta_pat;
7160 ret = reg_node(pRExC_state, CANY);
7161 RExC_seen |= REG_SEEN_CANY;
7162 *flagp |= HASWIDTH|SIMPLE;
7163 goto finish_meta_pat;
7165 ret = reg_node(pRExC_state, CLUMP);
7167 goto finish_meta_pat;
7169 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7170 *flagp |= HASWIDTH|SIMPLE;
7171 goto finish_meta_pat;
7173 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7174 *flagp |= HASWIDTH|SIMPLE;
7175 goto finish_meta_pat;
7177 RExC_seen_zerolen++;
7178 RExC_seen |= REG_SEEN_LOOKBEHIND;
7179 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7181 goto finish_meta_pat;
7183 RExC_seen_zerolen++;
7184 RExC_seen |= REG_SEEN_LOOKBEHIND;
7185 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7187 goto finish_meta_pat;
7189 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7190 *flagp |= HASWIDTH|SIMPLE;
7191 goto finish_meta_pat;
7193 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7194 *flagp |= HASWIDTH|SIMPLE;
7195 goto finish_meta_pat;
7197 ret = reg_node(pRExC_state, DIGIT);
7198 *flagp |= HASWIDTH|SIMPLE;
7199 goto finish_meta_pat;
7201 ret = reg_node(pRExC_state, NDIGIT);
7202 *flagp |= HASWIDTH|SIMPLE;
7203 goto finish_meta_pat;
7205 ret = reg_node(pRExC_state, LNBREAK);
7206 *flagp |= HASWIDTH|SIMPLE;
7207 goto finish_meta_pat;
7209 ret = reg_node(pRExC_state, HORIZWS);
7210 *flagp |= HASWIDTH|SIMPLE;
7211 goto finish_meta_pat;
7213 ret = reg_node(pRExC_state, NHORIZWS);
7214 *flagp |= HASWIDTH|SIMPLE;
7215 goto finish_meta_pat;
7217 ret = reg_node(pRExC_state, VERTWS);
7218 *flagp |= HASWIDTH|SIMPLE;
7219 goto finish_meta_pat;
7221 ret = reg_node(pRExC_state, NVERTWS);
7222 *flagp |= HASWIDTH|SIMPLE;
7224 nextchar(pRExC_state);
7225 Set_Node_Length(ret, 2); /* MJD */
7230 char* const oldregxend = RExC_end;
7232 char* parse_start = RExC_parse - 2;
7235 if (RExC_parse[1] == '{') {
7236 /* a lovely hack--pretend we saw [\pX] instead */
7237 RExC_end = strchr(RExC_parse, '}');
7239 const U8 c = (U8)*RExC_parse;
7241 RExC_end = oldregxend;
7242 vFAIL2("Missing right brace on \\%c{}", c);
7247 RExC_end = RExC_parse + 2;
7248 if (RExC_end > oldregxend)
7249 RExC_end = oldregxend;
7253 ret = regclass(pRExC_state,depth+1);
7255 RExC_end = oldregxend;
7258 Set_Node_Offset(ret, parse_start + 2);
7259 Set_Node_Cur_Length(ret);
7260 nextchar(pRExC_state);
7261 *flagp |= HASWIDTH|SIMPLE;
7265 /* Handle \N and \N{NAME} here and not below because it can be
7266 multicharacter. join_exact() will join them up later on.
7267 Also this makes sure that things like /\N{BLAH}+/ and
7268 \N{BLAH} being multi char Just Happen. dmq*/
7270 ret= reg_namedseq(pRExC_state, NULL, flagp);
7272 case 'k': /* Handle \k<NAME> and \k'NAME' */
7275 char ch= RExC_parse[1];
7276 if (ch != '<' && ch != '\'' && ch != '{') {
7278 vFAIL2("Sequence %.2s... not terminated",parse_start);
7280 /* this pretty much dupes the code for (?P=...) in reg(), if
7281 you change this make sure you change that */
7282 char* name_start = (RExC_parse += 2);
7284 SV *sv_dat = reg_scan_name(pRExC_state,
7285 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7286 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7287 if (RExC_parse == name_start || *RExC_parse != ch)
7288 vFAIL2("Sequence %.3s... not terminated",parse_start);
7291 num = add_data( pRExC_state, 1, "S" );
7292 RExC_rxi->data->data[num]=(void*)sv_dat;
7293 SvREFCNT_inc_simple_void(sv_dat);
7297 ret = reganode(pRExC_state,
7298 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7302 /* override incorrect value set in reganode MJD */
7303 Set_Node_Offset(ret, parse_start+1);
7304 Set_Node_Cur_Length(ret); /* MJD */
7305 nextchar(pRExC_state);
7311 case '1': case '2': case '3': case '4':
7312 case '5': case '6': case '7': case '8': case '9':
7315 bool isg = *RExC_parse == 'g';
7320 if (*RExC_parse == '{') {
7324 if (*RExC_parse == '-') {
7328 if (hasbrace && !isDIGIT(*RExC_parse)) {
7329 if (isrel) RExC_parse--;
7331 goto parse_named_seq;
7333 num = atoi(RExC_parse);
7334 if (isg && num == 0)
7335 vFAIL("Reference to invalid group 0");
7337 num = RExC_npar - num;
7339 vFAIL("Reference to nonexistent or unclosed group");
7341 if (!isg && num > 9 && num >= RExC_npar)
7344 char * const parse_start = RExC_parse - 1; /* MJD */
7345 while (isDIGIT(*RExC_parse))
7347 if (parse_start == RExC_parse - 1)
7348 vFAIL("Unterminated \\g... pattern");
7350 if (*RExC_parse != '}')
7351 vFAIL("Unterminated \\g{...} pattern");
7355 if (num > (I32)RExC_rx->nparens)
7356 vFAIL("Reference to nonexistent group");
7359 ret = reganode(pRExC_state,
7360 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7364 /* override incorrect value set in reganode MJD */
7365 Set_Node_Offset(ret, parse_start+1);
7366 Set_Node_Cur_Length(ret); /* MJD */
7368 nextchar(pRExC_state);
7373 if (RExC_parse >= RExC_end)
7374 FAIL("Trailing \\");
7377 /* Do not generate "unrecognized" warnings here, we fall
7378 back into the quick-grab loop below */
7385 if (RExC_flags & RXf_PMf_EXTENDED) {
7386 if ( reg_skipcomment( pRExC_state ) )
7393 register STRLEN len;
7398 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7400 parse_start = RExC_parse - 1;
7406 ret = reg_node(pRExC_state,
7407 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7409 for (len = 0, p = RExC_parse - 1;
7410 len < 127 && p < RExC_end;
7413 char * const oldp = p;
7415 if (RExC_flags & RXf_PMf_EXTENDED)
7416 p = regwhite( pRExC_state, p );
7421 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7422 goto normal_default;
7432 /* Literal Escapes Switch
7434 This switch is meant to handle escape sequences that
7435 resolve to a literal character.
7437 Every escape sequence that represents something
7438 else, like an assertion or a char class, is handled
7439 in the switch marked 'Special Escapes' above in this
7440 routine, but also has an entry here as anything that
7441 isn't explicitly mentioned here will be treated as
7442 an unescaped equivalent literal.
7446 /* These are all the special escapes. */
7450 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7451 goto normal_default;
7452 case 'A': /* Start assertion */
7453 case 'b': case 'B': /* Word-boundary assertion*/
7454 case 'C': /* Single char !DANGEROUS! */
7455 case 'd': case 'D': /* digit class */
7456 case 'g': case 'G': /* generic-backref, pos assertion */
7457 case 'h': case 'H': /* HORIZWS */
7458 case 'k': case 'K': /* named backref, keep marker */
7459 case 'N': /* named char sequence */
7460 case 'p': case 'P': /* Unicode property */
7461 case 'R': /* LNBREAK */
7462 case 's': case 'S': /* space class */
7463 case 'v': case 'V': /* VERTWS */
7464 case 'w': case 'W': /* word class */
7465 case 'X': /* eXtended Unicode "combining character sequence" */
7466 case 'z': case 'Z': /* End of line/string assertion */
7470 /* Anything after here is an escape that resolves to a
7471 literal. (Except digits, which may or may not)
7490 ender = ASCII_TO_NATIVE('\033');
7494 ender = ASCII_TO_NATIVE('\007');
7499 STRLEN brace_len = len;
7501 const char* error_msg;
7503 bool valid = grok_bslash_o(p,
7510 RExC_parse = p; /* going to die anyway; point
7511 to exact spot of failure */
7518 if (PL_encoding && ender < 0x100) {
7519 goto recode_encoding;
7528 char* const e = strchr(p, '}');
7532 vFAIL("Missing right brace on \\x{}");
7535 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7536 | PERL_SCAN_DISALLOW_PREFIX;
7537 STRLEN numlen = e - p - 1;
7538 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7545 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7547 ender = grok_hex(p, &numlen, &flags, NULL);
7550 if (PL_encoding && ender < 0x100)
7551 goto recode_encoding;
7555 ender = grok_bslash_c(*p++, SIZE_ONLY);
7557 case '0': case '1': case '2': case '3':case '4':
7558 case '5': case '6': case '7': case '8':case '9':
7560 (isOCTAL(p[1]) && atoi(p) >= RExC_npar))
7562 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7564 ender = grok_oct(p, &numlen, &flags, NULL);
7574 if (PL_encoding && ender < 0x100)
7575 goto recode_encoding;
7579 SV* enc = PL_encoding;
7580 ender = reg_recode((const char)(U8)ender, &enc);
7581 if (!enc && SIZE_ONLY)
7582 ckWARNreg(p, "Invalid escape in the specified encoding");
7588 FAIL("Trailing \\");
7591 if (!SIZE_ONLY&& isALPHA(*p))
7592 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7593 goto normal_default;
7598 if (UTF8_IS_START(*p) && UTF) {
7600 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7601 &numlen, UTF8_ALLOW_DEFAULT);
7608 if ( RExC_flags & RXf_PMf_EXTENDED)
7609 p = regwhite( pRExC_state, p );
7611 /* Prime the casefolded buffer. */
7612 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7614 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7619 /* Emit all the Unicode characters. */
7621 for (foldbuf = tmpbuf;
7623 foldlen -= numlen) {
7624 ender = utf8_to_uvchr(foldbuf, &numlen);
7626 const STRLEN unilen = reguni(pRExC_state, ender, s);
7629 /* In EBCDIC the numlen
7630 * and unilen can differ. */
7632 if (numlen >= foldlen)
7636 break; /* "Can't happen." */
7640 const STRLEN unilen = reguni(pRExC_state, ender, s);
7649 REGC((char)ender, s++);
7655 /* Emit all the Unicode characters. */
7657 for (foldbuf = tmpbuf;
7659 foldlen -= numlen) {
7660 ender = utf8_to_uvchr(foldbuf, &numlen);
7662 const STRLEN unilen = reguni(pRExC_state, ender, s);
7665 /* In EBCDIC the numlen
7666 * and unilen can differ. */
7668 if (numlen >= foldlen)
7676 const STRLEN unilen = reguni(pRExC_state, ender, s);
7685 REGC((char)ender, s++);
7689 Set_Node_Cur_Length(ret); /* MJD */
7690 nextchar(pRExC_state);
7692 /* len is STRLEN which is unsigned, need to copy to signed */
7695 vFAIL("Internal disaster");
7699 if (len == 1 && UNI_IS_INVARIANT(ender))
7703 RExC_size += STR_SZ(len);
7706 RExC_emit += STR_SZ(len);
7716 S_regwhite( RExC_state_t *pRExC_state, char *p )
7718 const char *e = RExC_end;
7720 PERL_ARGS_ASSERT_REGWHITE;
7725 else if (*p == '#') {
7734 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7742 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7743 Character classes ([:foo:]) can also be negated ([:^foo:]).
7744 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7745 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7746 but trigger failures because they are currently unimplemented. */
7748 #define POSIXCC_DONE(c) ((c) == ':')
7749 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7750 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7753 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7756 I32 namedclass = OOB_NAMEDCLASS;
7758 PERL_ARGS_ASSERT_REGPPOSIXCC;
7760 if (value == '[' && RExC_parse + 1 < RExC_end &&
7761 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7762 POSIXCC(UCHARAT(RExC_parse))) {
7763 const char c = UCHARAT(RExC_parse);
7764 char* const s = RExC_parse++;
7766 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7768 if (RExC_parse == RExC_end)
7769 /* Grandfather lone [:, [=, [. */
7772 const char* const t = RExC_parse++; /* skip over the c */
7775 if (UCHARAT(RExC_parse) == ']') {
7776 const char *posixcc = s + 1;
7777 RExC_parse++; /* skip over the ending ] */
7780 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7781 const I32 skip = t - posixcc;
7783 /* Initially switch on the length of the name. */
7786 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7787 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7790 /* Names all of length 5. */
7791 /* alnum alpha ascii blank cntrl digit graph lower
7792 print punct space upper */
7793 /* Offset 4 gives the best switch position. */
7794 switch (posixcc[4]) {
7796 if (memEQ(posixcc, "alph", 4)) /* alpha */
7797 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7800 if (memEQ(posixcc, "spac", 4)) /* space */
7801 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7804 if (memEQ(posixcc, "grap", 4)) /* graph */
7805 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7808 if (memEQ(posixcc, "asci", 4)) /* ascii */
7809 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7812 if (memEQ(posixcc, "blan", 4)) /* blank */
7813 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7816 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7817 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7820 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7821 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7824 if (memEQ(posixcc, "lowe", 4)) /* lower */
7825 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7826 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7827 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7830 if (memEQ(posixcc, "digi", 4)) /* digit */
7831 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7832 else if (memEQ(posixcc, "prin", 4)) /* print */
7833 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7834 else if (memEQ(posixcc, "punc", 4)) /* punct */
7835 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7840 if (memEQ(posixcc, "xdigit", 6))
7841 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7845 if (namedclass == OOB_NAMEDCLASS)
7846 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7848 assert (posixcc[skip] == ':');
7849 assert (posixcc[skip+1] == ']');
7850 } else if (!SIZE_ONLY) {
7851 /* [[=foo=]] and [[.foo.]] are still future. */
7853 /* adjust RExC_parse so the warning shows after
7855 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7857 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7860 /* Maternal grandfather:
7861 * "[:" ending in ":" but not in ":]" */
7871 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7875 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7877 if (POSIXCC(UCHARAT(RExC_parse))) {
7878 const char *s = RExC_parse;
7879 const char c = *s++;
7883 if (*s && c == *s && s[1] == ']') {
7885 "POSIX syntax [%c %c] belongs inside character classes",
7888 /* [[=foo=]] and [[.foo.]] are still future. */
7889 if (POSIXCC_NOTYET(c)) {
7890 /* adjust RExC_parse so the error shows after
7892 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7894 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7901 #define _C_C_T_(NAME,TEST,WORD) \
7904 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7906 for (value = 0; value < 256; value++) \
7908 ANYOF_BITMAP_SET(ret, value); \
7913 case ANYOF_N##NAME: \
7915 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7917 for (value = 0; value < 256; value++) \
7919 ANYOF_BITMAP_SET(ret, value); \
7925 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7927 for (value = 0; value < 256; value++) \
7929 ANYOF_BITMAP_SET(ret, value); \
7933 case ANYOF_N##NAME: \
7934 for (value = 0; value < 256; value++) \
7936 ANYOF_BITMAP_SET(ret, value); \
7942 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7943 so that it is possible to override the option here without having to
7944 rebuild the entire core. as we are required to do if we change regcomp.h
7945 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7947 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7948 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7951 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7952 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7954 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7958 parse a class specification and produce either an ANYOF node that
7959 matches the pattern or if the pattern matches a single char only and
7960 that char is < 256 and we are case insensitive then we produce an
7965 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7968 register UV nextvalue;
7969 register IV prevvalue = OOB_UNICODE;
7970 register IV range = 0;
7971 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7972 register regnode *ret;
7975 char *rangebegin = NULL;
7976 bool need_class = 0;
7979 bool optimize_invert = TRUE;
7980 AV* unicode_alternate = NULL;
7982 UV literal_endpoint = 0;
7984 UV stored = 0; /* number of chars stored in the class */
7986 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7987 case we need to change the emitted regop to an EXACT. */
7988 const char * orig_parse = RExC_parse;
7989 GET_RE_DEBUG_FLAGS_DECL;
7991 PERL_ARGS_ASSERT_REGCLASS;
7993 PERL_UNUSED_ARG(depth);
7996 DEBUG_PARSE("clas");
7998 /* Assume we are going to generate an ANYOF node. */
7999 ret = reganode(pRExC_state, ANYOF, 0);
8002 ANYOF_FLAGS(ret) = 0;
8004 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
8008 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8012 RExC_size += ANYOF_SKIP;
8013 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8016 RExC_emit += ANYOF_SKIP;
8018 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8020 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8021 ANYOF_BITMAP_ZERO(ret);
8022 listsv = newSVpvs("# comment\n");
8025 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8027 if (!SIZE_ONLY && POSIXCC(nextvalue))
8028 checkposixcc(pRExC_state);
8030 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8031 if (UCHARAT(RExC_parse) == ']')
8035 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8039 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8042 rangebegin = RExC_parse;
8044 value = utf8n_to_uvchr((U8*)RExC_parse,
8045 RExC_end - RExC_parse,
8046 &numlen, UTF8_ALLOW_DEFAULT);
8047 RExC_parse += numlen;
8050 value = UCHARAT(RExC_parse++);
8052 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8053 if (value == '[' && POSIXCC(nextvalue))
8054 namedclass = regpposixcc(pRExC_state, value);
8055 else if (value == '\\') {
8057 value = utf8n_to_uvchr((U8*)RExC_parse,
8058 RExC_end - RExC_parse,
8059 &numlen, UTF8_ALLOW_DEFAULT);
8060 RExC_parse += numlen;
8063 value = UCHARAT(RExC_parse++);
8064 /* Some compilers cannot handle switching on 64-bit integer
8065 * values, therefore value cannot be an UV. Yes, this will
8066 * be a problem later if we want switch on Unicode.
8067 * A similar issue a little bit later when switching on
8068 * namedclass. --jhi */
8069 switch ((I32)value) {
8070 case 'w': namedclass = ANYOF_ALNUM; break;
8071 case 'W': namedclass = ANYOF_NALNUM; break;
8072 case 's': namedclass = ANYOF_SPACE; break;
8073 case 'S': namedclass = ANYOF_NSPACE; break;
8074 case 'd': namedclass = ANYOF_DIGIT; break;
8075 case 'D': namedclass = ANYOF_NDIGIT; break;
8076 case 'v': namedclass = ANYOF_VERTWS; break;
8077 case 'V': namedclass = ANYOF_NVERTWS; break;
8078 case 'h': namedclass = ANYOF_HORIZWS; break;
8079 case 'H': namedclass = ANYOF_NHORIZWS; break;
8080 case 'N': /* Handle \N{NAME} in class */
8082 /* We only pay attention to the first char of
8083 multichar strings being returned. I kinda wonder
8084 if this makes sense as it does change the behaviour
8085 from earlier versions, OTOH that behaviour was broken
8087 UV v; /* value is register so we cant & it /grrr */
8088 if (reg_namedseq(pRExC_state, &v, NULL)) {
8098 if (RExC_parse >= RExC_end)
8099 vFAIL2("Empty \\%c{}", (U8)value);
8100 if (*RExC_parse == '{') {
8101 const U8 c = (U8)value;
8102 e = strchr(RExC_parse++, '}');
8104 vFAIL2("Missing right brace on \\%c{}", c);
8105 while (isSPACE(UCHARAT(RExC_parse)))
8107 if (e == RExC_parse)
8108 vFAIL2("Empty \\%c{}", c);
8110 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8118 if (UCHARAT(RExC_parse) == '^') {
8121 value = value == 'p' ? 'P' : 'p'; /* toggle */
8122 while (isSPACE(UCHARAT(RExC_parse))) {
8127 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8128 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8131 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8132 namedclass = ANYOF_MAX; /* no official name, but it's named */
8135 case 'n': value = '\n'; break;
8136 case 'r': value = '\r'; break;
8137 case 't': value = '\t'; break;
8138 case 'f': value = '\f'; break;
8139 case 'b': value = '\b'; break;
8140 case 'e': value = ASCII_TO_NATIVE('\033');break;
8141 case 'a': value = ASCII_TO_NATIVE('\007');break;
8143 RExC_parse--; /* function expects to be pointed at the 'o' */
8145 const char* error_msg;
8146 bool valid = grok_bslash_o(RExC_parse,
8151 RExC_parse += numlen;
8156 if (PL_encoding && value < 0x100) {
8157 goto recode_encoding;
8161 if (*RExC_parse == '{') {
8162 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8163 | PERL_SCAN_DISALLOW_PREFIX;
8164 char * const e = strchr(RExC_parse++, '}');
8166 vFAIL("Missing right brace on \\x{}");
8168 numlen = e - RExC_parse;
8169 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8173 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8175 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8176 RExC_parse += numlen;
8178 if (PL_encoding && value < 0x100)
8179 goto recode_encoding;
8182 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8184 case '0': case '1': case '2': case '3': case '4':
8185 case '5': case '6': case '7':
8187 /* Take 1-3 octal digits */
8188 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8190 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8191 RExC_parse += numlen;
8192 if (PL_encoding && value < 0x100)
8193 goto recode_encoding;
8198 SV* enc = PL_encoding;
8199 value = reg_recode((const char)(U8)value, &enc);
8200 if (!enc && SIZE_ONLY)
8201 ckWARNreg(RExC_parse,
8202 "Invalid escape in the specified encoding");
8206 /* Allow \_ to not give an error */
8207 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8208 ckWARN2reg(RExC_parse,
8209 "Unrecognized escape \\%c in character class passed through",
8214 } /* end of \blah */
8220 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8222 if (!SIZE_ONLY && !need_class)
8223 ANYOF_CLASS_ZERO(ret);
8227 /* a bad range like a-\d, a-[:digit:] ? */
8231 RExC_parse >= rangebegin ?
8232 RExC_parse - rangebegin : 0;
8233 ckWARN4reg(RExC_parse,
8234 "False [] range \"%*.*s\"",
8237 if (prevvalue < 256) {
8238 ANYOF_BITMAP_SET(ret, prevvalue);
8239 ANYOF_BITMAP_SET(ret, '-');
8242 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8243 Perl_sv_catpvf(aTHX_ listsv,
8244 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8248 range = 0; /* this was not a true range */
8254 const char *what = NULL;
8257 if (namedclass > OOB_NAMEDCLASS)
8258 optimize_invert = FALSE;
8259 /* Possible truncation here but in some 64-bit environments
8260 * the compiler gets heartburn about switch on 64-bit values.
8261 * A similar issue a little earlier when switching on value.
8263 switch ((I32)namedclass) {
8265 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8266 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8267 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8268 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8269 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8270 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8271 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8272 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8273 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8274 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8275 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8276 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8277 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8279 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8280 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8282 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8283 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8284 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8287 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8290 for (value = 0; value < 128; value++)
8291 ANYOF_BITMAP_SET(ret, value);
8293 for (value = 0; value < 256; value++) {
8295 ANYOF_BITMAP_SET(ret, value);
8304 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8307 for (value = 128; value < 256; value++)
8308 ANYOF_BITMAP_SET(ret, value);
8310 for (value = 0; value < 256; value++) {
8311 if (!isASCII(value))
8312 ANYOF_BITMAP_SET(ret, value);
8321 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8323 /* consecutive digits assumed */
8324 for (value = '0'; value <= '9'; value++)
8325 ANYOF_BITMAP_SET(ret, value);
8328 what = POSIX_CC_UNI_NAME("Digit");
8332 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8334 /* consecutive digits assumed */
8335 for (value = 0; value < '0'; value++)
8336 ANYOF_BITMAP_SET(ret, value);
8337 for (value = '9' + 1; value < 256; value++)
8338 ANYOF_BITMAP_SET(ret, value);
8341 what = POSIX_CC_UNI_NAME("Digit");
8344 /* this is to handle \p and \P */
8347 vFAIL("Invalid [::] class");
8351 /* Strings such as "+utf8::isWord\n" */
8352 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8355 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8358 } /* end of namedclass \blah */
8361 if (prevvalue > (IV)value) /* b-a */ {
8362 const int w = RExC_parse - rangebegin;
8363 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8364 range = 0; /* not a valid range */
8368 prevvalue = value; /* save the beginning of the range */
8369 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8370 RExC_parse[1] != ']') {
8373 /* a bad range like \w-, [:word:]- ? */
8374 if (namedclass > OOB_NAMEDCLASS) {
8375 if (ckWARN(WARN_REGEXP)) {
8377 RExC_parse >= rangebegin ?
8378 RExC_parse - rangebegin : 0;
8380 "False [] range \"%*.*s\"",
8384 ANYOF_BITMAP_SET(ret, '-');
8386 range = 1; /* yeah, it's a range! */
8387 continue; /* but do it the next time */
8391 /* now is the next time */
8392 /*stored += (value - prevvalue + 1);*/
8394 if (prevvalue < 256) {
8395 const IV ceilvalue = value < 256 ? value : 255;
8398 /* In EBCDIC [\x89-\x91] should include
8399 * the \x8e but [i-j] should not. */
8400 if (literal_endpoint == 2 &&
8401 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8402 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8404 if (isLOWER(prevvalue)) {
8405 for (i = prevvalue; i <= ceilvalue; i++)
8406 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8408 ANYOF_BITMAP_SET(ret, i);
8411 for (i = prevvalue; i <= ceilvalue; i++)
8412 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8414 ANYOF_BITMAP_SET(ret, i);
8420 for (i = prevvalue; i <= ceilvalue; i++) {
8421 if (!ANYOF_BITMAP_TEST(ret,i)) {
8423 ANYOF_BITMAP_SET(ret, i);
8427 if (value > 255 || UTF) {
8428 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8429 const UV natvalue = NATIVE_TO_UNI(value);
8430 stored+=2; /* can't optimize this class */
8431 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8432 if (prevnatvalue < natvalue) { /* what about > ? */
8433 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8434 prevnatvalue, natvalue);
8436 else if (prevnatvalue == natvalue) {
8437 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8439 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8441 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8443 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8444 if (RExC_precomp[0] == ':' &&
8445 RExC_precomp[1] == '[' &&
8446 (f == 0xDF || f == 0x92)) {
8447 f = NATIVE_TO_UNI(f);
8450 /* If folding and foldable and a single
8451 * character, insert also the folded version
8452 * to the charclass. */
8454 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8455 if ((RExC_precomp[0] == ':' &&
8456 RExC_precomp[1] == '[' &&
8458 (value == 0xFB05 || value == 0xFB06))) ?
8459 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8460 foldlen == (STRLEN)UNISKIP(f) )
8462 if (foldlen == (STRLEN)UNISKIP(f))
8464 Perl_sv_catpvf(aTHX_ listsv,
8467 /* Any multicharacter foldings
8468 * require the following transform:
8469 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8470 * where E folds into "pq" and F folds
8471 * into "rst", all other characters
8472 * fold to single characters. We save
8473 * away these multicharacter foldings,
8474 * to be later saved as part of the
8475 * additional "s" data. */
8478 if (!unicode_alternate)
8479 unicode_alternate = newAV();
8480 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8482 av_push(unicode_alternate, sv);
8486 /* If folding and the value is one of the Greek
8487 * sigmas insert a few more sigmas to make the
8488 * folding rules of the sigmas to work right.
8489 * Note that not all the possible combinations
8490 * are handled here: some of them are handled
8491 * by the standard folding rules, and some of
8492 * them (literal or EXACTF cases) are handled
8493 * during runtime in regexec.c:S_find_byclass(). */
8494 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8495 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8496 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8497 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8498 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8500 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8501 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8502 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8507 literal_endpoint = 0;
8511 range = 0; /* this range (if it was one) is done now */
8515 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8517 RExC_size += ANYOF_CLASS_ADD_SKIP;
8519 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8525 /****** !SIZE_ONLY AFTER HERE *********/
8527 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8528 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8530 /* optimize single char class to an EXACT node
8531 but *only* when its not a UTF/high char */
8532 const char * cur_parse= RExC_parse;
8533 RExC_emit = (regnode *)orig_emit;
8534 RExC_parse = (char *)orig_parse;
8535 ret = reg_node(pRExC_state,
8536 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8537 RExC_parse = (char *)cur_parse;
8538 *STRING(ret)= (char)value;
8540 RExC_emit += STR_SZ(1);
8541 SvREFCNT_dec(listsv);
8544 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8545 if ( /* If the only flag is folding (plus possibly inversion). */
8546 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8548 for (value = 0; value < 256; ++value) {
8549 if (ANYOF_BITMAP_TEST(ret, value)) {
8550 UV fold = PL_fold[value];
8553 ANYOF_BITMAP_SET(ret, fold);
8556 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8559 /* optimize inverted simple patterns (e.g. [^a-z]) */
8560 if (optimize_invert &&
8561 /* If the only flag is inversion. */
8562 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8563 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8564 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8565 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8568 AV * const av = newAV();
8570 /* The 0th element stores the character class description
8571 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8572 * to initialize the appropriate swash (which gets stored in
8573 * the 1st element), and also useful for dumping the regnode.
8574 * The 2nd element stores the multicharacter foldings,
8575 * used later (regexec.c:S_reginclass()). */
8576 av_store(av, 0, listsv);
8577 av_store(av, 1, NULL);
8578 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8579 rv = newRV_noinc(MUTABLE_SV(av));
8580 n = add_data(pRExC_state, 1, "s");
8581 RExC_rxi->data->data[n] = (void*)rv;
8589 /* reg_skipcomment()
8591 Absorbs an /x style # comments from the input stream.
8592 Returns true if there is more text remaining in the stream.
8593 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8594 terminates the pattern without including a newline.
8596 Note its the callers responsibility to ensure that we are
8602 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8606 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8608 while (RExC_parse < RExC_end)
8609 if (*RExC_parse++ == '\n') {
8614 /* we ran off the end of the pattern without ending
8615 the comment, so we have to add an \n when wrapping */
8616 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8624 Advance that parse position, and optionall absorbs
8625 "whitespace" from the inputstream.
8627 Without /x "whitespace" means (?#...) style comments only,
8628 with /x this means (?#...) and # comments and whitespace proper.
8630 Returns the RExC_parse point from BEFORE the scan occurs.
8632 This is the /x friendly way of saying RExC_parse++.
8636 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8638 char* const retval = RExC_parse++;
8640 PERL_ARGS_ASSERT_NEXTCHAR;
8643 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8644 RExC_parse[2] == '#') {
8645 while (*RExC_parse != ')') {
8646 if (RExC_parse == RExC_end)
8647 FAIL("Sequence (?#... not terminated");
8653 if (RExC_flags & RXf_PMf_EXTENDED) {
8654 if (isSPACE(*RExC_parse)) {
8658 else if (*RExC_parse == '#') {
8659 if ( reg_skipcomment( pRExC_state ) )
8668 - reg_node - emit a node
8670 STATIC regnode * /* Location. */
8671 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8674 register regnode *ptr;
8675 regnode * const ret = RExC_emit;
8676 GET_RE_DEBUG_FLAGS_DECL;
8678 PERL_ARGS_ASSERT_REG_NODE;
8681 SIZE_ALIGN(RExC_size);
8685 if (RExC_emit >= RExC_emit_bound)
8686 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8688 NODE_ALIGN_FILL(ret);
8690 FILL_ADVANCE_NODE(ptr, op);
8691 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8692 #ifdef RE_TRACK_PATTERN_OFFSETS
8693 if (RExC_offsets) { /* MJD */
8694 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8695 "reg_node", __LINE__,
8697 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8698 ? "Overwriting end of array!\n" : "OK",
8699 (UV)(RExC_emit - RExC_emit_start),
8700 (UV)(RExC_parse - RExC_start),
8701 (UV)RExC_offsets[0]));
8702 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8710 - reganode - emit a node with an argument
8712 STATIC regnode * /* Location. */
8713 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8716 register regnode *ptr;
8717 regnode * const ret = RExC_emit;
8718 GET_RE_DEBUG_FLAGS_DECL;
8720 PERL_ARGS_ASSERT_REGANODE;
8723 SIZE_ALIGN(RExC_size);
8728 assert(2==regarglen[op]+1);
8730 Anything larger than this has to allocate the extra amount.
8731 If we changed this to be:
8733 RExC_size += (1 + regarglen[op]);
8735 then it wouldn't matter. Its not clear what side effect
8736 might come from that so its not done so far.
8741 if (RExC_emit >= RExC_emit_bound)
8742 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8744 NODE_ALIGN_FILL(ret);
8746 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8747 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8748 #ifdef RE_TRACK_PATTERN_OFFSETS
8749 if (RExC_offsets) { /* MJD */
8750 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8754 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8755 "Overwriting end of array!\n" : "OK",
8756 (UV)(RExC_emit - RExC_emit_start),
8757 (UV)(RExC_parse - RExC_start),
8758 (UV)RExC_offsets[0]));
8759 Set_Cur_Node_Offset;
8767 - reguni - emit (if appropriate) a Unicode character
8770 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8774 PERL_ARGS_ASSERT_REGUNI;
8776 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8780 - reginsert - insert an operator in front of already-emitted operand
8782 * Means relocating the operand.
8785 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8788 register regnode *src;
8789 register regnode *dst;
8790 register regnode *place;
8791 const int offset = regarglen[(U8)op];
8792 const int size = NODE_STEP_REGNODE + offset;
8793 GET_RE_DEBUG_FLAGS_DECL;
8795 PERL_ARGS_ASSERT_REGINSERT;
8796 PERL_UNUSED_ARG(depth);
8797 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8798 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8807 if (RExC_open_parens) {
8809 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8810 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8811 if ( RExC_open_parens[paren] >= opnd ) {
8812 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8813 RExC_open_parens[paren] += size;
8815 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8817 if ( RExC_close_parens[paren] >= opnd ) {
8818 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8819 RExC_close_parens[paren] += size;
8821 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8826 while (src > opnd) {
8827 StructCopy(--src, --dst, regnode);
8828 #ifdef RE_TRACK_PATTERN_OFFSETS
8829 if (RExC_offsets) { /* MJD 20010112 */
8830 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8834 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8835 ? "Overwriting end of array!\n" : "OK",
8836 (UV)(src - RExC_emit_start),
8837 (UV)(dst - RExC_emit_start),
8838 (UV)RExC_offsets[0]));
8839 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8840 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8846 place = opnd; /* Op node, where operand used to be. */
8847 #ifdef RE_TRACK_PATTERN_OFFSETS
8848 if (RExC_offsets) { /* MJD */
8849 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8853 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8854 ? "Overwriting end of array!\n" : "OK",
8855 (UV)(place - RExC_emit_start),
8856 (UV)(RExC_parse - RExC_start),
8857 (UV)RExC_offsets[0]));
8858 Set_Node_Offset(place, RExC_parse);
8859 Set_Node_Length(place, 1);
8862 src = NEXTOPER(place);
8863 FILL_ADVANCE_NODE(place, op);
8864 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8865 Zero(src, offset, regnode);
8869 - regtail - set the next-pointer at the end of a node chain of p to val.
8870 - SEE ALSO: regtail_study
8872 /* TODO: All three parms should be const */
8874 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8877 register regnode *scan;
8878 GET_RE_DEBUG_FLAGS_DECL;
8880 PERL_ARGS_ASSERT_REGTAIL;
8882 PERL_UNUSED_ARG(depth);
8888 /* Find last node. */
8891 regnode * const temp = regnext(scan);
8893 SV * const mysv=sv_newmortal();
8894 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8895 regprop(RExC_rx, mysv, scan);
8896 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8897 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8898 (temp == NULL ? "->" : ""),
8899 (temp == NULL ? PL_reg_name[OP(val)] : "")
8907 if (reg_off_by_arg[OP(scan)]) {
8908 ARG_SET(scan, val - scan);
8911 NEXT_OFF(scan) = val - scan;
8917 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8918 - Look for optimizable sequences at the same time.
8919 - currently only looks for EXACT chains.
8921 This is expermental code. The idea is to use this routine to perform
8922 in place optimizations on branches and groups as they are constructed,
8923 with the long term intention of removing optimization from study_chunk so
8924 that it is purely analytical.
8926 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8927 to control which is which.
8930 /* TODO: All four parms should be const */
8933 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8936 register regnode *scan;
8938 #ifdef EXPERIMENTAL_INPLACESCAN
8941 GET_RE_DEBUG_FLAGS_DECL;
8943 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8949 /* Find last node. */
8953 regnode * const temp = regnext(scan);
8954 #ifdef EXPERIMENTAL_INPLACESCAN
8955 if (PL_regkind[OP(scan)] == EXACT)
8956 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8964 if( exact == PSEUDO )
8966 else if ( exact != OP(scan) )
8975 SV * const mysv=sv_newmortal();
8976 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8977 regprop(RExC_rx, mysv, scan);
8978 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8979 SvPV_nolen_const(mysv),
8981 PL_reg_name[exact]);
8988 SV * const mysv_val=sv_newmortal();
8989 DEBUG_PARSE_MSG("");
8990 regprop(RExC_rx, mysv_val, val);
8991 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8992 SvPV_nolen_const(mysv_val),
8993 (IV)REG_NODE_NUM(val),
8997 if (reg_off_by_arg[OP(scan)]) {
8998 ARG_SET(scan, val - scan);
9001 NEXT_OFF(scan) = val - scan;
9009 - regcurly - a little FSA that accepts {\d+,?\d*}
9011 #ifndef PERL_IN_XSUB_RE
9013 Perl_regcurly(register const char *s)
9015 PERL_ARGS_ASSERT_REGCURLY;
9034 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9038 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9043 for (bit=0; bit<32; bit++) {
9044 if (flags & (1<<bit)) {
9046 PerlIO_printf(Perl_debug_log, "%s",lead);
9047 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9052 PerlIO_printf(Perl_debug_log, "\n");
9054 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9060 Perl_regdump(pTHX_ const regexp *r)
9064 SV * const sv = sv_newmortal();
9065 SV *dsv= sv_newmortal();
9067 GET_RE_DEBUG_FLAGS_DECL;
9069 PERL_ARGS_ASSERT_REGDUMP;
9071 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9073 /* Header fields of interest. */
9074 if (r->anchored_substr) {
9075 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9076 RE_SV_DUMPLEN(r->anchored_substr), 30);
9077 PerlIO_printf(Perl_debug_log,
9078 "anchored %s%s at %"IVdf" ",
9079 s, RE_SV_TAIL(r->anchored_substr),
9080 (IV)r->anchored_offset);
9081 } else if (r->anchored_utf8) {
9082 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9083 RE_SV_DUMPLEN(r->anchored_utf8), 30);
9084 PerlIO_printf(Perl_debug_log,
9085 "anchored utf8 %s%s at %"IVdf" ",
9086 s, RE_SV_TAIL(r->anchored_utf8),
9087 (IV)r->anchored_offset);
9089 if (r->float_substr) {
9090 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9091 RE_SV_DUMPLEN(r->float_substr), 30);
9092 PerlIO_printf(Perl_debug_log,
9093 "floating %s%s at %"IVdf"..%"UVuf" ",
9094 s, RE_SV_TAIL(r->float_substr),
9095 (IV)r->float_min_offset, (UV)r->float_max_offset);
9096 } else if (r->float_utf8) {
9097 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9098 RE_SV_DUMPLEN(r->float_utf8), 30);
9099 PerlIO_printf(Perl_debug_log,
9100 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9101 s, RE_SV_TAIL(r->float_utf8),
9102 (IV)r->float_min_offset, (UV)r->float_max_offset);
9104 if (r->check_substr || r->check_utf8)
9105 PerlIO_printf(Perl_debug_log,
9107 (r->check_substr == r->float_substr
9108 && r->check_utf8 == r->float_utf8
9109 ? "(checking floating" : "(checking anchored"));
9110 if (r->extflags & RXf_NOSCAN)
9111 PerlIO_printf(Perl_debug_log, " noscan");
9112 if (r->extflags & RXf_CHECK_ALL)
9113 PerlIO_printf(Perl_debug_log, " isall");
9114 if (r->check_substr || r->check_utf8)
9115 PerlIO_printf(Perl_debug_log, ") ");
9117 if (ri->regstclass) {
9118 regprop(r, sv, ri->regstclass);
9119 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9121 if (r->extflags & RXf_ANCH) {
9122 PerlIO_printf(Perl_debug_log, "anchored");
9123 if (r->extflags & RXf_ANCH_BOL)
9124 PerlIO_printf(Perl_debug_log, "(BOL)");
9125 if (r->extflags & RXf_ANCH_MBOL)
9126 PerlIO_printf(Perl_debug_log, "(MBOL)");
9127 if (r->extflags & RXf_ANCH_SBOL)
9128 PerlIO_printf(Perl_debug_log, "(SBOL)");
9129 if (r->extflags & RXf_ANCH_GPOS)
9130 PerlIO_printf(Perl_debug_log, "(GPOS)");
9131 PerlIO_putc(Perl_debug_log, ' ');
9133 if (r->extflags & RXf_GPOS_SEEN)
9134 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9135 if (r->intflags & PREGf_SKIP)
9136 PerlIO_printf(Perl_debug_log, "plus ");
9137 if (r->intflags & PREGf_IMPLICIT)
9138 PerlIO_printf(Perl_debug_log, "implicit ");
9139 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9140 if (r->extflags & RXf_EVAL_SEEN)
9141 PerlIO_printf(Perl_debug_log, "with eval ");
9142 PerlIO_printf(Perl_debug_log, "\n");
9143 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9145 PERL_ARGS_ASSERT_REGDUMP;
9146 PERL_UNUSED_CONTEXT;
9148 #endif /* DEBUGGING */
9152 - regprop - printable representation of opcode
9154 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9157 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9158 if (flags & ANYOF_INVERT) \
9159 /*make sure the invert info is in each */ \
9160 sv_catpvs(sv, "^"); \
9166 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9171 RXi_GET_DECL(prog,progi);
9172 GET_RE_DEBUG_FLAGS_DECL;
9174 PERL_ARGS_ASSERT_REGPROP;
9178 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9179 /* It would be nice to FAIL() here, but this may be called from
9180 regexec.c, and it would be hard to supply pRExC_state. */
9181 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9182 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9184 k = PL_regkind[OP(o)];
9188 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9189 * is a crude hack but it may be the best for now since
9190 * we have no flag "this EXACTish node was UTF-8"
9192 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9193 PERL_PV_ESCAPE_UNI_DETECT |
9194 PERL_PV_PRETTY_ELLIPSES |
9195 PERL_PV_PRETTY_LTGT |
9196 PERL_PV_PRETTY_NOCLEAR
9198 } else if (k == TRIE) {
9199 /* print the details of the trie in dumpuntil instead, as
9200 * progi->data isn't available here */
9201 const char op = OP(o);
9202 const U32 n = ARG(o);
9203 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9204 (reg_ac_data *)progi->data->data[n] :
9206 const reg_trie_data * const trie
9207 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9209 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9210 DEBUG_TRIE_COMPILE_r(
9211 Perl_sv_catpvf(aTHX_ sv,
9212 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9213 (UV)trie->startstate,
9214 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9215 (UV)trie->wordcount,
9218 (UV)TRIE_CHARCOUNT(trie),
9219 (UV)trie->uniquecharcount
9222 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9224 int rangestart = -1;
9225 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9227 for (i = 0; i <= 256; i++) {
9228 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9229 if (rangestart == -1)
9231 } else if (rangestart != -1) {
9232 if (i <= rangestart + 3)
9233 for (; rangestart < i; rangestart++)
9234 put_byte(sv, rangestart);
9236 put_byte(sv, rangestart);
9238 put_byte(sv, i - 1);
9246 } else if (k == CURLY) {
9247 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9248 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9249 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9251 else if (k == WHILEM && o->flags) /* Ordinal/of */
9252 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9253 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9254 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9255 if ( RXp_PAREN_NAMES(prog) ) {
9256 if ( k != REF || OP(o) < NREF) {
9257 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9258 SV **name= av_fetch(list, ARG(o), 0 );
9260 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9263 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9264 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9265 I32 *nums=(I32*)SvPVX(sv_dat);
9266 SV **name= av_fetch(list, nums[0], 0 );
9269 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9270 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9271 (n ? "," : ""), (IV)nums[n]);
9273 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9277 } else if (k == GOSUB)
9278 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9279 else if (k == VERB) {
9281 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9282 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9283 } else if (k == LOGICAL)
9284 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9285 else if (k == FOLDCHAR)
9286 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9287 else if (k == ANYOF) {
9288 int i, rangestart = -1;
9289 const U8 flags = ANYOF_FLAGS(o);
9292 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9293 static const char * const anyofs[] = {
9326 if (flags & ANYOF_LOCALE)
9327 sv_catpvs(sv, "{loc}");
9328 if (flags & ANYOF_FOLD)
9329 sv_catpvs(sv, "{i}");
9330 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9331 if (flags & ANYOF_INVERT)
9334 /* output what the standard cp 0-255 bitmap matches */
9335 for (i = 0; i <= 256; i++) {
9336 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9337 if (rangestart == -1)
9339 } else if (rangestart != -1) {
9340 if (i <= rangestart + 3)
9341 for (; rangestart < i; rangestart++)
9342 put_byte(sv, rangestart);
9344 put_byte(sv, rangestart);
9346 put_byte(sv, i - 1);
9353 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9354 /* output any special charclass tests (used mostly under use locale) */
9355 if (o->flags & ANYOF_CLASS)
9356 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9357 if (ANYOF_CLASS_TEST(o,i)) {
9358 sv_catpv(sv, anyofs[i]);
9362 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9364 /* output information about the unicode matching */
9365 if (flags & ANYOF_UNICODE)
9366 sv_catpvs(sv, "{unicode}");
9367 else if (flags & ANYOF_UNICODE_ALL)
9368 sv_catpvs(sv, "{unicode_all}");
9372 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9376 U8 s[UTF8_MAXBYTES_CASE+1];
9378 for (i = 0; i <= 256; i++) { /* just the first 256 */
9379 uvchr_to_utf8(s, i);
9381 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9382 if (rangestart == -1)
9384 } else if (rangestart != -1) {
9385 if (i <= rangestart + 3)
9386 for (; rangestart < i; rangestart++) {
9387 const U8 * const e = uvchr_to_utf8(s,rangestart);
9389 for(p = s; p < e; p++)
9393 const U8 *e = uvchr_to_utf8(s,rangestart);
9395 for (p = s; p < e; p++)
9398 e = uvchr_to_utf8(s, i-1);
9399 for (p = s; p < e; p++)
9406 sv_catpvs(sv, "..."); /* et cetera */
9410 char *s = savesvpv(lv);
9411 char * const origs = s;
9413 while (*s && *s != '\n')
9417 const char * const t = ++s;
9435 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9437 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9438 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9440 PERL_UNUSED_CONTEXT;
9441 PERL_UNUSED_ARG(sv);
9443 PERL_UNUSED_ARG(prog);
9444 #endif /* DEBUGGING */
9448 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9449 { /* Assume that RE_INTUIT is set */
9451 struct regexp *const prog = (struct regexp *)SvANY(r);
9452 GET_RE_DEBUG_FLAGS_DECL;
9454 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9455 PERL_UNUSED_CONTEXT;
9459 const char * const s = SvPV_nolen_const(prog->check_substr
9460 ? prog->check_substr : prog->check_utf8);
9462 if (!PL_colorset) reginitcolors();
9463 PerlIO_printf(Perl_debug_log,
9464 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9466 prog->check_substr ? "" : "utf8 ",
9467 PL_colors[5],PL_colors[0],
9470 (strlen(s) > 60 ? "..." : ""));
9473 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9479 handles refcounting and freeing the perl core regexp structure. When
9480 it is necessary to actually free the structure the first thing it
9481 does is call the 'free' method of the regexp_engine associated to to
9482 the regexp, allowing the handling of the void *pprivate; member
9483 first. (This routine is not overridable by extensions, which is why
9484 the extensions free is called first.)
9486 See regdupe and regdupe_internal if you change anything here.
9488 #ifndef PERL_IN_XSUB_RE
9490 Perl_pregfree(pTHX_ REGEXP *r)
9496 Perl_pregfree2(pTHX_ REGEXP *rx)
9499 struct regexp *const r = (struct regexp *)SvANY(rx);
9500 GET_RE_DEBUG_FLAGS_DECL;
9502 PERL_ARGS_ASSERT_PREGFREE2;
9505 ReREFCNT_dec(r->mother_re);
9507 CALLREGFREE_PVT(rx); /* free the private data */
9508 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9511 SvREFCNT_dec(r->anchored_substr);
9512 SvREFCNT_dec(r->anchored_utf8);
9513 SvREFCNT_dec(r->float_substr);
9514 SvREFCNT_dec(r->float_utf8);
9515 Safefree(r->substrs);
9517 RX_MATCH_COPY_FREE(rx);
9518 #ifdef PERL_OLD_COPY_ON_WRITE
9519 SvREFCNT_dec(r->saved_copy);
9526 This is a hacky workaround to the structural issue of match results
9527 being stored in the regexp structure which is in turn stored in
9528 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9529 could be PL_curpm in multiple contexts, and could require multiple
9530 result sets being associated with the pattern simultaneously, such
9531 as when doing a recursive match with (??{$qr})
9533 The solution is to make a lightweight copy of the regexp structure
9534 when a qr// is returned from the code executed by (??{$qr}) this
9535 lightweight copy doesnt actually own any of its data except for
9536 the starp/end and the actual regexp structure itself.
9542 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9545 struct regexp *const r = (struct regexp *)SvANY(rx);
9546 register const I32 npar = r->nparens+1;
9548 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9551 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9552 ret = (struct regexp *)SvANY(ret_x);
9554 (void)ReREFCNT_inc(rx);
9555 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9556 by pointing directly at the buffer, but flagging that the allocated
9557 space in the copy is zero. As we've just done a struct copy, it's now
9558 a case of zero-ing that, rather than copying the current length. */
9559 SvPV_set(ret_x, RX_WRAPPED(rx));
9560 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9561 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9562 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9563 SvLEN_set(ret_x, 0);
9564 SvSTASH_set(ret_x, NULL);
9565 SvMAGIC_set(ret_x, NULL);
9566 Newx(ret->offs, npar, regexp_paren_pair);
9567 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9569 Newx(ret->substrs, 1, struct reg_substr_data);
9570 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9572 SvREFCNT_inc_void(ret->anchored_substr);
9573 SvREFCNT_inc_void(ret->anchored_utf8);
9574 SvREFCNT_inc_void(ret->float_substr);
9575 SvREFCNT_inc_void(ret->float_utf8);
9577 /* check_substr and check_utf8, if non-NULL, point to either their
9578 anchored or float namesakes, and don't hold a second reference. */
9580 RX_MATCH_COPIED_off(ret_x);
9581 #ifdef PERL_OLD_COPY_ON_WRITE
9582 ret->saved_copy = NULL;
9584 ret->mother_re = rx;
9590 /* regfree_internal()
9592 Free the private data in a regexp. This is overloadable by
9593 extensions. Perl takes care of the regexp structure in pregfree(),
9594 this covers the *pprivate pointer which technically perldoesnt
9595 know about, however of course we have to handle the
9596 regexp_internal structure when no extension is in use.
9598 Note this is called before freeing anything in the regexp
9603 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9606 struct regexp *const r = (struct regexp *)SvANY(rx);
9608 GET_RE_DEBUG_FLAGS_DECL;
9610 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9616 SV *dsv= sv_newmortal();
9617 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9618 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9619 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9620 PL_colors[4],PL_colors[5],s);
9623 #ifdef RE_TRACK_PATTERN_OFFSETS
9625 Safefree(ri->u.offsets); /* 20010421 MJD */
9628 int n = ri->data->count;
9629 PAD* new_comppad = NULL;
9634 /* If you add a ->what type here, update the comment in regcomp.h */
9635 switch (ri->data->what[n]) {
9640 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9643 Safefree(ri->data->data[n]);
9646 new_comppad = MUTABLE_AV(ri->data->data[n]);
9649 if (new_comppad == NULL)
9650 Perl_croak(aTHX_ "panic: pregfree comppad");
9651 PAD_SAVE_LOCAL(old_comppad,
9652 /* Watch out for global destruction's random ordering. */
9653 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9656 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9659 op_free((OP_4tree*)ri->data->data[n]);
9661 PAD_RESTORE_LOCAL(old_comppad);
9662 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9668 { /* Aho Corasick add-on structure for a trie node.
9669 Used in stclass optimization only */
9671 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9673 refcount = --aho->refcount;
9676 PerlMemShared_free(aho->states);
9677 PerlMemShared_free(aho->fail);
9678 /* do this last!!!! */
9679 PerlMemShared_free(ri->data->data[n]);
9680 PerlMemShared_free(ri->regstclass);
9686 /* trie structure. */
9688 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9690 refcount = --trie->refcount;
9693 PerlMemShared_free(trie->charmap);
9694 PerlMemShared_free(trie->states);
9695 PerlMemShared_free(trie->trans);
9697 PerlMemShared_free(trie->bitmap);
9699 PerlMemShared_free(trie->jump);
9700 PerlMemShared_free(trie->wordinfo);
9701 /* do this last!!!! */
9702 PerlMemShared_free(ri->data->data[n]);
9707 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9710 Safefree(ri->data->what);
9717 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9718 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
9719 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9722 re_dup - duplicate a regexp.
9724 This routine is expected to clone a given regexp structure. It is only
9725 compiled under USE_ITHREADS.
9727 After all of the core data stored in struct regexp is duplicated
9728 the regexp_engine.dupe method is used to copy any private data
9729 stored in the *pprivate pointer. This allows extensions to handle
9730 any duplication it needs to do.
9732 See pregfree() and regfree_internal() if you change anything here.
9734 #if defined(USE_ITHREADS)
9735 #ifndef PERL_IN_XSUB_RE
9737 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9741 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9742 struct regexp *ret = (struct regexp *)SvANY(dstr);
9744 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9746 npar = r->nparens+1;
9747 Newx(ret->offs, npar, regexp_paren_pair);
9748 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9750 /* no need to copy these */
9751 Newx(ret->swap, npar, regexp_paren_pair);
9755 /* Do it this way to avoid reading from *r after the StructCopy().
9756 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9757 cache, it doesn't matter. */
9758 const bool anchored = r->check_substr
9759 ? r->check_substr == r->anchored_substr
9760 : r->check_utf8 == r->anchored_utf8;
9761 Newx(ret->substrs, 1, struct reg_substr_data);
9762 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9764 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9765 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9766 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9767 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9769 /* check_substr and check_utf8, if non-NULL, point to either their
9770 anchored or float namesakes, and don't hold a second reference. */
9772 if (ret->check_substr) {
9774 assert(r->check_utf8 == r->anchored_utf8);
9775 ret->check_substr = ret->anchored_substr;
9776 ret->check_utf8 = ret->anchored_utf8;
9778 assert(r->check_substr == r->float_substr);
9779 assert(r->check_utf8 == r->float_utf8);
9780 ret->check_substr = ret->float_substr;
9781 ret->check_utf8 = ret->float_utf8;
9783 } else if (ret->check_utf8) {
9785 ret->check_utf8 = ret->anchored_utf8;
9787 ret->check_utf8 = ret->float_utf8;
9792 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9795 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9797 if (RX_MATCH_COPIED(dstr))
9798 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9801 #ifdef PERL_OLD_COPY_ON_WRITE
9802 ret->saved_copy = NULL;
9805 if (ret->mother_re) {
9806 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9807 /* Our storage points directly to our mother regexp, but that's
9808 1: a buffer in a different thread
9809 2: something we no longer hold a reference on
9810 so we need to copy it locally. */
9811 /* Note we need to sue SvCUR() on our mother_re, because it, in
9812 turn, may well be pointing to its own mother_re. */
9813 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9814 SvCUR(ret->mother_re)+1));
9815 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9817 ret->mother_re = NULL;
9821 #endif /* PERL_IN_XSUB_RE */
9826 This is the internal complement to regdupe() which is used to copy
9827 the structure pointed to by the *pprivate pointer in the regexp.
9828 This is the core version of the extension overridable cloning hook.
9829 The regexp structure being duplicated will be copied by perl prior
9830 to this and will be provided as the regexp *r argument, however
9831 with the /old/ structures pprivate pointer value. Thus this routine
9832 may override any copying normally done by perl.
9834 It returns a pointer to the new regexp_internal structure.
9838 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9841 struct regexp *const r = (struct regexp *)SvANY(rx);
9842 regexp_internal *reti;
9846 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9848 npar = r->nparens+1;
9851 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9852 Copy(ri->program, reti->program, len+1, regnode);
9855 reti->regstclass = NULL;
9859 const int count = ri->data->count;
9862 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9863 char, struct reg_data);
9864 Newx(d->what, count, U8);
9867 for (i = 0; i < count; i++) {
9868 d->what[i] = ri->data->what[i];
9869 switch (d->what[i]) {
9870 /* legal options are one of: sSfpontTua
9871 see also regcomp.h and pregfree() */
9872 case 'a': /* actually an AV, but the dup function is identical. */
9875 case 'p': /* actually an AV, but the dup function is identical. */
9876 case 'u': /* actually an HV, but the dup function is identical. */
9877 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9880 /* This is cheating. */
9881 Newx(d->data[i], 1, struct regnode_charclass_class);
9882 StructCopy(ri->data->data[i], d->data[i],
9883 struct regnode_charclass_class);
9884 reti->regstclass = (regnode*)d->data[i];
9887 /* Compiled op trees are readonly and in shared memory,
9888 and can thus be shared without duplication. */
9890 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9894 /* Trie stclasses are readonly and can thus be shared
9895 * without duplication. We free the stclass in pregfree
9896 * when the corresponding reg_ac_data struct is freed.
9898 reti->regstclass= ri->regstclass;
9902 ((reg_trie_data*)ri->data->data[i])->refcount++;
9906 d->data[i] = ri->data->data[i];
9909 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9918 reti->name_list_idx = ri->name_list_idx;
9920 #ifdef RE_TRACK_PATTERN_OFFSETS
9921 if (ri->u.offsets) {
9922 Newx(reti->u.offsets, 2*len+1, U32);
9923 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9926 SetProgLen(reti,len);
9932 #endif /* USE_ITHREADS */
9934 #ifndef PERL_IN_XSUB_RE
9937 - regnext - dig the "next" pointer out of a node
9940 Perl_regnext(pTHX_ register regnode *p)
9943 register I32 offset;
9948 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
9949 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
9952 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9961 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9964 STRLEN l1 = strlen(pat1);
9965 STRLEN l2 = strlen(pat2);
9968 const char *message;
9970 PERL_ARGS_ASSERT_RE_CROAK2;
9976 Copy(pat1, buf, l1 , char);
9977 Copy(pat2, buf + l1, l2 , char);
9978 buf[l1 + l2] = '\n';
9979 buf[l1 + l2 + 1] = '\0';
9981 /* ANSI variant takes additional second argument */
9982 va_start(args, pat2);
9986 msv = vmess(buf, &args);
9988 message = SvPV_const(msv,l1);
9991 Copy(message, buf, l1 , char);
9992 buf[l1-1] = '\0'; /* Overwrite \n */
9993 Perl_croak(aTHX_ "%s", buf);
9996 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9998 #ifndef PERL_IN_XSUB_RE
10000 Perl_save_re_context(pTHX)
10004 struct re_save_state *state;
10006 SAVEVPTR(PL_curcop);
10007 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10009 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10010 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10011 SSPUSHUV(SAVEt_RE_STATE);
10013 Copy(&PL_reg_state, state, 1, struct re_save_state);
10015 PL_reg_start_tmp = 0;
10016 PL_reg_start_tmpl = 0;
10017 PL_reg_oldsaved = NULL;
10018 PL_reg_oldsavedlen = 0;
10019 PL_reg_maxiter = 0;
10020 PL_reg_leftiter = 0;
10021 PL_reg_poscache = NULL;
10022 PL_reg_poscache_size = 0;
10023 #ifdef PERL_OLD_COPY_ON_WRITE
10027 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10029 const REGEXP * const rx = PM_GETRE(PL_curpm);
10032 for (i = 1; i <= RX_NPARENS(rx); i++) {
10033 char digits[TYPE_CHARS(long)];
10034 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10035 GV *const *const gvp
10036 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10039 GV * const gv = *gvp;
10040 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10050 clear_re(pTHX_ void *r)
10053 ReREFCNT_dec((REGEXP *)r);
10059 S_put_byte(pTHX_ SV *sv, int c)
10061 PERL_ARGS_ASSERT_PUT_BYTE;
10063 /* Our definition of isPRINT() ignores locales, so only bytes that are
10064 not part of UTF-8 are considered printable. I assume that the same
10065 holds for UTF-EBCDIC.
10066 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10067 which Wikipedia says:
10069 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10070 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10071 identical, to the ASCII delete (DEL) or rubout control character.
10072 ) So the old condition can be simplified to !isPRINT(c) */
10074 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
10076 const char string = c;
10077 if (c == '-' || c == ']' || c == '\\' || c == '^')
10078 sv_catpvs(sv, "\\");
10079 sv_catpvn(sv, &string, 1);
10084 #define CLEAR_OPTSTART \
10085 if (optstart) STMT_START { \
10086 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10090 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10092 STATIC const regnode *
10093 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10094 const regnode *last, const regnode *plast,
10095 SV* sv, I32 indent, U32 depth)
10098 register U8 op = PSEUDO; /* Arbitrary non-END op. */
10099 register const regnode *next;
10100 const regnode *optstart= NULL;
10102 RXi_GET_DECL(r,ri);
10103 GET_RE_DEBUG_FLAGS_DECL;
10105 PERL_ARGS_ASSERT_DUMPUNTIL;
10107 #ifdef DEBUG_DUMPUNTIL
10108 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10109 last ? last-start : 0,plast ? plast-start : 0);
10112 if (plast && plast < last)
10115 while (PL_regkind[op] != END && (!last || node < last)) {
10116 /* While that wasn't END last time... */
10119 if (op == CLOSE || op == WHILEM)
10121 next = regnext((regnode *)node);
10124 if (OP(node) == OPTIMIZED) {
10125 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10132 regprop(r, sv, node);
10133 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10134 (int)(2*indent + 1), "", SvPVX_const(sv));
10136 if (OP(node) != OPTIMIZED) {
10137 if (next == NULL) /* Next ptr. */
10138 PerlIO_printf(Perl_debug_log, " (0)");
10139 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10140 PerlIO_printf(Perl_debug_log, " (FAIL)");
10142 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10143 (void)PerlIO_putc(Perl_debug_log, '\n');
10147 if (PL_regkind[(U8)op] == BRANCHJ) {
10150 register const regnode *nnode = (OP(next) == LONGJMP
10151 ? regnext((regnode *)next)
10153 if (last && nnode > last)
10155 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10158 else if (PL_regkind[(U8)op] == BRANCH) {
10160 DUMPUNTIL(NEXTOPER(node), next);
10162 else if ( PL_regkind[(U8)op] == TRIE ) {
10163 const regnode *this_trie = node;
10164 const char op = OP(node);
10165 const U32 n = ARG(node);
10166 const reg_ac_data * const ac = op>=AHOCORASICK ?
10167 (reg_ac_data *)ri->data->data[n] :
10169 const reg_trie_data * const trie =
10170 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10172 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10174 const regnode *nextbranch= NULL;
10177 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10178 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10180 PerlIO_printf(Perl_debug_log, "%*s%s ",
10181 (int)(2*(indent+3)), "",
10182 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10183 PL_colors[0], PL_colors[1],
10184 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10185 PERL_PV_PRETTY_ELLIPSES |
10186 PERL_PV_PRETTY_LTGT
10191 U16 dist= trie->jump[word_idx+1];
10192 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10193 (UV)((dist ? this_trie + dist : next) - start));
10196 nextbranch= this_trie + trie->jump[0];
10197 DUMPUNTIL(this_trie + dist, nextbranch);
10199 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10200 nextbranch= regnext((regnode *)nextbranch);
10202 PerlIO_printf(Perl_debug_log, "\n");
10205 if (last && next > last)
10210 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10211 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10212 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10214 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10216 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10218 else if ( op == PLUS || op == STAR) {
10219 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10221 else if (op == ANYOF) {
10222 /* arglen 1 + class block */
10223 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10224 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10225 node = NEXTOPER(node);
10227 else if (PL_regkind[(U8)op] == EXACT) {
10228 /* Literal string, where present. */
10229 node += NODE_SZ_STR(node) - 1;
10230 node = NEXTOPER(node);
10233 node = NEXTOPER(node);
10234 node += regarglen[(U8)op];
10236 if (op == CURLYX || op == OPEN)
10240 #ifdef DEBUG_DUMPUNTIL
10241 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10246 #endif /* DEBUGGING */
10250 * c-indentation-style: bsd
10251 * c-basic-offset: 4
10252 * indent-tabs-mode: t
10255 * ex: set ts=8 sts=4 sw=4 noet: