5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
93 # if defined(BUGGY_MSC6)
94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 # pragma optimize("a",off)
96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 # pragma optimize("w",on )
98 # endif /* BUGGY_MSC6 */
102 #define STATIC static
105 typedef struct RExC_state_t {
106 U32 flags; /* are we folding, multilining? */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
111 char *start; /* Start of input for compile */
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
115 regnode *emit_start; /* Start of emitted-code area */
116 regnode *emit_bound; /* First regnode outside of the allocated space */
117 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
121 I32 size; /* Code size. */
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
124 I32 nestroot; /* root parens we are in - used by accept */
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
135 HV *paren_names; /* Paren names */
137 regnode **recurse; /* Recurse regops */
138 I32 recurse_count; /* Number of recurse regops */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
144 const char *lastparse;
146 AV *paren_name_list; /* idx -> name */
147 #define RExC_lastparse (pRExC_state->lastparse)
148 #define RExC_lastnum (pRExC_state->lastnum)
149 #define RExC_paren_name_list (pRExC_state->paren_name_list)
153 #define RExC_flags (pRExC_state->flags)
154 #define RExC_precomp (pRExC_state->precomp)
155 #define RExC_rx_sv (pRExC_state->rx_sv)
156 #define RExC_rx (pRExC_state->rx)
157 #define RExC_rxi (pRExC_state->rxi)
158 #define RExC_start (pRExC_state->start)
159 #define RExC_end (pRExC_state->end)
160 #define RExC_parse (pRExC_state->parse)
161 #define RExC_whilem_seen (pRExC_state->whilem_seen)
162 #ifdef RE_TRACK_PATTERN_OFFSETS
163 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
165 #define RExC_emit (pRExC_state->emit)
166 #define RExC_emit_start (pRExC_state->emit_start)
167 #define RExC_emit_bound (pRExC_state->emit_bound)
168 #define RExC_naughty (pRExC_state->naughty)
169 #define RExC_sawback (pRExC_state->sawback)
170 #define RExC_seen (pRExC_state->seen)
171 #define RExC_size (pRExC_state->size)
172 #define RExC_npar (pRExC_state->npar)
173 #define RExC_nestroot (pRExC_state->nestroot)
174 #define RExC_extralen (pRExC_state->extralen)
175 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
176 #define RExC_seen_evals (pRExC_state->seen_evals)
177 #define RExC_utf8 (pRExC_state->utf8)
178 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
179 #define RExC_open_parens (pRExC_state->open_parens)
180 #define RExC_close_parens (pRExC_state->close_parens)
181 #define RExC_opend (pRExC_state->opend)
182 #define RExC_paren_names (pRExC_state->paren_names)
183 #define RExC_recurse (pRExC_state->recurse)
184 #define RExC_recurse_count (pRExC_state->recurse_count)
187 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
188 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 ((*s) == '{' && regcurly(s)))
192 #undef SPSTART /* dratted cpp namespace... */
195 * Flags to be passed up and down.
197 #define WORST 0 /* Worst case. */
198 #define HASWIDTH 0x01 /* Known to match non-null strings. */
199 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
200 #define SPSTART 0x04 /* Starts with * or +. */
201 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
202 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
204 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
206 /* whether trie related optimizations are enabled */
207 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208 #define TRIE_STUDY_OPT
209 #define FULL_TRIE_STUDY
215 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216 #define PBITVAL(paren) (1 << ((paren) & 7))
217 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
222 /* About scan_data_t.
224 During optimisation we recurse through the regexp program performing
225 various inplace (keyhole style) optimisations. In addition study_chunk
226 and scan_commit populate this data structure with information about
227 what strings MUST appear in the pattern. We look for the longest
228 string that must appear for at a fixed location, and we look for the
229 longest string that may appear at a floating location. So for instance
234 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
235 strings (because they follow a .* construct). study_chunk will identify
236 both FOO and BAR as being the longest fixed and floating strings respectively.
238 The strings can be composites, for instance
242 will result in a composite fixed substring 'foo'.
244 For each string some basic information is maintained:
246 - offset or min_offset
247 This is the position the string must appear at, or not before.
248 It also implicitly (when combined with minlenp) tells us how many
249 character must match before the string we are searching.
250 Likewise when combined with minlenp and the length of the string
251 tells us how many characters must appear after the string we have
255 Only used for floating strings. This is the rightmost point that
256 the string can appear at. Ifset to I32 max it indicates that the
257 string can occur infinitely far to the right.
260 A pointer to the minimum length of the pattern that the string
261 was found inside. This is important as in the case of positive
262 lookahead or positive lookbehind we can have multiple patterns
267 The minimum length of the pattern overall is 3, the minimum length
268 of the lookahead part is 3, but the minimum length of the part that
269 will actually match is 1. So 'FOO's minimum length is 3, but the
270 minimum length for the F is 1. This is important as the minimum length
271 is used to determine offsets in front of and behind the string being
272 looked for. Since strings can be composites this is the length of the
273 pattern at the time it was commited with a scan_commit. Note that
274 the length is calculated by study_chunk, so that the minimum lengths
275 are not known until the full pattern has been compiled, thus the
276 pointer to the value.
280 In the case of lookbehind the string being searched for can be
281 offset past the start point of the final matching string.
282 If this value was just blithely removed from the min_offset it would
283 invalidate some of the calculations for how many chars must match
284 before or after (as they are derived from min_offset and minlen and
285 the length of the string being searched for).
286 When the final pattern is compiled and the data is moved from the
287 scan_data_t structure into the regexp structure the information
288 about lookbehind is factored in, with the information that would
289 have been lost precalculated in the end_shift field for the
292 The fields pos_min and pos_delta are used to store the minimum offset
293 and the delta to the maximum offset at the current point in the pattern.
297 typedef struct scan_data_t {
298 /*I32 len_min; unused */
299 /*I32 len_delta; unused */
303 I32 last_end; /* min value, <0 unless valid. */
306 SV **longest; /* Either &l_fixed, or &l_float. */
307 SV *longest_fixed; /* longest fixed string found in pattern */
308 I32 offset_fixed; /* offset where it starts */
309 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
310 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
311 SV *longest_float; /* longest floating string found in pattern */
312 I32 offset_float_min; /* earliest point in string it can appear */
313 I32 offset_float_max; /* latest point in string it can appear */
314 I32 *minlen_float; /* pointer to the minlen relevent to the string */
315 I32 lookbehind_float; /* is the position of the string modified by LB */
319 struct regnode_charclass_class *start_class;
323 * Forward declarations for pregcomp()'s friends.
326 static const scan_data_t zero_scan_data =
327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
329 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
330 #define SF_BEFORE_SEOL 0x0001
331 #define SF_BEFORE_MEOL 0x0002
332 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
336 # define SF_FIX_SHIFT_EOL (0+2)
337 # define SF_FL_SHIFT_EOL (0+4)
339 # define SF_FIX_SHIFT_EOL (+2)
340 # define SF_FL_SHIFT_EOL (+4)
343 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
346 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
348 #define SF_IS_INF 0x0040
349 #define SF_HAS_PAR 0x0080
350 #define SF_IN_PAR 0x0100
351 #define SF_HAS_EVAL 0x0200
352 #define SCF_DO_SUBSTR 0x0400
353 #define SCF_DO_STCLASS_AND 0x0800
354 #define SCF_DO_STCLASS_OR 0x1000
355 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
356 #define SCF_WHILEM_VISITED_POS 0x2000
358 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
359 #define SCF_SEEN_ACCEPT 0x8000
361 #define UTF (RExC_utf8 != 0)
362 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
365 #define OOB_UNICODE 12345678
366 #define OOB_NAMEDCLASS -1
368 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
372 /* length of regex to show in messages that don't mark a position within */
373 #define RegexLengthToShowInErrorMessages 127
376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
378 * op/pragma/warn/regcomp.
380 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
381 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
383 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
387 * arg. Show regex, up to a maximum length. If it's too long, chop and add
390 #define _FAIL(code) STMT_START { \
391 const char *ellipses = ""; \
392 IV len = RExC_end - RExC_precomp; \
395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
396 if (len > RegexLengthToShowInErrorMessages) { \
397 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
398 len = RegexLengthToShowInErrorMessages - 10; \
404 #define FAIL(msg) _FAIL( \
405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
406 msg, (int)len, RExC_precomp, ellipses))
408 #define FAIL2(msg,arg) _FAIL( \
409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
410 arg, (int)len, RExC_precomp, ellipses))
413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
415 #define Simple_vFAIL(m) STMT_START { \
416 const IV offset = RExC_parse - RExC_precomp; \
417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
424 #define vFAIL(m) STMT_START { \
426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
431 * Like Simple_vFAIL(), but accepts two arguments.
433 #define Simple_vFAIL2(m,a1) STMT_START { \
434 const IV offset = RExC_parse - RExC_precomp; \
435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
436 (int)offset, RExC_precomp, RExC_precomp + offset); \
440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
442 #define vFAIL2(m,a1) STMT_START { \
444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
445 Simple_vFAIL2(m, a1); \
450 * Like Simple_vFAIL(), but accepts three arguments.
452 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
453 const IV offset = RExC_parse - RExC_precomp; \
454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
455 (int)offset, RExC_precomp, RExC_precomp + offset); \
459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
461 #define vFAIL3(m,a1,a2) STMT_START { \
463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
464 Simple_vFAIL3(m, a1, a2); \
468 * Like Simple_vFAIL(), but accepts four arguments.
470 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
471 const IV offset = RExC_parse - RExC_precomp; \
472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
473 (int)offset, RExC_precomp, RExC_precomp + offset); \
476 #define ckWARNreg(loc,m) STMT_START { \
477 const IV offset = loc - RExC_precomp; \
478 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 (int)offset, RExC_precomp, RExC_precomp + offset); \
482 #define ckWARNregdep(loc,m) STMT_START { \
483 const IV offset = loc - RExC_precomp; \
484 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
486 (int)offset, RExC_precomp, RExC_precomp + offset); \
489 #define ckWARN2reg(loc, m, a1) STMT_START { \
490 const IV offset = loc - RExC_precomp; \
491 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
492 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
495 #define vWARN3(loc, m, a1, a2) STMT_START { \
496 const IV offset = loc - RExC_precomp; \
497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
498 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
501 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
502 const IV offset = loc - RExC_precomp; \
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
507 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
508 const IV offset = loc - RExC_precomp; \
509 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
510 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
513 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
514 const IV offset = loc - RExC_precomp; \
515 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
516 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
519 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
520 const IV offset = loc - RExC_precomp; \
521 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
522 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
526 /* Allow for side effects in s */
527 #define REGC(c,s) STMT_START { \
528 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
531 /* Macros for recording node offsets. 20001227 mjd@plover.com
532 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
533 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
534 * Element 0 holds the number n.
535 * Position is 1 indexed.
537 #ifndef RE_TRACK_PATTERN_OFFSETS
538 #define Set_Node_Offset_To_R(node,byte)
539 #define Set_Node_Offset(node,byte)
540 #define Set_Cur_Node_Offset
541 #define Set_Node_Length_To_R(node,len)
542 #define Set_Node_Length(node,len)
543 #define Set_Node_Cur_Length(node)
544 #define Node_Offset(n)
545 #define Node_Length(n)
546 #define Set_Node_Offset_Length(node,offset,len)
547 #define ProgLen(ri) ri->u.proglen
548 #define SetProgLen(ri,x) ri->u.proglen = x
550 #define ProgLen(ri) ri->u.offsets[0]
551 #define SetProgLen(ri,x) ri->u.offsets[0] = x
552 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
554 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
555 __LINE__, (int)(node), (int)(byte))); \
557 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
559 RExC_offsets[2*(node)-1] = (byte); \
564 #define Set_Node_Offset(node,byte) \
565 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
566 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
568 #define Set_Node_Length_To_R(node,len) STMT_START { \
570 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
571 __LINE__, (int)(node), (int)(len))); \
573 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
575 RExC_offsets[2*(node)] = (len); \
580 #define Set_Node_Length(node,len) \
581 Set_Node_Length_To_R((node)-RExC_emit_start, len)
582 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
583 #define Set_Node_Cur_Length(node) \
584 Set_Node_Length(node, RExC_parse - parse_start)
586 /* Get offsets and lengths */
587 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
588 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
590 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
591 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
592 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
596 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
597 #define EXPERIMENTAL_INPLACESCAN
598 #endif /*RE_TRACK_PATTERN_OFFSETS*/
600 #define DEBUG_STUDYDATA(str,data,depth) \
601 DEBUG_OPTIMISE_MORE_r(if(data){ \
602 PerlIO_printf(Perl_debug_log, \
603 "%*s" str "Pos:%"IVdf"/%"IVdf \
604 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
605 (int)(depth)*2, "", \
606 (IV)((data)->pos_min), \
607 (IV)((data)->pos_delta), \
608 (UV)((data)->flags), \
609 (IV)((data)->whilem_c), \
610 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
611 is_inf ? "INF " : "" \
613 if ((data)->last_found) \
614 PerlIO_printf(Perl_debug_log, \
615 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
616 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
617 SvPVX_const((data)->last_found), \
618 (IV)((data)->last_end), \
619 (IV)((data)->last_start_min), \
620 (IV)((data)->last_start_max), \
621 ((data)->longest && \
622 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
623 SvPVX_const((data)->longest_fixed), \
624 (IV)((data)->offset_fixed), \
625 ((data)->longest && \
626 (data)->longest==&((data)->longest_float)) ? "*" : "", \
627 SvPVX_const((data)->longest_float), \
628 (IV)((data)->offset_float_min), \
629 (IV)((data)->offset_float_max) \
631 PerlIO_printf(Perl_debug_log,"\n"); \
634 static void clear_re(pTHX_ void *r);
636 /* Mark that we cannot extend a found fixed substring at this point.
637 Update the longest found anchored substring and the longest found
638 floating substrings if needed. */
641 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
643 const STRLEN l = CHR_SVLEN(data->last_found);
644 const STRLEN old_l = CHR_SVLEN(*data->longest);
645 GET_RE_DEBUG_FLAGS_DECL;
647 PERL_ARGS_ASSERT_SCAN_COMMIT;
649 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
650 SvSetMagicSV(*data->longest, data->last_found);
651 if (*data->longest == data->longest_fixed) {
652 data->offset_fixed = l ? data->last_start_min : data->pos_min;
653 if (data->flags & SF_BEFORE_EOL)
655 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
657 data->flags &= ~SF_FIX_BEFORE_EOL;
658 data->minlen_fixed=minlenp;
659 data->lookbehind_fixed=0;
661 else { /* *data->longest == data->longest_float */
662 data->offset_float_min = l ? data->last_start_min : data->pos_min;
663 data->offset_float_max = (l
664 ? data->last_start_max
665 : data->pos_min + data->pos_delta);
666 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
667 data->offset_float_max = I32_MAX;
668 if (data->flags & SF_BEFORE_EOL)
670 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
672 data->flags &= ~SF_FL_BEFORE_EOL;
673 data->minlen_float=minlenp;
674 data->lookbehind_float=0;
677 SvCUR_set(data->last_found, 0);
679 SV * const sv = data->last_found;
680 if (SvUTF8(sv) && SvMAGICAL(sv)) {
681 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
687 data->flags &= ~SF_BEFORE_EOL;
688 DEBUG_STUDYDATA("commit: ",data,0);
691 /* Can match anything (initialization) */
693 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
695 PERL_ARGS_ASSERT_CL_ANYTHING;
697 ANYOF_CLASS_ZERO(cl);
698 ANYOF_BITMAP_SETALL(cl);
699 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
701 cl->flags |= ANYOF_LOCALE;
704 /* Can match anything (initialization) */
706 S_cl_is_anything(const struct regnode_charclass_class *cl)
710 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
712 for (value = 0; value <= ANYOF_MAX; value += 2)
713 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
715 if (!(cl->flags & ANYOF_UNICODE_ALL))
717 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
722 /* Can match anything (initialization) */
724 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
726 PERL_ARGS_ASSERT_CL_INIT;
728 Zero(cl, 1, struct regnode_charclass_class);
730 cl_anything(pRExC_state, cl);
734 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
736 PERL_ARGS_ASSERT_CL_INIT_ZERO;
738 Zero(cl, 1, struct regnode_charclass_class);
740 cl_anything(pRExC_state, cl);
742 cl->flags |= ANYOF_LOCALE;
745 /* 'And' a given class with another one. Can create false positives */
746 /* We assume that cl is not inverted */
748 S_cl_and(struct regnode_charclass_class *cl,
749 const struct regnode_charclass_class *and_with)
751 PERL_ARGS_ASSERT_CL_AND;
753 assert(and_with->type == ANYOF);
754 if (!(and_with->flags & ANYOF_CLASS)
755 && !(cl->flags & ANYOF_CLASS)
756 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
757 && !(and_with->flags & ANYOF_FOLD)
758 && !(cl->flags & ANYOF_FOLD)) {
761 if (and_with->flags & ANYOF_INVERT)
762 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
763 cl->bitmap[i] &= ~and_with->bitmap[i];
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] &= and_with->bitmap[i];
767 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
768 if (!(and_with->flags & ANYOF_EOS))
769 cl->flags &= ~ANYOF_EOS;
771 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
772 !(and_with->flags & ANYOF_INVERT)) {
773 cl->flags &= ~ANYOF_UNICODE_ALL;
774 cl->flags |= ANYOF_UNICODE;
775 ARG_SET(cl, ARG(and_with));
777 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
778 !(and_with->flags & ANYOF_INVERT))
779 cl->flags &= ~ANYOF_UNICODE_ALL;
780 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
781 !(and_with->flags & ANYOF_INVERT))
782 cl->flags &= ~ANYOF_UNICODE;
785 /* 'OR' a given class with another one. Can create false positives */
786 /* We assume that cl is not inverted */
788 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
790 PERL_ARGS_ASSERT_CL_OR;
792 if (or_with->flags & ANYOF_INVERT) {
794 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
795 * <= (B1 | !B2) | (CL1 | !CL2)
796 * which is wasteful if CL2 is small, but we ignore CL2:
797 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
798 * XXXX Can we handle case-fold? Unclear:
799 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
800 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
802 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
803 && !(or_with->flags & ANYOF_FOLD)
804 && !(cl->flags & ANYOF_FOLD) ) {
807 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
808 cl->bitmap[i] |= ~or_with->bitmap[i];
809 } /* XXXX: logic is complicated otherwise */
811 cl_anything(pRExC_state, cl);
814 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
815 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
816 && (!(or_with->flags & ANYOF_FOLD)
817 || (cl->flags & ANYOF_FOLD)) ) {
820 /* OR char bitmap and class bitmap separately */
821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 cl->bitmap[i] |= or_with->bitmap[i];
823 if (or_with->flags & ANYOF_CLASS) {
824 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
825 cl->classflags[i] |= or_with->classflags[i];
826 cl->flags |= ANYOF_CLASS;
829 else { /* XXXX: logic is complicated, leave it along for a moment. */
830 cl_anything(pRExC_state, cl);
833 if (or_with->flags & ANYOF_EOS)
834 cl->flags |= ANYOF_EOS;
836 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
837 ARG(cl) != ARG(or_with)) {
838 cl->flags |= ANYOF_UNICODE_ALL;
839 cl->flags &= ~ANYOF_UNICODE;
841 if (or_with->flags & ANYOF_UNICODE_ALL) {
842 cl->flags |= ANYOF_UNICODE_ALL;
843 cl->flags &= ~ANYOF_UNICODE;
847 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
848 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
849 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
850 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
855 dump_trie(trie,widecharmap,revcharmap)
856 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
857 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
859 These routines dump out a trie in a somewhat readable format.
860 The _interim_ variants are used for debugging the interim
861 tables that are used to generate the final compressed
862 representation which is what dump_trie expects.
864 Part of the reason for their existance is to provide a form
865 of documentation as to how the different representations function.
870 Dumps the final compressed table form of the trie to Perl_debug_log.
871 Used for debugging make_trie().
875 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
876 AV *revcharmap, U32 depth)
879 SV *sv=sv_newmortal();
880 int colwidth= widecharmap ? 6 : 4;
881 GET_RE_DEBUG_FLAGS_DECL;
883 PERL_ARGS_ASSERT_DUMP_TRIE;
885 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
886 (int)depth * 2 + 2,"",
887 "Match","Base","Ofs" );
889 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
890 SV ** const tmp = av_fetch( revcharmap, state, 0);
892 PerlIO_printf( Perl_debug_log, "%*s",
894 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
895 PL_colors[0], PL_colors[1],
896 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
897 PERL_PV_ESCAPE_FIRSTCHAR
902 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
903 (int)depth * 2 + 2,"");
905 for( state = 0 ; state < trie->uniquecharcount ; state++ )
906 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
907 PerlIO_printf( Perl_debug_log, "\n");
909 for( state = 1 ; state < trie->statecount ; state++ ) {
910 const U32 base = trie->states[ state ].trans.base;
912 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
914 if ( trie->states[ state ].wordnum ) {
915 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
917 PerlIO_printf( Perl_debug_log, "%6s", "" );
920 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
925 while( ( base + ofs < trie->uniquecharcount ) ||
926 ( base + ofs - trie->uniquecharcount < trie->lasttrans
927 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
930 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
932 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
933 if ( ( base + ofs >= trie->uniquecharcount ) &&
934 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
935 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
937 PerlIO_printf( Perl_debug_log, "%*"UVXf,
939 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
941 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
945 PerlIO_printf( Perl_debug_log, "]");
948 PerlIO_printf( Perl_debug_log, "\n" );
952 Dumps a fully constructed but uncompressed trie in list form.
953 List tries normally only are used for construction when the number of
954 possible chars (trie->uniquecharcount) is very high.
955 Used for debugging make_trie().
958 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
959 HV *widecharmap, AV *revcharmap, U32 next_alloc,
963 SV *sv=sv_newmortal();
964 int colwidth= widecharmap ? 6 : 4;
965 GET_RE_DEBUG_FLAGS_DECL;
967 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
969 /* print out the table precompression. */
970 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
971 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
972 "------:-----+-----------------\n" );
974 for( state=1 ; state < next_alloc ; state ++ ) {
977 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
978 (int)depth * 2 + 2,"", (UV)state );
979 if ( ! trie->states[ state ].wordnum ) {
980 PerlIO_printf( Perl_debug_log, "%5s| ","");
982 PerlIO_printf( Perl_debug_log, "W%4x| ",
983 trie->states[ state ].wordnum
986 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
987 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
989 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
991 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
992 PL_colors[0], PL_colors[1],
993 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
994 PERL_PV_ESCAPE_FIRSTCHAR
996 TRIE_LIST_ITEM(state,charid).forid,
997 (UV)TRIE_LIST_ITEM(state,charid).newstate
1000 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1001 (int)((depth * 2) + 14), "");
1004 PerlIO_printf( Perl_debug_log, "\n");
1009 Dumps a fully constructed but uncompressed trie in table form.
1010 This is the normal DFA style state transition table, with a few
1011 twists to facilitate compression later.
1012 Used for debugging make_trie().
1015 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1016 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1021 SV *sv=sv_newmortal();
1022 int colwidth= widecharmap ? 6 : 4;
1023 GET_RE_DEBUG_FLAGS_DECL;
1025 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1028 print out the table precompression so that we can do a visual check
1029 that they are identical.
1032 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1034 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1035 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1037 PerlIO_printf( Perl_debug_log, "%*s",
1039 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1040 PL_colors[0], PL_colors[1],
1041 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1042 PERL_PV_ESCAPE_FIRSTCHAR
1048 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1050 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1051 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1054 PerlIO_printf( Perl_debug_log, "\n" );
1056 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1058 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1059 (int)depth * 2 + 2,"",
1060 (UV)TRIE_NODENUM( state ) );
1062 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1063 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1065 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1067 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1069 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1070 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1073 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1080 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1081 startbranch: the first branch in the whole branch sequence
1082 first : start branch of sequence of branch-exact nodes.
1083 May be the same as startbranch
1084 last : Thing following the last branch.
1085 May be the same as tail.
1086 tail : item following the branch sequence
1087 count : words in the sequence
1088 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1089 depth : indent depth
1091 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1093 A trie is an N'ary tree where the branches are determined by digital
1094 decomposition of the key. IE, at the root node you look up the 1st character and
1095 follow that branch repeat until you find the end of the branches. Nodes can be
1096 marked as "accepting" meaning they represent a complete word. Eg:
1100 would convert into the following structure. Numbers represent states, letters
1101 following numbers represent valid transitions on the letter from that state, if
1102 the number is in square brackets it represents an accepting state, otherwise it
1103 will be in parenthesis.
1105 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1109 (1) +-i->(6)-+-s->[7]
1111 +-s->(3)-+-h->(4)-+-e->[5]
1113 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1115 This shows that when matching against the string 'hers' we will begin at state 1
1116 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1117 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1118 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1119 single traverse. We store a mapping from accepting to state to which word was
1120 matched, and then when we have multiple possibilities we try to complete the
1121 rest of the regex in the order in which they occured in the alternation.
1123 The only prior NFA like behaviour that would be changed by the TRIE support is
1124 the silent ignoring of duplicate alternations which are of the form:
1126 / (DUPE|DUPE) X? (?{ ... }) Y /x
1128 Thus EVAL blocks follwing a trie may be called a different number of times with
1129 and without the optimisation. With the optimisations dupes will be silently
1130 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1131 the following demonstrates:
1133 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1135 which prints out 'word' three times, but
1137 'words'=~/(word|word|word)(?{ print $1 })S/
1139 which doesnt print it out at all. This is due to other optimisations kicking in.
1141 Example of what happens on a structural level:
1143 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1145 1: CURLYM[1] {1,32767}(18)
1156 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1157 and should turn into:
1159 1: CURLYM[1] {1,32767}(18)
1161 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1169 Cases where tail != last would be like /(?foo|bar)baz/:
1179 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1180 and would end up looking like:
1183 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1190 d = uvuni_to_utf8_flags(d, uv, 0);
1192 is the recommended Unicode-aware way of saying
1197 #define TRIE_STORE_REVCHAR \
1200 SV *zlopp = newSV(2); \
1201 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1202 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1203 SvCUR_set(zlopp, kapow - flrbbbbb); \
1206 av_push(revcharmap, zlopp); \
1208 char ooooff = (char)uvc; \
1209 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1213 #define TRIE_READ_CHAR STMT_START { \
1217 if ( foldlen > 0 ) { \
1218 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1223 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1224 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1225 foldlen -= UNISKIP( uvc ); \
1226 scan = foldbuf + UNISKIP( uvc ); \
1229 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1239 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1240 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1241 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1242 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1244 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1245 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1246 TRIE_LIST_CUR( state )++; \
1249 #define TRIE_LIST_NEW(state) STMT_START { \
1250 Newxz( trie->states[ state ].trans.list, \
1251 4, reg_trie_trans_le ); \
1252 TRIE_LIST_CUR( state ) = 1; \
1253 TRIE_LIST_LEN( state ) = 4; \
1256 #define TRIE_HANDLE_WORD(state) STMT_START { \
1257 U16 dupe= trie->states[ state ].wordnum; \
1258 regnode * const noper_next = regnext( noper ); \
1260 if (trie->wordlen) \
1261 trie->wordlen[ curword ] = wordlen; \
1263 /* store the word for dumping */ \
1265 if (OP(noper) != NOTHING) \
1266 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1268 tmp = newSVpvn_utf8( "", 0, UTF ); \
1269 av_push( trie_words, tmp ); \
1274 if ( noper_next < tail ) { \
1276 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1277 trie->jump[curword] = (U16)(noper_next - convert); \
1279 jumper = noper_next; \
1281 nextbranch= regnext(cur); \
1285 /* So it's a dupe. This means we need to maintain a */\
1286 /* linked-list from the first to the next. */\
1287 /* we only allocate the nextword buffer when there */\
1288 /* a dupe, so first time we have to do the allocation */\
1289 if (!trie->nextword) \
1290 trie->nextword = (U16 *) \
1291 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1292 while ( trie->nextword[dupe] ) \
1293 dupe= trie->nextword[dupe]; \
1294 trie->nextword[dupe]= curword; \
1296 /* we haven't inserted this word yet. */ \
1297 trie->states[ state ].wordnum = curword; \
1302 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1303 ( ( base + charid >= ucharcount \
1304 && base + charid < ubound \
1305 && state == trie->trans[ base - ucharcount + charid ].check \
1306 && trie->trans[ base - ucharcount + charid ].next ) \
1307 ? trie->trans[ base - ucharcount + charid ].next \
1308 : ( state==1 ? special : 0 ) \
1312 #define MADE_JUMP_TRIE 2
1313 #define MADE_EXACT_TRIE 4
1316 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1319 /* first pass, loop through and scan words */
1320 reg_trie_data *trie;
1321 HV *widecharmap = NULL;
1322 AV *revcharmap = newAV();
1324 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1329 regnode *jumper = NULL;
1330 regnode *nextbranch = NULL;
1331 regnode *convert = NULL;
1332 /* we just use folder as a flag in utf8 */
1333 const U8 * const folder = ( flags == EXACTF
1335 : ( flags == EXACTFL
1342 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1343 AV *trie_words = NULL;
1344 /* along with revcharmap, this only used during construction but both are
1345 * useful during debugging so we store them in the struct when debugging.
1348 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1349 STRLEN trie_charcount=0;
1351 SV *re_trie_maxbuff;
1352 GET_RE_DEBUG_FLAGS_DECL;
1354 PERL_ARGS_ASSERT_MAKE_TRIE;
1356 PERL_UNUSED_ARG(depth);
1359 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1361 trie->startstate = 1;
1362 trie->wordcount = word_count;
1363 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1364 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1365 if (!(UTF && folder))
1366 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1368 trie_words = newAV();
1371 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1372 if (!SvIOK(re_trie_maxbuff)) {
1373 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1376 PerlIO_printf( Perl_debug_log,
1377 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1378 (int)depth * 2 + 2, "",
1379 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1380 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1384 /* Find the node we are going to overwrite */
1385 if ( first == startbranch && OP( last ) != BRANCH ) {
1386 /* whole branch chain */
1389 /* branch sub-chain */
1390 convert = NEXTOPER( first );
1393 /* -- First loop and Setup --
1395 We first traverse the branches and scan each word to determine if it
1396 contains widechars, and how many unique chars there are, this is
1397 important as we have to build a table with at least as many columns as we
1400 We use an array of integers to represent the character codes 0..255
1401 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1402 native representation of the character value as the key and IV's for the
1405 *TODO* If we keep track of how many times each character is used we can
1406 remap the columns so that the table compression later on is more
1407 efficient in terms of memory by ensuring most common value is in the
1408 middle and the least common are on the outside. IMO this would be better
1409 than a most to least common mapping as theres a decent chance the most
1410 common letter will share a node with the least common, meaning the node
1411 will not be compressable. With a middle is most common approach the worst
1412 case is when we have the least common nodes twice.
1416 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1417 regnode * const noper = NEXTOPER( cur );
1418 const U8 *uc = (U8*)STRING( noper );
1419 const U8 * const e = uc + STR_LEN( noper );
1421 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1422 const U8 *scan = (U8*)NULL;
1423 U32 wordlen = 0; /* required init */
1425 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1427 if (OP(noper) == NOTHING) {
1431 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1432 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1433 regardless of encoding */
1435 for ( ; uc < e ; uc += len ) {
1436 TRIE_CHARCOUNT(trie)++;
1440 if ( !trie->charmap[ uvc ] ) {
1441 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1443 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1447 /* store the codepoint in the bitmap, and if its ascii
1448 also store its folded equivelent. */
1449 TRIE_BITMAP_SET(trie,uvc);
1451 /* store the folded codepoint */
1452 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1455 /* store first byte of utf8 representation of
1456 codepoints in the 127 < uvc < 256 range */
1457 if (127 < uvc && uvc < 192) {
1458 TRIE_BITMAP_SET(trie,194);
1459 } else if (191 < uvc ) {
1460 TRIE_BITMAP_SET(trie,195);
1461 /* && uvc < 256 -- we know uvc is < 256 already */
1464 set_bit = 0; /* We've done our bit :-) */
1469 widecharmap = newHV();
1471 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1474 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1476 if ( !SvTRUE( *svpp ) ) {
1477 sv_setiv( *svpp, ++trie->uniquecharcount );
1482 if( cur == first ) {
1485 } else if (chars < trie->minlen) {
1487 } else if (chars > trie->maxlen) {
1491 } /* end first pass */
1492 DEBUG_TRIE_COMPILE_r(
1493 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1494 (int)depth * 2 + 2,"",
1495 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1496 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1497 (int)trie->minlen, (int)trie->maxlen )
1499 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1502 We now know what we are dealing with in terms of unique chars and
1503 string sizes so we can calculate how much memory a naive
1504 representation using a flat table will take. If it's over a reasonable
1505 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1506 conservative but potentially much slower representation using an array
1509 At the end we convert both representations into the same compressed
1510 form that will be used in regexec.c for matching with. The latter
1511 is a form that cannot be used to construct with but has memory
1512 properties similar to the list form and access properties similar
1513 to the table form making it both suitable for fast searches and
1514 small enough that its feasable to store for the duration of a program.
1516 See the comment in the code where the compressed table is produced
1517 inplace from the flat tabe representation for an explanation of how
1518 the compression works.
1523 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1525 Second Pass -- Array Of Lists Representation
1527 Each state will be represented by a list of charid:state records
1528 (reg_trie_trans_le) the first such element holds the CUR and LEN
1529 points of the allocated array. (See defines above).
1531 We build the initial structure using the lists, and then convert
1532 it into the compressed table form which allows faster lookups
1533 (but cant be modified once converted).
1536 STRLEN transcount = 1;
1538 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1539 "%*sCompiling trie using list compiler\n",
1540 (int)depth * 2 + 2, ""));
1542 trie->states = (reg_trie_state *)
1543 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1544 sizeof(reg_trie_state) );
1548 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1550 regnode * const noper = NEXTOPER( cur );
1551 U8 *uc = (U8*)STRING( noper );
1552 const U8 * const e = uc + STR_LEN( noper );
1553 U32 state = 1; /* required init */
1554 U16 charid = 0; /* sanity init */
1555 U8 *scan = (U8*)NULL; /* sanity init */
1556 STRLEN foldlen = 0; /* required init */
1557 U32 wordlen = 0; /* required init */
1558 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1560 if (OP(noper) != NOTHING) {
1561 for ( ; uc < e ; uc += len ) {
1566 charid = trie->charmap[ uvc ];
1568 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1572 charid=(U16)SvIV( *svpp );
1575 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1582 if ( !trie->states[ state ].trans.list ) {
1583 TRIE_LIST_NEW( state );
1585 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1586 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1587 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1592 newstate = next_alloc++;
1593 TRIE_LIST_PUSH( state, charid, newstate );
1598 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1602 TRIE_HANDLE_WORD(state);
1604 } /* end second pass */
1606 /* next alloc is the NEXT state to be allocated */
1607 trie->statecount = next_alloc;
1608 trie->states = (reg_trie_state *)
1609 PerlMemShared_realloc( trie->states,
1611 * sizeof(reg_trie_state) );
1613 /* and now dump it out before we compress it */
1614 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1615 revcharmap, next_alloc,
1619 trie->trans = (reg_trie_trans *)
1620 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1627 for( state=1 ; state < next_alloc ; state ++ ) {
1631 DEBUG_TRIE_COMPILE_MORE_r(
1632 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1636 if (trie->states[state].trans.list) {
1637 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1641 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1642 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1643 if ( forid < minid ) {
1645 } else if ( forid > maxid ) {
1649 if ( transcount < tp + maxid - minid + 1) {
1651 trie->trans = (reg_trie_trans *)
1652 PerlMemShared_realloc( trie->trans,
1654 * sizeof(reg_trie_trans) );
1655 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1657 base = trie->uniquecharcount + tp - minid;
1658 if ( maxid == minid ) {
1660 for ( ; zp < tp ; zp++ ) {
1661 if ( ! trie->trans[ zp ].next ) {
1662 base = trie->uniquecharcount + zp - minid;
1663 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1664 trie->trans[ zp ].check = state;
1670 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1671 trie->trans[ tp ].check = state;
1676 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1677 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1678 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1679 trie->trans[ tid ].check = state;
1681 tp += ( maxid - minid + 1 );
1683 Safefree(trie->states[ state ].trans.list);
1686 DEBUG_TRIE_COMPILE_MORE_r(
1687 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1690 trie->states[ state ].trans.base=base;
1692 trie->lasttrans = tp + 1;
1696 Second Pass -- Flat Table Representation.
1698 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1699 We know that we will need Charcount+1 trans at most to store the data
1700 (one row per char at worst case) So we preallocate both structures
1701 assuming worst case.
1703 We then construct the trie using only the .next slots of the entry
1706 We use the .check field of the first entry of the node temporarily to
1707 make compression both faster and easier by keeping track of how many non
1708 zero fields are in the node.
1710 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1713 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1714 number representing the first entry of the node, and state as a
1715 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1716 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1717 are 2 entrys per node. eg:
1725 The table is internally in the right hand, idx form. However as we also
1726 have to deal with the states array which is indexed by nodenum we have to
1727 use TRIE_NODENUM() to convert.
1730 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1731 "%*sCompiling trie using table compiler\n",
1732 (int)depth * 2 + 2, ""));
1734 trie->trans = (reg_trie_trans *)
1735 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1736 * trie->uniquecharcount + 1,
1737 sizeof(reg_trie_trans) );
1738 trie->states = (reg_trie_state *)
1739 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1740 sizeof(reg_trie_state) );
1741 next_alloc = trie->uniquecharcount + 1;
1744 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1746 regnode * const noper = NEXTOPER( cur );
1747 const U8 *uc = (U8*)STRING( noper );
1748 const U8 * const e = uc + STR_LEN( noper );
1750 U32 state = 1; /* required init */
1752 U16 charid = 0; /* sanity init */
1753 U32 accept_state = 0; /* sanity init */
1754 U8 *scan = (U8*)NULL; /* sanity init */
1756 STRLEN foldlen = 0; /* required init */
1757 U32 wordlen = 0; /* required init */
1758 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1760 if ( OP(noper) != NOTHING ) {
1761 for ( ; uc < e ; uc += len ) {
1766 charid = trie->charmap[ uvc ];
1768 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1769 charid = svpp ? (U16)SvIV(*svpp) : 0;
1773 if ( !trie->trans[ state + charid ].next ) {
1774 trie->trans[ state + charid ].next = next_alloc;
1775 trie->trans[ state ].check++;
1776 next_alloc += trie->uniquecharcount;
1778 state = trie->trans[ state + charid ].next;
1780 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1782 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1785 accept_state = TRIE_NODENUM( state );
1786 TRIE_HANDLE_WORD(accept_state);
1788 } /* end second pass */
1790 /* and now dump it out before we compress it */
1791 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1793 next_alloc, depth+1));
1797 * Inplace compress the table.*
1799 For sparse data sets the table constructed by the trie algorithm will
1800 be mostly 0/FAIL transitions or to put it another way mostly empty.
1801 (Note that leaf nodes will not contain any transitions.)
1803 This algorithm compresses the tables by eliminating most such
1804 transitions, at the cost of a modest bit of extra work during lookup:
1806 - Each states[] entry contains a .base field which indicates the
1807 index in the state[] array wheres its transition data is stored.
1809 - If .base is 0 there are no valid transitions from that node.
1811 - If .base is nonzero then charid is added to it to find an entry in
1814 -If trans[states[state].base+charid].check!=state then the
1815 transition is taken to be a 0/Fail transition. Thus if there are fail
1816 transitions at the front of the node then the .base offset will point
1817 somewhere inside the previous nodes data (or maybe even into a node
1818 even earlier), but the .check field determines if the transition is
1822 The following process inplace converts the table to the compressed
1823 table: We first do not compress the root node 1,and mark its all its
1824 .check pointers as 1 and set its .base pointer as 1 as well. This
1825 allows to do a DFA construction from the compressed table later, and
1826 ensures that any .base pointers we calculate later are greater than
1829 - We set 'pos' to indicate the first entry of the second node.
1831 - We then iterate over the columns of the node, finding the first and
1832 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1833 and set the .check pointers accordingly, and advance pos
1834 appropriately and repreat for the next node. Note that when we copy
1835 the next pointers we have to convert them from the original
1836 NODEIDX form to NODENUM form as the former is not valid post
1839 - If a node has no transitions used we mark its base as 0 and do not
1840 advance the pos pointer.
1842 - If a node only has one transition we use a second pointer into the
1843 structure to fill in allocated fail transitions from other states.
1844 This pointer is independent of the main pointer and scans forward
1845 looking for null transitions that are allocated to a state. When it
1846 finds one it writes the single transition into the "hole". If the
1847 pointer doesnt find one the single transition is appended as normal.
1849 - Once compressed we can Renew/realloc the structures to release the
1852 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1853 specifically Fig 3.47 and the associated pseudocode.
1857 const U32 laststate = TRIE_NODENUM( next_alloc );
1860 trie->statecount = laststate;
1862 for ( state = 1 ; state < laststate ; state++ ) {
1864 const U32 stateidx = TRIE_NODEIDX( state );
1865 const U32 o_used = trie->trans[ stateidx ].check;
1866 U32 used = trie->trans[ stateidx ].check;
1867 trie->trans[ stateidx ].check = 0;
1869 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1870 if ( flag || trie->trans[ stateidx + charid ].next ) {
1871 if ( trie->trans[ stateidx + charid ].next ) {
1873 for ( ; zp < pos ; zp++ ) {
1874 if ( ! trie->trans[ zp ].next ) {
1878 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1879 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880 trie->trans[ zp ].check = state;
1881 if ( ++zp > pos ) pos = zp;
1888 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1890 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1891 trie->trans[ pos ].check = state;
1896 trie->lasttrans = pos + 1;
1897 trie->states = (reg_trie_state *)
1898 PerlMemShared_realloc( trie->states, laststate
1899 * sizeof(reg_trie_state) );
1900 DEBUG_TRIE_COMPILE_MORE_r(
1901 PerlIO_printf( Perl_debug_log,
1902 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1903 (int)depth * 2 + 2,"",
1904 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1907 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1910 } /* end table compress */
1912 DEBUG_TRIE_COMPILE_MORE_r(
1913 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1914 (int)depth * 2 + 2, "",
1915 (UV)trie->statecount,
1916 (UV)trie->lasttrans)
1918 /* resize the trans array to remove unused space */
1919 trie->trans = (reg_trie_trans *)
1920 PerlMemShared_realloc( trie->trans, trie->lasttrans
1921 * sizeof(reg_trie_trans) );
1923 /* and now dump out the compressed format */
1924 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1926 { /* Modify the program and insert the new TRIE node*/
1927 U8 nodetype =(U8)(flags & 0xFF);
1931 regnode *optimize = NULL;
1932 #ifdef RE_TRACK_PATTERN_OFFSETS
1935 U32 mjd_nodelen = 0;
1936 #endif /* RE_TRACK_PATTERN_OFFSETS */
1937 #endif /* DEBUGGING */
1939 This means we convert either the first branch or the first Exact,
1940 depending on whether the thing following (in 'last') is a branch
1941 or not and whther first is the startbranch (ie is it a sub part of
1942 the alternation or is it the whole thing.)
1943 Assuming its a sub part we conver the EXACT otherwise we convert
1944 the whole branch sequence, including the first.
1946 /* Find the node we are going to overwrite */
1947 if ( first != startbranch || OP( last ) == BRANCH ) {
1948 /* branch sub-chain */
1949 NEXT_OFF( first ) = (U16)(last - first);
1950 #ifdef RE_TRACK_PATTERN_OFFSETS
1952 mjd_offset= Node_Offset((convert));
1953 mjd_nodelen= Node_Length((convert));
1956 /* whole branch chain */
1958 #ifdef RE_TRACK_PATTERN_OFFSETS
1961 const regnode *nop = NEXTOPER( convert );
1962 mjd_offset= Node_Offset((nop));
1963 mjd_nodelen= Node_Length((nop));
1967 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1968 (int)depth * 2 + 2, "",
1969 (UV)mjd_offset, (UV)mjd_nodelen)
1972 /* But first we check to see if there is a common prefix we can
1973 split out as an EXACT and put in front of the TRIE node. */
1974 trie->startstate= 1;
1975 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1977 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1981 const U32 base = trie->states[ state ].trans.base;
1983 if ( trie->states[state].wordnum )
1986 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1987 if ( ( base + ofs >= trie->uniquecharcount ) &&
1988 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1989 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1991 if ( ++count > 1 ) {
1992 SV **tmp = av_fetch( revcharmap, ofs, 0);
1993 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1994 if ( state == 1 ) break;
1996 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1998 PerlIO_printf(Perl_debug_log,
1999 "%*sNew Start State=%"UVuf" Class: [",
2000 (int)depth * 2 + 2, "",
2003 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2004 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2006 TRIE_BITMAP_SET(trie,*ch);
2008 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2010 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2014 TRIE_BITMAP_SET(trie,*ch);
2016 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2017 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2023 SV **tmp = av_fetch( revcharmap, idx, 0);
2025 char *ch = SvPV( *tmp, len );
2027 SV *sv=sv_newmortal();
2028 PerlIO_printf( Perl_debug_log,
2029 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2030 (int)depth * 2 + 2, "",
2032 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2033 PL_colors[0], PL_colors[1],
2034 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2035 PERL_PV_ESCAPE_FIRSTCHAR
2040 OP( convert ) = nodetype;
2041 str=STRING(convert);
2044 STR_LEN(convert) += len;
2050 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2056 regnode *n = convert+NODE_SZ_STR(convert);
2057 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2058 trie->startstate = state;
2059 trie->minlen -= (state - 1);
2060 trie->maxlen -= (state - 1);
2062 /* At least the UNICOS C compiler choked on this
2063 * being argument to DEBUG_r(), so let's just have
2066 #ifdef PERL_EXT_RE_BUILD
2072 regnode *fix = convert;
2073 U32 word = trie->wordcount;
2075 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2076 while( ++fix < n ) {
2077 Set_Node_Offset_Length(fix, 0, 0);
2080 SV ** const tmp = av_fetch( trie_words, word, 0 );
2082 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2083 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2093 NEXT_OFF(convert) = (U16)(tail - convert);
2094 DEBUG_r(optimize= n);
2100 if ( trie->maxlen ) {
2101 NEXT_OFF( convert ) = (U16)(tail - convert);
2102 ARG_SET( convert, data_slot );
2103 /* Store the offset to the first unabsorbed branch in
2104 jump[0], which is otherwise unused by the jump logic.
2105 We use this when dumping a trie and during optimisation. */
2107 trie->jump[0] = (U16)(nextbranch - convert);
2110 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2111 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2113 OP( convert ) = TRIEC;
2114 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2115 PerlMemShared_free(trie->bitmap);
2118 OP( convert ) = TRIE;
2120 /* store the type in the flags */
2121 convert->flags = nodetype;
2125 + regarglen[ OP( convert ) ];
2127 /* XXX We really should free up the resource in trie now,
2128 as we won't use them - (which resources?) dmq */
2130 /* needed for dumping*/
2131 DEBUG_r(if (optimize) {
2132 regnode *opt = convert;
2134 while ( ++opt < optimize) {
2135 Set_Node_Offset_Length(opt,0,0);
2138 Try to clean up some of the debris left after the
2141 while( optimize < jumper ) {
2142 mjd_nodelen += Node_Length((optimize));
2143 OP( optimize ) = OPTIMIZED;
2144 Set_Node_Offset_Length(optimize,0,0);
2147 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2149 } /* end node insert */
2150 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2152 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2153 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2155 SvREFCNT_dec(revcharmap);
2159 : trie->startstate>1
2165 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2167 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2169 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2170 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2173 We find the fail state for each state in the trie, this state is the longest proper
2174 suffix of the current states 'word' that is also a proper prefix of another word in our
2175 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2176 the DFA not to have to restart after its tried and failed a word at a given point, it
2177 simply continues as though it had been matching the other word in the first place.
2179 'abcdgu'=~/abcdefg|cdgu/
2180 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2181 fail, which would bring use to the state representing 'd' in the second word where we would
2182 try 'g' and succeed, prodceding to match 'cdgu'.
2184 /* add a fail transition */
2185 const U32 trie_offset = ARG(source);
2186 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2188 const U32 ucharcount = trie->uniquecharcount;
2189 const U32 numstates = trie->statecount;
2190 const U32 ubound = trie->lasttrans + ucharcount;
2194 U32 base = trie->states[ 1 ].trans.base;
2197 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2198 GET_RE_DEBUG_FLAGS_DECL;
2200 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2202 PERL_UNUSED_ARG(depth);
2206 ARG_SET( stclass, data_slot );
2207 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2208 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2209 aho->trie=trie_offset;
2210 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2211 Copy( trie->states, aho->states, numstates, reg_trie_state );
2212 Newxz( q, numstates, U32);
2213 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2216 /* initialize fail[0..1] to be 1 so that we always have
2217 a valid final fail state */
2218 fail[ 0 ] = fail[ 1 ] = 1;
2220 for ( charid = 0; charid < ucharcount ; charid++ ) {
2221 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2223 q[ q_write ] = newstate;
2224 /* set to point at the root */
2225 fail[ q[ q_write++ ] ]=1;
2228 while ( q_read < q_write) {
2229 const U32 cur = q[ q_read++ % numstates ];
2230 base = trie->states[ cur ].trans.base;
2232 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2233 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2235 U32 fail_state = cur;
2238 fail_state = fail[ fail_state ];
2239 fail_base = aho->states[ fail_state ].trans.base;
2240 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2242 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2243 fail[ ch_state ] = fail_state;
2244 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2246 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2248 q[ q_write++ % numstates] = ch_state;
2252 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2253 when we fail in state 1, this allows us to use the
2254 charclass scan to find a valid start char. This is based on the principle
2255 that theres a good chance the string being searched contains lots of stuff
2256 that cant be a start char.
2258 fail[ 0 ] = fail[ 1 ] = 0;
2259 DEBUG_TRIE_COMPILE_r({
2260 PerlIO_printf(Perl_debug_log,
2261 "%*sStclass Failtable (%"UVuf" states): 0",
2262 (int)(depth * 2), "", (UV)numstates
2264 for( q_read=1; q_read<numstates; q_read++ ) {
2265 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2267 PerlIO_printf(Perl_debug_log, "\n");
2270 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2275 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2276 * These need to be revisited when a newer toolchain becomes available.
2278 #if defined(__sparc64__) && defined(__GNUC__)
2279 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2280 # undef SPARC64_GCC_WORKAROUND
2281 # define SPARC64_GCC_WORKAROUND 1
2285 #define DEBUG_PEEP(str,scan,depth) \
2286 DEBUG_OPTIMISE_r({if (scan){ \
2287 SV * const mysv=sv_newmortal(); \
2288 regnode *Next = regnext(scan); \
2289 regprop(RExC_rx, mysv, scan); \
2290 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2291 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2292 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2299 #define JOIN_EXACT(scan,min,flags) \
2300 if (PL_regkind[OP(scan)] == EXACT) \
2301 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2304 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2305 /* Merge several consecutive EXACTish nodes into one. */
2306 regnode *n = regnext(scan);
2308 regnode *next = scan + NODE_SZ_STR(scan);
2312 regnode *stop = scan;
2313 GET_RE_DEBUG_FLAGS_DECL;
2315 PERL_UNUSED_ARG(depth);
2318 PERL_ARGS_ASSERT_JOIN_EXACT;
2319 #ifndef EXPERIMENTAL_INPLACESCAN
2320 PERL_UNUSED_ARG(flags);
2321 PERL_UNUSED_ARG(val);
2323 DEBUG_PEEP("join",scan,depth);
2325 /* Skip NOTHING, merge EXACT*. */
2327 ( PL_regkind[OP(n)] == NOTHING ||
2328 (stringok && (OP(n) == OP(scan))))
2330 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2332 if (OP(n) == TAIL || n > next)
2334 if (PL_regkind[OP(n)] == NOTHING) {
2335 DEBUG_PEEP("skip:",n,depth);
2336 NEXT_OFF(scan) += NEXT_OFF(n);
2337 next = n + NODE_STEP_REGNODE;
2344 else if (stringok) {
2345 const unsigned int oldl = STR_LEN(scan);
2346 regnode * const nnext = regnext(n);
2348 DEBUG_PEEP("merg",n,depth);
2351 if (oldl + STR_LEN(n) > U8_MAX)
2353 NEXT_OFF(scan) += NEXT_OFF(n);
2354 STR_LEN(scan) += STR_LEN(n);
2355 next = n + NODE_SZ_STR(n);
2356 /* Now we can overwrite *n : */
2357 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2365 #ifdef EXPERIMENTAL_INPLACESCAN
2366 if (flags && !NEXT_OFF(n)) {
2367 DEBUG_PEEP("atch", val, depth);
2368 if (reg_off_by_arg[OP(n)]) {
2369 ARG_SET(n, val - n);
2372 NEXT_OFF(n) = val - n;
2379 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2381 Two problematic code points in Unicode casefolding of EXACT nodes:
2383 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2384 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2390 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2391 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2393 This means that in case-insensitive matching (or "loose matching",
2394 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2395 length of the above casefolded versions) can match a target string
2396 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2397 This would rather mess up the minimum length computation.
2399 What we'll do is to look for the tail four bytes, and then peek
2400 at the preceding two bytes to see whether we need to decrease
2401 the minimum length by four (six minus two).
2403 Thanks to the design of UTF-8, there cannot be false matches:
2404 A sequence of valid UTF-8 bytes cannot be a subsequence of
2405 another valid sequence of UTF-8 bytes.
2408 char * const s0 = STRING(scan), *s, *t;
2409 char * const s1 = s0 + STR_LEN(scan) - 1;
2410 char * const s2 = s1 - 4;
2411 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2412 const char t0[] = "\xaf\x49\xaf\x42";
2414 const char t0[] = "\xcc\x88\xcc\x81";
2416 const char * const t1 = t0 + 3;
2419 s < s2 && (t = ninstr(s, s1, t0, t1));
2422 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2423 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2425 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2426 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2434 n = scan + NODE_SZ_STR(scan);
2436 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2443 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2447 /* REx optimizer. Converts nodes into quickier variants "in place".
2448 Finds fixed substrings. */
2450 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2451 to the position after last scanned or to NULL. */
2453 #define INIT_AND_WITHP \
2454 assert(!and_withp); \
2455 Newx(and_withp,1,struct regnode_charclass_class); \
2456 SAVEFREEPV(and_withp)
2458 /* this is a chain of data about sub patterns we are processing that
2459 need to be handled seperately/specially in study_chunk. Its so
2460 we can simulate recursion without losing state. */
2462 typedef struct scan_frame {
2463 regnode *last; /* last node to process in this frame */
2464 regnode *next; /* next node to process when last is reached */
2465 struct scan_frame *prev; /*previous frame*/
2466 I32 stop; /* what stopparen do we use */
2470 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2472 #define CASE_SYNST_FNC(nAmE) \
2474 if (flags & SCF_DO_STCLASS_AND) { \
2475 for (value = 0; value < 256; value++) \
2476 if (!is_ ## nAmE ## _cp(value)) \
2477 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2480 for (value = 0; value < 256; value++) \
2481 if (is_ ## nAmE ## _cp(value)) \
2482 ANYOF_BITMAP_SET(data->start_class, value); \
2486 if (flags & SCF_DO_STCLASS_AND) { \
2487 for (value = 0; value < 256; value++) \
2488 if (is_ ## nAmE ## _cp(value)) \
2489 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2492 for (value = 0; value < 256; value++) \
2493 if (!is_ ## nAmE ## _cp(value)) \
2494 ANYOF_BITMAP_SET(data->start_class, value); \
2501 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2502 I32 *minlenp, I32 *deltap,
2507 struct regnode_charclass_class *and_withp,
2508 U32 flags, U32 depth)
2509 /* scanp: Start here (read-write). */
2510 /* deltap: Write maxlen-minlen here. */
2511 /* last: Stop before this one. */
2512 /* data: string data about the pattern */
2513 /* stopparen: treat close N as END */
2514 /* recursed: which subroutines have we recursed into */
2515 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2518 I32 min = 0, pars = 0, code;
2519 regnode *scan = *scanp, *next;
2521 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2522 int is_inf_internal = 0; /* The studied chunk is infinite */
2523 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2524 scan_data_t data_fake;
2525 SV *re_trie_maxbuff = NULL;
2526 regnode *first_non_open = scan;
2527 I32 stopmin = I32_MAX;
2528 scan_frame *frame = NULL;
2529 GET_RE_DEBUG_FLAGS_DECL;
2531 PERL_ARGS_ASSERT_STUDY_CHUNK;
2534 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2538 while (first_non_open && OP(first_non_open) == OPEN)
2539 first_non_open=regnext(first_non_open);
2544 while ( scan && OP(scan) != END && scan < last ){
2545 /* Peephole optimizer: */
2546 DEBUG_STUDYDATA("Peep:", data,depth);
2547 DEBUG_PEEP("Peep",scan,depth);
2548 JOIN_EXACT(scan,&min,0);
2550 /* Follow the next-chain of the current node and optimize
2551 away all the NOTHINGs from it. */
2552 if (OP(scan) != CURLYX) {
2553 const int max = (reg_off_by_arg[OP(scan)]
2555 /* I32 may be smaller than U16 on CRAYs! */
2556 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2557 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2561 /* Skip NOTHING and LONGJMP. */
2562 while ((n = regnext(n))
2563 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2564 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2565 && off + noff < max)
2567 if (reg_off_by_arg[OP(scan)])
2570 NEXT_OFF(scan) = off;
2575 /* The principal pseudo-switch. Cannot be a switch, since we
2576 look into several different things. */
2577 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2578 || OP(scan) == IFTHEN) {
2579 next = regnext(scan);
2581 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2583 if (OP(next) == code || code == IFTHEN) {
2584 /* NOTE - There is similar code to this block below for handling
2585 TRIE nodes on a re-study. If you change stuff here check there
2587 I32 max1 = 0, min1 = I32_MAX, num = 0;
2588 struct regnode_charclass_class accum;
2589 regnode * const startbranch=scan;
2591 if (flags & SCF_DO_SUBSTR)
2592 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2593 if (flags & SCF_DO_STCLASS)
2594 cl_init_zero(pRExC_state, &accum);
2596 while (OP(scan) == code) {
2597 I32 deltanext, minnext, f = 0, fake;
2598 struct regnode_charclass_class this_class;
2601 data_fake.flags = 0;
2603 data_fake.whilem_c = data->whilem_c;
2604 data_fake.last_closep = data->last_closep;
2607 data_fake.last_closep = &fake;
2609 data_fake.pos_delta = delta;
2610 next = regnext(scan);
2611 scan = NEXTOPER(scan);
2613 scan = NEXTOPER(scan);
2614 if (flags & SCF_DO_STCLASS) {
2615 cl_init(pRExC_state, &this_class);
2616 data_fake.start_class = &this_class;
2617 f = SCF_DO_STCLASS_AND;
2619 if (flags & SCF_WHILEM_VISITED_POS)
2620 f |= SCF_WHILEM_VISITED_POS;
2622 /* we suppose the run is continuous, last=next...*/
2623 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2625 stopparen, recursed, NULL, f,depth+1);
2628 if (max1 < minnext + deltanext)
2629 max1 = minnext + deltanext;
2630 if (deltanext == I32_MAX)
2631 is_inf = is_inf_internal = 1;
2633 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2635 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2636 if ( stopmin > minnext)
2637 stopmin = min + min1;
2638 flags &= ~SCF_DO_SUBSTR;
2640 data->flags |= SCF_SEEN_ACCEPT;
2643 if (data_fake.flags & SF_HAS_EVAL)
2644 data->flags |= SF_HAS_EVAL;
2645 data->whilem_c = data_fake.whilem_c;
2647 if (flags & SCF_DO_STCLASS)
2648 cl_or(pRExC_state, &accum, &this_class);
2650 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2652 if (flags & SCF_DO_SUBSTR) {
2653 data->pos_min += min1;
2654 data->pos_delta += max1 - min1;
2655 if (max1 != min1 || is_inf)
2656 data->longest = &(data->longest_float);
2659 delta += max1 - min1;
2660 if (flags & SCF_DO_STCLASS_OR) {
2661 cl_or(pRExC_state, data->start_class, &accum);
2663 cl_and(data->start_class, and_withp);
2664 flags &= ~SCF_DO_STCLASS;
2667 else if (flags & SCF_DO_STCLASS_AND) {
2669 cl_and(data->start_class, &accum);
2670 flags &= ~SCF_DO_STCLASS;
2673 /* Switch to OR mode: cache the old value of
2674 * data->start_class */
2676 StructCopy(data->start_class, and_withp,
2677 struct regnode_charclass_class);
2678 flags &= ~SCF_DO_STCLASS_AND;
2679 StructCopy(&accum, data->start_class,
2680 struct regnode_charclass_class);
2681 flags |= SCF_DO_STCLASS_OR;
2682 data->start_class->flags |= ANYOF_EOS;
2686 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2689 Assuming this was/is a branch we are dealing with: 'scan' now
2690 points at the item that follows the branch sequence, whatever
2691 it is. We now start at the beginning of the sequence and look
2698 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2700 If we can find such a subseqence we need to turn the first
2701 element into a trie and then add the subsequent branch exact
2702 strings to the trie.
2706 1. patterns where the whole set of branch can be converted.
2708 2. patterns where only a subset can be converted.
2710 In case 1 we can replace the whole set with a single regop
2711 for the trie. In case 2 we need to keep the start and end
2714 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2715 becomes BRANCH TRIE; BRANCH X;
2717 There is an additional case, that being where there is a
2718 common prefix, which gets split out into an EXACT like node
2719 preceding the TRIE node.
2721 If x(1..n)==tail then we can do a simple trie, if not we make
2722 a "jump" trie, such that when we match the appropriate word
2723 we "jump" to the appopriate tail node. Essentailly we turn
2724 a nested if into a case structure of sorts.
2729 if (!re_trie_maxbuff) {
2730 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2731 if (!SvIOK(re_trie_maxbuff))
2732 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2734 if ( SvIV(re_trie_maxbuff)>=0 ) {
2736 regnode *first = (regnode *)NULL;
2737 regnode *last = (regnode *)NULL;
2738 regnode *tail = scan;
2743 SV * const mysv = sv_newmortal(); /* for dumping */
2745 /* var tail is used because there may be a TAIL
2746 regop in the way. Ie, the exacts will point to the
2747 thing following the TAIL, but the last branch will
2748 point at the TAIL. So we advance tail. If we
2749 have nested (?:) we may have to move through several
2753 while ( OP( tail ) == TAIL ) {
2754 /* this is the TAIL generated by (?:) */
2755 tail = regnext( tail );
2760 regprop(RExC_rx, mysv, tail );
2761 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2762 (int)depth * 2 + 2, "",
2763 "Looking for TRIE'able sequences. Tail node is: ",
2764 SvPV_nolen_const( mysv )
2770 step through the branches, cur represents each
2771 branch, noper is the first thing to be matched
2772 as part of that branch and noper_next is the
2773 regnext() of that node. if noper is an EXACT
2774 and noper_next is the same as scan (our current
2775 position in the regex) then the EXACT branch is
2776 a possible optimization target. Once we have
2777 two or more consequetive such branches we can
2778 create a trie of the EXACT's contents and stich
2779 it in place. If the sequence represents all of
2780 the branches we eliminate the whole thing and
2781 replace it with a single TRIE. If it is a
2782 subsequence then we need to stitch it in. This
2783 means the first branch has to remain, and needs
2784 to be repointed at the item on the branch chain
2785 following the last branch optimized. This could
2786 be either a BRANCH, in which case the
2787 subsequence is internal, or it could be the
2788 item following the branch sequence in which
2789 case the subsequence is at the end.
2793 /* dont use tail as the end marker for this traverse */
2794 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2795 regnode * const noper = NEXTOPER( cur );
2796 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2797 regnode * const noper_next = regnext( noper );
2801 regprop(RExC_rx, mysv, cur);
2802 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2803 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2805 regprop(RExC_rx, mysv, noper);
2806 PerlIO_printf( Perl_debug_log, " -> %s",
2807 SvPV_nolen_const(mysv));
2810 regprop(RExC_rx, mysv, noper_next );
2811 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2812 SvPV_nolen_const(mysv));
2814 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2815 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2817 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2818 : PL_regkind[ OP( noper ) ] == EXACT )
2819 || OP(noper) == NOTHING )
2821 && noper_next == tail
2826 if ( !first || optype == NOTHING ) {
2827 if (!first) first = cur;
2828 optype = OP( noper );
2834 Currently we do not believe that the trie logic can
2835 handle case insensitive matching properly when the
2836 pattern is not unicode (thus forcing unicode semantics).
2838 If/when this is fixed the following define can be swapped
2839 in below to fully enable trie logic.
2841 #define TRIE_TYPE_IS_SAFE 1
2844 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2846 if ( last && TRIE_TYPE_IS_SAFE ) {
2847 make_trie( pRExC_state,
2848 startbranch, first, cur, tail, count,
2851 if ( PL_regkind[ OP( noper ) ] == EXACT
2853 && noper_next == tail
2858 optype = OP( noper );
2868 regprop(RExC_rx, mysv, cur);
2869 PerlIO_printf( Perl_debug_log,
2870 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2871 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2875 if ( last && TRIE_TYPE_IS_SAFE ) {
2876 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2877 #ifdef TRIE_STUDY_OPT
2878 if ( ((made == MADE_EXACT_TRIE &&
2879 startbranch == first)
2880 || ( first_non_open == first )) &&
2882 flags |= SCF_TRIE_RESTUDY;
2883 if ( startbranch == first
2886 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2896 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2897 scan = NEXTOPER(NEXTOPER(scan));
2898 } else /* single branch is optimized. */
2899 scan = NEXTOPER(scan);
2901 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2902 scan_frame *newframe = NULL;
2907 if (OP(scan) != SUSPEND) {
2908 /* set the pointer */
2909 if (OP(scan) == GOSUB) {
2911 RExC_recurse[ARG2L(scan)] = scan;
2912 start = RExC_open_parens[paren-1];
2913 end = RExC_close_parens[paren-1];
2916 start = RExC_rxi->program + 1;
2920 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2921 SAVEFREEPV(recursed);
2923 if (!PAREN_TEST(recursed,paren+1)) {
2924 PAREN_SET(recursed,paren+1);
2925 Newx(newframe,1,scan_frame);
2927 if (flags & SCF_DO_SUBSTR) {
2928 SCAN_COMMIT(pRExC_state,data,minlenp);
2929 data->longest = &(data->longest_float);
2931 is_inf = is_inf_internal = 1;
2932 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2933 cl_anything(pRExC_state, data->start_class);
2934 flags &= ~SCF_DO_STCLASS;
2937 Newx(newframe,1,scan_frame);
2940 end = regnext(scan);
2945 SAVEFREEPV(newframe);
2946 newframe->next = regnext(scan);
2947 newframe->last = last;
2948 newframe->stop = stopparen;
2949 newframe->prev = frame;
2959 else if (OP(scan) == EXACT) {
2960 I32 l = STR_LEN(scan);
2963 const U8 * const s = (U8*)STRING(scan);
2964 l = utf8_length(s, s + l);
2965 uc = utf8_to_uvchr(s, NULL);
2967 uc = *((U8*)STRING(scan));
2970 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2971 /* The code below prefers earlier match for fixed
2972 offset, later match for variable offset. */
2973 if (data->last_end == -1) { /* Update the start info. */
2974 data->last_start_min = data->pos_min;
2975 data->last_start_max = is_inf
2976 ? I32_MAX : data->pos_min + data->pos_delta;
2978 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2980 SvUTF8_on(data->last_found);
2982 SV * const sv = data->last_found;
2983 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2984 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2985 if (mg && mg->mg_len >= 0)
2986 mg->mg_len += utf8_length((U8*)STRING(scan),
2987 (U8*)STRING(scan)+STR_LEN(scan));
2989 data->last_end = data->pos_min + l;
2990 data->pos_min += l; /* As in the first entry. */
2991 data->flags &= ~SF_BEFORE_EOL;
2993 if (flags & SCF_DO_STCLASS_AND) {
2994 /* Check whether it is compatible with what we know already! */
2998 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2999 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3000 && (!(data->start_class->flags & ANYOF_FOLD)
3001 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3004 ANYOF_CLASS_ZERO(data->start_class);
3005 ANYOF_BITMAP_ZERO(data->start_class);
3007 ANYOF_BITMAP_SET(data->start_class, uc);
3008 data->start_class->flags &= ~ANYOF_EOS;
3010 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3012 else if (flags & SCF_DO_STCLASS_OR) {
3013 /* false positive possible if the class is case-folded */
3015 ANYOF_BITMAP_SET(data->start_class, uc);
3017 data->start_class->flags |= ANYOF_UNICODE_ALL;
3018 data->start_class->flags &= ~ANYOF_EOS;
3019 cl_and(data->start_class, and_withp);
3021 flags &= ~SCF_DO_STCLASS;
3023 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3024 I32 l = STR_LEN(scan);
3025 UV uc = *((U8*)STRING(scan));
3027 /* Search for fixed substrings supports EXACT only. */
3028 if (flags & SCF_DO_SUBSTR) {
3030 SCAN_COMMIT(pRExC_state, data, minlenp);
3033 const U8 * const s = (U8 *)STRING(scan);
3034 l = utf8_length(s, s + l);
3035 uc = utf8_to_uvchr(s, NULL);
3038 if (flags & SCF_DO_SUBSTR)
3040 if (flags & SCF_DO_STCLASS_AND) {
3041 /* Check whether it is compatible with what we know already! */
3045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3046 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3049 ANYOF_CLASS_ZERO(data->start_class);
3050 ANYOF_BITMAP_ZERO(data->start_class);
3052 ANYOF_BITMAP_SET(data->start_class, uc);
3053 data->start_class->flags &= ~ANYOF_EOS;
3054 data->start_class->flags |= ANYOF_FOLD;
3055 if (OP(scan) == EXACTFL)
3056 data->start_class->flags |= ANYOF_LOCALE;
3059 else if (flags & SCF_DO_STCLASS_OR) {
3060 if (data->start_class->flags & ANYOF_FOLD) {
3061 /* false positive possible if the class is case-folded.
3062 Assume that the locale settings are the same... */
3064 ANYOF_BITMAP_SET(data->start_class, uc);
3065 data->start_class->flags &= ~ANYOF_EOS;
3067 cl_and(data->start_class, and_withp);
3069 flags &= ~SCF_DO_STCLASS;
3071 else if (strchr((const char*)PL_varies,OP(scan))) {
3072 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3073 I32 f = flags, pos_before = 0;
3074 regnode * const oscan = scan;
3075 struct regnode_charclass_class this_class;
3076 struct regnode_charclass_class *oclass = NULL;
3077 I32 next_is_eval = 0;
3079 switch (PL_regkind[OP(scan)]) {
3080 case WHILEM: /* End of (?:...)* . */
3081 scan = NEXTOPER(scan);
3084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3085 next = NEXTOPER(scan);
3086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3088 maxcount = REG_INFTY;
3089 next = regnext(scan);
3090 scan = NEXTOPER(scan);
3094 if (flags & SCF_DO_SUBSTR)
3099 if (flags & SCF_DO_STCLASS) {
3101 maxcount = REG_INFTY;
3102 next = regnext(scan);
3103 scan = NEXTOPER(scan);
3106 is_inf = is_inf_internal = 1;
3107 scan = regnext(scan);
3108 if (flags & SCF_DO_SUBSTR) {
3109 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3110 data->longest = &(data->longest_float);
3112 goto optimize_curly_tail;
3114 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3115 && (scan->flags == stopparen))
3120 mincount = ARG1(scan);
3121 maxcount = ARG2(scan);
3123 next = regnext(scan);
3124 if (OP(scan) == CURLYX) {
3125 I32 lp = (data ? *(data->last_closep) : 0);
3126 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3128 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3129 next_is_eval = (OP(scan) == EVAL);
3131 if (flags & SCF_DO_SUBSTR) {
3132 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3133 pos_before = data->pos_min;
3137 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3139 data->flags |= SF_IS_INF;
3141 if (flags & SCF_DO_STCLASS) {
3142 cl_init(pRExC_state, &this_class);
3143 oclass = data->start_class;
3144 data->start_class = &this_class;
3145 f |= SCF_DO_STCLASS_AND;
3146 f &= ~SCF_DO_STCLASS_OR;
3148 /* These are the cases when once a subexpression
3149 fails at a particular position, it cannot succeed
3150 even after backtracking at the enclosing scope.
3152 XXXX what if minimal match and we are at the
3153 initial run of {n,m}? */
3154 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3155 f &= ~SCF_WHILEM_VISITED_POS;
3157 /* This will finish on WHILEM, setting scan, or on NULL: */
3158 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3159 last, data, stopparen, recursed, NULL,
3161 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3163 if (flags & SCF_DO_STCLASS)
3164 data->start_class = oclass;
3165 if (mincount == 0 || minnext == 0) {
3166 if (flags & SCF_DO_STCLASS_OR) {
3167 cl_or(pRExC_state, data->start_class, &this_class);
3169 else if (flags & SCF_DO_STCLASS_AND) {
3170 /* Switch to OR mode: cache the old value of
3171 * data->start_class */
3173 StructCopy(data->start_class, and_withp,
3174 struct regnode_charclass_class);
3175 flags &= ~SCF_DO_STCLASS_AND;
3176 StructCopy(&this_class, data->start_class,
3177 struct regnode_charclass_class);
3178 flags |= SCF_DO_STCLASS_OR;
3179 data->start_class->flags |= ANYOF_EOS;
3181 } else { /* Non-zero len */
3182 if (flags & SCF_DO_STCLASS_OR) {
3183 cl_or(pRExC_state, data->start_class, &this_class);
3184 cl_and(data->start_class, and_withp);
3186 else if (flags & SCF_DO_STCLASS_AND)
3187 cl_and(data->start_class, &this_class);
3188 flags &= ~SCF_DO_STCLASS;
3190 if (!scan) /* It was not CURLYX, but CURLY. */
3192 if ( /* ? quantifier ok, except for (?{ ... }) */
3193 (next_is_eval || !(mincount == 0 && maxcount == 1))
3194 && (minnext == 0) && (deltanext == 0)
3195 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3196 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3198 ckWARNreg(RExC_parse,
3199 "Quantifier unexpected on zero-length expression");
3202 min += minnext * mincount;
3203 is_inf_internal |= ((maxcount == REG_INFTY
3204 && (minnext + deltanext) > 0)
3205 || deltanext == I32_MAX);
3206 is_inf |= is_inf_internal;
3207 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3209 /* Try powerful optimization CURLYX => CURLYN. */
3210 if ( OP(oscan) == CURLYX && data
3211 && data->flags & SF_IN_PAR
3212 && !(data->flags & SF_HAS_EVAL)
3213 && !deltanext && minnext == 1 ) {
3214 /* Try to optimize to CURLYN. */
3215 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3216 regnode * const nxt1 = nxt;
3223 if (!strchr((const char*)PL_simple,OP(nxt))
3224 && !(PL_regkind[OP(nxt)] == EXACT
3225 && STR_LEN(nxt) == 1))
3231 if (OP(nxt) != CLOSE)
3233 if (RExC_open_parens) {
3234 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3235 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3237 /* Now we know that nxt2 is the only contents: */
3238 oscan->flags = (U8)ARG(nxt);
3240 OP(nxt1) = NOTHING; /* was OPEN. */
3243 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3244 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3245 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3246 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3247 OP(nxt + 1) = OPTIMIZED; /* was count. */
3248 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3253 /* Try optimization CURLYX => CURLYM. */
3254 if ( OP(oscan) == CURLYX && data
3255 && !(data->flags & SF_HAS_PAR)
3256 && !(data->flags & SF_HAS_EVAL)
3257 && !deltanext /* atom is fixed width */
3258 && minnext != 0 /* CURLYM can't handle zero width */
3260 /* XXXX How to optimize if data == 0? */
3261 /* Optimize to a simpler form. */
3262 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3266 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3267 && (OP(nxt2) != WHILEM))
3269 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3270 /* Need to optimize away parenths. */
3271 if (data->flags & SF_IN_PAR) {
3272 /* Set the parenth number. */
3273 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3275 if (OP(nxt) != CLOSE)
3276 FAIL("Panic opt close");
3277 oscan->flags = (U8)ARG(nxt);
3278 if (RExC_open_parens) {
3279 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3280 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3282 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3283 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3286 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3287 OP(nxt + 1) = OPTIMIZED; /* was count. */
3288 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3289 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3292 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3293 regnode *nnxt = regnext(nxt1);
3296 if (reg_off_by_arg[OP(nxt1)])
3297 ARG_SET(nxt1, nxt2 - nxt1);
3298 else if (nxt2 - nxt1 < U16_MAX)
3299 NEXT_OFF(nxt1) = nxt2 - nxt1;
3301 OP(nxt) = NOTHING; /* Cannot beautify */
3306 /* Optimize again: */
3307 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3308 NULL, stopparen, recursed, NULL, 0,depth+1);
3313 else if ((OP(oscan) == CURLYX)
3314 && (flags & SCF_WHILEM_VISITED_POS)
3315 /* See the comment on a similar expression above.
3316 However, this time it not a subexpression
3317 we care about, but the expression itself. */
3318 && (maxcount == REG_INFTY)
3319 && data && ++data->whilem_c < 16) {
3320 /* This stays as CURLYX, we can put the count/of pair. */
3321 /* Find WHILEM (as in regexec.c) */
3322 regnode *nxt = oscan + NEXT_OFF(oscan);
3324 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3326 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3327 | (RExC_whilem_seen << 4)); /* On WHILEM */
3329 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3331 if (flags & SCF_DO_SUBSTR) {
3332 SV *last_str = NULL;
3333 int counted = mincount != 0;
3335 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3336 #if defined(SPARC64_GCC_WORKAROUND)
3339 const char *s = NULL;
3342 if (pos_before >= data->last_start_min)
3345 b = data->last_start_min;
3348 s = SvPV_const(data->last_found, l);
3349 old = b - data->last_start_min;
3352 I32 b = pos_before >= data->last_start_min
3353 ? pos_before : data->last_start_min;
3355 const char * const s = SvPV_const(data->last_found, l);
3356 I32 old = b - data->last_start_min;
3360 old = utf8_hop((U8*)s, old) - (U8*)s;
3363 /* Get the added string: */
3364 last_str = newSVpvn_utf8(s + old, l, UTF);
3365 if (deltanext == 0 && pos_before == b) {
3366 /* What was added is a constant string */
3368 SvGROW(last_str, (mincount * l) + 1);
3369 repeatcpy(SvPVX(last_str) + l,
3370 SvPVX_const(last_str), l, mincount - 1);
3371 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3372 /* Add additional parts. */
3373 SvCUR_set(data->last_found,
3374 SvCUR(data->last_found) - l);
3375 sv_catsv(data->last_found, last_str);
3377 SV * sv = data->last_found;
3379 SvUTF8(sv) && SvMAGICAL(sv) ?
3380 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3381 if (mg && mg->mg_len >= 0)
3382 mg->mg_len += CHR_SVLEN(last_str) - l;
3384 data->last_end += l * (mincount - 1);
3387 /* start offset must point into the last copy */
3388 data->last_start_min += minnext * (mincount - 1);
3389 data->last_start_max += is_inf ? I32_MAX
3390 : (maxcount - 1) * (minnext + data->pos_delta);
3393 /* It is counted once already... */
3394 data->pos_min += minnext * (mincount - counted);
3395 data->pos_delta += - counted * deltanext +
3396 (minnext + deltanext) * maxcount - minnext * mincount;
3397 if (mincount != maxcount) {
3398 /* Cannot extend fixed substrings found inside
3400 SCAN_COMMIT(pRExC_state,data,minlenp);
3401 if (mincount && last_str) {
3402 SV * const sv = data->last_found;
3403 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3404 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3408 sv_setsv(sv, last_str);
3409 data->last_end = data->pos_min;
3410 data->last_start_min =
3411 data->pos_min - CHR_SVLEN(last_str);
3412 data->last_start_max = is_inf
3414 : data->pos_min + data->pos_delta
3415 - CHR_SVLEN(last_str);
3417 data->longest = &(data->longest_float);
3419 SvREFCNT_dec(last_str);
3421 if (data && (fl & SF_HAS_EVAL))
3422 data->flags |= SF_HAS_EVAL;
3423 optimize_curly_tail:
3424 if (OP(oscan) != CURLYX) {
3425 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3427 NEXT_OFF(oscan) += NEXT_OFF(next);
3430 default: /* REF and CLUMP only? */
3431 if (flags & SCF_DO_SUBSTR) {
3432 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3433 data->longest = &(data->longest_float);
3435 is_inf = is_inf_internal = 1;
3436 if (flags & SCF_DO_STCLASS_OR)
3437 cl_anything(pRExC_state, data->start_class);
3438 flags &= ~SCF_DO_STCLASS;
3442 else if (OP(scan) == LNBREAK) {
3443 if (flags & SCF_DO_STCLASS) {
3445 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3446 if (flags & SCF_DO_STCLASS_AND) {
3447 for (value = 0; value < 256; value++)
3448 if (!is_VERTWS_cp(value))
3449 ANYOF_BITMAP_CLEAR(data->start_class, value);
3452 for (value = 0; value < 256; value++)
3453 if (is_VERTWS_cp(value))
3454 ANYOF_BITMAP_SET(data->start_class, value);
3456 if (flags & SCF_DO_STCLASS_OR)
3457 cl_and(data->start_class, and_withp);
3458 flags &= ~SCF_DO_STCLASS;
3462 if (flags & SCF_DO_SUBSTR) {
3463 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3465 data->pos_delta += 1;
3466 data->longest = &(data->longest_float);
3470 else if (OP(scan) == FOLDCHAR) {
3471 int d = ARG(scan)==0xDF ? 1 : 2;
3472 flags &= ~SCF_DO_STCLASS;
3475 if (flags & SCF_DO_SUBSTR) {
3476 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3478 data->pos_delta += d;
3479 data->longest = &(data->longest_float);
3482 else if (strchr((const char*)PL_simple,OP(scan))) {
3485 if (flags & SCF_DO_SUBSTR) {
3486 SCAN_COMMIT(pRExC_state,data,minlenp);
3490 if (flags & SCF_DO_STCLASS) {
3491 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3493 /* Some of the logic below assumes that switching
3494 locale on will only add false positives. */
3495 switch (PL_regkind[OP(scan)]) {
3499 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3500 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3501 cl_anything(pRExC_state, data->start_class);
3504 if (OP(scan) == SANY)
3506 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3507 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3508 || (data->start_class->flags & ANYOF_CLASS));
3509 cl_anything(pRExC_state, data->start_class);
3511 if (flags & SCF_DO_STCLASS_AND || !value)
3512 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3515 if (flags & SCF_DO_STCLASS_AND)
3516 cl_and(data->start_class,
3517 (struct regnode_charclass_class*)scan);
3519 cl_or(pRExC_state, data->start_class,
3520 (struct regnode_charclass_class*)scan);
3523 if (flags & SCF_DO_STCLASS_AND) {
3524 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3525 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3526 for (value = 0; value < 256; value++)
3527 if (!isALNUM(value))
3528 ANYOF_BITMAP_CLEAR(data->start_class, value);
3532 if (data->start_class->flags & ANYOF_LOCALE)
3533 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3535 for (value = 0; value < 256; value++)
3537 ANYOF_BITMAP_SET(data->start_class, value);
3542 if (flags & SCF_DO_STCLASS_AND) {
3543 if (data->start_class->flags & ANYOF_LOCALE)
3544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3547 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3548 data->start_class->flags |= ANYOF_LOCALE;
3552 if (flags & SCF_DO_STCLASS_AND) {
3553 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3554 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3555 for (value = 0; value < 256; value++)
3557 ANYOF_BITMAP_CLEAR(data->start_class, value);
3561 if (data->start_class->flags & ANYOF_LOCALE)
3562 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3564 for (value = 0; value < 256; value++)
3565 if (!isALNUM(value))
3566 ANYOF_BITMAP_SET(data->start_class, value);
3571 if (flags & SCF_DO_STCLASS_AND) {
3572 if (data->start_class->flags & ANYOF_LOCALE)
3573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3576 data->start_class->flags |= ANYOF_LOCALE;
3577 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3581 if (flags & SCF_DO_STCLASS_AND) {
3582 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3583 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3584 for (value = 0; value < 256; value++)
3585 if (!isSPACE(value))
3586 ANYOF_BITMAP_CLEAR(data->start_class, value);
3590 if (data->start_class->flags & ANYOF_LOCALE)
3591 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3593 for (value = 0; value < 256; value++)
3595 ANYOF_BITMAP_SET(data->start_class, value);
3600 if (flags & SCF_DO_STCLASS_AND) {
3601 if (data->start_class->flags & ANYOF_LOCALE)
3602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3605 data->start_class->flags |= ANYOF_LOCALE;
3606 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3610 if (flags & SCF_DO_STCLASS_AND) {
3611 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3612 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3613 for (value = 0; value < 256; value++)
3615 ANYOF_BITMAP_CLEAR(data->start_class, value);
3619 if (data->start_class->flags & ANYOF_LOCALE)
3620 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3622 for (value = 0; value < 256; value++)
3623 if (!isSPACE(value))
3624 ANYOF_BITMAP_SET(data->start_class, value);
3629 if (flags & SCF_DO_STCLASS_AND) {
3630 if (data->start_class->flags & ANYOF_LOCALE) {
3631 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3632 for (value = 0; value < 256; value++)
3633 if (!isSPACE(value))
3634 ANYOF_BITMAP_CLEAR(data->start_class, value);
3638 data->start_class->flags |= ANYOF_LOCALE;
3639 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3643 if (flags & SCF_DO_STCLASS_AND) {
3644 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3645 for (value = 0; value < 256; value++)
3646 if (!isDIGIT(value))
3647 ANYOF_BITMAP_CLEAR(data->start_class, value);
3650 if (data->start_class->flags & ANYOF_LOCALE)
3651 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3653 for (value = 0; value < 256; value++)
3655 ANYOF_BITMAP_SET(data->start_class, value);
3660 if (flags & SCF_DO_STCLASS_AND) {
3661 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3662 for (value = 0; value < 256; value++)
3664 ANYOF_BITMAP_CLEAR(data->start_class, value);
3667 if (data->start_class->flags & ANYOF_LOCALE)
3668 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3670 for (value = 0; value < 256; value++)
3671 if (!isDIGIT(value))
3672 ANYOF_BITMAP_SET(data->start_class, value);
3676 CASE_SYNST_FNC(VERTWS);
3677 CASE_SYNST_FNC(HORIZWS);
3680 if (flags & SCF_DO_STCLASS_OR)
3681 cl_and(data->start_class, and_withp);
3682 flags &= ~SCF_DO_STCLASS;
3685 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3686 data->flags |= (OP(scan) == MEOL
3690 else if ( PL_regkind[OP(scan)] == BRANCHJ
3691 /* Lookbehind, or need to calculate parens/evals/stclass: */
3692 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3693 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3694 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3695 || OP(scan) == UNLESSM )
3697 /* Negative Lookahead/lookbehind
3698 In this case we can't do fixed string optimisation.
3701 I32 deltanext, minnext, fake = 0;
3703 struct regnode_charclass_class intrnl;
3706 data_fake.flags = 0;
3708 data_fake.whilem_c = data->whilem_c;
3709 data_fake.last_closep = data->last_closep;
3712 data_fake.last_closep = &fake;
3713 data_fake.pos_delta = delta;
3714 if ( flags & SCF_DO_STCLASS && !scan->flags
3715 && OP(scan) == IFMATCH ) { /* Lookahead */
3716 cl_init(pRExC_state, &intrnl);
3717 data_fake.start_class = &intrnl;
3718 f |= SCF_DO_STCLASS_AND;
3720 if (flags & SCF_WHILEM_VISITED_POS)
3721 f |= SCF_WHILEM_VISITED_POS;
3722 next = regnext(scan);
3723 nscan = NEXTOPER(NEXTOPER(scan));
3724 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3725 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3728 FAIL("Variable length lookbehind not implemented");
3730 else if (minnext > (I32)U8_MAX) {
3731 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3733 scan->flags = (U8)minnext;
3736 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3738 if (data_fake.flags & SF_HAS_EVAL)
3739 data->flags |= SF_HAS_EVAL;
3740 data->whilem_c = data_fake.whilem_c;
3742 if (f & SCF_DO_STCLASS_AND) {
3743 if (flags & SCF_DO_STCLASS_OR) {
3744 /* OR before, AND after: ideally we would recurse with
3745 * data_fake to get the AND applied by study of the
3746 * remainder of the pattern, and then derecurse;
3747 * *** HACK *** for now just treat as "no information".
3748 * See [perl #56690].
3750 cl_init(pRExC_state, data->start_class);
3752 /* AND before and after: combine and continue */
3753 const int was = (data->start_class->flags & ANYOF_EOS);
3755 cl_and(data->start_class, &intrnl);
3757 data->start_class->flags |= ANYOF_EOS;
3761 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3763 /* Positive Lookahead/lookbehind
3764 In this case we can do fixed string optimisation,
3765 but we must be careful about it. Note in the case of
3766 lookbehind the positions will be offset by the minimum
3767 length of the pattern, something we won't know about
3768 until after the recurse.
3770 I32 deltanext, fake = 0;
3772 struct regnode_charclass_class intrnl;
3774 /* We use SAVEFREEPV so that when the full compile
3775 is finished perl will clean up the allocated
3776 minlens when its all done. This was we don't
3777 have to worry about freeing them when we know
3778 they wont be used, which would be a pain.
3781 Newx( minnextp, 1, I32 );
3782 SAVEFREEPV(minnextp);
3785 StructCopy(data, &data_fake, scan_data_t);
3786 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3789 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3790 data_fake.last_found=newSVsv(data->last_found);
3794 data_fake.last_closep = &fake;
3795 data_fake.flags = 0;
3796 data_fake.pos_delta = delta;
3798 data_fake.flags |= SF_IS_INF;
3799 if ( flags & SCF_DO_STCLASS && !scan->flags
3800 && OP(scan) == IFMATCH ) { /* Lookahead */
3801 cl_init(pRExC_state, &intrnl);
3802 data_fake.start_class = &intrnl;
3803 f |= SCF_DO_STCLASS_AND;
3805 if (flags & SCF_WHILEM_VISITED_POS)
3806 f |= SCF_WHILEM_VISITED_POS;
3807 next = regnext(scan);
3808 nscan = NEXTOPER(NEXTOPER(scan));
3810 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3811 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3814 FAIL("Variable length lookbehind not implemented");
3816 else if (*minnextp > (I32)U8_MAX) {
3817 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3819 scan->flags = (U8)*minnextp;
3824 if (f & SCF_DO_STCLASS_AND) {
3825 const int was = (data->start_class->flags & ANYOF_EOS);
3827 cl_and(data->start_class, &intrnl);
3829 data->start_class->flags |= ANYOF_EOS;
3832 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3834 if (data_fake.flags & SF_HAS_EVAL)
3835 data->flags |= SF_HAS_EVAL;
3836 data->whilem_c = data_fake.whilem_c;
3837 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3838 if (RExC_rx->minlen<*minnextp)
3839 RExC_rx->minlen=*minnextp;
3840 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3841 SvREFCNT_dec(data_fake.last_found);
3843 if ( data_fake.minlen_fixed != minlenp )
3845 data->offset_fixed= data_fake.offset_fixed;
3846 data->minlen_fixed= data_fake.minlen_fixed;
3847 data->lookbehind_fixed+= scan->flags;
3849 if ( data_fake.minlen_float != minlenp )
3851 data->minlen_float= data_fake.minlen_float;
3852 data->offset_float_min=data_fake.offset_float_min;
3853 data->offset_float_max=data_fake.offset_float_max;
3854 data->lookbehind_float+= scan->flags;
3863 else if (OP(scan) == OPEN) {
3864 if (stopparen != (I32)ARG(scan))
3867 else if (OP(scan) == CLOSE) {
3868 if (stopparen == (I32)ARG(scan)) {
3871 if ((I32)ARG(scan) == is_par) {
3872 next = regnext(scan);
3874 if ( next && (OP(next) != WHILEM) && next < last)
3875 is_par = 0; /* Disable optimization */
3878 *(data->last_closep) = ARG(scan);
3880 else if (OP(scan) == EVAL) {
3882 data->flags |= SF_HAS_EVAL;
3884 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3885 if (flags & SCF_DO_SUBSTR) {
3886 SCAN_COMMIT(pRExC_state,data,minlenp);
3887 flags &= ~SCF_DO_SUBSTR;
3889 if (data && OP(scan)==ACCEPT) {
3890 data->flags |= SCF_SEEN_ACCEPT;
3895 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3897 if (flags & SCF_DO_SUBSTR) {
3898 SCAN_COMMIT(pRExC_state,data,minlenp);
3899 data->longest = &(data->longest_float);
3901 is_inf = is_inf_internal = 1;
3902 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3903 cl_anything(pRExC_state, data->start_class);
3904 flags &= ~SCF_DO_STCLASS;
3906 else if (OP(scan) == GPOS) {
3907 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3908 !(delta || is_inf || (data && data->pos_delta)))
3910 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3911 RExC_rx->extflags |= RXf_ANCH_GPOS;
3912 if (RExC_rx->gofs < (U32)min)
3913 RExC_rx->gofs = min;
3915 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3919 #ifdef TRIE_STUDY_OPT
3920 #ifdef FULL_TRIE_STUDY
3921 else if (PL_regkind[OP(scan)] == TRIE) {
3922 /* NOTE - There is similar code to this block above for handling
3923 BRANCH nodes on the initial study. If you change stuff here
3925 regnode *trie_node= scan;
3926 regnode *tail= regnext(scan);
3927 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3928 I32 max1 = 0, min1 = I32_MAX;
3929 struct regnode_charclass_class accum;
3931 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3932 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3933 if (flags & SCF_DO_STCLASS)
3934 cl_init_zero(pRExC_state, &accum);
3940 const regnode *nextbranch= NULL;
3943 for ( word=1 ; word <= trie->wordcount ; word++)
3945 I32 deltanext=0, minnext=0, f = 0, fake;
3946 struct regnode_charclass_class this_class;
3948 data_fake.flags = 0;
3950 data_fake.whilem_c = data->whilem_c;
3951 data_fake.last_closep = data->last_closep;
3954 data_fake.last_closep = &fake;
3955 data_fake.pos_delta = delta;
3956 if (flags & SCF_DO_STCLASS) {
3957 cl_init(pRExC_state, &this_class);
3958 data_fake.start_class = &this_class;
3959 f = SCF_DO_STCLASS_AND;
3961 if (flags & SCF_WHILEM_VISITED_POS)
3962 f |= SCF_WHILEM_VISITED_POS;
3964 if (trie->jump[word]) {
3966 nextbranch = trie_node + trie->jump[0];
3967 scan= trie_node + trie->jump[word];
3968 /* We go from the jump point to the branch that follows
3969 it. Note this means we need the vestigal unused branches
3970 even though they arent otherwise used.
3972 minnext = study_chunk(pRExC_state, &scan, minlenp,
3973 &deltanext, (regnode *)nextbranch, &data_fake,
3974 stopparen, recursed, NULL, f,depth+1);
3976 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3977 nextbranch= regnext((regnode*)nextbranch);
3979 if (min1 > (I32)(minnext + trie->minlen))
3980 min1 = minnext + trie->minlen;
3981 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3982 max1 = minnext + deltanext + trie->maxlen;
3983 if (deltanext == I32_MAX)
3984 is_inf = is_inf_internal = 1;
3986 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3988 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3989 if ( stopmin > min + min1)
3990 stopmin = min + min1;
3991 flags &= ~SCF_DO_SUBSTR;
3993 data->flags |= SCF_SEEN_ACCEPT;
3996 if (data_fake.flags & SF_HAS_EVAL)
3997 data->flags |= SF_HAS_EVAL;
3998 data->whilem_c = data_fake.whilem_c;
4000 if (flags & SCF_DO_STCLASS)
4001 cl_or(pRExC_state, &accum, &this_class);
4004 if (flags & SCF_DO_SUBSTR) {
4005 data->pos_min += min1;
4006 data->pos_delta += max1 - min1;
4007 if (max1 != min1 || is_inf)
4008 data->longest = &(data->longest_float);
4011 delta += max1 - min1;
4012 if (flags & SCF_DO_STCLASS_OR) {
4013 cl_or(pRExC_state, data->start_class, &accum);
4015 cl_and(data->start_class, and_withp);
4016 flags &= ~SCF_DO_STCLASS;
4019 else if (flags & SCF_DO_STCLASS_AND) {
4021 cl_and(data->start_class, &accum);
4022 flags &= ~SCF_DO_STCLASS;
4025 /* Switch to OR mode: cache the old value of
4026 * data->start_class */
4028 StructCopy(data->start_class, and_withp,
4029 struct regnode_charclass_class);
4030 flags &= ~SCF_DO_STCLASS_AND;
4031 StructCopy(&accum, data->start_class,
4032 struct regnode_charclass_class);
4033 flags |= SCF_DO_STCLASS_OR;
4034 data->start_class->flags |= ANYOF_EOS;
4041 else if (PL_regkind[OP(scan)] == TRIE) {
4042 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4045 min += trie->minlen;
4046 delta += (trie->maxlen - trie->minlen);
4047 flags &= ~SCF_DO_STCLASS; /* xxx */
4048 if (flags & SCF_DO_SUBSTR) {
4049 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4050 data->pos_min += trie->minlen;
4051 data->pos_delta += (trie->maxlen - trie->minlen);
4052 if (trie->maxlen != trie->minlen)
4053 data->longest = &(data->longest_float);
4055 if (trie->jump) /* no more substrings -- for now /grr*/
4056 flags &= ~SCF_DO_SUBSTR;
4058 #endif /* old or new */
4059 #endif /* TRIE_STUDY_OPT */
4061 /* Else: zero-length, ignore. */
4062 scan = regnext(scan);
4067 stopparen = frame->stop;
4068 frame = frame->prev;
4069 goto fake_study_recurse;
4074 DEBUG_STUDYDATA("pre-fin:",data,depth);
4077 *deltap = is_inf_internal ? I32_MAX : delta;
4078 if (flags & SCF_DO_SUBSTR && is_inf)
4079 data->pos_delta = I32_MAX - data->pos_min;
4080 if (is_par > (I32)U8_MAX)
4082 if (is_par && pars==1 && data) {
4083 data->flags |= SF_IN_PAR;
4084 data->flags &= ~SF_HAS_PAR;
4086 else if (pars && data) {
4087 data->flags |= SF_HAS_PAR;
4088 data->flags &= ~SF_IN_PAR;
4090 if (flags & SCF_DO_STCLASS_OR)
4091 cl_and(data->start_class, and_withp);
4092 if (flags & SCF_TRIE_RESTUDY)
4093 data->flags |= SCF_TRIE_RESTUDY;
4095 DEBUG_STUDYDATA("post-fin:",data,depth);
4097 return min < stopmin ? min : stopmin;
4101 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4103 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4105 PERL_ARGS_ASSERT_ADD_DATA;
4107 Renewc(RExC_rxi->data,
4108 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4109 char, struct reg_data);
4111 Renew(RExC_rxi->data->what, count + n, U8);
4113 Newx(RExC_rxi->data->what, n, U8);
4114 RExC_rxi->data->count = count + n;
4115 Copy(s, RExC_rxi->data->what + count, n, U8);
4119 /*XXX: todo make this not included in a non debugging perl */
4120 #ifndef PERL_IN_XSUB_RE
4122 Perl_reginitcolors(pTHX)
4125 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4127 char *t = savepv(s);
4131 t = strchr(t, '\t');
4137 PL_colors[i] = t = (char *)"";
4142 PL_colors[i++] = (char *)"";
4149 #ifdef TRIE_STUDY_OPT
4150 #define CHECK_RESTUDY_GOTO \
4152 (data.flags & SCF_TRIE_RESTUDY) \
4156 #define CHECK_RESTUDY_GOTO
4160 - pregcomp - compile a regular expression into internal code
4162 * We can't allocate space until we know how big the compiled form will be,
4163 * but we can't compile it (and thus know how big it is) until we've got a
4164 * place to put the code. So we cheat: we compile it twice, once with code
4165 * generation turned off and size counting turned on, and once "for real".
4166 * This also means that we don't allocate space until we are sure that the
4167 * thing really will compile successfully, and we never have to move the
4168 * code and thus invalidate pointers into it. (Note that it has to be in
4169 * one piece because free() must be able to free it all.) [NB: not true in perl]
4171 * Beware that the optimization-preparation code in here knows about some
4172 * of the structure of the compiled regexp. [I'll say.]
4177 #ifndef PERL_IN_XSUB_RE
4178 #define RE_ENGINE_PTR &PL_core_reg_engine
4180 extern const struct regexp_engine my_reg_engine;
4181 #define RE_ENGINE_PTR &my_reg_engine
4184 #ifndef PERL_IN_XSUB_RE
4186 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4189 HV * const table = GvHV(PL_hintgv);
4191 PERL_ARGS_ASSERT_PREGCOMP;
4193 /* Dispatch a request to compile a regexp to correct
4196 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4197 GET_RE_DEBUG_FLAGS_DECL;
4198 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4199 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4201 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4204 return CALLREGCOMP_ENG(eng, pattern, flags);
4207 return Perl_re_compile(aTHX_ pattern, flags);
4212 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4217 register regexp_internal *ri;
4219 char *exp = SvPV(pattern, plen);
4220 char* xend = exp + plen;
4227 RExC_state_t RExC_state;
4228 RExC_state_t * const pRExC_state = &RExC_state;
4229 #ifdef TRIE_STUDY_OPT
4231 RExC_state_t copyRExC_state;
4233 GET_RE_DEBUG_FLAGS_DECL;
4235 PERL_ARGS_ASSERT_RE_COMPILE;
4237 DEBUG_r(if (!PL_colorset) reginitcolors());
4239 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4242 SV *dsv= sv_newmortal();
4243 RE_PV_QUOTED_DECL(s, RExC_utf8,
4244 dsv, exp, plen, 60);
4245 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4246 PL_colors[4],PL_colors[5],s);
4251 RExC_flags = pm_flags;
4255 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4256 RExC_seen_evals = 0;
4259 /* First pass: determine size, legality. */
4267 RExC_emit = &PL_regdummy;
4268 RExC_whilem_seen = 0;
4269 RExC_open_parens = NULL;
4270 RExC_close_parens = NULL;
4272 RExC_paren_names = NULL;
4274 RExC_paren_name_list = NULL;
4276 RExC_recurse = NULL;
4277 RExC_recurse_count = 0;
4279 #if 0 /* REGC() is (currently) a NOP at the first pass.
4280 * Clever compilers notice this and complain. --jhi */
4281 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4283 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4284 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4285 RExC_precomp = NULL;
4288 if (RExC_utf8 && !RExC_orig_utf8) {
4289 /* It's possible to write a regexp in ascii that represents Unicode
4290 codepoints outside of the byte range, such as via \x{100}. If we
4291 detect such a sequence we have to convert the entire pattern to utf8
4292 and then recompile, as our sizing calculation will have been based
4293 on 1 byte == 1 character, but we will need to use utf8 to encode
4294 at least some part of the pattern, and therefore must convert the whole
4296 XXX: somehow figure out how to make this less expensive...
4299 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4300 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4301 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4303 RExC_orig_utf8 = RExC_utf8;
4305 goto redo_first_pass;
4308 PerlIO_printf(Perl_debug_log,
4309 "Required size %"IVdf" nodes\n"
4310 "Starting second pass (creation)\n",
4313 RExC_lastparse=NULL;
4315 /* Small enough for pointer-storage convention?
4316 If extralen==0, this means that we will not need long jumps. */
4317 if (RExC_size >= 0x10000L && RExC_extralen)
4318 RExC_size += RExC_extralen;
4321 if (RExC_whilem_seen > 15)
4322 RExC_whilem_seen = 15;
4324 /* Allocate space and zero-initialize. Note, the two step process
4325 of zeroing when in debug mode, thus anything assigned has to
4326 happen after that */
4327 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4328 r = (struct regexp*)SvANY(rx);
4329 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4330 char, regexp_internal);
4331 if ( r == NULL || ri == NULL )
4332 FAIL("Regexp out of space");
4334 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4335 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4337 /* bulk initialize base fields with 0. */
4338 Zero(ri, sizeof(regexp_internal), char);
4341 /* non-zero initialization begins here */
4343 r->engine= RE_ENGINE_PTR;
4344 r->extflags = pm_flags;
4346 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4347 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4348 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4349 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4350 >> RXf_PMf_STD_PMMOD_SHIFT);
4351 const char *fptr = STD_PAT_MODS; /*"msix"*/
4353 const STRLEN wraplen = plen + has_minus + has_p + has_runon
4354 + (sizeof(STD_PAT_MODS) - 1)
4355 + (sizeof("(?:)") - 1);
4357 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4358 SvCUR_set(rx, wraplen);
4360 SvFLAGS(rx) |= SvUTF8(pattern);
4363 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4365 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4366 char *colon = r + 1;
4369 while((ch = *fptr++)) {
4383 Copy(RExC_precomp, p, plen, char);
4384 assert ((RX_WRAPPED(rx) - p) < 16);
4385 r->pre_prefix = p - RX_WRAPPED(rx);
4394 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4396 if (RExC_seen & REG_SEEN_RECURSE) {
4397 Newxz(RExC_open_parens, RExC_npar,regnode *);
4398 SAVEFREEPV(RExC_open_parens);
4399 Newxz(RExC_close_parens,RExC_npar,regnode *);
4400 SAVEFREEPV(RExC_close_parens);
4403 /* Useful during FAIL. */
4404 #ifdef RE_TRACK_PATTERN_OFFSETS
4405 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4406 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4407 "%s %"UVuf" bytes for offset annotations.\n",
4408 ri->u.offsets ? "Got" : "Couldn't get",
4409 (UV)((2*RExC_size+1) * sizeof(U32))));
4411 SetProgLen(ri,RExC_size);
4415 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4417 /* Second pass: emit code. */
4418 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4423 RExC_emit_start = ri->program;
4424 RExC_emit = ri->program;
4425 RExC_emit_bound = ri->program + RExC_size + 1;
4427 /* Store the count of eval-groups for security checks: */
4428 RExC_rx->seen_evals = RExC_seen_evals;
4429 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4430 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4434 /* XXXX To minimize changes to RE engine we always allocate
4435 3-units-long substrs field. */
4436 Newx(r->substrs, 1, struct reg_substr_data);
4437 if (RExC_recurse_count) {
4438 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4439 SAVEFREEPV(RExC_recurse);
4443 r->minlen = minlen = sawplus = sawopen = 0;
4444 Zero(r->substrs, 1, struct reg_substr_data);
4446 #ifdef TRIE_STUDY_OPT
4448 StructCopy(&zero_scan_data, &data, scan_data_t);
4449 copyRExC_state = RExC_state;
4452 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4454 RExC_state = copyRExC_state;
4455 if (seen & REG_TOP_LEVEL_BRANCHES)
4456 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4458 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4459 if (data.last_found) {
4460 SvREFCNT_dec(data.longest_fixed);
4461 SvREFCNT_dec(data.longest_float);
4462 SvREFCNT_dec(data.last_found);
4464 StructCopy(&zero_scan_data, &data, scan_data_t);
4467 StructCopy(&zero_scan_data, &data, scan_data_t);
4470 /* Dig out information for optimizations. */
4471 r->extflags = RExC_flags; /* was pm_op */
4472 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4475 SvUTF8_on(rx); /* Unicode in it? */
4476 ri->regstclass = NULL;
4477 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4478 r->intflags |= PREGf_NAUGHTY;
4479 scan = ri->program + 1; /* First BRANCH. */
4481 /* testing for BRANCH here tells us whether there is "must appear"
4482 data in the pattern. If there is then we can use it for optimisations */
4483 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4485 STRLEN longest_float_length, longest_fixed_length;
4486 struct regnode_charclass_class ch_class; /* pointed to by data */
4488 I32 last_close = 0; /* pointed to by data */
4489 regnode *first= scan;
4490 regnode *first_next= regnext(first);
4493 * Skip introductions and multiplicators >= 1
4494 * so that we can extract the 'meat' of the pattern that must
4495 * match in the large if() sequence following.
4496 * NOTE that EXACT is NOT covered here, as it is normally
4497 * picked up by the optimiser separately.
4499 * This is unfortunate as the optimiser isnt handling lookahead
4500 * properly currently.
4503 while ((OP(first) == OPEN && (sawopen = 1)) ||
4504 /* An OR of *one* alternative - should not happen now. */
4505 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4506 /* for now we can't handle lookbehind IFMATCH*/
4507 (OP(first) == IFMATCH && !first->flags) ||
4508 (OP(first) == PLUS) ||
4509 (OP(first) == MINMOD) ||
4510 /* An {n,m} with n>0 */
4511 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4512 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4515 * the only op that could be a regnode is PLUS, all the rest
4516 * will be regnode_1 or regnode_2.
4519 if (OP(first) == PLUS)
4522 first += regarglen[OP(first)];
4524 first = NEXTOPER(first);
4525 first_next= regnext(first);
4528 /* Starting-point info. */
4530 DEBUG_PEEP("first:",first,0);
4531 /* Ignore EXACT as we deal with it later. */
4532 if (PL_regkind[OP(first)] == EXACT) {
4533 if (OP(first) == EXACT)
4534 NOOP; /* Empty, get anchored substr later. */
4535 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4536 ri->regstclass = first;
4539 else if (PL_regkind[OP(first)] == TRIE &&
4540 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4543 /* this can happen only on restudy */
4544 if ( OP(first) == TRIE ) {
4545 struct regnode_1 *trieop = (struct regnode_1 *)
4546 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4547 StructCopy(first,trieop,struct regnode_1);
4548 trie_op=(regnode *)trieop;
4550 struct regnode_charclass *trieop = (struct regnode_charclass *)
4551 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4552 StructCopy(first,trieop,struct regnode_charclass);
4553 trie_op=(regnode *)trieop;
4556 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4557 ri->regstclass = trie_op;
4560 else if (strchr((const char*)PL_simple,OP(first)))
4561 ri->regstclass = first;
4562 else if (PL_regkind[OP(first)] == BOUND ||
4563 PL_regkind[OP(first)] == NBOUND)
4564 ri->regstclass = first;
4565 else if (PL_regkind[OP(first)] == BOL) {
4566 r->extflags |= (OP(first) == MBOL
4568 : (OP(first) == SBOL
4571 first = NEXTOPER(first);
4574 else if (OP(first) == GPOS) {
4575 r->extflags |= RXf_ANCH_GPOS;
4576 first = NEXTOPER(first);
4579 else if ((!sawopen || !RExC_sawback) &&
4580 (OP(first) == STAR &&
4581 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4582 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4584 /* turn .* into ^.* with an implied $*=1 */
4586 (OP(NEXTOPER(first)) == REG_ANY)
4589 r->extflags |= type;
4590 r->intflags |= PREGf_IMPLICIT;
4591 first = NEXTOPER(first);
4594 if (sawplus && (!sawopen || !RExC_sawback)
4595 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4596 /* x+ must match at the 1st pos of run of x's */
4597 r->intflags |= PREGf_SKIP;
4599 /* Scan is after the zeroth branch, first is atomic matcher. */
4600 #ifdef TRIE_STUDY_OPT
4603 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4604 (IV)(first - scan + 1))
4608 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4609 (IV)(first - scan + 1))
4615 * If there's something expensive in the r.e., find the
4616 * longest literal string that must appear and make it the
4617 * regmust. Resolve ties in favor of later strings, since
4618 * the regstart check works with the beginning of the r.e.
4619 * and avoiding duplication strengthens checking. Not a
4620 * strong reason, but sufficient in the absence of others.
4621 * [Now we resolve ties in favor of the earlier string if
4622 * it happens that c_offset_min has been invalidated, since the
4623 * earlier string may buy us something the later one won't.]
4626 data.longest_fixed = newSVpvs("");
4627 data.longest_float = newSVpvs("");
4628 data.last_found = newSVpvs("");
4629 data.longest = &(data.longest_fixed);
4631 if (!ri->regstclass) {
4632 cl_init(pRExC_state, &ch_class);
4633 data.start_class = &ch_class;
4634 stclass_flag = SCF_DO_STCLASS_AND;
4635 } else /* XXXX Check for BOUND? */
4637 data.last_closep = &last_close;
4639 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4640 &data, -1, NULL, NULL,
4641 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4647 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4648 && data.last_start_min == 0 && data.last_end > 0
4649 && !RExC_seen_zerolen
4650 && !(RExC_seen & REG_SEEN_VERBARG)
4651 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4652 r->extflags |= RXf_CHECK_ALL;
4653 scan_commit(pRExC_state, &data,&minlen,0);
4654 SvREFCNT_dec(data.last_found);
4656 /* Note that code very similar to this but for anchored string
4657 follows immediately below, changes may need to be made to both.
4660 longest_float_length = CHR_SVLEN(data.longest_float);
4661 if (longest_float_length
4662 || (data.flags & SF_FL_BEFORE_EOL
4663 && (!(data.flags & SF_FL_BEFORE_MEOL)
4664 || (RExC_flags & RXf_PMf_MULTILINE))))
4668 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4669 && data.offset_fixed == data.offset_float_min
4670 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4671 goto remove_float; /* As in (a)+. */
4673 /* copy the information about the longest float from the reg_scan_data
4674 over to the program. */
4675 if (SvUTF8(data.longest_float)) {
4676 r->float_utf8 = data.longest_float;
4677 r->float_substr = NULL;
4679 r->float_substr = data.longest_float;
4680 r->float_utf8 = NULL;
4682 /* float_end_shift is how many chars that must be matched that
4683 follow this item. We calculate it ahead of time as once the
4684 lookbehind offset is added in we lose the ability to correctly
4686 ml = data.minlen_float ? *(data.minlen_float)
4687 : (I32)longest_float_length;
4688 r->float_end_shift = ml - data.offset_float_min
4689 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4690 + data.lookbehind_float;
4691 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4692 r->float_max_offset = data.offset_float_max;
4693 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4694 r->float_max_offset -= data.lookbehind_float;
4696 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4697 && (!(data.flags & SF_FL_BEFORE_MEOL)
4698 || (RExC_flags & RXf_PMf_MULTILINE)));
4699 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4703 r->float_substr = r->float_utf8 = NULL;
4704 SvREFCNT_dec(data.longest_float);
4705 longest_float_length = 0;
4708 /* Note that code very similar to this but for floating string
4709 is immediately above, changes may need to be made to both.
4712 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4713 if (longest_fixed_length
4714 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4715 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4716 || (RExC_flags & RXf_PMf_MULTILINE))))
4720 /* copy the information about the longest fixed
4721 from the reg_scan_data over to the program. */
4722 if (SvUTF8(data.longest_fixed)) {
4723 r->anchored_utf8 = data.longest_fixed;
4724 r->anchored_substr = NULL;
4726 r->anchored_substr = data.longest_fixed;
4727 r->anchored_utf8 = NULL;
4729 /* fixed_end_shift is how many chars that must be matched that
4730 follow this item. We calculate it ahead of time as once the
4731 lookbehind offset is added in we lose the ability to correctly
4733 ml = data.minlen_fixed ? *(data.minlen_fixed)
4734 : (I32)longest_fixed_length;
4735 r->anchored_end_shift = ml - data.offset_fixed
4736 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4737 + data.lookbehind_fixed;
4738 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4740 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4741 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4742 || (RExC_flags & RXf_PMf_MULTILINE)));
4743 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4746 r->anchored_substr = r->anchored_utf8 = NULL;
4747 SvREFCNT_dec(data.longest_fixed);
4748 longest_fixed_length = 0;
4751 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4752 ri->regstclass = NULL;
4753 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4755 && !(data.start_class->flags & ANYOF_EOS)
4756 && !cl_is_anything(data.start_class))
4758 const U32 n = add_data(pRExC_state, 1, "f");
4760 Newx(RExC_rxi->data->data[n], 1,
4761 struct regnode_charclass_class);
4762 StructCopy(data.start_class,
4763 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4764 struct regnode_charclass_class);
4765 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4766 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4767 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4768 regprop(r, sv, (regnode*)data.start_class);
4769 PerlIO_printf(Perl_debug_log,
4770 "synthetic stclass \"%s\".\n",
4771 SvPVX_const(sv));});
4774 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4775 if (longest_fixed_length > longest_float_length) {
4776 r->check_end_shift = r->anchored_end_shift;
4777 r->check_substr = r->anchored_substr;
4778 r->check_utf8 = r->anchored_utf8;
4779 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4780 if (r->extflags & RXf_ANCH_SINGLE)
4781 r->extflags |= RXf_NOSCAN;
4784 r->check_end_shift = r->float_end_shift;
4785 r->check_substr = r->float_substr;
4786 r->check_utf8 = r->float_utf8;
4787 r->check_offset_min = r->float_min_offset;
4788 r->check_offset_max = r->float_max_offset;
4790 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4791 This should be changed ASAP! */
4792 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4793 r->extflags |= RXf_USE_INTUIT;
4794 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4795 r->extflags |= RXf_INTUIT_TAIL;
4797 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4798 if ( (STRLEN)minlen < longest_float_length )
4799 minlen= longest_float_length;
4800 if ( (STRLEN)minlen < longest_fixed_length )
4801 minlen= longest_fixed_length;
4805 /* Several toplevels. Best we can is to set minlen. */
4807 struct regnode_charclass_class ch_class;
4810 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4812 scan = ri->program + 1;
4813 cl_init(pRExC_state, &ch_class);
4814 data.start_class = &ch_class;
4815 data.last_closep = &last_close;
4818 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4819 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4823 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4824 = r->float_substr = r->float_utf8 = NULL;
4825 if (!(data.start_class->flags & ANYOF_EOS)
4826 && !cl_is_anything(data.start_class))
4828 const U32 n = add_data(pRExC_state, 1, "f");
4830 Newx(RExC_rxi->data->data[n], 1,
4831 struct regnode_charclass_class);
4832 StructCopy(data.start_class,
4833 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4834 struct regnode_charclass_class);
4835 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4836 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4837 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4838 regprop(r, sv, (regnode*)data.start_class);
4839 PerlIO_printf(Perl_debug_log,
4840 "synthetic stclass \"%s\".\n",
4841 SvPVX_const(sv));});
4845 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4846 the "real" pattern. */
4848 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4849 (IV)minlen, (IV)r->minlen);
4851 r->minlenret = minlen;
4852 if (r->minlen < minlen)
4855 if (RExC_seen & REG_SEEN_GPOS)
4856 r->extflags |= RXf_GPOS_SEEN;
4857 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4858 r->extflags |= RXf_LOOKBEHIND_SEEN;
4859 if (RExC_seen & REG_SEEN_EVAL)
4860 r->extflags |= RXf_EVAL_SEEN;
4861 if (RExC_seen & REG_SEEN_CANY)
4862 r->extflags |= RXf_CANY_SEEN;
4863 if (RExC_seen & REG_SEEN_VERBARG)
4864 r->intflags |= PREGf_VERBARG_SEEN;
4865 if (RExC_seen & REG_SEEN_CUTGROUP)
4866 r->intflags |= PREGf_CUTGROUP_SEEN;
4867 if (RExC_paren_names)
4868 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4870 RXp_PAREN_NAMES(r) = NULL;
4872 #ifdef STUPID_PATTERN_CHECKS
4873 if (RX_PRELEN(rx) == 0)
4874 r->extflags |= RXf_NULL;
4875 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4876 /* XXX: this should happen BEFORE we compile */
4877 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4878 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4879 r->extflags |= RXf_WHITE;
4880 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4881 r->extflags |= RXf_START_ONLY;
4883 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4884 /* XXX: this should happen BEFORE we compile */
4885 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4887 regnode *first = ri->program + 1;
4889 U8 nop = OP(NEXTOPER(first));
4891 if (PL_regkind[fop] == NOTHING && nop == END)
4892 r->extflags |= RXf_NULL;
4893 else if (PL_regkind[fop] == BOL && nop == END)
4894 r->extflags |= RXf_START_ONLY;
4895 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4896 r->extflags |= RXf_WHITE;
4900 if (RExC_paren_names) {
4901 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4902 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4905 ri->name_list_idx = 0;
4907 if (RExC_recurse_count) {
4908 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4909 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4910 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4913 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4914 /* assume we don't need to swap parens around before we match */
4917 PerlIO_printf(Perl_debug_log,"Final program:\n");
4920 #ifdef RE_TRACK_PATTERN_OFFSETS
4921 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4922 const U32 len = ri->u.offsets[0];
4924 GET_RE_DEBUG_FLAGS_DECL;
4925 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4926 for (i = 1; i <= len; i++) {
4927 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4928 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4929 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4931 PerlIO_printf(Perl_debug_log, "\n");
4937 #undef RE_ENGINE_PTR
4941 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4944 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4946 PERL_UNUSED_ARG(value);
4948 if (flags & RXapif_FETCH) {
4949 return reg_named_buff_fetch(rx, key, flags);
4950 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4951 Perl_croak(aTHX_ "%s", PL_no_modify);
4953 } else if (flags & RXapif_EXISTS) {
4954 return reg_named_buff_exists(rx, key, flags)
4957 } else if (flags & RXapif_REGNAMES) {
4958 return reg_named_buff_all(rx, flags);
4959 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4960 return reg_named_buff_scalar(rx, flags);
4962 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4968 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4971 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4972 PERL_UNUSED_ARG(lastkey);
4974 if (flags & RXapif_FIRSTKEY)
4975 return reg_named_buff_firstkey(rx, flags);
4976 else if (flags & RXapif_NEXTKEY)
4977 return reg_named_buff_nextkey(rx, flags);
4979 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4985 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4988 AV *retarray = NULL;
4990 struct regexp *const rx = (struct regexp *)SvANY(r);
4992 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4994 if (flags & RXapif_ALL)
4997 if (rx && RXp_PAREN_NAMES(rx)) {
4998 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5001 SV* sv_dat=HeVAL(he_str);
5002 I32 *nums=(I32*)SvPVX(sv_dat);
5003 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5004 if ((I32)(rx->nparens) >= nums[i]
5005 && rx->offs[nums[i]].start != -1
5006 && rx->offs[nums[i]].end != -1)
5009 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5013 ret = newSVsv(&PL_sv_undef);
5016 av_push(retarray, ret);
5019 return newRV_noinc(MUTABLE_SV(retarray));
5026 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5029 struct regexp *const rx = (struct regexp *)SvANY(r);
5031 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5033 if (rx && RXp_PAREN_NAMES(rx)) {
5034 if (flags & RXapif_ALL) {
5035 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5037 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5051 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5053 struct regexp *const rx = (struct regexp *)SvANY(r);
5055 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5057 if ( rx && RXp_PAREN_NAMES(rx) ) {
5058 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5060 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5067 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5069 struct regexp *const rx = (struct regexp *)SvANY(r);
5070 GET_RE_DEBUG_FLAGS_DECL;
5072 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5074 if (rx && RXp_PAREN_NAMES(rx)) {
5075 HV *hv = RXp_PAREN_NAMES(rx);
5077 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5080 SV* sv_dat = HeVAL(temphe);
5081 I32 *nums = (I32*)SvPVX(sv_dat);
5082 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5083 if ((I32)(rx->lastparen) >= nums[i] &&
5084 rx->offs[nums[i]].start != -1 &&
5085 rx->offs[nums[i]].end != -1)
5091 if (parno || flags & RXapif_ALL) {
5092 return newSVhek(HeKEY_hek(temphe));
5100 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5105 struct regexp *const rx = (struct regexp *)SvANY(r);
5107 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5109 if (rx && RXp_PAREN_NAMES(rx)) {
5110 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5111 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5112 } else if (flags & RXapif_ONE) {
5113 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5114 av = MUTABLE_AV(SvRV(ret));
5115 length = av_len(av);
5117 return newSViv(length + 1);
5119 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5123 return &PL_sv_undef;
5127 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5129 struct regexp *const rx = (struct regexp *)SvANY(r);
5132 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5134 if (rx && RXp_PAREN_NAMES(rx)) {
5135 HV *hv= RXp_PAREN_NAMES(rx);
5137 (void)hv_iterinit(hv);
5138 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5141 SV* sv_dat = HeVAL(temphe);
5142 I32 *nums = (I32*)SvPVX(sv_dat);
5143 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5144 if ((I32)(rx->lastparen) >= nums[i] &&
5145 rx->offs[nums[i]].start != -1 &&
5146 rx->offs[nums[i]].end != -1)
5152 if (parno || flags & RXapif_ALL) {
5153 av_push(av, newSVhek(HeKEY_hek(temphe)));
5158 return newRV_noinc(MUTABLE_SV(av));
5162 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5165 struct regexp *const rx = (struct regexp *)SvANY(r);
5170 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5173 sv_setsv(sv,&PL_sv_undef);
5177 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5179 i = rx->offs[0].start;
5183 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5185 s = rx->subbeg + rx->offs[0].end;
5186 i = rx->sublen - rx->offs[0].end;
5189 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5190 (s1 = rx->offs[paren].start) != -1 &&
5191 (t1 = rx->offs[paren].end) != -1)
5195 s = rx->subbeg + s1;
5197 sv_setsv(sv,&PL_sv_undef);
5200 assert(rx->sublen >= (s - rx->subbeg) + i );
5202 const int oldtainted = PL_tainted;
5204 sv_setpvn(sv, s, i);
5205 PL_tainted = oldtainted;
5206 if ( (rx->extflags & RXf_CANY_SEEN)
5207 ? (RXp_MATCH_UTF8(rx)
5208 && (!i || is_utf8_string((U8*)s, i)))
5209 : (RXp_MATCH_UTF8(rx)) )
5216 if (RXp_MATCH_TAINTED(rx)) {
5217 if (SvTYPE(sv) >= SVt_PVMG) {
5218 MAGIC* const mg = SvMAGIC(sv);
5221 SvMAGIC_set(sv, mg->mg_moremagic);
5223 if ((mgt = SvMAGIC(sv))) {
5224 mg->mg_moremagic = mgt;
5225 SvMAGIC_set(sv, mg);
5235 sv_setsv(sv,&PL_sv_undef);
5241 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5242 SV const * const value)
5244 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5246 PERL_UNUSED_ARG(rx);
5247 PERL_UNUSED_ARG(paren);
5248 PERL_UNUSED_ARG(value);
5251 Perl_croak(aTHX_ "%s", PL_no_modify);
5255 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5258 struct regexp *const rx = (struct regexp *)SvANY(r);
5262 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5264 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5266 /* $` / ${^PREMATCH} */
5267 case RX_BUFF_IDX_PREMATCH:
5268 if (rx->offs[0].start != -1) {
5269 i = rx->offs[0].start;
5277 /* $' / ${^POSTMATCH} */
5278 case RX_BUFF_IDX_POSTMATCH:
5279 if (rx->offs[0].end != -1) {
5280 i = rx->sublen - rx->offs[0].end;
5282 s1 = rx->offs[0].end;
5288 /* $& / ${^MATCH}, $1, $2, ... */
5290 if (paren <= (I32)rx->nparens &&
5291 (s1 = rx->offs[paren].start) != -1 &&
5292 (t1 = rx->offs[paren].end) != -1)
5297 if (ckWARN(WARN_UNINITIALIZED))
5298 report_uninit((const SV *)sv);
5303 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5304 const char * const s = rx->subbeg + s1;
5309 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5316 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5318 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5319 PERL_UNUSED_ARG(rx);
5323 return newSVpvs("Regexp");
5326 /* Scans the name of a named buffer from the pattern.
5327 * If flags is REG_RSN_RETURN_NULL returns null.
5328 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5329 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5330 * to the parsed name as looked up in the RExC_paren_names hash.
5331 * If there is an error throws a vFAIL().. type exception.
5334 #define REG_RSN_RETURN_NULL 0
5335 #define REG_RSN_RETURN_NAME 1
5336 #define REG_RSN_RETURN_DATA 2
5339 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5341 char *name_start = RExC_parse;
5343 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5345 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5346 /* skip IDFIRST by using do...while */
5349 RExC_parse += UTF8SKIP(RExC_parse);
5350 } while (isALNUM_utf8((U8*)RExC_parse));
5354 } while (isALNUM(*RExC_parse));
5359 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5360 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5361 if ( flags == REG_RSN_RETURN_NAME)
5363 else if (flags==REG_RSN_RETURN_DATA) {
5366 if ( ! sv_name ) /* should not happen*/
5367 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5368 if (RExC_paren_names)
5369 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5371 sv_dat = HeVAL(he_str);
5373 vFAIL("Reference to nonexistent named group");
5377 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5384 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5385 int rem=(int)(RExC_end - RExC_parse); \
5394 if (RExC_lastparse!=RExC_parse) \
5395 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5398 iscut ? "..." : "<" \
5401 PerlIO_printf(Perl_debug_log,"%16s",""); \
5404 num = RExC_size + 1; \
5406 num=REG_NODE_NUM(RExC_emit); \
5407 if (RExC_lastnum!=num) \
5408 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5410 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5411 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5412 (int)((depth*2)), "", \
5416 RExC_lastparse=RExC_parse; \
5421 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5422 DEBUG_PARSE_MSG((funcname)); \
5423 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5425 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5426 DEBUG_PARSE_MSG((funcname)); \
5427 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5430 - reg - regular expression, i.e. main body or parenthesized thing
5432 * Caller must absorb opening parenthesis.
5434 * Combining parenthesis handling with the base level of regular expression
5435 * is a trifle forced, but the need to tie the tails of the branches to what
5436 * follows makes it hard to avoid.
5438 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5440 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5442 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5446 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5447 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5450 register regnode *ret; /* Will be the head of the group. */
5451 register regnode *br;
5452 register regnode *lastbr;
5453 register regnode *ender = NULL;
5454 register I32 parno = 0;
5456 U32 oregflags = RExC_flags;
5457 bool have_branch = 0;
5459 I32 freeze_paren = 0;
5460 I32 after_freeze = 0;
5462 /* for (?g), (?gc), and (?o) warnings; warning
5463 about (?c) will warn about (?g) -- japhy */
5465 #define WASTED_O 0x01
5466 #define WASTED_G 0x02
5467 #define WASTED_C 0x04
5468 #define WASTED_GC (0x02|0x04)
5469 I32 wastedflags = 0x00;
5471 char * parse_start = RExC_parse; /* MJD */
5472 char * const oregcomp_parse = RExC_parse;
5474 GET_RE_DEBUG_FLAGS_DECL;
5476 PERL_ARGS_ASSERT_REG;
5477 DEBUG_PARSE("reg ");
5479 *flagp = 0; /* Tentatively. */
5482 /* Make an OPEN node, if parenthesized. */
5484 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5485 char *start_verb = RExC_parse;
5486 STRLEN verb_len = 0;
5487 char *start_arg = NULL;
5488 unsigned char op = 0;
5490 int internal_argval = 0; /* internal_argval is only useful if !argok */
5491 while ( *RExC_parse && *RExC_parse != ')' ) {
5492 if ( *RExC_parse == ':' ) {
5493 start_arg = RExC_parse + 1;
5499 verb_len = RExC_parse - start_verb;
5502 while ( *RExC_parse && *RExC_parse != ')' )
5504 if ( *RExC_parse != ')' )
5505 vFAIL("Unterminated verb pattern argument");
5506 if ( RExC_parse == start_arg )
5509 if ( *RExC_parse != ')' )
5510 vFAIL("Unterminated verb pattern");
5513 switch ( *start_verb ) {
5514 case 'A': /* (*ACCEPT) */
5515 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5517 internal_argval = RExC_nestroot;
5520 case 'C': /* (*COMMIT) */
5521 if ( memEQs(start_verb,verb_len,"COMMIT") )
5524 case 'F': /* (*FAIL) */
5525 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5530 case ':': /* (*:NAME) */
5531 case 'M': /* (*MARK:NAME) */
5532 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5537 case 'P': /* (*PRUNE) */
5538 if ( memEQs(start_verb,verb_len,"PRUNE") )
5541 case 'S': /* (*SKIP) */
5542 if ( memEQs(start_verb,verb_len,"SKIP") )
5545 case 'T': /* (*THEN) */
5546 /* [19:06] <TimToady> :: is then */
5547 if ( memEQs(start_verb,verb_len,"THEN") ) {
5549 RExC_seen |= REG_SEEN_CUTGROUP;
5555 vFAIL3("Unknown verb pattern '%.*s'",
5556 verb_len, start_verb);
5559 if ( start_arg && internal_argval ) {
5560 vFAIL3("Verb pattern '%.*s' may not have an argument",
5561 verb_len, start_verb);
5562 } else if ( argok < 0 && !start_arg ) {
5563 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5564 verb_len, start_verb);
5566 ret = reganode(pRExC_state, op, internal_argval);
5567 if ( ! internal_argval && ! SIZE_ONLY ) {
5569 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5570 ARG(ret) = add_data( pRExC_state, 1, "S" );
5571 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5578 if (!internal_argval)
5579 RExC_seen |= REG_SEEN_VERBARG;
5580 } else if ( start_arg ) {
5581 vFAIL3("Verb pattern '%.*s' may not have an argument",
5582 verb_len, start_verb);
5584 ret = reg_node(pRExC_state, op);
5586 nextchar(pRExC_state);
5589 if (*RExC_parse == '?') { /* (?...) */
5590 bool is_logical = 0;
5591 const char * const seqstart = RExC_parse;
5594 paren = *RExC_parse++;
5595 ret = NULL; /* For look-ahead/behind. */
5598 case 'P': /* (?P...) variants for those used to PCRE/Python */
5599 paren = *RExC_parse++;
5600 if ( paren == '<') /* (?P<...>) named capture */
5602 else if (paren == '>') { /* (?P>name) named recursion */
5603 goto named_recursion;
5605 else if (paren == '=') { /* (?P=...) named backref */
5606 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5607 you change this make sure you change that */
5608 char* name_start = RExC_parse;
5610 SV *sv_dat = reg_scan_name(pRExC_state,
5611 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5612 if (RExC_parse == name_start || *RExC_parse != ')')
5613 vFAIL2("Sequence %.3s... not terminated",parse_start);
5616 num = add_data( pRExC_state, 1, "S" );
5617 RExC_rxi->data->data[num]=(void*)sv_dat;
5618 SvREFCNT_inc_simple_void(sv_dat);
5621 ret = reganode(pRExC_state,
5622 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5626 Set_Node_Offset(ret, parse_start+1);
5627 Set_Node_Cur_Length(ret); /* MJD */
5629 nextchar(pRExC_state);
5633 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5635 case '<': /* (?<...) */
5636 if (*RExC_parse == '!')
5638 else if (*RExC_parse != '=')
5644 case '\'': /* (?'...') */
5645 name_start= RExC_parse;
5646 svname = reg_scan_name(pRExC_state,
5647 SIZE_ONLY ? /* reverse test from the others */
5648 REG_RSN_RETURN_NAME :
5649 REG_RSN_RETURN_NULL);
5650 if (RExC_parse == name_start) {
5652 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5655 if (*RExC_parse != paren)
5656 vFAIL2("Sequence (?%c... not terminated",
5657 paren=='>' ? '<' : paren);
5661 if (!svname) /* shouldnt happen */
5663 "panic: reg_scan_name returned NULL");
5664 if (!RExC_paren_names) {
5665 RExC_paren_names= newHV();
5666 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5668 RExC_paren_name_list= newAV();
5669 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5672 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5674 sv_dat = HeVAL(he_str);
5676 /* croak baby croak */
5678 "panic: paren_name hash element allocation failed");
5679 } else if ( SvPOK(sv_dat) ) {
5680 /* (?|...) can mean we have dupes so scan to check
5681 its already been stored. Maybe a flag indicating
5682 we are inside such a construct would be useful,
5683 but the arrays are likely to be quite small, so
5684 for now we punt -- dmq */
5685 IV count = SvIV(sv_dat);
5686 I32 *pv = (I32*)SvPVX(sv_dat);
5688 for ( i = 0 ; i < count ; i++ ) {
5689 if ( pv[i] == RExC_npar ) {
5695 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5696 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5697 pv[count] = RExC_npar;
5698 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5701 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5702 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5704 SvIV_set(sv_dat, 1);
5707 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5708 SvREFCNT_dec(svname);
5711 /*sv_dump(sv_dat);*/
5713 nextchar(pRExC_state);
5715 goto capturing_parens;
5717 RExC_seen |= REG_SEEN_LOOKBEHIND;
5719 case '=': /* (?=...) */
5720 RExC_seen_zerolen++;
5722 case '!': /* (?!...) */
5723 RExC_seen_zerolen++;
5724 if (*RExC_parse == ')') {
5725 ret=reg_node(pRExC_state, OPFAIL);
5726 nextchar(pRExC_state);
5730 case '|': /* (?|...) */
5731 /* branch reset, behave like a (?:...) except that
5732 buffers in alternations share the same numbers */
5734 after_freeze = freeze_paren = RExC_npar;
5736 case ':': /* (?:...) */
5737 case '>': /* (?>...) */
5739 case '$': /* (?$...) */
5740 case '@': /* (?@...) */
5741 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5743 case '#': /* (?#...) */
5744 while (*RExC_parse && *RExC_parse != ')')
5746 if (*RExC_parse != ')')
5747 FAIL("Sequence (?#... not terminated");
5748 nextchar(pRExC_state);
5751 case '0' : /* (?0) */
5752 case 'R' : /* (?R) */
5753 if (*RExC_parse != ')')
5754 FAIL("Sequence (?R) not terminated");
5755 ret = reg_node(pRExC_state, GOSTART);
5756 *flagp |= POSTPONED;
5757 nextchar(pRExC_state);
5760 { /* named and numeric backreferences */
5762 case '&': /* (?&NAME) */
5763 parse_start = RExC_parse - 1;
5766 SV *sv_dat = reg_scan_name(pRExC_state,
5767 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5768 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5770 goto gen_recurse_regop;
5773 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5775 vFAIL("Illegal pattern");
5777 goto parse_recursion;
5779 case '-': /* (?-1) */
5780 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5781 RExC_parse--; /* rewind to let it be handled later */
5785 case '1': case '2': case '3': case '4': /* (?1) */
5786 case '5': case '6': case '7': case '8': case '9':
5789 num = atoi(RExC_parse);
5790 parse_start = RExC_parse - 1; /* MJD */
5791 if (*RExC_parse == '-')
5793 while (isDIGIT(*RExC_parse))
5795 if (*RExC_parse!=')')
5796 vFAIL("Expecting close bracket");
5799 if ( paren == '-' ) {
5801 Diagram of capture buffer numbering.
5802 Top line is the normal capture buffer numbers
5803 Botton line is the negative indexing as from
5807 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5811 num = RExC_npar + num;
5814 vFAIL("Reference to nonexistent group");
5816 } else if ( paren == '+' ) {
5817 num = RExC_npar + num - 1;
5820 ret = reganode(pRExC_state, GOSUB, num);
5822 if (num > (I32)RExC_rx->nparens) {
5824 vFAIL("Reference to nonexistent group");
5826 ARG2L_SET( ret, RExC_recurse_count++);
5828 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5829 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5833 RExC_seen |= REG_SEEN_RECURSE;
5834 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5835 Set_Node_Offset(ret, parse_start); /* MJD */
5837 *flagp |= POSTPONED;
5838 nextchar(pRExC_state);
5840 } /* named and numeric backreferences */
5843 case '?': /* (??...) */
5845 if (*RExC_parse != '{') {
5847 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5850 *flagp |= POSTPONED;
5851 paren = *RExC_parse++;
5853 case '{': /* (?{...}) */
5858 char *s = RExC_parse;
5860 RExC_seen_zerolen++;
5861 RExC_seen |= REG_SEEN_EVAL;
5862 while (count && (c = *RExC_parse)) {
5873 if (*RExC_parse != ')') {
5875 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5879 OP_4tree *sop, *rop;
5880 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5883 Perl_save_re_context(aTHX);
5884 rop = sv_compile_2op(sv, &sop, "re", &pad);
5885 sop->op_private |= OPpREFCOUNTED;
5886 /* re_dup will OpREFCNT_inc */
5887 OpREFCNT_set(sop, 1);
5890 n = add_data(pRExC_state, 3, "nop");
5891 RExC_rxi->data->data[n] = (void*)rop;
5892 RExC_rxi->data->data[n+1] = (void*)sop;
5893 RExC_rxi->data->data[n+2] = (void*)pad;
5896 else { /* First pass */
5897 if (PL_reginterp_cnt < ++RExC_seen_evals
5899 /* No compiled RE interpolated, has runtime
5900 components ===> unsafe. */
5901 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5902 if (PL_tainting && PL_tainted)
5903 FAIL("Eval-group in insecure regular expression");
5904 #if PERL_VERSION > 8
5905 if (IN_PERL_COMPILETIME)
5910 nextchar(pRExC_state);
5912 ret = reg_node(pRExC_state, LOGICAL);
5915 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5916 /* deal with the length of this later - MJD */
5919 ret = reganode(pRExC_state, EVAL, n);
5920 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5921 Set_Node_Offset(ret, parse_start);
5924 case '(': /* (?(?{...})...) and (?(?=...)...) */
5927 if (RExC_parse[0] == '?') { /* (?(?...)) */
5928 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5929 || RExC_parse[1] == '<'
5930 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5933 ret = reg_node(pRExC_state, LOGICAL);
5936 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5940 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5941 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5943 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5944 char *name_start= RExC_parse++;
5946 SV *sv_dat=reg_scan_name(pRExC_state,
5947 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5948 if (RExC_parse == name_start || *RExC_parse != ch)
5949 vFAIL2("Sequence (?(%c... not terminated",
5950 (ch == '>' ? '<' : ch));
5953 num = add_data( pRExC_state, 1, "S" );
5954 RExC_rxi->data->data[num]=(void*)sv_dat;
5955 SvREFCNT_inc_simple_void(sv_dat);
5957 ret = reganode(pRExC_state,NGROUPP,num);
5958 goto insert_if_check_paren;
5960 else if (RExC_parse[0] == 'D' &&
5961 RExC_parse[1] == 'E' &&
5962 RExC_parse[2] == 'F' &&
5963 RExC_parse[3] == 'I' &&
5964 RExC_parse[4] == 'N' &&
5965 RExC_parse[5] == 'E')
5967 ret = reganode(pRExC_state,DEFINEP,0);
5970 goto insert_if_check_paren;
5972 else if (RExC_parse[0] == 'R') {
5975 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5976 parno = atoi(RExC_parse++);
5977 while (isDIGIT(*RExC_parse))
5979 } else if (RExC_parse[0] == '&') {
5982 sv_dat = reg_scan_name(pRExC_state,
5983 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5984 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5986 ret = reganode(pRExC_state,INSUBP,parno);
5987 goto insert_if_check_paren;
5989 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5992 parno = atoi(RExC_parse++);
5994 while (isDIGIT(*RExC_parse))
5996 ret = reganode(pRExC_state, GROUPP, parno);
5998 insert_if_check_paren:
5999 if ((c = *nextchar(pRExC_state)) != ')')
6000 vFAIL("Switch condition not recognized");
6002 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6003 br = regbranch(pRExC_state, &flags, 1,depth+1);
6005 br = reganode(pRExC_state, LONGJMP, 0);
6007 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6008 c = *nextchar(pRExC_state);
6013 vFAIL("(?(DEFINE)....) does not allow branches");
6014 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6015 regbranch(pRExC_state, &flags, 1,depth+1);
6016 REGTAIL(pRExC_state, ret, lastbr);
6019 c = *nextchar(pRExC_state);
6024 vFAIL("Switch (?(condition)... contains too many branches");
6025 ender = reg_node(pRExC_state, TAIL);
6026 REGTAIL(pRExC_state, br, ender);
6028 REGTAIL(pRExC_state, lastbr, ender);
6029 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6032 REGTAIL(pRExC_state, ret, ender);
6033 RExC_size++; /* XXX WHY do we need this?!!
6034 For large programs it seems to be required
6035 but I can't figure out why. -- dmq*/
6039 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6043 RExC_parse--; /* for vFAIL to print correctly */
6044 vFAIL("Sequence (? incomplete");
6048 parse_flags: /* (?i) */
6050 U32 posflags = 0, negflags = 0;
6051 U32 *flagsp = &posflags;
6053 while (*RExC_parse) {
6054 /* && strchr("iogcmsx", *RExC_parse) */
6055 /* (?g), (?gc) and (?o) are useless here
6056 and must be globally applied -- japhy */
6057 switch (*RExC_parse) {
6058 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6059 case ONCE_PAT_MOD: /* 'o' */
6060 case GLOBAL_PAT_MOD: /* 'g' */
6061 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6062 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6063 if (! (wastedflags & wflagbit) ) {
6064 wastedflags |= wflagbit;
6067 "Useless (%s%c) - %suse /%c modifier",
6068 flagsp == &negflags ? "?-" : "?",
6070 flagsp == &negflags ? "don't " : "",
6077 case CONTINUE_PAT_MOD: /* 'c' */
6078 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6079 if (! (wastedflags & WASTED_C) ) {
6080 wastedflags |= WASTED_GC;
6083 "Useless (%sc) - %suse /gc modifier",
6084 flagsp == &negflags ? "?-" : "?",
6085 flagsp == &negflags ? "don't " : ""
6090 case KEEPCOPY_PAT_MOD: /* 'p' */
6091 if (flagsp == &negflags) {
6093 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6095 *flagsp |= RXf_PMf_KEEPCOPY;
6099 if (flagsp == &negflags) {
6101 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6105 wastedflags = 0; /* reset so (?g-c) warns twice */
6111 RExC_flags |= posflags;
6112 RExC_flags &= ~negflags;
6114 oregflags |= posflags;
6115 oregflags &= ~negflags;
6117 nextchar(pRExC_state);
6128 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6133 }} /* one for the default block, one for the switch */
6140 ret = reganode(pRExC_state, OPEN, parno);
6143 RExC_nestroot = parno;
6144 if (RExC_seen & REG_SEEN_RECURSE
6145 && !RExC_open_parens[parno-1])
6147 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6148 "Setting open paren #%"IVdf" to %d\n",
6149 (IV)parno, REG_NODE_NUM(ret)));
6150 RExC_open_parens[parno-1]= ret;
6153 Set_Node_Length(ret, 1); /* MJD */
6154 Set_Node_Offset(ret, RExC_parse); /* MJD */
6162 /* Pick up the branches, linking them together. */
6163 parse_start = RExC_parse; /* MJD */
6164 br = regbranch(pRExC_state, &flags, 1,depth+1);
6167 if (RExC_npar > after_freeze)
6168 after_freeze = RExC_npar;
6169 RExC_npar = freeze_paren;
6172 /* branch_len = (paren != 0); */
6176 if (*RExC_parse == '|') {
6177 if (!SIZE_ONLY && RExC_extralen) {
6178 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6181 reginsert(pRExC_state, BRANCH, br, depth+1);
6182 Set_Node_Length(br, paren != 0);
6183 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6187 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6189 else if (paren == ':') {
6190 *flagp |= flags&SIMPLE;
6192 if (is_open) { /* Starts with OPEN. */
6193 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6195 else if (paren != '?') /* Not Conditional */
6197 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6199 while (*RExC_parse == '|') {
6200 if (!SIZE_ONLY && RExC_extralen) {
6201 ender = reganode(pRExC_state, LONGJMP,0);
6202 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6205 RExC_extralen += 2; /* Account for LONGJMP. */
6206 nextchar(pRExC_state);
6208 if (RExC_npar > after_freeze)
6209 after_freeze = RExC_npar;
6210 RExC_npar = freeze_paren;
6212 br = regbranch(pRExC_state, &flags, 0, depth+1);
6216 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6218 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6221 if (have_branch || paren != ':') {
6222 /* Make a closing node, and hook it on the end. */
6225 ender = reg_node(pRExC_state, TAIL);
6228 ender = reganode(pRExC_state, CLOSE, parno);
6229 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6230 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6231 "Setting close paren #%"IVdf" to %d\n",
6232 (IV)parno, REG_NODE_NUM(ender)));
6233 RExC_close_parens[parno-1]= ender;
6234 if (RExC_nestroot == parno)
6237 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6238 Set_Node_Length(ender,1); /* MJD */
6244 *flagp &= ~HASWIDTH;
6247 ender = reg_node(pRExC_state, SUCCEED);
6250 ender = reg_node(pRExC_state, END);
6252 assert(!RExC_opend); /* there can only be one! */
6257 REGTAIL(pRExC_state, lastbr, ender);
6259 if (have_branch && !SIZE_ONLY) {
6261 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6263 /* Hook the tails of the branches to the closing node. */
6264 for (br = ret; br; br = regnext(br)) {
6265 const U8 op = PL_regkind[OP(br)];
6267 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6269 else if (op == BRANCHJ) {
6270 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6278 static const char parens[] = "=!<,>";
6280 if (paren && (p = strchr(parens, paren))) {
6281 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6282 int flag = (p - parens) > 1;
6285 node = SUSPEND, flag = 0;
6286 reginsert(pRExC_state, node,ret, depth+1);
6287 Set_Node_Cur_Length(ret);
6288 Set_Node_Offset(ret, parse_start + 1);
6290 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6294 /* Check for proper termination. */
6296 RExC_flags = oregflags;
6297 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6298 RExC_parse = oregcomp_parse;
6299 vFAIL("Unmatched (");
6302 else if (!paren && RExC_parse < RExC_end) {
6303 if (*RExC_parse == ')') {
6305 vFAIL("Unmatched )");
6308 FAIL("Junk on end of regexp"); /* "Can't happen". */
6312 RExC_npar = after_freeze;
6317 - regbranch - one alternative of an | operator
6319 * Implements the concatenation operator.
6322 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6325 register regnode *ret;
6326 register regnode *chain = NULL;
6327 register regnode *latest;
6328 I32 flags = 0, c = 0;
6329 GET_RE_DEBUG_FLAGS_DECL;
6331 PERL_ARGS_ASSERT_REGBRANCH;
6333 DEBUG_PARSE("brnc");
6338 if (!SIZE_ONLY && RExC_extralen)
6339 ret = reganode(pRExC_state, BRANCHJ,0);
6341 ret = reg_node(pRExC_state, BRANCH);
6342 Set_Node_Length(ret, 1);
6346 if (!first && SIZE_ONLY)
6347 RExC_extralen += 1; /* BRANCHJ */
6349 *flagp = WORST; /* Tentatively. */
6352 nextchar(pRExC_state);
6353 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6355 latest = regpiece(pRExC_state, &flags,depth+1);
6356 if (latest == NULL) {
6357 if (flags & TRYAGAIN)
6361 else if (ret == NULL)
6363 *flagp |= flags&(HASWIDTH|POSTPONED);
6364 if (chain == NULL) /* First piece. */
6365 *flagp |= flags&SPSTART;
6368 REGTAIL(pRExC_state, chain, latest);
6373 if (chain == NULL) { /* Loop ran zero times. */
6374 chain = reg_node(pRExC_state, NOTHING);
6379 *flagp |= flags&SIMPLE;
6386 - regpiece - something followed by possible [*+?]
6388 * Note that the branching code sequences used for ? and the general cases
6389 * of * and + are somewhat optimized: they use the same NOTHING node as
6390 * both the endmarker for their branch list and the body of the last branch.
6391 * It might seem that this node could be dispensed with entirely, but the
6392 * endmarker role is not redundant.
6395 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6398 register regnode *ret;
6400 register char *next;
6402 const char * const origparse = RExC_parse;
6404 I32 max = REG_INFTY;
6406 const char *maxpos = NULL;
6407 GET_RE_DEBUG_FLAGS_DECL;
6409 PERL_ARGS_ASSERT_REGPIECE;
6411 DEBUG_PARSE("piec");
6413 ret = regatom(pRExC_state, &flags,depth+1);
6415 if (flags & TRYAGAIN)
6422 if (op == '{' && regcurly(RExC_parse)) {
6424 parse_start = RExC_parse; /* MJD */
6425 next = RExC_parse + 1;
6426 while (isDIGIT(*next) || *next == ',') {
6435 if (*next == '}') { /* got one */
6439 min = atoi(RExC_parse);
6443 maxpos = RExC_parse;
6445 if (!max && *maxpos != '0')
6446 max = REG_INFTY; /* meaning "infinity" */
6447 else if (max >= REG_INFTY)
6448 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6450 nextchar(pRExC_state);
6453 if ((flags&SIMPLE)) {
6454 RExC_naughty += 2 + RExC_naughty / 2;
6455 reginsert(pRExC_state, CURLY, ret, depth+1);
6456 Set_Node_Offset(ret, parse_start+1); /* MJD */
6457 Set_Node_Cur_Length(ret);
6460 regnode * const w = reg_node(pRExC_state, WHILEM);
6463 REGTAIL(pRExC_state, ret, w);
6464 if (!SIZE_ONLY && RExC_extralen) {
6465 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6466 reginsert(pRExC_state, NOTHING,ret, depth+1);
6467 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6469 reginsert(pRExC_state, CURLYX,ret, depth+1);
6471 Set_Node_Offset(ret, parse_start+1);
6472 Set_Node_Length(ret,
6473 op == '{' ? (RExC_parse - parse_start) : 1);
6475 if (!SIZE_ONLY && RExC_extralen)
6476 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6477 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6479 RExC_whilem_seen++, RExC_extralen += 3;
6480 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6489 vFAIL("Can't do {n,m} with n > m");
6491 ARG1_SET(ret, (U16)min);
6492 ARG2_SET(ret, (U16)max);
6504 #if 0 /* Now runtime fix should be reliable. */
6506 /* if this is reinstated, don't forget to put this back into perldiag:
6508 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6510 (F) The part of the regexp subject to either the * or + quantifier
6511 could match an empty string. The {#} shows in the regular
6512 expression about where the problem was discovered.
6516 if (!(flags&HASWIDTH) && op != '?')
6517 vFAIL("Regexp *+ operand could be empty");
6520 parse_start = RExC_parse;
6521 nextchar(pRExC_state);
6523 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6525 if (op == '*' && (flags&SIMPLE)) {
6526 reginsert(pRExC_state, STAR, ret, depth+1);
6530 else if (op == '*') {
6534 else if (op == '+' && (flags&SIMPLE)) {
6535 reginsert(pRExC_state, PLUS, ret, depth+1);
6539 else if (op == '+') {
6543 else if (op == '?') {
6548 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6549 ckWARN3reg(RExC_parse,
6550 "%.*s matches null string many times",
6551 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6555 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6556 nextchar(pRExC_state);
6557 reginsert(pRExC_state, MINMOD, ret, depth+1);
6558 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6560 #ifndef REG_ALLOW_MINMOD_SUSPEND
6563 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6565 nextchar(pRExC_state);
6566 ender = reg_node(pRExC_state, SUCCEED);
6567 REGTAIL(pRExC_state, ret, ender);
6568 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6570 ender = reg_node(pRExC_state, TAIL);
6571 REGTAIL(pRExC_state, ret, ender);
6575 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6577 vFAIL("Nested quantifiers");
6584 /* reg_namedseq(pRExC_state,UVp)
6586 This is expected to be called by a parser routine that has
6587 recognized '\N' and needs to handle the rest. RExC_parse is
6588 expected to point at the first char following the N at the time
6591 The \N may be inside (indicated by valuep not being NULL) or outside a
6594 \N may begin either a named sequence, or if outside a character class, mean
6595 to match a non-newline. For non single-quoted regexes, the tokenizer has
6596 attempted to decide which, and in the case of a named sequence converted it
6597 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6598 where c1... are the characters in the sequence. For single-quoted regexes,
6599 the tokenizer passes the \N sequence through unchanged; this code will not
6600 attempt to determine this nor expand those. The net effect is that if the
6601 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6602 signals that this \N occurrence means to match a non-newline.
6604 Only the \N{U+...} form should occur in a character class, for the same
6605 reason that '.' inside a character class means to just match a period: it
6606 just doesn't make sense.
6608 If valuep is non-null then it is assumed that we are parsing inside
6609 of a charclass definition and the first codepoint in the resolved
6610 string is returned via *valuep and the routine will return NULL.
6611 In this mode if a multichar string is returned from the charnames
6612 handler, a warning will be issued, and only the first char in the
6613 sequence will be examined. If the string returned is zero length
6614 then the value of *valuep is undefined and NON-NULL will
6615 be returned to indicate failure. (This will NOT be a valid pointer
6618 If valuep is null then it is assumed that we are parsing normal text and a
6619 new EXACT node is inserted into the program containing the resolved string,
6620 and a pointer to the new node is returned. But if the string is zero length
6621 a NOTHING node is emitted instead.
6623 On success RExC_parse is set to the char following the endbrace.
6624 Parsing failures will generate a fatal error via vFAIL(...)
6627 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6629 char * endbrace; /* '}' following the name */
6630 regnode *ret = NULL;
6632 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6636 GET_RE_DEBUG_FLAGS_DECL;
6638 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6642 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6643 * modifier. The other meaning does not */
6644 p = (RExC_flags & RXf_PMf_EXTENDED)
6645 ? regwhite( pRExC_state, RExC_parse )
6648 /* Disambiguate between \N meaning a named character versus \N meaning
6649 * [^\n]. The former is assumed when it can't be the latter. */
6650 if (*p != '{' || regcurly(p)) {
6653 /* no bare \N in a charclass */
6654 vFAIL("\\N in a character class must be a named character: \\N{...}");
6656 nextchar(pRExC_state);
6657 ret = reg_node(pRExC_state, REG_ANY);
6658 *flagp |= HASWIDTH|SIMPLE;
6661 Set_Node_Length(ret, 1); /* MJD */
6665 /* Here, we have decided it should be a named sequence */
6667 /* The test above made sure that the next real character is a '{', but
6668 * under the /x modifier, it could be separated by space (or a comment and
6669 * \n) and this is not allowed (for consistency with \x{...} and the
6670 * tokenizer handling of \N{NAME}). */
6671 if (*RExC_parse != '{') {
6672 vFAIL("Missing braces on \\N{}");
6675 RExC_parse++; /* Skip past the '{' */
6677 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6678 || ! (endbrace == RExC_parse /* nothing between the {} */
6679 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6680 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6682 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6683 vFAIL("\\N{NAME} must be resolved by the lexer");
6686 if (endbrace == RExC_parse) { /* empty: \N{} */
6688 RExC_parse = endbrace + 1;
6689 return reg_node(pRExC_state,NOTHING);
6693 ckWARNreg(RExC_parse,
6694 "Ignoring zero length \\N{} in character class"
6696 RExC_parse = endbrace + 1;
6699 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6702 RExC_utf8 = 1; /* named sequences imply Unicode semantics */
6703 RExC_parse += 2; /* Skip past the 'U+' */
6705 if (valuep) { /* In a bracketed char class */
6706 /* We only pay attention to the first char of
6707 multichar strings being returned. I kinda wonder
6708 if this makes sense as it does change the behaviour
6709 from earlier versions, OTOH that behaviour was broken
6710 as well. XXX Solution is to recharacterize as
6711 [rest-of-class]|multi1|multi2... */
6713 STRLEN length_of_hex;
6714 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6715 | PERL_SCAN_DISALLOW_PREFIX
6716 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6718 char * endchar = strchr(RExC_parse, '.');
6720 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6722 else endchar = endbrace;
6724 length_of_hex = (STRLEN)(endchar - RExC_parse);
6725 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6727 /* The tokenizer should have guaranteed validity, but it's possible to
6728 * bypass it by using single quoting, so check */
6729 if (length_of_hex == 0
6730 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6732 RExC_parse += length_of_hex; /* Includes all the valid */
6733 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6734 ? UTF8SKIP(RExC_parse)
6736 /* Guard against malformed utf8 */
6737 if (RExC_parse >= endchar) RExC_parse = endchar;
6738 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6741 RExC_parse = endbrace + 1;
6742 if (endchar == endbrace) return NULL;
6744 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
6746 else { /* Not a char class */
6747 char *s; /* String to put in generated EXACT node */
6748 STRLEN len = 0; /* Its current length */
6749 char *endchar; /* Points to '.' or '}' ending cur char in the input
6752 ret = reg_node(pRExC_state,
6753 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6756 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6757 * the input which is of the form now 'c1.c2.c3...}' until find the
6758 * ending brace or exeed length 255. The characters that exceed this
6759 * limit are dropped. The limit could be relaxed should it become
6760 * desirable by reparsing this as (?:\N{NAME}), so could generate
6761 * multiple EXACT nodes, as is done for just regular input. But this
6762 * is primarily a named character, and not intended to be a huge long
6763 * string, so 255 bytes should be good enough */
6765 STRLEN length_of_hex;
6766 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6767 | PERL_SCAN_DISALLOW_PREFIX
6768 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6769 UV cp; /* Ord of current character */
6771 /* Code points are separated by dots. If none, there is only one
6772 * code point, and is terminated by the brace */
6773 endchar = strchr(RExC_parse, '.');
6774 if (! endchar) endchar = endbrace;
6776 /* The values are Unicode even on EBCDIC machines */
6777 length_of_hex = (STRLEN)(endchar - RExC_parse);
6778 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6779 if ( length_of_hex == 0
6780 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6782 RExC_parse += length_of_hex; /* Includes all the valid */
6783 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6784 ? UTF8SKIP(RExC_parse)
6786 /* Guard against malformed utf8 */
6787 if (RExC_parse >= endchar) RExC_parse = endchar;
6788 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6791 if (! FOLD) { /* Not folding, just append to the string */
6794 /* Quit before adding this character if would exceed limit */
6795 if (len + UNISKIP(cp) > U8_MAX) break;
6797 unilen = reguni(pRExC_state, cp, s);
6802 } else { /* Folding, output the folded equivalent */
6803 STRLEN foldlen,numlen;
6804 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6805 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6807 /* Quit before exceeding size limit */
6808 if (len + foldlen > U8_MAX) break;
6810 for (foldbuf = tmpbuf;
6814 cp = utf8_to_uvchr(foldbuf, &numlen);
6816 const STRLEN unilen = reguni(pRExC_state, cp, s);
6819 /* In EBCDIC the numlen and unilen can differ. */
6821 if (numlen >= foldlen)
6825 break; /* "Can't happen." */
6829 /* Point to the beginning of the next character in the sequence. */
6830 RExC_parse = endchar + 1;
6832 /* Quit if no more characters */
6833 if (RExC_parse >= endbrace) break;
6838 if (RExC_parse < endbrace) {
6839 ckWARNreg(RExC_parse - 1,
6840 "Using just the first characters returned by \\N{}");
6843 RExC_size += STR_SZ(len);
6846 RExC_emit += STR_SZ(len);
6849 RExC_parse = endbrace + 1;
6851 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6852 with malformed in t/re/pat_advanced.t */
6854 Set_Node_Cur_Length(ret); /* MJD */
6855 nextchar(pRExC_state);
6865 * It returns the code point in utf8 for the value in *encp.
6866 * value: a code value in the source encoding
6867 * encp: a pointer to an Encode object
6869 * If the result from Encode is not a single character,
6870 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6873 S_reg_recode(pTHX_ const char value, SV **encp)
6876 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6877 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6878 const STRLEN newlen = SvCUR(sv);
6879 UV uv = UNICODE_REPLACEMENT;
6881 PERL_ARGS_ASSERT_REG_RECODE;
6885 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6888 if (!newlen || numlen != newlen) {
6889 uv = UNICODE_REPLACEMENT;
6897 - regatom - the lowest level
6899 Try to identify anything special at the start of the pattern. If there
6900 is, then handle it as required. This may involve generating a single regop,
6901 such as for an assertion; or it may involve recursing, such as to
6902 handle a () structure.
6904 If the string doesn't start with something special then we gobble up
6905 as much literal text as we can.
6907 Once we have been able to handle whatever type of thing started the
6908 sequence, we return.
6910 Note: we have to be careful with escapes, as they can be both literal
6911 and special, and in the case of \10 and friends can either, depending
6912 on context. Specifically there are two seperate switches for handling
6913 escape sequences, with the one for handling literal escapes requiring
6914 a dummy entry for all of the special escapes that are actually handled
6919 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6922 register regnode *ret = NULL;
6924 char *parse_start = RExC_parse;
6925 GET_RE_DEBUG_FLAGS_DECL;
6926 DEBUG_PARSE("atom");
6927 *flagp = WORST; /* Tentatively. */
6929 PERL_ARGS_ASSERT_REGATOM;
6932 switch ((U8)*RExC_parse) {
6934 RExC_seen_zerolen++;
6935 nextchar(pRExC_state);
6936 if (RExC_flags & RXf_PMf_MULTILINE)
6937 ret = reg_node(pRExC_state, MBOL);
6938 else if (RExC_flags & RXf_PMf_SINGLELINE)
6939 ret = reg_node(pRExC_state, SBOL);
6941 ret = reg_node(pRExC_state, BOL);
6942 Set_Node_Length(ret, 1); /* MJD */
6945 nextchar(pRExC_state);
6947 RExC_seen_zerolen++;
6948 if (RExC_flags & RXf_PMf_MULTILINE)
6949 ret = reg_node(pRExC_state, MEOL);
6950 else if (RExC_flags & RXf_PMf_SINGLELINE)
6951 ret = reg_node(pRExC_state, SEOL);
6953 ret = reg_node(pRExC_state, EOL);
6954 Set_Node_Length(ret, 1); /* MJD */
6957 nextchar(pRExC_state);
6958 if (RExC_flags & RXf_PMf_SINGLELINE)
6959 ret = reg_node(pRExC_state, SANY);
6961 ret = reg_node(pRExC_state, REG_ANY);
6962 *flagp |= HASWIDTH|SIMPLE;
6964 Set_Node_Length(ret, 1); /* MJD */
6968 char * const oregcomp_parse = ++RExC_parse;
6969 ret = regclass(pRExC_state,depth+1);
6970 if (*RExC_parse != ']') {
6971 RExC_parse = oregcomp_parse;
6972 vFAIL("Unmatched [");
6974 nextchar(pRExC_state);
6975 *flagp |= HASWIDTH|SIMPLE;
6976 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6980 nextchar(pRExC_state);
6981 ret = reg(pRExC_state, 1, &flags,depth+1);
6983 if (flags & TRYAGAIN) {
6984 if (RExC_parse == RExC_end) {
6985 /* Make parent create an empty node if needed. */
6993 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6997 if (flags & TRYAGAIN) {
7001 vFAIL("Internal urp");
7002 /* Supposed to be caught earlier. */
7005 if (!regcurly(RExC_parse)) {
7014 vFAIL("Quantifier follows nothing");
7022 len=0; /* silence a spurious compiler warning */
7023 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7024 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7025 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7026 ret = reganode(pRExC_state, FOLDCHAR, cp);
7027 Set_Node_Length(ret, 1); /* MJD */
7028 nextchar(pRExC_state); /* kill whitespace under /x */
7036 This switch handles escape sequences that resolve to some kind
7037 of special regop and not to literal text. Escape sequnces that
7038 resolve to literal text are handled below in the switch marked
7041 Every entry in this switch *must* have a corresponding entry
7042 in the literal escape switch. However, the opposite is not
7043 required, as the default for this switch is to jump to the
7044 literal text handling code.
7046 switch ((U8)*++RExC_parse) {
7051 /* Special Escapes */
7053 RExC_seen_zerolen++;
7054 ret = reg_node(pRExC_state, SBOL);
7056 goto finish_meta_pat;
7058 ret = reg_node(pRExC_state, GPOS);
7059 RExC_seen |= REG_SEEN_GPOS;
7061 goto finish_meta_pat;
7063 RExC_seen_zerolen++;
7064 ret = reg_node(pRExC_state, KEEPS);
7066 /* XXX:dmq : disabling in-place substitution seems to
7067 * be necessary here to avoid cases of memory corruption, as
7068 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7070 RExC_seen |= REG_SEEN_LOOKBEHIND;
7071 goto finish_meta_pat;
7073 ret = reg_node(pRExC_state, SEOL);
7075 RExC_seen_zerolen++; /* Do not optimize RE away */
7076 goto finish_meta_pat;
7078 ret = reg_node(pRExC_state, EOS);
7080 RExC_seen_zerolen++; /* Do not optimize RE away */
7081 goto finish_meta_pat;
7083 ret = reg_node(pRExC_state, CANY);
7084 RExC_seen |= REG_SEEN_CANY;
7085 *flagp |= HASWIDTH|SIMPLE;
7086 goto finish_meta_pat;
7088 ret = reg_node(pRExC_state, CLUMP);
7090 goto finish_meta_pat;
7092 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7093 *flagp |= HASWIDTH|SIMPLE;
7094 goto finish_meta_pat;
7096 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7097 *flagp |= HASWIDTH|SIMPLE;
7098 goto finish_meta_pat;
7100 RExC_seen_zerolen++;
7101 RExC_seen |= REG_SEEN_LOOKBEHIND;
7102 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7104 goto finish_meta_pat;
7106 RExC_seen_zerolen++;
7107 RExC_seen |= REG_SEEN_LOOKBEHIND;
7108 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7110 goto finish_meta_pat;
7112 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7113 *flagp |= HASWIDTH|SIMPLE;
7114 goto finish_meta_pat;
7116 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7117 *flagp |= HASWIDTH|SIMPLE;
7118 goto finish_meta_pat;
7120 ret = reg_node(pRExC_state, DIGIT);
7121 *flagp |= HASWIDTH|SIMPLE;
7122 goto finish_meta_pat;
7124 ret = reg_node(pRExC_state, NDIGIT);
7125 *flagp |= HASWIDTH|SIMPLE;
7126 goto finish_meta_pat;
7128 ret = reg_node(pRExC_state, LNBREAK);
7129 *flagp |= HASWIDTH|SIMPLE;
7130 goto finish_meta_pat;
7132 ret = reg_node(pRExC_state, HORIZWS);
7133 *flagp |= HASWIDTH|SIMPLE;
7134 goto finish_meta_pat;
7136 ret = reg_node(pRExC_state, NHORIZWS);
7137 *flagp |= HASWIDTH|SIMPLE;
7138 goto finish_meta_pat;
7140 ret = reg_node(pRExC_state, VERTWS);
7141 *flagp |= HASWIDTH|SIMPLE;
7142 goto finish_meta_pat;
7144 ret = reg_node(pRExC_state, NVERTWS);
7145 *flagp |= HASWIDTH|SIMPLE;
7147 nextchar(pRExC_state);
7148 Set_Node_Length(ret, 2); /* MJD */
7153 char* const oldregxend = RExC_end;
7155 char* parse_start = RExC_parse - 2;
7158 if (RExC_parse[1] == '{') {
7159 /* a lovely hack--pretend we saw [\pX] instead */
7160 RExC_end = strchr(RExC_parse, '}');
7162 const U8 c = (U8)*RExC_parse;
7164 RExC_end = oldregxend;
7165 vFAIL2("Missing right brace on \\%c{}", c);
7170 RExC_end = RExC_parse + 2;
7171 if (RExC_end > oldregxend)
7172 RExC_end = oldregxend;
7176 ret = regclass(pRExC_state,depth+1);
7178 RExC_end = oldregxend;
7181 Set_Node_Offset(ret, parse_start + 2);
7182 Set_Node_Cur_Length(ret);
7183 nextchar(pRExC_state);
7184 *flagp |= HASWIDTH|SIMPLE;
7188 /* Handle \N and \N{NAME} here and not below because it can be
7189 multicharacter. join_exact() will join them up later on.
7190 Also this makes sure that things like /\N{BLAH}+/ and
7191 \N{BLAH} being multi char Just Happen. dmq*/
7193 ret= reg_namedseq(pRExC_state, NULL, flagp);
7195 case 'k': /* Handle \k<NAME> and \k'NAME' */
7198 char ch= RExC_parse[1];
7199 if (ch != '<' && ch != '\'' && ch != '{') {
7201 vFAIL2("Sequence %.2s... not terminated",parse_start);
7203 /* this pretty much dupes the code for (?P=...) in reg(), if
7204 you change this make sure you change that */
7205 char* name_start = (RExC_parse += 2);
7207 SV *sv_dat = reg_scan_name(pRExC_state,
7208 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7209 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7210 if (RExC_parse == name_start || *RExC_parse != ch)
7211 vFAIL2("Sequence %.3s... not terminated",parse_start);
7214 num = add_data( pRExC_state, 1, "S" );
7215 RExC_rxi->data->data[num]=(void*)sv_dat;
7216 SvREFCNT_inc_simple_void(sv_dat);
7220 ret = reganode(pRExC_state,
7221 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7225 /* override incorrect value set in reganode MJD */
7226 Set_Node_Offset(ret, parse_start+1);
7227 Set_Node_Cur_Length(ret); /* MJD */
7228 nextchar(pRExC_state);
7234 case '1': case '2': case '3': case '4':
7235 case '5': case '6': case '7': case '8': case '9':
7238 bool isg = *RExC_parse == 'g';
7243 if (*RExC_parse == '{') {
7247 if (*RExC_parse == '-') {
7251 if (hasbrace && !isDIGIT(*RExC_parse)) {
7252 if (isrel) RExC_parse--;
7254 goto parse_named_seq;
7256 num = atoi(RExC_parse);
7257 if (isg && num == 0)
7258 vFAIL("Reference to invalid group 0");
7260 num = RExC_npar - num;
7262 vFAIL("Reference to nonexistent or unclosed group");
7264 if (!isg && num > 9 && num >= RExC_npar)
7267 char * const parse_start = RExC_parse - 1; /* MJD */
7268 while (isDIGIT(*RExC_parse))
7270 if (parse_start == RExC_parse - 1)
7271 vFAIL("Unterminated \\g... pattern");
7273 if (*RExC_parse != '}')
7274 vFAIL("Unterminated \\g{...} pattern");
7278 if (num > (I32)RExC_rx->nparens)
7279 vFAIL("Reference to nonexistent group");
7282 ret = reganode(pRExC_state,
7283 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7287 /* override incorrect value set in reganode MJD */
7288 Set_Node_Offset(ret, parse_start+1);
7289 Set_Node_Cur_Length(ret); /* MJD */
7291 nextchar(pRExC_state);
7296 if (RExC_parse >= RExC_end)
7297 FAIL("Trailing \\");
7300 /* Do not generate "unrecognized" warnings here, we fall
7301 back into the quick-grab loop below */
7308 if (RExC_flags & RXf_PMf_EXTENDED) {
7309 if ( reg_skipcomment( pRExC_state ) )
7316 register STRLEN len;
7321 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7323 parse_start = RExC_parse - 1;
7329 ret = reg_node(pRExC_state,
7330 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7332 for (len = 0, p = RExC_parse - 1;
7333 len < 127 && p < RExC_end;
7336 char * const oldp = p;
7338 if (RExC_flags & RXf_PMf_EXTENDED)
7339 p = regwhite( pRExC_state, p );
7344 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7345 goto normal_default;
7355 /* Literal Escapes Switch
7357 This switch is meant to handle escape sequences that
7358 resolve to a literal character.
7360 Every escape sequence that represents something
7361 else, like an assertion or a char class, is handled
7362 in the switch marked 'Special Escapes' above in this
7363 routine, but also has an entry here as anything that
7364 isn't explicitly mentioned here will be treated as
7365 an unescaped equivalent literal.
7369 /* These are all the special escapes. */
7373 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7374 goto normal_default;
7375 case 'A': /* Start assertion */
7376 case 'b': case 'B': /* Word-boundary assertion*/
7377 case 'C': /* Single char !DANGEROUS! */
7378 case 'd': case 'D': /* digit class */
7379 case 'g': case 'G': /* generic-backref, pos assertion */
7380 case 'h': case 'H': /* HORIZWS */
7381 case 'k': case 'K': /* named backref, keep marker */
7382 case 'N': /* named char sequence */
7383 case 'p': case 'P': /* Unicode property */
7384 case 'R': /* LNBREAK */
7385 case 's': case 'S': /* space class */
7386 case 'v': case 'V': /* VERTWS */
7387 case 'w': case 'W': /* word class */
7388 case 'X': /* eXtended Unicode "combining character sequence" */
7389 case 'z': case 'Z': /* End of line/string assertion */
7393 /* Anything after here is an escape that resolves to a
7394 literal. (Except digits, which may or may not)
7413 ender = ASCII_TO_NATIVE('\033');
7417 ender = ASCII_TO_NATIVE('\007');
7422 char* const e = strchr(p, '}');
7426 vFAIL("Missing right brace on \\x{}");
7429 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7430 | PERL_SCAN_DISALLOW_PREFIX;
7431 STRLEN numlen = e - p - 1;
7432 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7439 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7441 ender = grok_hex(p, &numlen, &flags, NULL);
7444 if (PL_encoding && ender < 0x100)
7445 goto recode_encoding;
7449 ender = UCHARAT(p++);
7450 ender = toCTRL(ender);
7452 case '0': case '1': case '2': case '3':case '4':
7453 case '5': case '6': case '7': case '8':case '9':
7455 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7458 ender = grok_oct(p, &numlen, &flags, NULL);
7460 /* An octal above 0xff is interpreted differently
7461 * depending on if the re is in utf8 or not. If it
7462 * is in utf8, the value will be itself, otherwise
7463 * it is interpreted as modulo 0x100. It has been
7464 * decided to discourage the use of octal above the
7465 * single-byte range. For now, warn only when
7466 * it ends up modulo */
7467 if (SIZE_ONLY && ender >= 0x100
7468 && ! UTF && ! PL_encoding) {
7469 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7477 if (PL_encoding && ender < 0x100)
7478 goto recode_encoding;
7482 SV* enc = PL_encoding;
7483 ender = reg_recode((const char)(U8)ender, &enc);
7484 if (!enc && SIZE_ONLY)
7485 ckWARNreg(p, "Invalid escape in the specified encoding");
7491 FAIL("Trailing \\");
7494 if (!SIZE_ONLY&& isALPHA(*p))
7495 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7496 goto normal_default;
7501 if (UTF8_IS_START(*p) && UTF) {
7503 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7504 &numlen, UTF8_ALLOW_DEFAULT);
7511 if ( RExC_flags & RXf_PMf_EXTENDED)
7512 p = regwhite( pRExC_state, p );
7514 /* Prime the casefolded buffer. */
7515 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7517 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7522 /* Emit all the Unicode characters. */
7524 for (foldbuf = tmpbuf;
7526 foldlen -= numlen) {
7527 ender = utf8_to_uvchr(foldbuf, &numlen);
7529 const STRLEN unilen = reguni(pRExC_state, ender, s);
7532 /* In EBCDIC the numlen
7533 * and unilen can differ. */
7535 if (numlen >= foldlen)
7539 break; /* "Can't happen." */
7543 const STRLEN unilen = reguni(pRExC_state, ender, s);
7552 REGC((char)ender, s++);
7558 /* Emit all the Unicode characters. */
7560 for (foldbuf = tmpbuf;
7562 foldlen -= numlen) {
7563 ender = utf8_to_uvchr(foldbuf, &numlen);
7565 const STRLEN unilen = reguni(pRExC_state, ender, s);
7568 /* In EBCDIC the numlen
7569 * and unilen can differ. */
7571 if (numlen >= foldlen)
7579 const STRLEN unilen = reguni(pRExC_state, ender, s);
7588 REGC((char)ender, s++);
7592 Set_Node_Cur_Length(ret); /* MJD */
7593 nextchar(pRExC_state);
7595 /* len is STRLEN which is unsigned, need to copy to signed */
7598 vFAIL("Internal disaster");
7602 if (len == 1 && UNI_IS_INVARIANT(ender))
7606 RExC_size += STR_SZ(len);
7609 RExC_emit += STR_SZ(len);
7619 S_regwhite( RExC_state_t *pRExC_state, char *p )
7621 const char *e = RExC_end;
7623 PERL_ARGS_ASSERT_REGWHITE;
7628 else if (*p == '#') {
7637 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7645 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7646 Character classes ([:foo:]) can also be negated ([:^foo:]).
7647 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7648 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7649 but trigger failures because they are currently unimplemented. */
7651 #define POSIXCC_DONE(c) ((c) == ':')
7652 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7653 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7656 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7659 I32 namedclass = OOB_NAMEDCLASS;
7661 PERL_ARGS_ASSERT_REGPPOSIXCC;
7663 if (value == '[' && RExC_parse + 1 < RExC_end &&
7664 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7665 POSIXCC(UCHARAT(RExC_parse))) {
7666 const char c = UCHARAT(RExC_parse);
7667 char* const s = RExC_parse++;
7669 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7671 if (RExC_parse == RExC_end)
7672 /* Grandfather lone [:, [=, [. */
7675 const char* const t = RExC_parse++; /* skip over the c */
7678 if (UCHARAT(RExC_parse) == ']') {
7679 const char *posixcc = s + 1;
7680 RExC_parse++; /* skip over the ending ] */
7683 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7684 const I32 skip = t - posixcc;
7686 /* Initially switch on the length of the name. */
7689 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7690 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7693 /* Names all of length 5. */
7694 /* alnum alpha ascii blank cntrl digit graph lower
7695 print punct space upper */
7696 /* Offset 4 gives the best switch position. */
7697 switch (posixcc[4]) {
7699 if (memEQ(posixcc, "alph", 4)) /* alpha */
7700 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7703 if (memEQ(posixcc, "spac", 4)) /* space */
7704 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7707 if (memEQ(posixcc, "grap", 4)) /* graph */
7708 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7711 if (memEQ(posixcc, "asci", 4)) /* ascii */
7712 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7715 if (memEQ(posixcc, "blan", 4)) /* blank */
7716 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7719 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7720 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7723 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7724 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7727 if (memEQ(posixcc, "lowe", 4)) /* lower */
7728 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7729 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7730 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7733 if (memEQ(posixcc, "digi", 4)) /* digit */
7734 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7735 else if (memEQ(posixcc, "prin", 4)) /* print */
7736 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7737 else if (memEQ(posixcc, "punc", 4)) /* punct */
7738 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7743 if (memEQ(posixcc, "xdigit", 6))
7744 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7748 if (namedclass == OOB_NAMEDCLASS)
7749 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7751 assert (posixcc[skip] == ':');
7752 assert (posixcc[skip+1] == ']');
7753 } else if (!SIZE_ONLY) {
7754 /* [[=foo=]] and [[.foo.]] are still future. */
7756 /* adjust RExC_parse so the warning shows after
7758 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7760 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7763 /* Maternal grandfather:
7764 * "[:" ending in ":" but not in ":]" */
7774 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7778 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7780 if (POSIXCC(UCHARAT(RExC_parse))) {
7781 const char *s = RExC_parse;
7782 const char c = *s++;
7786 if (*s && c == *s && s[1] == ']') {
7788 "POSIX syntax [%c %c] belongs inside character classes",
7791 /* [[=foo=]] and [[.foo.]] are still future. */
7792 if (POSIXCC_NOTYET(c)) {
7793 /* adjust RExC_parse so the error shows after
7795 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7797 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7804 #define _C_C_T_(NAME,TEST,WORD) \
7807 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7809 for (value = 0; value < 256; value++) \
7811 ANYOF_BITMAP_SET(ret, value); \
7816 case ANYOF_N##NAME: \
7818 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7820 for (value = 0; value < 256; value++) \
7822 ANYOF_BITMAP_SET(ret, value); \
7828 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7830 for (value = 0; value < 256; value++) \
7832 ANYOF_BITMAP_SET(ret, value); \
7836 case ANYOF_N##NAME: \
7837 for (value = 0; value < 256; value++) \
7839 ANYOF_BITMAP_SET(ret, value); \
7845 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7846 so that it is possible to override the option here without having to
7847 rebuild the entire core. as we are required to do if we change regcomp.h
7848 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7850 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7851 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7854 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7855 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7857 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7861 parse a class specification and produce either an ANYOF node that
7862 matches the pattern or if the pattern matches a single char only and
7863 that char is < 256 and we are case insensitive then we produce an
7868 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7871 register UV nextvalue;
7872 register IV prevvalue = OOB_UNICODE;
7873 register IV range = 0;
7874 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7875 register regnode *ret;
7878 char *rangebegin = NULL;
7879 bool need_class = 0;
7882 bool optimize_invert = TRUE;
7883 AV* unicode_alternate = NULL;
7885 UV literal_endpoint = 0;
7887 UV stored = 0; /* number of chars stored in the class */
7889 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7890 case we need to change the emitted regop to an EXACT. */
7891 const char * orig_parse = RExC_parse;
7892 GET_RE_DEBUG_FLAGS_DECL;
7894 PERL_ARGS_ASSERT_REGCLASS;
7896 PERL_UNUSED_ARG(depth);
7899 DEBUG_PARSE("clas");
7901 /* Assume we are going to generate an ANYOF node. */
7902 ret = reganode(pRExC_state, ANYOF, 0);
7905 ANYOF_FLAGS(ret) = 0;
7907 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7911 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7915 RExC_size += ANYOF_SKIP;
7916 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7919 RExC_emit += ANYOF_SKIP;
7921 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7923 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7924 ANYOF_BITMAP_ZERO(ret);
7925 listsv = newSVpvs("# comment\n");
7928 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7930 if (!SIZE_ONLY && POSIXCC(nextvalue))
7931 checkposixcc(pRExC_state);
7933 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7934 if (UCHARAT(RExC_parse) == ']')
7938 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7942 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7945 rangebegin = RExC_parse;
7947 value = utf8n_to_uvchr((U8*)RExC_parse,
7948 RExC_end - RExC_parse,
7949 &numlen, UTF8_ALLOW_DEFAULT);
7950 RExC_parse += numlen;
7953 value = UCHARAT(RExC_parse++);
7955 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7956 if (value == '[' && POSIXCC(nextvalue))
7957 namedclass = regpposixcc(pRExC_state, value);
7958 else if (value == '\\') {
7960 value = utf8n_to_uvchr((U8*)RExC_parse,
7961 RExC_end - RExC_parse,
7962 &numlen, UTF8_ALLOW_DEFAULT);
7963 RExC_parse += numlen;
7966 value = UCHARAT(RExC_parse++);
7967 /* Some compilers cannot handle switching on 64-bit integer
7968 * values, therefore value cannot be an UV. Yes, this will
7969 * be a problem later if we want switch on Unicode.
7970 * A similar issue a little bit later when switching on
7971 * namedclass. --jhi */
7972 switch ((I32)value) {
7973 case 'w': namedclass = ANYOF_ALNUM; break;
7974 case 'W': namedclass = ANYOF_NALNUM; break;
7975 case 's': namedclass = ANYOF_SPACE; break;
7976 case 'S': namedclass = ANYOF_NSPACE; break;
7977 case 'd': namedclass = ANYOF_DIGIT; break;
7978 case 'D': namedclass = ANYOF_NDIGIT; break;
7979 case 'v': namedclass = ANYOF_VERTWS; break;
7980 case 'V': namedclass = ANYOF_NVERTWS; break;
7981 case 'h': namedclass = ANYOF_HORIZWS; break;
7982 case 'H': namedclass = ANYOF_NHORIZWS; break;
7983 case 'N': /* Handle \N{NAME} in class */
7985 /* We only pay attention to the first char of
7986 multichar strings being returned. I kinda wonder
7987 if this makes sense as it does change the behaviour
7988 from earlier versions, OTOH that behaviour was broken
7990 UV v; /* value is register so we cant & it /grrr */
7991 if (reg_namedseq(pRExC_state, &v, NULL)) {
8001 if (RExC_parse >= RExC_end)
8002 vFAIL2("Empty \\%c{}", (U8)value);
8003 if (*RExC_parse == '{') {
8004 const U8 c = (U8)value;
8005 e = strchr(RExC_parse++, '}');
8007 vFAIL2("Missing right brace on \\%c{}", c);
8008 while (isSPACE(UCHARAT(RExC_parse)))
8010 if (e == RExC_parse)
8011 vFAIL2("Empty \\%c{}", c);
8013 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8021 if (UCHARAT(RExC_parse) == '^') {
8024 value = value == 'p' ? 'P' : 'p'; /* toggle */
8025 while (isSPACE(UCHARAT(RExC_parse))) {
8030 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8031 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8034 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8035 namedclass = ANYOF_MAX; /* no official name, but it's named */
8038 case 'n': value = '\n'; break;
8039 case 'r': value = '\r'; break;
8040 case 't': value = '\t'; break;
8041 case 'f': value = '\f'; break;
8042 case 'b': value = '\b'; break;
8043 case 'e': value = ASCII_TO_NATIVE('\033');break;
8044 case 'a': value = ASCII_TO_NATIVE('\007');break;
8046 if (*RExC_parse == '{') {
8047 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8048 | PERL_SCAN_DISALLOW_PREFIX;
8049 char * const e = strchr(RExC_parse++, '}');
8051 vFAIL("Missing right brace on \\x{}");
8053 numlen = e - RExC_parse;
8054 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8058 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8060 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8061 RExC_parse += numlen;
8063 if (PL_encoding && value < 0x100)
8064 goto recode_encoding;
8067 value = UCHARAT(RExC_parse++);
8068 value = toCTRL(value);
8070 case '0': case '1': case '2': case '3': case '4':
8071 case '5': case '6': case '7': case '8': case '9':
8075 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8076 RExC_parse += numlen;
8077 if (PL_encoding && value < 0x100)
8078 goto recode_encoding;
8083 SV* enc = PL_encoding;
8084 value = reg_recode((const char)(U8)value, &enc);
8085 if (!enc && SIZE_ONLY)
8086 ckWARNreg(RExC_parse,
8087 "Invalid escape in the specified encoding");
8091 if (!SIZE_ONLY && isALPHA(value))
8092 ckWARN2reg(RExC_parse,
8093 "Unrecognized escape \\%c in character class passed through",
8097 } /* end of \blah */
8103 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8105 if (!SIZE_ONLY && !need_class)
8106 ANYOF_CLASS_ZERO(ret);
8110 /* a bad range like a-\d, a-[:digit:] ? */
8114 RExC_parse >= rangebegin ?
8115 RExC_parse - rangebegin : 0;
8116 ckWARN4reg(RExC_parse,
8117 "False [] range \"%*.*s\"",
8120 if (prevvalue < 256) {
8121 ANYOF_BITMAP_SET(ret, prevvalue);
8122 ANYOF_BITMAP_SET(ret, '-');
8125 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8126 Perl_sv_catpvf(aTHX_ listsv,
8127 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8131 range = 0; /* this was not a true range */
8137 const char *what = NULL;
8140 if (namedclass > OOB_NAMEDCLASS)
8141 optimize_invert = FALSE;
8142 /* Possible truncation here but in some 64-bit environments
8143 * the compiler gets heartburn about switch on 64-bit values.
8144 * A similar issue a little earlier when switching on value.
8146 switch ((I32)namedclass) {
8148 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8149 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8150 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8151 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8152 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8153 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8154 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8155 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8156 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8157 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8158 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8159 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8160 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8162 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8163 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8165 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8166 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8167 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8170 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8173 for (value = 0; value < 128; value++)
8174 ANYOF_BITMAP_SET(ret, value);
8176 for (value = 0; value < 256; value++) {
8178 ANYOF_BITMAP_SET(ret, value);
8187 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8190 for (value = 128; value < 256; value++)
8191 ANYOF_BITMAP_SET(ret, value);
8193 for (value = 0; value < 256; value++) {
8194 if (!isASCII(value))
8195 ANYOF_BITMAP_SET(ret, value);
8204 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8206 /* consecutive digits assumed */
8207 for (value = '0'; value <= '9'; value++)
8208 ANYOF_BITMAP_SET(ret, value);
8211 what = POSIX_CC_UNI_NAME("Digit");
8215 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8217 /* consecutive digits assumed */
8218 for (value = 0; value < '0'; value++)
8219 ANYOF_BITMAP_SET(ret, value);
8220 for (value = '9' + 1; value < 256; value++)
8221 ANYOF_BITMAP_SET(ret, value);
8224 what = POSIX_CC_UNI_NAME("Digit");
8227 /* this is to handle \p and \P */
8230 vFAIL("Invalid [::] class");
8234 /* Strings such as "+utf8::isWord\n" */
8235 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8238 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8241 } /* end of namedclass \blah */
8244 if (prevvalue > (IV)value) /* b-a */ {
8245 const int w = RExC_parse - rangebegin;
8246 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8247 range = 0; /* not a valid range */
8251 prevvalue = value; /* save the beginning of the range */
8252 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8253 RExC_parse[1] != ']') {
8256 /* a bad range like \w-, [:word:]- ? */
8257 if (namedclass > OOB_NAMEDCLASS) {
8258 if (ckWARN(WARN_REGEXP)) {
8260 RExC_parse >= rangebegin ?
8261 RExC_parse - rangebegin : 0;
8263 "False [] range \"%*.*s\"",
8267 ANYOF_BITMAP_SET(ret, '-');
8269 range = 1; /* yeah, it's a range! */
8270 continue; /* but do it the next time */
8274 /* now is the next time */
8275 /*stored += (value - prevvalue + 1);*/
8277 if (prevvalue < 256) {
8278 const IV ceilvalue = value < 256 ? value : 255;
8281 /* In EBCDIC [\x89-\x91] should include
8282 * the \x8e but [i-j] should not. */
8283 if (literal_endpoint == 2 &&
8284 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8285 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8287 if (isLOWER(prevvalue)) {
8288 for (i = prevvalue; i <= ceilvalue; i++)
8289 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8291 ANYOF_BITMAP_SET(ret, i);
8294 for (i = prevvalue; i <= ceilvalue; i++)
8295 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8297 ANYOF_BITMAP_SET(ret, i);
8303 for (i = prevvalue; i <= ceilvalue; i++) {
8304 if (!ANYOF_BITMAP_TEST(ret,i)) {
8306 ANYOF_BITMAP_SET(ret, i);
8310 if (value > 255 || UTF) {
8311 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8312 const UV natvalue = NATIVE_TO_UNI(value);
8313 stored+=2; /* can't optimize this class */
8314 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8315 if (prevnatvalue < natvalue) { /* what about > ? */
8316 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8317 prevnatvalue, natvalue);
8319 else if (prevnatvalue == natvalue) {
8320 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8322 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8324 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8326 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8327 if (RExC_precomp[0] == ':' &&
8328 RExC_precomp[1] == '[' &&
8329 (f == 0xDF || f == 0x92)) {
8330 f = NATIVE_TO_UNI(f);
8333 /* If folding and foldable and a single
8334 * character, insert also the folded version
8335 * to the charclass. */
8337 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8338 if ((RExC_precomp[0] == ':' &&
8339 RExC_precomp[1] == '[' &&
8341 (value == 0xFB05 || value == 0xFB06))) ?
8342 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8343 foldlen == (STRLEN)UNISKIP(f) )
8345 if (foldlen == (STRLEN)UNISKIP(f))
8347 Perl_sv_catpvf(aTHX_ listsv,
8350 /* Any multicharacter foldings
8351 * require the following transform:
8352 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8353 * where E folds into "pq" and F folds
8354 * into "rst", all other characters
8355 * fold to single characters. We save
8356 * away these multicharacter foldings,
8357 * to be later saved as part of the
8358 * additional "s" data. */
8361 if (!unicode_alternate)
8362 unicode_alternate = newAV();
8363 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8365 av_push(unicode_alternate, sv);
8369 /* If folding and the value is one of the Greek
8370 * sigmas insert a few more sigmas to make the
8371 * folding rules of the sigmas to work right.
8372 * Note that not all the possible combinations
8373 * are handled here: some of them are handled
8374 * by the standard folding rules, and some of
8375 * them (literal or EXACTF cases) are handled
8376 * during runtime in regexec.c:S_find_byclass(). */
8377 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8378 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8379 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8380 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8381 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8383 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8384 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8385 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8390 literal_endpoint = 0;
8394 range = 0; /* this range (if it was one) is done now */
8398 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8400 RExC_size += ANYOF_CLASS_ADD_SKIP;
8402 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8408 /****** !SIZE_ONLY AFTER HERE *********/
8410 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8411 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8413 /* optimize single char class to an EXACT node
8414 but *only* when its not a UTF/high char */
8415 const char * cur_parse= RExC_parse;
8416 RExC_emit = (regnode *)orig_emit;
8417 RExC_parse = (char *)orig_parse;
8418 ret = reg_node(pRExC_state,
8419 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8420 RExC_parse = (char *)cur_parse;
8421 *STRING(ret)= (char)value;
8423 RExC_emit += STR_SZ(1);
8424 SvREFCNT_dec(listsv);
8427 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8428 if ( /* If the only flag is folding (plus possibly inversion). */
8429 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8431 for (value = 0; value < 256; ++value) {
8432 if (ANYOF_BITMAP_TEST(ret, value)) {
8433 UV fold = PL_fold[value];
8436 ANYOF_BITMAP_SET(ret, fold);
8439 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8442 /* optimize inverted simple patterns (e.g. [^a-z]) */
8443 if (optimize_invert &&
8444 /* If the only flag is inversion. */
8445 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8446 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8447 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8448 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8451 AV * const av = newAV();
8453 /* The 0th element stores the character class description
8454 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8455 * to initialize the appropriate swash (which gets stored in
8456 * the 1st element), and also useful for dumping the regnode.
8457 * The 2nd element stores the multicharacter foldings,
8458 * used later (regexec.c:S_reginclass()). */
8459 av_store(av, 0, listsv);
8460 av_store(av, 1, NULL);
8461 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8462 rv = newRV_noinc(MUTABLE_SV(av));
8463 n = add_data(pRExC_state, 1, "s");
8464 RExC_rxi->data->data[n] = (void*)rv;
8472 /* reg_skipcomment()
8474 Absorbs an /x style # comments from the input stream.
8475 Returns true if there is more text remaining in the stream.
8476 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8477 terminates the pattern without including a newline.
8479 Note its the callers responsibility to ensure that we are
8485 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8489 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8491 while (RExC_parse < RExC_end)
8492 if (*RExC_parse++ == '\n') {
8497 /* we ran off the end of the pattern without ending
8498 the comment, so we have to add an \n when wrapping */
8499 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8507 Advance that parse position, and optionall absorbs
8508 "whitespace" from the inputstream.
8510 Without /x "whitespace" means (?#...) style comments only,
8511 with /x this means (?#...) and # comments and whitespace proper.
8513 Returns the RExC_parse point from BEFORE the scan occurs.
8515 This is the /x friendly way of saying RExC_parse++.
8519 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8521 char* const retval = RExC_parse++;
8523 PERL_ARGS_ASSERT_NEXTCHAR;
8526 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8527 RExC_parse[2] == '#') {
8528 while (*RExC_parse != ')') {
8529 if (RExC_parse == RExC_end)
8530 FAIL("Sequence (?#... not terminated");
8536 if (RExC_flags & RXf_PMf_EXTENDED) {
8537 if (isSPACE(*RExC_parse)) {
8541 else if (*RExC_parse == '#') {
8542 if ( reg_skipcomment( pRExC_state ) )
8551 - reg_node - emit a node
8553 STATIC regnode * /* Location. */
8554 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8557 register regnode *ptr;
8558 regnode * const ret = RExC_emit;
8559 GET_RE_DEBUG_FLAGS_DECL;
8561 PERL_ARGS_ASSERT_REG_NODE;
8564 SIZE_ALIGN(RExC_size);
8568 if (RExC_emit >= RExC_emit_bound)
8569 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8571 NODE_ALIGN_FILL(ret);
8573 FILL_ADVANCE_NODE(ptr, op);
8574 REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 1);
8575 #ifdef RE_TRACK_PATTERN_OFFSETS
8576 if (RExC_offsets) { /* MJD */
8577 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8578 "reg_node", __LINE__,
8580 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8581 ? "Overwriting end of array!\n" : "OK",
8582 (UV)(RExC_emit - RExC_emit_start),
8583 (UV)(RExC_parse - RExC_start),
8584 (UV)RExC_offsets[0]));
8585 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8593 - reganode - emit a node with an argument
8595 STATIC regnode * /* Location. */
8596 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8599 register regnode *ptr;
8600 regnode * const ret = RExC_emit;
8601 GET_RE_DEBUG_FLAGS_DECL;
8603 PERL_ARGS_ASSERT_REGANODE;
8606 SIZE_ALIGN(RExC_size);
8611 assert(2==regarglen[op]+1);
8613 Anything larger than this has to allocate the extra amount.
8614 If we changed this to be:
8616 RExC_size += (1 + regarglen[op]);
8618 then it wouldn't matter. Its not clear what side effect
8619 might come from that so its not done so far.
8624 if (RExC_emit >= RExC_emit_bound)
8625 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8627 NODE_ALIGN_FILL(ret);
8629 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8630 REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 2);
8631 #ifdef RE_TRACK_PATTERN_OFFSETS
8632 if (RExC_offsets) { /* MJD */
8633 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8637 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8638 "Overwriting end of array!\n" : "OK",
8639 (UV)(RExC_emit - RExC_emit_start),
8640 (UV)(RExC_parse - RExC_start),
8641 (UV)RExC_offsets[0]));
8642 Set_Cur_Node_Offset;
8650 - reguni - emit (if appropriate) a Unicode character
8653 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8657 PERL_ARGS_ASSERT_REGUNI;
8659 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8663 - reginsert - insert an operator in front of already-emitted operand
8665 * Means relocating the operand.
8668 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8671 register regnode *src;
8672 register regnode *dst;
8673 register regnode *place;
8674 const int offset = regarglen[(U8)op];
8675 const int size = NODE_STEP_REGNODE + offset;
8676 GET_RE_DEBUG_FLAGS_DECL;
8678 PERL_ARGS_ASSERT_REGINSERT;
8679 PERL_UNUSED_ARG(depth);
8680 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8681 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8690 if (RExC_open_parens) {
8692 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8693 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8694 if ( RExC_open_parens[paren] >= opnd ) {
8695 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8696 RExC_open_parens[paren] += size;
8698 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8700 if ( RExC_close_parens[paren] >= opnd ) {
8701 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8702 RExC_close_parens[paren] += size;
8704 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8709 while (src > opnd) {
8710 StructCopy(--src, --dst, regnode);
8711 #ifdef RE_TRACK_PATTERN_OFFSETS
8712 if (RExC_offsets) { /* MJD 20010112 */
8713 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8717 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8718 ? "Overwriting end of array!\n" : "OK",
8719 (UV)(src - RExC_emit_start),
8720 (UV)(dst - RExC_emit_start),
8721 (UV)RExC_offsets[0]));
8722 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8723 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8729 place = opnd; /* Op node, where operand used to be. */
8730 #ifdef RE_TRACK_PATTERN_OFFSETS
8731 if (RExC_offsets) { /* MJD */
8732 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8736 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8737 ? "Overwriting end of array!\n" : "OK",
8738 (UV)(place - RExC_emit_start),
8739 (UV)(RExC_parse - RExC_start),
8740 (UV)RExC_offsets[0]));
8741 Set_Node_Offset(place, RExC_parse);
8742 Set_Node_Length(place, 1);
8745 src = NEXTOPER(place);
8746 FILL_ADVANCE_NODE(place, op);
8747 REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (place) - 1);
8748 Zero(src, offset, regnode);
8752 - regtail - set the next-pointer at the end of a node chain of p to val.
8753 - SEE ALSO: regtail_study
8755 /* TODO: All three parms should be const */
8757 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8760 register regnode *scan;
8761 GET_RE_DEBUG_FLAGS_DECL;
8763 PERL_ARGS_ASSERT_REGTAIL;
8765 PERL_UNUSED_ARG(depth);
8771 /* Find last node. */
8774 regnode * const temp = regnext(scan);
8776 SV * const mysv=sv_newmortal();
8777 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8778 regprop(RExC_rx, mysv, scan);
8779 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8780 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8781 (temp == NULL ? "->" : ""),
8782 (temp == NULL ? PL_reg_name[OP(val)] : "")
8790 if (reg_off_by_arg[OP(scan)]) {
8791 ARG_SET(scan, val - scan);
8794 NEXT_OFF(scan) = val - scan;
8800 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8801 - Look for optimizable sequences at the same time.
8802 - currently only looks for EXACT chains.
8804 This is expermental code. The idea is to use this routine to perform
8805 in place optimizations on branches and groups as they are constructed,
8806 with the long term intention of removing optimization from study_chunk so
8807 that it is purely analytical.
8809 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8810 to control which is which.
8813 /* TODO: All four parms should be const */
8816 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8819 register regnode *scan;
8821 #ifdef EXPERIMENTAL_INPLACESCAN
8824 GET_RE_DEBUG_FLAGS_DECL;
8826 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8832 /* Find last node. */
8836 regnode * const temp = regnext(scan);
8837 #ifdef EXPERIMENTAL_INPLACESCAN
8838 if (PL_regkind[OP(scan)] == EXACT)
8839 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8847 if( exact == PSEUDO )
8849 else if ( exact != OP(scan) )
8858 SV * const mysv=sv_newmortal();
8859 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8860 regprop(RExC_rx, mysv, scan);
8861 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8862 SvPV_nolen_const(mysv),
8864 PL_reg_name[exact]);
8871 SV * const mysv_val=sv_newmortal();
8872 DEBUG_PARSE_MSG("");
8873 regprop(RExC_rx, mysv_val, val);
8874 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8875 SvPV_nolen_const(mysv_val),
8876 (IV)REG_NODE_NUM(val),
8880 if (reg_off_by_arg[OP(scan)]) {
8881 ARG_SET(scan, val - scan);
8884 NEXT_OFF(scan) = val - scan;
8892 - regcurly - a little FSA that accepts {\d+,?\d*}
8894 #ifndef PERL_IN_XSUB_RE
8896 Perl_regcurly(register const char *s)
8898 PERL_ARGS_ASSERT_REGCURLY;
8917 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8921 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8926 for (bit=0; bit<32; bit++) {
8927 if (flags & (1<<bit)) {
8929 PerlIO_printf(Perl_debug_log, "%s",lead);
8930 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8935 PerlIO_printf(Perl_debug_log, "\n");
8937 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8943 Perl_regdump(pTHX_ const regexp *r)
8947 SV * const sv = sv_newmortal();
8948 SV *dsv= sv_newmortal();
8950 GET_RE_DEBUG_FLAGS_DECL;
8952 PERL_ARGS_ASSERT_REGDUMP;
8954 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8956 /* Header fields of interest. */
8957 if (r->anchored_substr) {
8958 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8959 RE_SV_DUMPLEN(r->anchored_substr), 30);
8960 PerlIO_printf(Perl_debug_log,
8961 "anchored %s%s at %"IVdf" ",
8962 s, RE_SV_TAIL(r->anchored_substr),
8963 (IV)r->anchored_offset);
8964 } else if (r->anchored_utf8) {
8965 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8966 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8967 PerlIO_printf(Perl_debug_log,
8968 "anchored utf8 %s%s at %"IVdf" ",
8969 s, RE_SV_TAIL(r->anchored_utf8),
8970 (IV)r->anchored_offset);
8972 if (r->float_substr) {
8973 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8974 RE_SV_DUMPLEN(r->float_substr), 30);
8975 PerlIO_printf(Perl_debug_log,
8976 "floating %s%s at %"IVdf"..%"UVuf" ",
8977 s, RE_SV_TAIL(r->float_substr),
8978 (IV)r->float_min_offset, (UV)r->float_max_offset);
8979 } else if (r->float_utf8) {
8980 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8981 RE_SV_DUMPLEN(r->float_utf8), 30);
8982 PerlIO_printf(Perl_debug_log,
8983 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8984 s, RE_SV_TAIL(r->float_utf8),
8985 (IV)r->float_min_offset, (UV)r->float_max_offset);
8987 if (r->check_substr || r->check_utf8)
8988 PerlIO_printf(Perl_debug_log,
8990 (r->check_substr == r->float_substr
8991 && r->check_utf8 == r->float_utf8
8992 ? "(checking floating" : "(checking anchored"));
8993 if (r->extflags & RXf_NOSCAN)
8994 PerlIO_printf(Perl_debug_log, " noscan");
8995 if (r->extflags & RXf_CHECK_ALL)
8996 PerlIO_printf(Perl_debug_log, " isall");
8997 if (r->check_substr || r->check_utf8)
8998 PerlIO_printf(Perl_debug_log, ") ");
9000 if (ri->regstclass) {
9001 regprop(r, sv, ri->regstclass);
9002 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9004 if (r->extflags & RXf_ANCH) {
9005 PerlIO_printf(Perl_debug_log, "anchored");
9006 if (r->extflags & RXf_ANCH_BOL)
9007 PerlIO_printf(Perl_debug_log, "(BOL)");
9008 if (r->extflags & RXf_ANCH_MBOL)
9009 PerlIO_printf(Perl_debug_log, "(MBOL)");
9010 if (r->extflags & RXf_ANCH_SBOL)
9011 PerlIO_printf(Perl_debug_log, "(SBOL)");
9012 if (r->extflags & RXf_ANCH_GPOS)
9013 PerlIO_printf(Perl_debug_log, "(GPOS)");
9014 PerlIO_putc(Perl_debug_log, ' ');
9016 if (r->extflags & RXf_GPOS_SEEN)
9017 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9018 if (r->intflags & PREGf_SKIP)
9019 PerlIO_printf(Perl_debug_log, "plus ");
9020 if (r->intflags & PREGf_IMPLICIT)
9021 PerlIO_printf(Perl_debug_log, "implicit ");
9022 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9023 if (r->extflags & RXf_EVAL_SEEN)
9024 PerlIO_printf(Perl_debug_log, "with eval ");
9025 PerlIO_printf(Perl_debug_log, "\n");
9026 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9028 PERL_ARGS_ASSERT_REGDUMP;
9029 PERL_UNUSED_CONTEXT;
9031 #endif /* DEBUGGING */
9035 - regprop - printable representation of opcode
9037 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9040 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9041 if (flags & ANYOF_INVERT) \
9042 /*make sure the invert info is in each */ \
9043 sv_catpvs(sv, "^"); \
9049 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9054 RXi_GET_DECL(prog,progi);
9055 GET_RE_DEBUG_FLAGS_DECL;
9057 PERL_ARGS_ASSERT_REGPROP;
9061 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9062 /* It would be nice to FAIL() here, but this may be called from
9063 regexec.c, and it would be hard to supply pRExC_state. */
9064 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9065 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9067 k = PL_regkind[OP(o)];
9071 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9072 * is a crude hack but it may be the best for now since
9073 * we have no flag "this EXACTish node was UTF-8"
9075 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9076 PERL_PV_ESCAPE_UNI_DETECT |
9077 PERL_PV_PRETTY_ELLIPSES |
9078 PERL_PV_PRETTY_LTGT |
9079 PERL_PV_PRETTY_NOCLEAR
9081 } else if (k == TRIE) {
9082 /* print the details of the trie in dumpuntil instead, as
9083 * progi->data isn't available here */
9084 const char op = OP(o);
9085 const U32 n = ARG(o);
9086 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9087 (reg_ac_data *)progi->data->data[n] :
9089 const reg_trie_data * const trie
9090 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9092 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9093 DEBUG_TRIE_COMPILE_r(
9094 Perl_sv_catpvf(aTHX_ sv,
9095 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9096 (UV)trie->startstate,
9097 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9098 (UV)trie->wordcount,
9101 (UV)TRIE_CHARCOUNT(trie),
9102 (UV)trie->uniquecharcount
9105 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9107 int rangestart = -1;
9108 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9110 for (i = 0; i <= 256; i++) {
9111 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9112 if (rangestart == -1)
9114 } else if (rangestart != -1) {
9115 if (i <= rangestart + 3)
9116 for (; rangestart < i; rangestart++)
9117 put_byte(sv, rangestart);
9119 put_byte(sv, rangestart);
9121 put_byte(sv, i - 1);
9129 } else if (k == CURLY) {
9130 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9131 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9132 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9134 else if (k == WHILEM && o->flags) /* Ordinal/of */
9135 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9136 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9137 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9138 if ( RXp_PAREN_NAMES(prog) ) {
9139 if ( k != REF || OP(o) < NREF) {
9140 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9141 SV **name= av_fetch(list, ARG(o), 0 );
9143 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9146 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9147 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9148 I32 *nums=(I32*)SvPVX(sv_dat);
9149 SV **name= av_fetch(list, nums[0], 0 );
9152 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9153 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9154 (n ? "," : ""), (IV)nums[n]);
9156 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9160 } else if (k == GOSUB)
9161 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9162 else if (k == VERB) {
9164 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9165 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9166 } else if (k == LOGICAL)
9167 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9168 else if (k == FOLDCHAR)
9169 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9170 else if (k == ANYOF) {
9171 int i, rangestart = -1;
9172 const U8 flags = ANYOF_FLAGS(o);
9175 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9176 static const char * const anyofs[] = {
9209 if (flags & ANYOF_LOCALE)
9210 sv_catpvs(sv, "{loc}");
9211 if (flags & ANYOF_FOLD)
9212 sv_catpvs(sv, "{i}");
9213 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9214 if (flags & ANYOF_INVERT)
9217 /* output what the standard cp 0-255 bitmap matches */
9218 for (i = 0; i <= 256; i++) {
9219 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9220 if (rangestart == -1)
9222 } else if (rangestart != -1) {
9223 if (i <= rangestart + 3)
9224 for (; rangestart < i; rangestart++)
9225 put_byte(sv, rangestart);
9227 put_byte(sv, rangestart);
9229 put_byte(sv, i - 1);
9236 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9237 /* output any special charclass tests (used mostly under use locale) */
9238 if (o->flags & ANYOF_CLASS)
9239 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9240 if (ANYOF_CLASS_TEST(o,i)) {
9241 sv_catpv(sv, anyofs[i]);
9245 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9247 /* output information about the unicode matching */
9248 if (flags & ANYOF_UNICODE)
9249 sv_catpvs(sv, "{unicode}");
9250 else if (flags & ANYOF_UNICODE_ALL)
9251 sv_catpvs(sv, "{unicode_all}");
9255 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9259 U8 s[UTF8_MAXBYTES_CASE+1];
9261 for (i = 0; i <= 256; i++) { /* just the first 256 */
9262 uvchr_to_utf8(s, i);
9264 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9265 if (rangestart == -1)
9267 } else if (rangestart != -1) {
9268 if (i <= rangestart + 3)
9269 for (; rangestart < i; rangestart++) {
9270 const U8 * const e = uvchr_to_utf8(s,rangestart);
9272 for(p = s; p < e; p++)
9276 const U8 *e = uvchr_to_utf8(s,rangestart);
9278 for (p = s; p < e; p++)
9281 e = uvchr_to_utf8(s, i-1);
9282 for (p = s; p < e; p++)
9289 sv_catpvs(sv, "..."); /* et cetera */
9293 char *s = savesvpv(lv);
9294 char * const origs = s;
9296 while (*s && *s != '\n')
9300 const char * const t = ++s;
9318 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9320 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9321 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9323 PERL_UNUSED_CONTEXT;
9324 PERL_UNUSED_ARG(sv);
9326 PERL_UNUSED_ARG(prog);
9327 #endif /* DEBUGGING */
9331 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9332 { /* Assume that RE_INTUIT is set */
9334 struct regexp *const prog = (struct regexp *)SvANY(r);
9335 GET_RE_DEBUG_FLAGS_DECL;
9337 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9338 PERL_UNUSED_CONTEXT;
9342 const char * const s = SvPV_nolen_const(prog->check_substr
9343 ? prog->check_substr : prog->check_utf8);
9345 if (!PL_colorset) reginitcolors();
9346 PerlIO_printf(Perl_debug_log,
9347 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9349 prog->check_substr ? "" : "utf8 ",
9350 PL_colors[5],PL_colors[0],
9353 (strlen(s) > 60 ? "..." : ""));
9356 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9362 handles refcounting and freeing the perl core regexp structure. When
9363 it is necessary to actually free the structure the first thing it
9364 does is call the 'free' method of the regexp_engine associated to to
9365 the regexp, allowing the handling of the void *pprivate; member
9366 first. (This routine is not overridable by extensions, which is why
9367 the extensions free is called first.)
9369 See regdupe and regdupe_internal if you change anything here.
9371 #ifndef PERL_IN_XSUB_RE
9373 Perl_pregfree(pTHX_ REGEXP *r)
9379 Perl_pregfree2(pTHX_ REGEXP *rx)
9382 struct regexp *const r = (struct regexp *)SvANY(rx);
9383 GET_RE_DEBUG_FLAGS_DECL;
9385 PERL_ARGS_ASSERT_PREGFREE2;
9388 ReREFCNT_dec(r->mother_re);
9390 CALLREGFREE_PVT(rx); /* free the private data */
9391 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9394 SvREFCNT_dec(r->anchored_substr);
9395 SvREFCNT_dec(r->anchored_utf8);
9396 SvREFCNT_dec(r->float_substr);
9397 SvREFCNT_dec(r->float_utf8);
9398 Safefree(r->substrs);
9400 RX_MATCH_COPY_FREE(rx);
9401 #ifdef PERL_OLD_COPY_ON_WRITE
9402 SvREFCNT_dec(r->saved_copy);
9409 This is a hacky workaround to the structural issue of match results
9410 being stored in the regexp structure which is in turn stored in
9411 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9412 could be PL_curpm in multiple contexts, and could require multiple
9413 result sets being associated with the pattern simultaneously, such
9414 as when doing a recursive match with (??{$qr})
9416 The solution is to make a lightweight copy of the regexp structure
9417 when a qr// is returned from the code executed by (??{$qr}) this
9418 lightweight copy doesnt actually own any of its data except for
9419 the starp/end and the actual regexp structure itself.
9425 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9428 struct regexp *const r = (struct regexp *)SvANY(rx);
9429 register const I32 npar = r->nparens+1;
9431 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9434 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9435 ret = (struct regexp *)SvANY(ret_x);
9437 (void)ReREFCNT_inc(rx);
9438 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9439 by pointing directly at the buffer, but flagging that the allocated
9440 space in the copy is zero. As we've just done a struct copy, it's now
9441 a case of zero-ing that, rather than copying the current length. */
9442 SvPV_set(ret_x, RX_WRAPPED(rx));
9443 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9444 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9445 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9446 SvLEN_set(ret_x, 0);
9447 SvSTASH_set(ret_x, NULL);
9448 SvMAGIC_set(ret_x, NULL);
9449 Newx(ret->offs, npar, regexp_paren_pair);
9450 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9452 Newx(ret->substrs, 1, struct reg_substr_data);
9453 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9455 SvREFCNT_inc_void(ret->anchored_substr);
9456 SvREFCNT_inc_void(ret->anchored_utf8);
9457 SvREFCNT_inc_void(ret->float_substr);
9458 SvREFCNT_inc_void(ret->float_utf8);
9460 /* check_substr and check_utf8, if non-NULL, point to either their
9461 anchored or float namesakes, and don't hold a second reference. */
9463 RX_MATCH_COPIED_off(ret_x);
9464 #ifdef PERL_OLD_COPY_ON_WRITE
9465 ret->saved_copy = NULL;
9467 ret->mother_re = rx;
9473 /* regfree_internal()
9475 Free the private data in a regexp. This is overloadable by
9476 extensions. Perl takes care of the regexp structure in pregfree(),
9477 this covers the *pprivate pointer which technically perldoesnt
9478 know about, however of course we have to handle the
9479 regexp_internal structure when no extension is in use.
9481 Note this is called before freeing anything in the regexp
9486 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9489 struct regexp *const r = (struct regexp *)SvANY(rx);
9491 GET_RE_DEBUG_FLAGS_DECL;
9493 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9499 SV *dsv= sv_newmortal();
9500 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9501 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9502 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9503 PL_colors[4],PL_colors[5],s);
9506 #ifdef RE_TRACK_PATTERN_OFFSETS
9508 Safefree(ri->u.offsets); /* 20010421 MJD */
9511 int n = ri->data->count;
9512 PAD* new_comppad = NULL;
9517 /* If you add a ->what type here, update the comment in regcomp.h */
9518 switch (ri->data->what[n]) {
9522 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9525 Safefree(ri->data->data[n]);
9528 new_comppad = MUTABLE_AV(ri->data->data[n]);
9531 if (new_comppad == NULL)
9532 Perl_croak(aTHX_ "panic: pregfree comppad");
9533 PAD_SAVE_LOCAL(old_comppad,
9534 /* Watch out for global destruction's random ordering. */
9535 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9538 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9541 op_free((OP_4tree*)ri->data->data[n]);
9543 PAD_RESTORE_LOCAL(old_comppad);
9544 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9550 { /* Aho Corasick add-on structure for a trie node.
9551 Used in stclass optimization only */
9553 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9555 refcount = --aho->refcount;
9558 PerlMemShared_free(aho->states);
9559 PerlMemShared_free(aho->fail);
9560 /* do this last!!!! */
9561 PerlMemShared_free(ri->data->data[n]);
9562 PerlMemShared_free(ri->regstclass);
9568 /* trie structure. */
9570 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9572 refcount = --trie->refcount;
9575 PerlMemShared_free(trie->charmap);
9576 PerlMemShared_free(trie->states);
9577 PerlMemShared_free(trie->trans);
9579 PerlMemShared_free(trie->bitmap);
9581 PerlMemShared_free(trie->wordlen);
9583 PerlMemShared_free(trie->jump);
9585 PerlMemShared_free(trie->nextword);
9586 /* do this last!!!! */
9587 PerlMemShared_free(ri->data->data[n]);
9592 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9595 Safefree(ri->data->what);
9602 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9603 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9604 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9605 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9608 re_dup - duplicate a regexp.
9610 This routine is expected to clone a given regexp structure. It is only
9611 compiled under USE_ITHREADS.
9613 After all of the core data stored in struct regexp is duplicated
9614 the regexp_engine.dupe method is used to copy any private data
9615 stored in the *pprivate pointer. This allows extensions to handle
9616 any duplication it needs to do.
9618 See pregfree() and regfree_internal() if you change anything here.
9620 #if defined(USE_ITHREADS)
9621 #ifndef PERL_IN_XSUB_RE
9623 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9627 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9628 struct regexp *ret = (struct regexp *)SvANY(dstr);
9630 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9632 npar = r->nparens+1;
9633 Newx(ret->offs, npar, regexp_paren_pair);
9634 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9636 /* no need to copy these */
9637 Newx(ret->swap, npar, regexp_paren_pair);
9641 /* Do it this way to avoid reading from *r after the StructCopy().
9642 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9643 cache, it doesn't matter. */
9644 const bool anchored = r->check_substr
9645 ? r->check_substr == r->anchored_substr
9646 : r->check_utf8 == r->anchored_utf8;
9647 Newx(ret->substrs, 1, struct reg_substr_data);
9648 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9650 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9651 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9652 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9653 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9655 /* check_substr and check_utf8, if non-NULL, point to either their
9656 anchored or float namesakes, and don't hold a second reference. */
9658 if (ret->check_substr) {
9660 assert(r->check_utf8 == r->anchored_utf8);
9661 ret->check_substr = ret->anchored_substr;
9662 ret->check_utf8 = ret->anchored_utf8;
9664 assert(r->check_substr == r->float_substr);
9665 assert(r->check_utf8 == r->float_utf8);
9666 ret->check_substr = ret->float_substr;
9667 ret->check_utf8 = ret->float_utf8;
9669 } else if (ret->check_utf8) {
9671 ret->check_utf8 = ret->anchored_utf8;
9673 ret->check_utf8 = ret->float_utf8;
9678 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9681 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9683 if (RX_MATCH_COPIED(dstr))
9684 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9687 #ifdef PERL_OLD_COPY_ON_WRITE
9688 ret->saved_copy = NULL;
9691 if (ret->mother_re) {
9692 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9693 /* Our storage points directly to our mother regexp, but that's
9694 1: a buffer in a different thread
9695 2: something we no longer hold a reference on
9696 so we need to copy it locally. */
9697 /* Note we need to sue SvCUR() on our mother_re, because it, in
9698 turn, may well be pointing to its own mother_re. */
9699 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9700 SvCUR(ret->mother_re)+1));
9701 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9703 ret->mother_re = NULL;
9707 #endif /* PERL_IN_XSUB_RE */
9712 This is the internal complement to regdupe() which is used to copy
9713 the structure pointed to by the *pprivate pointer in the regexp.
9714 This is the core version of the extension overridable cloning hook.
9715 The regexp structure being duplicated will be copied by perl prior
9716 to this and will be provided as the regexp *r argument, however
9717 with the /old/ structures pprivate pointer value. Thus this routine
9718 may override any copying normally done by perl.
9720 It returns a pointer to the new regexp_internal structure.
9724 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9727 struct regexp *const r = (struct regexp *)SvANY(rx);
9728 regexp_internal *reti;
9732 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9734 npar = r->nparens+1;
9737 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9738 Copy(ri->program, reti->program, len+1, regnode);
9741 reti->regstclass = NULL;
9745 const int count = ri->data->count;
9748 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9749 char, struct reg_data);
9750 Newx(d->what, count, U8);
9753 for (i = 0; i < count; i++) {
9754 d->what[i] = ri->data->what[i];
9755 switch (d->what[i]) {
9756 /* legal options are one of: sSfpontTu
9757 see also regcomp.h and pregfree() */
9760 case 'p': /* actually an AV, but the dup function is identical. */
9761 case 'u': /* actually an HV, but the dup function is identical. */
9762 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9765 /* This is cheating. */
9766 Newx(d->data[i], 1, struct regnode_charclass_class);
9767 StructCopy(ri->data->data[i], d->data[i],
9768 struct regnode_charclass_class);
9769 reti->regstclass = (regnode*)d->data[i];
9772 /* Compiled op trees are readonly and in shared memory,
9773 and can thus be shared without duplication. */
9775 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9779 /* Trie stclasses are readonly and can thus be shared
9780 * without duplication. We free the stclass in pregfree
9781 * when the corresponding reg_ac_data struct is freed.
9783 reti->regstclass= ri->regstclass;
9787 ((reg_trie_data*)ri->data->data[i])->refcount++;
9791 d->data[i] = ri->data->data[i];
9794 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9803 reti->name_list_idx = ri->name_list_idx;
9805 #ifdef RE_TRACK_PATTERN_OFFSETS
9806 if (ri->u.offsets) {
9807 Newx(reti->u.offsets, 2*len+1, U32);
9808 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9811 SetProgLen(reti,len);
9817 #endif /* USE_ITHREADS */
9819 #ifndef PERL_IN_XSUB_RE
9822 - regnext - dig the "next" pointer out of a node
9825 Perl_regnext(pTHX_ register regnode *p)
9828 register I32 offset;
9833 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9842 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9845 STRLEN l1 = strlen(pat1);
9846 STRLEN l2 = strlen(pat2);
9849 const char *message;
9851 PERL_ARGS_ASSERT_RE_CROAK2;
9857 Copy(pat1, buf, l1 , char);
9858 Copy(pat2, buf + l1, l2 , char);
9859 buf[l1 + l2] = '\n';
9860 buf[l1 + l2 + 1] = '\0';
9862 /* ANSI variant takes additional second argument */
9863 va_start(args, pat2);
9867 msv = vmess(buf, &args);
9869 message = SvPV_const(msv,l1);
9872 Copy(message, buf, l1 , char);
9873 buf[l1-1] = '\0'; /* Overwrite \n */
9874 Perl_croak(aTHX_ "%s", buf);
9877 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9879 #ifndef PERL_IN_XSUB_RE
9881 Perl_save_re_context(pTHX)
9885 struct re_save_state *state;
9887 SAVEVPTR(PL_curcop);
9888 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9890 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9891 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9892 SSPUSHINT(SAVEt_RE_STATE);
9894 Copy(&PL_reg_state, state, 1, struct re_save_state);
9896 PL_reg_start_tmp = 0;
9897 PL_reg_start_tmpl = 0;
9898 PL_reg_oldsaved = NULL;
9899 PL_reg_oldsavedlen = 0;
9901 PL_reg_leftiter = 0;
9902 PL_reg_poscache = NULL;
9903 PL_reg_poscache_size = 0;
9904 #ifdef PERL_OLD_COPY_ON_WRITE
9908 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9910 const REGEXP * const rx = PM_GETRE(PL_curpm);
9913 for (i = 1; i <= RX_NPARENS(rx); i++) {
9914 char digits[TYPE_CHARS(long)];
9915 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9916 GV *const *const gvp
9917 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9920 GV * const gv = *gvp;
9921 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9931 clear_re(pTHX_ void *r)
9934 ReREFCNT_dec((REGEXP *)r);
9940 S_put_byte(pTHX_ SV *sv, int c)
9942 PERL_ARGS_ASSERT_PUT_BYTE;
9944 /* Our definition of isPRINT() ignores locales, so only bytes that are
9945 not part of UTF-8 are considered printable. I assume that the same
9946 holds for UTF-EBCDIC.
9947 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9948 which Wikipedia says:
9950 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9951 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9952 identical, to the ASCII delete (DEL) or rubout control character.
9953 ) So the old condition can be simplified to !isPRINT(c) */
9955 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9957 const char string = c;
9958 if (c == '-' || c == ']' || c == '\\' || c == '^')
9959 sv_catpvs(sv, "\\");
9960 sv_catpvn(sv, &string, 1);
9965 #define CLEAR_OPTSTART \
9966 if (optstart) STMT_START { \
9967 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9971 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9973 STATIC const regnode *
9974 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9975 const regnode *last, const regnode *plast,
9976 SV* sv, I32 indent, U32 depth)
9979 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9980 register const regnode *next;
9981 const regnode *optstart= NULL;
9984 GET_RE_DEBUG_FLAGS_DECL;
9986 PERL_ARGS_ASSERT_DUMPUNTIL;
9988 #ifdef DEBUG_DUMPUNTIL
9989 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9990 last ? last-start : 0,plast ? plast-start : 0);
9993 if (plast && plast < last)
9996 while (PL_regkind[op] != END && (!last || node < last)) {
9997 /* While that wasn't END last time... */
10000 if (op == CLOSE || op == WHILEM)
10002 next = regnext((regnode *)node);
10005 if (OP(node) == OPTIMIZED) {
10006 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10013 regprop(r, sv, node);
10014 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10015 (int)(2*indent + 1), "", SvPVX_const(sv));
10017 if (OP(node) != OPTIMIZED) {
10018 if (next == NULL) /* Next ptr. */
10019 PerlIO_printf(Perl_debug_log, " (0)");
10020 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10021 PerlIO_printf(Perl_debug_log, " (FAIL)");
10023 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10024 (void)PerlIO_putc(Perl_debug_log, '\n');
10028 if (PL_regkind[(U8)op] == BRANCHJ) {
10031 register const regnode *nnode = (OP(next) == LONGJMP
10032 ? regnext((regnode *)next)
10034 if (last && nnode > last)
10036 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10039 else if (PL_regkind[(U8)op] == BRANCH) {
10041 DUMPUNTIL(NEXTOPER(node), next);
10043 else if ( PL_regkind[(U8)op] == TRIE ) {
10044 const regnode *this_trie = node;
10045 const char op = OP(node);
10046 const U32 n = ARG(node);
10047 const reg_ac_data * const ac = op>=AHOCORASICK ?
10048 (reg_ac_data *)ri->data->data[n] :
10050 const reg_trie_data * const trie =
10051 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10053 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10055 const regnode *nextbranch= NULL;
10058 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10059 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10061 PerlIO_printf(Perl_debug_log, "%*s%s ",
10062 (int)(2*(indent+3)), "",
10063 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10064 PL_colors[0], PL_colors[1],
10065 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10066 PERL_PV_PRETTY_ELLIPSES |
10067 PERL_PV_PRETTY_LTGT
10072 U16 dist= trie->jump[word_idx+1];
10073 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10074 (UV)((dist ? this_trie + dist : next) - start));
10077 nextbranch= this_trie + trie->jump[0];
10078 DUMPUNTIL(this_trie + dist, nextbranch);
10080 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10081 nextbranch= regnext((regnode *)nextbranch);
10083 PerlIO_printf(Perl_debug_log, "\n");
10086 if (last && next > last)
10091 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10092 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10093 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10095 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10097 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10099 else if ( op == PLUS || op == STAR) {
10100 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10102 else if (op == ANYOF) {
10103 /* arglen 1 + class block */
10104 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10105 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10106 node = NEXTOPER(node);
10108 else if (PL_regkind[(U8)op] == EXACT) {
10109 /* Literal string, where present. */
10110 node += NODE_SZ_STR(node) - 1;
10111 node = NEXTOPER(node);
10114 node = NEXTOPER(node);
10115 node += regarglen[(U8)op];
10117 if (op == CURLYX || op == OPEN)
10121 #ifdef DEBUG_DUMPUNTIL
10122 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10127 #endif /* DEBUGGING */
10131 * c-indentation-style: bsd
10132 * c-basic-offset: 4
10133 * indent-tabs-mode: t
10136 * ex: set ts=8 sts=4 sw=4 noet: