5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
93 # if defined(BUGGY_MSC6)
94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 # pragma optimize("a",off)
96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 # pragma optimize("w",on )
98 # endif /* BUGGY_MSC6 */
102 #define STATIC static
105 typedef struct RExC_state_t {
106 U32 flags; /* are we folding, multilining? */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
111 char *start; /* Start of input for compile */
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
115 regnode *emit_start; /* Start of emitted-code area */
116 regnode *emit_bound; /* First regnode outside of the allocated space */
117 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
121 I32 size; /* Code size. */
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
124 I32 nestroot; /* root parens we are in - used by accept */
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
135 HV *paren_names; /* Paren names */
137 regnode **recurse; /* Recurse regops */
138 I32 recurse_count; /* Number of recurse regops */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
144 const char *lastparse;
146 AV *paren_name_list; /* idx -> name */
147 #define RExC_lastparse (pRExC_state->lastparse)
148 #define RExC_lastnum (pRExC_state->lastnum)
149 #define RExC_paren_name_list (pRExC_state->paren_name_list)
153 #define RExC_flags (pRExC_state->flags)
154 #define RExC_precomp (pRExC_state->precomp)
155 #define RExC_rx_sv (pRExC_state->rx_sv)
156 #define RExC_rx (pRExC_state->rx)
157 #define RExC_rxi (pRExC_state->rxi)
158 #define RExC_start (pRExC_state->start)
159 #define RExC_end (pRExC_state->end)
160 #define RExC_parse (pRExC_state->parse)
161 #define RExC_whilem_seen (pRExC_state->whilem_seen)
162 #ifdef RE_TRACK_PATTERN_OFFSETS
163 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
165 #define RExC_emit (pRExC_state->emit)
166 #define RExC_emit_start (pRExC_state->emit_start)
167 #define RExC_emit_bound (pRExC_state->emit_bound)
168 #define RExC_naughty (pRExC_state->naughty)
169 #define RExC_sawback (pRExC_state->sawback)
170 #define RExC_seen (pRExC_state->seen)
171 #define RExC_size (pRExC_state->size)
172 #define RExC_npar (pRExC_state->npar)
173 #define RExC_nestroot (pRExC_state->nestroot)
174 #define RExC_extralen (pRExC_state->extralen)
175 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
176 #define RExC_seen_evals (pRExC_state->seen_evals)
177 #define RExC_utf8 (pRExC_state->utf8)
178 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
179 #define RExC_open_parens (pRExC_state->open_parens)
180 #define RExC_close_parens (pRExC_state->close_parens)
181 #define RExC_opend (pRExC_state->opend)
182 #define RExC_paren_names (pRExC_state->paren_names)
183 #define RExC_recurse (pRExC_state->recurse)
184 #define RExC_recurse_count (pRExC_state->recurse_count)
187 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
188 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 ((*s) == '{' && regcurly(s)))
192 #undef SPSTART /* dratted cpp namespace... */
195 * Flags to be passed up and down.
197 #define WORST 0 /* Worst case. */
198 #define HASWIDTH 0x01 /* Known to match non-null strings. */
199 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
200 #define SPSTART 0x04 /* Starts with * or +. */
201 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
202 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
204 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
206 /* whether trie related optimizations are enabled */
207 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208 #define TRIE_STUDY_OPT
209 #define FULL_TRIE_STUDY
215 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216 #define PBITVAL(paren) (1 << ((paren) & 7))
217 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
222 /* About scan_data_t.
224 During optimisation we recurse through the regexp program performing
225 various inplace (keyhole style) optimisations. In addition study_chunk
226 and scan_commit populate this data structure with information about
227 what strings MUST appear in the pattern. We look for the longest
228 string that must appear for at a fixed location, and we look for the
229 longest string that may appear at a floating location. So for instance
234 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
235 strings (because they follow a .* construct). study_chunk will identify
236 both FOO and BAR as being the longest fixed and floating strings respectively.
238 The strings can be composites, for instance
242 will result in a composite fixed substring 'foo'.
244 For each string some basic information is maintained:
246 - offset or min_offset
247 This is the position the string must appear at, or not before.
248 It also implicitly (when combined with minlenp) tells us how many
249 character must match before the string we are searching.
250 Likewise when combined with minlenp and the length of the string
251 tells us how many characters must appear after the string we have
255 Only used for floating strings. This is the rightmost point that
256 the string can appear at. Ifset to I32 max it indicates that the
257 string can occur infinitely far to the right.
260 A pointer to the minimum length of the pattern that the string
261 was found inside. This is important as in the case of positive
262 lookahead or positive lookbehind we can have multiple patterns
267 The minimum length of the pattern overall is 3, the minimum length
268 of the lookahead part is 3, but the minimum length of the part that
269 will actually match is 1. So 'FOO's minimum length is 3, but the
270 minimum length for the F is 1. This is important as the minimum length
271 is used to determine offsets in front of and behind the string being
272 looked for. Since strings can be composites this is the length of the
273 pattern at the time it was commited with a scan_commit. Note that
274 the length is calculated by study_chunk, so that the minimum lengths
275 are not known until the full pattern has been compiled, thus the
276 pointer to the value.
280 In the case of lookbehind the string being searched for can be
281 offset past the start point of the final matching string.
282 If this value was just blithely removed from the min_offset it would
283 invalidate some of the calculations for how many chars must match
284 before or after (as they are derived from min_offset and minlen and
285 the length of the string being searched for).
286 When the final pattern is compiled and the data is moved from the
287 scan_data_t structure into the regexp structure the information
288 about lookbehind is factored in, with the information that would
289 have been lost precalculated in the end_shift field for the
292 The fields pos_min and pos_delta are used to store the minimum offset
293 and the delta to the maximum offset at the current point in the pattern.
297 typedef struct scan_data_t {
298 /*I32 len_min; unused */
299 /*I32 len_delta; unused */
303 I32 last_end; /* min value, <0 unless valid. */
306 SV **longest; /* Either &l_fixed, or &l_float. */
307 SV *longest_fixed; /* longest fixed string found in pattern */
308 I32 offset_fixed; /* offset where it starts */
309 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
310 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
311 SV *longest_float; /* longest floating string found in pattern */
312 I32 offset_float_min; /* earliest point in string it can appear */
313 I32 offset_float_max; /* latest point in string it can appear */
314 I32 *minlen_float; /* pointer to the minlen relevent to the string */
315 I32 lookbehind_float; /* is the position of the string modified by LB */
319 struct regnode_charclass_class *start_class;
323 * Forward declarations for pregcomp()'s friends.
326 static const scan_data_t zero_scan_data =
327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
329 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
330 #define SF_BEFORE_SEOL 0x0001
331 #define SF_BEFORE_MEOL 0x0002
332 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
336 # define SF_FIX_SHIFT_EOL (0+2)
337 # define SF_FL_SHIFT_EOL (0+4)
339 # define SF_FIX_SHIFT_EOL (+2)
340 # define SF_FL_SHIFT_EOL (+4)
343 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
346 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
348 #define SF_IS_INF 0x0040
349 #define SF_HAS_PAR 0x0080
350 #define SF_IN_PAR 0x0100
351 #define SF_HAS_EVAL 0x0200
352 #define SCF_DO_SUBSTR 0x0400
353 #define SCF_DO_STCLASS_AND 0x0800
354 #define SCF_DO_STCLASS_OR 0x1000
355 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
356 #define SCF_WHILEM_VISITED_POS 0x2000
358 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
359 #define SCF_SEEN_ACCEPT 0x8000
361 #define UTF (RExC_utf8 != 0)
362 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
365 #define OOB_UNICODE 12345678
366 #define OOB_NAMEDCLASS -1
368 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
372 /* length of regex to show in messages that don't mark a position within */
373 #define RegexLengthToShowInErrorMessages 127
376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
378 * op/pragma/warn/regcomp.
380 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
381 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
383 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
387 * arg. Show regex, up to a maximum length. If it's too long, chop and add
390 #define _FAIL(code) STMT_START { \
391 const char *ellipses = ""; \
392 IV len = RExC_end - RExC_precomp; \
395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
396 if (len > RegexLengthToShowInErrorMessages) { \
397 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
398 len = RegexLengthToShowInErrorMessages - 10; \
404 #define FAIL(msg) _FAIL( \
405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
406 msg, (int)len, RExC_precomp, ellipses))
408 #define FAIL2(msg,arg) _FAIL( \
409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
410 arg, (int)len, RExC_precomp, ellipses))
413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
415 #define Simple_vFAIL(m) STMT_START { \
416 const IV offset = RExC_parse - RExC_precomp; \
417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
424 #define vFAIL(m) STMT_START { \
426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
431 * Like Simple_vFAIL(), but accepts two arguments.
433 #define Simple_vFAIL2(m,a1) STMT_START { \
434 const IV offset = RExC_parse - RExC_precomp; \
435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
436 (int)offset, RExC_precomp, RExC_precomp + offset); \
440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
442 #define vFAIL2(m,a1) STMT_START { \
444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
445 Simple_vFAIL2(m, a1); \
450 * Like Simple_vFAIL(), but accepts three arguments.
452 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
453 const IV offset = RExC_parse - RExC_precomp; \
454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
455 (int)offset, RExC_precomp, RExC_precomp + offset); \
459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
461 #define vFAIL3(m,a1,a2) STMT_START { \
463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
464 Simple_vFAIL3(m, a1, a2); \
468 * Like Simple_vFAIL(), but accepts four arguments.
470 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
471 const IV offset = RExC_parse - RExC_precomp; \
472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
473 (int)offset, RExC_precomp, RExC_precomp + offset); \
476 #define ckWARNreg(loc,m) STMT_START { \
477 const IV offset = loc - RExC_precomp; \
478 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 (int)offset, RExC_precomp, RExC_precomp + offset); \
482 #define ckWARNregdep(loc,m) STMT_START { \
483 const IV offset = loc - RExC_precomp; \
484 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
486 (int)offset, RExC_precomp, RExC_precomp + offset); \
489 #define ckWARN2reg(loc, m, a1) STMT_START { \
490 const IV offset = loc - RExC_precomp; \
491 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
492 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
495 #define vWARN3(loc, m, a1, a2) STMT_START { \
496 const IV offset = loc - RExC_precomp; \
497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
498 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
501 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
502 const IV offset = loc - RExC_precomp; \
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
507 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
508 const IV offset = loc - RExC_precomp; \
509 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
510 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
513 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
514 const IV offset = loc - RExC_precomp; \
515 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
516 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
519 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
520 const IV offset = loc - RExC_precomp; \
521 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
522 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
526 /* Allow for side effects in s */
527 #define REGC(c,s) STMT_START { \
528 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
531 /* Macros for recording node offsets. 20001227 mjd@plover.com
532 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
533 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
534 * Element 0 holds the number n.
535 * Position is 1 indexed.
537 #ifndef RE_TRACK_PATTERN_OFFSETS
538 #define Set_Node_Offset_To_R(node,byte)
539 #define Set_Node_Offset(node,byte)
540 #define Set_Cur_Node_Offset
541 #define Set_Node_Length_To_R(node,len)
542 #define Set_Node_Length(node,len)
543 #define Set_Node_Cur_Length(node)
544 #define Node_Offset(n)
545 #define Node_Length(n)
546 #define Set_Node_Offset_Length(node,offset,len)
547 #define ProgLen(ri) ri->u.proglen
548 #define SetProgLen(ri,x) ri->u.proglen = x
550 #define ProgLen(ri) ri->u.offsets[0]
551 #define SetProgLen(ri,x) ri->u.offsets[0] = x
552 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
554 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
555 __LINE__, (int)(node), (int)(byte))); \
557 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
559 RExC_offsets[2*(node)-1] = (byte); \
564 #define Set_Node_Offset(node,byte) \
565 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
566 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
568 #define Set_Node_Length_To_R(node,len) STMT_START { \
570 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
571 __LINE__, (int)(node), (int)(len))); \
573 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
575 RExC_offsets[2*(node)] = (len); \
580 #define Set_Node_Length(node,len) \
581 Set_Node_Length_To_R((node)-RExC_emit_start, len)
582 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
583 #define Set_Node_Cur_Length(node) \
584 Set_Node_Length(node, RExC_parse - parse_start)
586 /* Get offsets and lengths */
587 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
588 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
590 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
591 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
592 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
596 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
597 #define EXPERIMENTAL_INPLACESCAN
598 #endif /*RE_TRACK_PATTERN_OFFSETS*/
600 #define DEBUG_STUDYDATA(str,data,depth) \
601 DEBUG_OPTIMISE_MORE_r(if(data){ \
602 PerlIO_printf(Perl_debug_log, \
603 "%*s" str "Pos:%"IVdf"/%"IVdf \
604 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
605 (int)(depth)*2, "", \
606 (IV)((data)->pos_min), \
607 (IV)((data)->pos_delta), \
608 (UV)((data)->flags), \
609 (IV)((data)->whilem_c), \
610 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
611 is_inf ? "INF " : "" \
613 if ((data)->last_found) \
614 PerlIO_printf(Perl_debug_log, \
615 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
616 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
617 SvPVX_const((data)->last_found), \
618 (IV)((data)->last_end), \
619 (IV)((data)->last_start_min), \
620 (IV)((data)->last_start_max), \
621 ((data)->longest && \
622 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
623 SvPVX_const((data)->longest_fixed), \
624 (IV)((data)->offset_fixed), \
625 ((data)->longest && \
626 (data)->longest==&((data)->longest_float)) ? "*" : "", \
627 SvPVX_const((data)->longest_float), \
628 (IV)((data)->offset_float_min), \
629 (IV)((data)->offset_float_max) \
631 PerlIO_printf(Perl_debug_log,"\n"); \
634 static void clear_re(pTHX_ void *r);
636 /* Mark that we cannot extend a found fixed substring at this point.
637 Update the longest found anchored substring and the longest found
638 floating substrings if needed. */
641 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
643 const STRLEN l = CHR_SVLEN(data->last_found);
644 const STRLEN old_l = CHR_SVLEN(*data->longest);
645 GET_RE_DEBUG_FLAGS_DECL;
647 PERL_ARGS_ASSERT_SCAN_COMMIT;
649 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
650 SvSetMagicSV(*data->longest, data->last_found);
651 if (*data->longest == data->longest_fixed) {
652 data->offset_fixed = l ? data->last_start_min : data->pos_min;
653 if (data->flags & SF_BEFORE_EOL)
655 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
657 data->flags &= ~SF_FIX_BEFORE_EOL;
658 data->minlen_fixed=minlenp;
659 data->lookbehind_fixed=0;
661 else { /* *data->longest == data->longest_float */
662 data->offset_float_min = l ? data->last_start_min : data->pos_min;
663 data->offset_float_max = (l
664 ? data->last_start_max
665 : data->pos_min + data->pos_delta);
666 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
667 data->offset_float_max = I32_MAX;
668 if (data->flags & SF_BEFORE_EOL)
670 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
672 data->flags &= ~SF_FL_BEFORE_EOL;
673 data->minlen_float=minlenp;
674 data->lookbehind_float=0;
677 SvCUR_set(data->last_found, 0);
679 SV * const sv = data->last_found;
680 if (SvUTF8(sv) && SvMAGICAL(sv)) {
681 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
687 data->flags &= ~SF_BEFORE_EOL;
688 DEBUG_STUDYDATA("commit: ",data,0);
691 /* Can match anything (initialization) */
693 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
695 PERL_ARGS_ASSERT_CL_ANYTHING;
697 ANYOF_CLASS_ZERO(cl);
698 ANYOF_BITMAP_SETALL(cl);
699 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
701 cl->flags |= ANYOF_LOCALE;
704 /* Can match anything (initialization) */
706 S_cl_is_anything(const struct regnode_charclass_class *cl)
710 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
712 for (value = 0; value <= ANYOF_MAX; value += 2)
713 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
715 if (!(cl->flags & ANYOF_UNICODE_ALL))
717 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
722 /* Can match anything (initialization) */
724 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
726 PERL_ARGS_ASSERT_CL_INIT;
728 Zero(cl, 1, struct regnode_charclass_class);
730 cl_anything(pRExC_state, cl);
734 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
736 PERL_ARGS_ASSERT_CL_INIT_ZERO;
738 Zero(cl, 1, struct regnode_charclass_class);
740 cl_anything(pRExC_state, cl);
742 cl->flags |= ANYOF_LOCALE;
745 /* 'And' a given class with another one. Can create false positives */
746 /* We assume that cl is not inverted */
748 S_cl_and(struct regnode_charclass_class *cl,
749 const struct regnode_charclass_class *and_with)
751 PERL_ARGS_ASSERT_CL_AND;
753 assert(and_with->type == ANYOF);
754 if (!(and_with->flags & ANYOF_CLASS)
755 && !(cl->flags & ANYOF_CLASS)
756 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
757 && !(and_with->flags & ANYOF_FOLD)
758 && !(cl->flags & ANYOF_FOLD)) {
761 if (and_with->flags & ANYOF_INVERT)
762 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
763 cl->bitmap[i] &= ~and_with->bitmap[i];
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] &= and_with->bitmap[i];
767 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
768 if (!(and_with->flags & ANYOF_EOS))
769 cl->flags &= ~ANYOF_EOS;
771 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
772 !(and_with->flags & ANYOF_INVERT)) {
773 cl->flags &= ~ANYOF_UNICODE_ALL;
774 cl->flags |= ANYOF_UNICODE;
775 ARG_SET(cl, ARG(and_with));
777 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
778 !(and_with->flags & ANYOF_INVERT))
779 cl->flags &= ~ANYOF_UNICODE_ALL;
780 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
781 !(and_with->flags & ANYOF_INVERT))
782 cl->flags &= ~ANYOF_UNICODE;
785 /* 'OR' a given class with another one. Can create false positives */
786 /* We assume that cl is not inverted */
788 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
790 PERL_ARGS_ASSERT_CL_OR;
792 if (or_with->flags & ANYOF_INVERT) {
794 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
795 * <= (B1 | !B2) | (CL1 | !CL2)
796 * which is wasteful if CL2 is small, but we ignore CL2:
797 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
798 * XXXX Can we handle case-fold? Unclear:
799 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
800 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
802 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
803 && !(or_with->flags & ANYOF_FOLD)
804 && !(cl->flags & ANYOF_FOLD) ) {
807 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
808 cl->bitmap[i] |= ~or_with->bitmap[i];
809 } /* XXXX: logic is complicated otherwise */
811 cl_anything(pRExC_state, cl);
814 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
815 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
816 && (!(or_with->flags & ANYOF_FOLD)
817 || (cl->flags & ANYOF_FOLD)) ) {
820 /* OR char bitmap and class bitmap separately */
821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 cl->bitmap[i] |= or_with->bitmap[i];
823 if (or_with->flags & ANYOF_CLASS) {
824 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
825 cl->classflags[i] |= or_with->classflags[i];
826 cl->flags |= ANYOF_CLASS;
829 else { /* XXXX: logic is complicated, leave it along for a moment. */
830 cl_anything(pRExC_state, cl);
833 if (or_with->flags & ANYOF_EOS)
834 cl->flags |= ANYOF_EOS;
836 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
837 ARG(cl) != ARG(or_with)) {
838 cl->flags |= ANYOF_UNICODE_ALL;
839 cl->flags &= ~ANYOF_UNICODE;
841 if (or_with->flags & ANYOF_UNICODE_ALL) {
842 cl->flags |= ANYOF_UNICODE_ALL;
843 cl->flags &= ~ANYOF_UNICODE;
847 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
848 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
849 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
850 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
855 dump_trie(trie,widecharmap,revcharmap)
856 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
857 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
859 These routines dump out a trie in a somewhat readable format.
860 The _interim_ variants are used for debugging the interim
861 tables that are used to generate the final compressed
862 representation which is what dump_trie expects.
864 Part of the reason for their existance is to provide a form
865 of documentation as to how the different representations function.
870 Dumps the final compressed table form of the trie to Perl_debug_log.
871 Used for debugging make_trie().
875 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
876 AV *revcharmap, U32 depth)
879 SV *sv=sv_newmortal();
880 int colwidth= widecharmap ? 6 : 4;
881 GET_RE_DEBUG_FLAGS_DECL;
883 PERL_ARGS_ASSERT_DUMP_TRIE;
885 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
886 (int)depth * 2 + 2,"",
887 "Match","Base","Ofs" );
889 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
890 SV ** const tmp = av_fetch( revcharmap, state, 0);
892 PerlIO_printf( Perl_debug_log, "%*s",
894 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
895 PL_colors[0], PL_colors[1],
896 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
897 PERL_PV_ESCAPE_FIRSTCHAR
902 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
903 (int)depth * 2 + 2,"");
905 for( state = 0 ; state < trie->uniquecharcount ; state++ )
906 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
907 PerlIO_printf( Perl_debug_log, "\n");
909 for( state = 1 ; state < trie->statecount ; state++ ) {
910 const U32 base = trie->states[ state ].trans.base;
912 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
914 if ( trie->states[ state ].wordnum ) {
915 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
917 PerlIO_printf( Perl_debug_log, "%6s", "" );
920 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
925 while( ( base + ofs < trie->uniquecharcount ) ||
926 ( base + ofs - trie->uniquecharcount < trie->lasttrans
927 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
930 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
932 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
933 if ( ( base + ofs >= trie->uniquecharcount ) &&
934 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
935 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
937 PerlIO_printf( Perl_debug_log, "%*"UVXf,
939 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
941 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
945 PerlIO_printf( Perl_debug_log, "]");
948 PerlIO_printf( Perl_debug_log, "\n" );
952 Dumps a fully constructed but uncompressed trie in list form.
953 List tries normally only are used for construction when the number of
954 possible chars (trie->uniquecharcount) is very high.
955 Used for debugging make_trie().
958 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
959 HV *widecharmap, AV *revcharmap, U32 next_alloc,
963 SV *sv=sv_newmortal();
964 int colwidth= widecharmap ? 6 : 4;
965 GET_RE_DEBUG_FLAGS_DECL;
967 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
969 /* print out the table precompression. */
970 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
971 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
972 "------:-----+-----------------\n" );
974 for( state=1 ; state < next_alloc ; state ++ ) {
977 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
978 (int)depth * 2 + 2,"", (UV)state );
979 if ( ! trie->states[ state ].wordnum ) {
980 PerlIO_printf( Perl_debug_log, "%5s| ","");
982 PerlIO_printf( Perl_debug_log, "W%4x| ",
983 trie->states[ state ].wordnum
986 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
987 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
989 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
991 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
992 PL_colors[0], PL_colors[1],
993 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
994 PERL_PV_ESCAPE_FIRSTCHAR
996 TRIE_LIST_ITEM(state,charid).forid,
997 (UV)TRIE_LIST_ITEM(state,charid).newstate
1000 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1001 (int)((depth * 2) + 14), "");
1004 PerlIO_printf( Perl_debug_log, "\n");
1009 Dumps a fully constructed but uncompressed trie in table form.
1010 This is the normal DFA style state transition table, with a few
1011 twists to facilitate compression later.
1012 Used for debugging make_trie().
1015 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1016 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1021 SV *sv=sv_newmortal();
1022 int colwidth= widecharmap ? 6 : 4;
1023 GET_RE_DEBUG_FLAGS_DECL;
1025 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1028 print out the table precompression so that we can do a visual check
1029 that they are identical.
1032 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1034 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1035 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1037 PerlIO_printf( Perl_debug_log, "%*s",
1039 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1040 PL_colors[0], PL_colors[1],
1041 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1042 PERL_PV_ESCAPE_FIRSTCHAR
1048 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1050 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1051 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1054 PerlIO_printf( Perl_debug_log, "\n" );
1056 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1058 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1059 (int)depth * 2 + 2,"",
1060 (UV)TRIE_NODENUM( state ) );
1062 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1063 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1065 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1067 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1069 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1070 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1073 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1080 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1081 startbranch: the first branch in the whole branch sequence
1082 first : start branch of sequence of branch-exact nodes.
1083 May be the same as startbranch
1084 last : Thing following the last branch.
1085 May be the same as tail.
1086 tail : item following the branch sequence
1087 count : words in the sequence
1088 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1089 depth : indent depth
1091 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1093 A trie is an N'ary tree where the branches are determined by digital
1094 decomposition of the key. IE, at the root node you look up the 1st character and
1095 follow that branch repeat until you find the end of the branches. Nodes can be
1096 marked as "accepting" meaning they represent a complete word. Eg:
1100 would convert into the following structure. Numbers represent states, letters
1101 following numbers represent valid transitions on the letter from that state, if
1102 the number is in square brackets it represents an accepting state, otherwise it
1103 will be in parenthesis.
1105 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1109 (1) +-i->(6)-+-s->[7]
1111 +-s->(3)-+-h->(4)-+-e->[5]
1113 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1115 This shows that when matching against the string 'hers' we will begin at state 1
1116 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1117 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1118 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1119 single traverse. We store a mapping from accepting to state to which word was
1120 matched, and then when we have multiple possibilities we try to complete the
1121 rest of the regex in the order in which they occured in the alternation.
1123 The only prior NFA like behaviour that would be changed by the TRIE support is
1124 the silent ignoring of duplicate alternations which are of the form:
1126 / (DUPE|DUPE) X? (?{ ... }) Y /x
1128 Thus EVAL blocks follwing a trie may be called a different number of times with
1129 and without the optimisation. With the optimisations dupes will be silently
1130 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1131 the following demonstrates:
1133 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1135 which prints out 'word' three times, but
1137 'words'=~/(word|word|word)(?{ print $1 })S/
1139 which doesnt print it out at all. This is due to other optimisations kicking in.
1141 Example of what happens on a structural level:
1143 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1145 1: CURLYM[1] {1,32767}(18)
1156 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1157 and should turn into:
1159 1: CURLYM[1] {1,32767}(18)
1161 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1169 Cases where tail != last would be like /(?foo|bar)baz/:
1179 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1180 and would end up looking like:
1183 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1190 d = uvuni_to_utf8_flags(d, uv, 0);
1192 is the recommended Unicode-aware way of saying
1197 #define TRIE_STORE_REVCHAR \
1200 SV *zlopp = newSV(2); \
1201 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1202 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1203 SvCUR_set(zlopp, kapow - flrbbbbb); \
1206 av_push(revcharmap, zlopp); \
1208 char ooooff = (char)uvc; \
1209 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1213 #define TRIE_READ_CHAR STMT_START { \
1217 if ( foldlen > 0 ) { \
1218 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1223 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1224 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1225 foldlen -= UNISKIP( uvc ); \
1226 scan = foldbuf + UNISKIP( uvc ); \
1229 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1239 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1240 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1241 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1242 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1244 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1245 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1246 TRIE_LIST_CUR( state )++; \
1249 #define TRIE_LIST_NEW(state) STMT_START { \
1250 Newxz( trie->states[ state ].trans.list, \
1251 4, reg_trie_trans_le ); \
1252 TRIE_LIST_CUR( state ) = 1; \
1253 TRIE_LIST_LEN( state ) = 4; \
1256 #define TRIE_HANDLE_WORD(state) STMT_START { \
1257 U16 dupe= trie->states[ state ].wordnum; \
1258 regnode * const noper_next = regnext( noper ); \
1260 if (trie->wordlen) \
1261 trie->wordlen[ curword ] = wordlen; \
1263 /* store the word for dumping */ \
1265 if (OP(noper) != NOTHING) \
1266 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1268 tmp = newSVpvn_utf8( "", 0, UTF ); \
1269 av_push( trie_words, tmp ); \
1274 if ( noper_next < tail ) { \
1276 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1277 trie->jump[curword] = (U16)(noper_next - convert); \
1279 jumper = noper_next; \
1281 nextbranch= regnext(cur); \
1285 /* So it's a dupe. This means we need to maintain a */\
1286 /* linked-list from the first to the next. */\
1287 /* we only allocate the nextword buffer when there */\
1288 /* a dupe, so first time we have to do the allocation */\
1289 if (!trie->nextword) \
1290 trie->nextword = (U16 *) \
1291 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1292 while ( trie->nextword[dupe] ) \
1293 dupe= trie->nextword[dupe]; \
1294 trie->nextword[dupe]= curword; \
1296 /* we haven't inserted this word yet. */ \
1297 trie->states[ state ].wordnum = curword; \
1302 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1303 ( ( base + charid >= ucharcount \
1304 && base + charid < ubound \
1305 && state == trie->trans[ base - ucharcount + charid ].check \
1306 && trie->trans[ base - ucharcount + charid ].next ) \
1307 ? trie->trans[ base - ucharcount + charid ].next \
1308 : ( state==1 ? special : 0 ) \
1312 #define MADE_JUMP_TRIE 2
1313 #define MADE_EXACT_TRIE 4
1316 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1319 /* first pass, loop through and scan words */
1320 reg_trie_data *trie;
1321 HV *widecharmap = NULL;
1322 AV *revcharmap = newAV();
1324 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1329 regnode *jumper = NULL;
1330 regnode *nextbranch = NULL;
1331 regnode *convert = NULL;
1332 /* we just use folder as a flag in utf8 */
1333 const U8 * const folder = ( flags == EXACTF
1335 : ( flags == EXACTFL
1342 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1343 AV *trie_words = NULL;
1344 /* along with revcharmap, this only used during construction but both are
1345 * useful during debugging so we store them in the struct when debugging.
1348 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1349 STRLEN trie_charcount=0;
1351 SV *re_trie_maxbuff;
1352 GET_RE_DEBUG_FLAGS_DECL;
1354 PERL_ARGS_ASSERT_MAKE_TRIE;
1356 PERL_UNUSED_ARG(depth);
1359 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1361 trie->startstate = 1;
1362 trie->wordcount = word_count;
1363 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1364 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1365 if (!(UTF && folder))
1366 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1368 trie_words = newAV();
1371 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1372 if (!SvIOK(re_trie_maxbuff)) {
1373 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1376 PerlIO_printf( Perl_debug_log,
1377 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1378 (int)depth * 2 + 2, "",
1379 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1380 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1384 /* Find the node we are going to overwrite */
1385 if ( first == startbranch && OP( last ) != BRANCH ) {
1386 /* whole branch chain */
1389 /* branch sub-chain */
1390 convert = NEXTOPER( first );
1393 /* -- First loop and Setup --
1395 We first traverse the branches and scan each word to determine if it
1396 contains widechars, and how many unique chars there are, this is
1397 important as we have to build a table with at least as many columns as we
1400 We use an array of integers to represent the character codes 0..255
1401 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1402 native representation of the character value as the key and IV's for the
1405 *TODO* If we keep track of how many times each character is used we can
1406 remap the columns so that the table compression later on is more
1407 efficient in terms of memory by ensuring most common value is in the
1408 middle and the least common are on the outside. IMO this would be better
1409 than a most to least common mapping as theres a decent chance the most
1410 common letter will share a node with the least common, meaning the node
1411 will not be compressable. With a middle is most common approach the worst
1412 case is when we have the least common nodes twice.
1416 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1417 regnode * const noper = NEXTOPER( cur );
1418 const U8 *uc = (U8*)STRING( noper );
1419 const U8 * const e = uc + STR_LEN( noper );
1421 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1422 const U8 *scan = (U8*)NULL;
1423 U32 wordlen = 0; /* required init */
1425 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1427 if (OP(noper) == NOTHING) {
1431 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1432 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1433 regardless of encoding */
1435 for ( ; uc < e ; uc += len ) {
1436 TRIE_CHARCOUNT(trie)++;
1440 if ( !trie->charmap[ uvc ] ) {
1441 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1443 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1447 /* store the codepoint in the bitmap, and if its ascii
1448 also store its folded equivelent. */
1449 TRIE_BITMAP_SET(trie,uvc);
1451 /* store the folded codepoint */
1452 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1455 /* store first byte of utf8 representation of
1456 codepoints in the 127 < uvc < 256 range */
1457 if (127 < uvc && uvc < 192) {
1458 TRIE_BITMAP_SET(trie,194);
1459 } else if (191 < uvc ) {
1460 TRIE_BITMAP_SET(trie,195);
1461 /* && uvc < 256 -- we know uvc is < 256 already */
1464 set_bit = 0; /* We've done our bit :-) */
1469 widecharmap = newHV();
1471 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1474 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1476 if ( !SvTRUE( *svpp ) ) {
1477 sv_setiv( *svpp, ++trie->uniquecharcount );
1482 if( cur == first ) {
1485 } else if (chars < trie->minlen) {
1487 } else if (chars > trie->maxlen) {
1491 } /* end first pass */
1492 DEBUG_TRIE_COMPILE_r(
1493 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1494 (int)depth * 2 + 2,"",
1495 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1496 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1497 (int)trie->minlen, (int)trie->maxlen )
1499 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1502 We now know what we are dealing with in terms of unique chars and
1503 string sizes so we can calculate how much memory a naive
1504 representation using a flat table will take. If it's over a reasonable
1505 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1506 conservative but potentially much slower representation using an array
1509 At the end we convert both representations into the same compressed
1510 form that will be used in regexec.c for matching with. The latter
1511 is a form that cannot be used to construct with but has memory
1512 properties similar to the list form and access properties similar
1513 to the table form making it both suitable for fast searches and
1514 small enough that its feasable to store for the duration of a program.
1516 See the comment in the code where the compressed table is produced
1517 inplace from the flat tabe representation for an explanation of how
1518 the compression works.
1523 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1525 Second Pass -- Array Of Lists Representation
1527 Each state will be represented by a list of charid:state records
1528 (reg_trie_trans_le) the first such element holds the CUR and LEN
1529 points of the allocated array. (See defines above).
1531 We build the initial structure using the lists, and then convert
1532 it into the compressed table form which allows faster lookups
1533 (but cant be modified once converted).
1536 STRLEN transcount = 1;
1538 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1539 "%*sCompiling trie using list compiler\n",
1540 (int)depth * 2 + 2, ""));
1542 trie->states = (reg_trie_state *)
1543 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1544 sizeof(reg_trie_state) );
1548 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1550 regnode * const noper = NEXTOPER( cur );
1551 U8 *uc = (U8*)STRING( noper );
1552 const U8 * const e = uc + STR_LEN( noper );
1553 U32 state = 1; /* required init */
1554 U16 charid = 0; /* sanity init */
1555 U8 *scan = (U8*)NULL; /* sanity init */
1556 STRLEN foldlen = 0; /* required init */
1557 U32 wordlen = 0; /* required init */
1558 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1560 if (OP(noper) != NOTHING) {
1561 for ( ; uc < e ; uc += len ) {
1566 charid = trie->charmap[ uvc ];
1568 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1572 charid=(U16)SvIV( *svpp );
1575 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1582 if ( !trie->states[ state ].trans.list ) {
1583 TRIE_LIST_NEW( state );
1585 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1586 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1587 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1592 newstate = next_alloc++;
1593 TRIE_LIST_PUSH( state, charid, newstate );
1598 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1602 TRIE_HANDLE_WORD(state);
1604 } /* end second pass */
1606 /* next alloc is the NEXT state to be allocated */
1607 trie->statecount = next_alloc;
1608 trie->states = (reg_trie_state *)
1609 PerlMemShared_realloc( trie->states,
1611 * sizeof(reg_trie_state) );
1613 /* and now dump it out before we compress it */
1614 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1615 revcharmap, next_alloc,
1619 trie->trans = (reg_trie_trans *)
1620 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1627 for( state=1 ; state < next_alloc ; state ++ ) {
1631 DEBUG_TRIE_COMPILE_MORE_r(
1632 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1636 if (trie->states[state].trans.list) {
1637 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1641 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1642 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1643 if ( forid < minid ) {
1645 } else if ( forid > maxid ) {
1649 if ( transcount < tp + maxid - minid + 1) {
1651 trie->trans = (reg_trie_trans *)
1652 PerlMemShared_realloc( trie->trans,
1654 * sizeof(reg_trie_trans) );
1655 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1657 base = trie->uniquecharcount + tp - minid;
1658 if ( maxid == minid ) {
1660 for ( ; zp < tp ; zp++ ) {
1661 if ( ! trie->trans[ zp ].next ) {
1662 base = trie->uniquecharcount + zp - minid;
1663 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1664 trie->trans[ zp ].check = state;
1670 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1671 trie->trans[ tp ].check = state;
1676 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1677 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1678 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1679 trie->trans[ tid ].check = state;
1681 tp += ( maxid - minid + 1 );
1683 Safefree(trie->states[ state ].trans.list);
1686 DEBUG_TRIE_COMPILE_MORE_r(
1687 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1690 trie->states[ state ].trans.base=base;
1692 trie->lasttrans = tp + 1;
1696 Second Pass -- Flat Table Representation.
1698 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1699 We know that we will need Charcount+1 trans at most to store the data
1700 (one row per char at worst case) So we preallocate both structures
1701 assuming worst case.
1703 We then construct the trie using only the .next slots of the entry
1706 We use the .check field of the first entry of the node temporarily to
1707 make compression both faster and easier by keeping track of how many non
1708 zero fields are in the node.
1710 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1713 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1714 number representing the first entry of the node, and state as a
1715 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1716 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1717 are 2 entrys per node. eg:
1725 The table is internally in the right hand, idx form. However as we also
1726 have to deal with the states array which is indexed by nodenum we have to
1727 use TRIE_NODENUM() to convert.
1730 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1731 "%*sCompiling trie using table compiler\n",
1732 (int)depth * 2 + 2, ""));
1734 trie->trans = (reg_trie_trans *)
1735 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1736 * trie->uniquecharcount + 1,
1737 sizeof(reg_trie_trans) );
1738 trie->states = (reg_trie_state *)
1739 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1740 sizeof(reg_trie_state) );
1741 next_alloc = trie->uniquecharcount + 1;
1744 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1746 regnode * const noper = NEXTOPER( cur );
1747 const U8 *uc = (U8*)STRING( noper );
1748 const U8 * const e = uc + STR_LEN( noper );
1750 U32 state = 1; /* required init */
1752 U16 charid = 0; /* sanity init */
1753 U32 accept_state = 0; /* sanity init */
1754 U8 *scan = (U8*)NULL; /* sanity init */
1756 STRLEN foldlen = 0; /* required init */
1757 U32 wordlen = 0; /* required init */
1758 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1760 if ( OP(noper) != NOTHING ) {
1761 for ( ; uc < e ; uc += len ) {
1766 charid = trie->charmap[ uvc ];
1768 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1769 charid = svpp ? (U16)SvIV(*svpp) : 0;
1773 if ( !trie->trans[ state + charid ].next ) {
1774 trie->trans[ state + charid ].next = next_alloc;
1775 trie->trans[ state ].check++;
1776 next_alloc += trie->uniquecharcount;
1778 state = trie->trans[ state + charid ].next;
1780 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1782 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1785 accept_state = TRIE_NODENUM( state );
1786 TRIE_HANDLE_WORD(accept_state);
1788 } /* end second pass */
1790 /* and now dump it out before we compress it */
1791 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1793 next_alloc, depth+1));
1797 * Inplace compress the table.*
1799 For sparse data sets the table constructed by the trie algorithm will
1800 be mostly 0/FAIL transitions or to put it another way mostly empty.
1801 (Note that leaf nodes will not contain any transitions.)
1803 This algorithm compresses the tables by eliminating most such
1804 transitions, at the cost of a modest bit of extra work during lookup:
1806 - Each states[] entry contains a .base field which indicates the
1807 index in the state[] array wheres its transition data is stored.
1809 - If .base is 0 there are no valid transitions from that node.
1811 - If .base is nonzero then charid is added to it to find an entry in
1814 -If trans[states[state].base+charid].check!=state then the
1815 transition is taken to be a 0/Fail transition. Thus if there are fail
1816 transitions at the front of the node then the .base offset will point
1817 somewhere inside the previous nodes data (or maybe even into a node
1818 even earlier), but the .check field determines if the transition is
1822 The following process inplace converts the table to the compressed
1823 table: We first do not compress the root node 1,and mark its all its
1824 .check pointers as 1 and set its .base pointer as 1 as well. This
1825 allows to do a DFA construction from the compressed table later, and
1826 ensures that any .base pointers we calculate later are greater than
1829 - We set 'pos' to indicate the first entry of the second node.
1831 - We then iterate over the columns of the node, finding the first and
1832 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1833 and set the .check pointers accordingly, and advance pos
1834 appropriately and repreat for the next node. Note that when we copy
1835 the next pointers we have to convert them from the original
1836 NODEIDX form to NODENUM form as the former is not valid post
1839 - If a node has no transitions used we mark its base as 0 and do not
1840 advance the pos pointer.
1842 - If a node only has one transition we use a second pointer into the
1843 structure to fill in allocated fail transitions from other states.
1844 This pointer is independent of the main pointer and scans forward
1845 looking for null transitions that are allocated to a state. When it
1846 finds one it writes the single transition into the "hole". If the
1847 pointer doesnt find one the single transition is appended as normal.
1849 - Once compressed we can Renew/realloc the structures to release the
1852 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1853 specifically Fig 3.47 and the associated pseudocode.
1857 const U32 laststate = TRIE_NODENUM( next_alloc );
1860 trie->statecount = laststate;
1862 for ( state = 1 ; state < laststate ; state++ ) {
1864 const U32 stateidx = TRIE_NODEIDX( state );
1865 const U32 o_used = trie->trans[ stateidx ].check;
1866 U32 used = trie->trans[ stateidx ].check;
1867 trie->trans[ stateidx ].check = 0;
1869 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1870 if ( flag || trie->trans[ stateidx + charid ].next ) {
1871 if ( trie->trans[ stateidx + charid ].next ) {
1873 for ( ; zp < pos ; zp++ ) {
1874 if ( ! trie->trans[ zp ].next ) {
1878 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1879 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880 trie->trans[ zp ].check = state;
1881 if ( ++zp > pos ) pos = zp;
1888 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1890 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1891 trie->trans[ pos ].check = state;
1896 trie->lasttrans = pos + 1;
1897 trie->states = (reg_trie_state *)
1898 PerlMemShared_realloc( trie->states, laststate
1899 * sizeof(reg_trie_state) );
1900 DEBUG_TRIE_COMPILE_MORE_r(
1901 PerlIO_printf( Perl_debug_log,
1902 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1903 (int)depth * 2 + 2,"",
1904 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1907 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1910 } /* end table compress */
1912 DEBUG_TRIE_COMPILE_MORE_r(
1913 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1914 (int)depth * 2 + 2, "",
1915 (UV)trie->statecount,
1916 (UV)trie->lasttrans)
1918 /* resize the trans array to remove unused space */
1919 trie->trans = (reg_trie_trans *)
1920 PerlMemShared_realloc( trie->trans, trie->lasttrans
1921 * sizeof(reg_trie_trans) );
1923 /* and now dump out the compressed format */
1924 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1926 { /* Modify the program and insert the new TRIE node*/
1927 U8 nodetype =(U8)(flags & 0xFF);
1931 regnode *optimize = NULL;
1932 #ifdef RE_TRACK_PATTERN_OFFSETS
1935 U32 mjd_nodelen = 0;
1936 #endif /* RE_TRACK_PATTERN_OFFSETS */
1937 #endif /* DEBUGGING */
1939 This means we convert either the first branch or the first Exact,
1940 depending on whether the thing following (in 'last') is a branch
1941 or not and whther first is the startbranch (ie is it a sub part of
1942 the alternation or is it the whole thing.)
1943 Assuming its a sub part we conver the EXACT otherwise we convert
1944 the whole branch sequence, including the first.
1946 /* Find the node we are going to overwrite */
1947 if ( first != startbranch || OP( last ) == BRANCH ) {
1948 /* branch sub-chain */
1949 NEXT_OFF( first ) = (U16)(last - first);
1950 #ifdef RE_TRACK_PATTERN_OFFSETS
1952 mjd_offset= Node_Offset((convert));
1953 mjd_nodelen= Node_Length((convert));
1956 /* whole branch chain */
1958 #ifdef RE_TRACK_PATTERN_OFFSETS
1961 const regnode *nop = NEXTOPER( convert );
1962 mjd_offset= Node_Offset((nop));
1963 mjd_nodelen= Node_Length((nop));
1967 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1968 (int)depth * 2 + 2, "",
1969 (UV)mjd_offset, (UV)mjd_nodelen)
1972 /* But first we check to see if there is a common prefix we can
1973 split out as an EXACT and put in front of the TRIE node. */
1974 trie->startstate= 1;
1975 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1977 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1981 const U32 base = trie->states[ state ].trans.base;
1983 if ( trie->states[state].wordnum )
1986 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1987 if ( ( base + ofs >= trie->uniquecharcount ) &&
1988 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1989 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1991 if ( ++count > 1 ) {
1992 SV **tmp = av_fetch( revcharmap, ofs, 0);
1993 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1994 if ( state == 1 ) break;
1996 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1998 PerlIO_printf(Perl_debug_log,
1999 "%*sNew Start State=%"UVuf" Class: [",
2000 (int)depth * 2 + 2, "",
2003 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2004 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2006 TRIE_BITMAP_SET(trie,*ch);
2008 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2010 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2014 TRIE_BITMAP_SET(trie,*ch);
2016 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2017 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2023 SV **tmp = av_fetch( revcharmap, idx, 0);
2025 char *ch = SvPV( *tmp, len );
2027 SV *sv=sv_newmortal();
2028 PerlIO_printf( Perl_debug_log,
2029 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2030 (int)depth * 2 + 2, "",
2032 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2033 PL_colors[0], PL_colors[1],
2034 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2035 PERL_PV_ESCAPE_FIRSTCHAR
2040 OP( convert ) = nodetype;
2041 str=STRING(convert);
2044 STR_LEN(convert) += len;
2050 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2056 regnode *n = convert+NODE_SZ_STR(convert);
2057 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2058 trie->startstate = state;
2059 trie->minlen -= (state - 1);
2060 trie->maxlen -= (state - 1);
2062 /* At least the UNICOS C compiler choked on this
2063 * being argument to DEBUG_r(), so let's just have
2066 #ifdef PERL_EXT_RE_BUILD
2072 regnode *fix = convert;
2073 U32 word = trie->wordcount;
2075 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2076 while( ++fix < n ) {
2077 Set_Node_Offset_Length(fix, 0, 0);
2080 SV ** const tmp = av_fetch( trie_words, word, 0 );
2082 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2083 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2093 NEXT_OFF(convert) = (U16)(tail - convert);
2094 DEBUG_r(optimize= n);
2100 if ( trie->maxlen ) {
2101 NEXT_OFF( convert ) = (U16)(tail - convert);
2102 ARG_SET( convert, data_slot );
2103 /* Store the offset to the first unabsorbed branch in
2104 jump[0], which is otherwise unused by the jump logic.
2105 We use this when dumping a trie and during optimisation. */
2107 trie->jump[0] = (U16)(nextbranch - convert);
2110 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2111 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2113 OP( convert ) = TRIEC;
2114 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2115 PerlMemShared_free(trie->bitmap);
2118 OP( convert ) = TRIE;
2120 /* store the type in the flags */
2121 convert->flags = nodetype;
2125 + regarglen[ OP( convert ) ];
2127 /* XXX We really should free up the resource in trie now,
2128 as we won't use them - (which resources?) dmq */
2130 /* needed for dumping*/
2131 DEBUG_r(if (optimize) {
2132 regnode *opt = convert;
2134 while ( ++opt < optimize) {
2135 Set_Node_Offset_Length(opt,0,0);
2138 Try to clean up some of the debris left after the
2141 while( optimize < jumper ) {
2142 mjd_nodelen += Node_Length((optimize));
2143 OP( optimize ) = OPTIMIZED;
2144 Set_Node_Offset_Length(optimize,0,0);
2147 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2149 } /* end node insert */
2150 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2152 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2153 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2155 SvREFCNT_dec(revcharmap);
2159 : trie->startstate>1
2165 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2167 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2169 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2170 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2173 We find the fail state for each state in the trie, this state is the longest proper
2174 suffix of the current states 'word' that is also a proper prefix of another word in our
2175 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2176 the DFA not to have to restart after its tried and failed a word at a given point, it
2177 simply continues as though it had been matching the other word in the first place.
2179 'abcdgu'=~/abcdefg|cdgu/
2180 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2181 fail, which would bring use to the state representing 'd' in the second word where we would
2182 try 'g' and succeed, prodceding to match 'cdgu'.
2184 /* add a fail transition */
2185 const U32 trie_offset = ARG(source);
2186 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2188 const U32 ucharcount = trie->uniquecharcount;
2189 const U32 numstates = trie->statecount;
2190 const U32 ubound = trie->lasttrans + ucharcount;
2194 U32 base = trie->states[ 1 ].trans.base;
2197 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2198 GET_RE_DEBUG_FLAGS_DECL;
2200 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2202 PERL_UNUSED_ARG(depth);
2206 ARG_SET( stclass, data_slot );
2207 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2208 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2209 aho->trie=trie_offset;
2210 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2211 Copy( trie->states, aho->states, numstates, reg_trie_state );
2212 Newxz( q, numstates, U32);
2213 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2216 /* initialize fail[0..1] to be 1 so that we always have
2217 a valid final fail state */
2218 fail[ 0 ] = fail[ 1 ] = 1;
2220 for ( charid = 0; charid < ucharcount ; charid++ ) {
2221 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2223 q[ q_write ] = newstate;
2224 /* set to point at the root */
2225 fail[ q[ q_write++ ] ]=1;
2228 while ( q_read < q_write) {
2229 const U32 cur = q[ q_read++ % numstates ];
2230 base = trie->states[ cur ].trans.base;
2232 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2233 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2235 U32 fail_state = cur;
2238 fail_state = fail[ fail_state ];
2239 fail_base = aho->states[ fail_state ].trans.base;
2240 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2242 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2243 fail[ ch_state ] = fail_state;
2244 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2246 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2248 q[ q_write++ % numstates] = ch_state;
2252 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2253 when we fail in state 1, this allows us to use the
2254 charclass scan to find a valid start char. This is based on the principle
2255 that theres a good chance the string being searched contains lots of stuff
2256 that cant be a start char.
2258 fail[ 0 ] = fail[ 1 ] = 0;
2259 DEBUG_TRIE_COMPILE_r({
2260 PerlIO_printf(Perl_debug_log,
2261 "%*sStclass Failtable (%"UVuf" states): 0",
2262 (int)(depth * 2), "", (UV)numstates
2264 for( q_read=1; q_read<numstates; q_read++ ) {
2265 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2267 PerlIO_printf(Perl_debug_log, "\n");
2270 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2275 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2276 * These need to be revisited when a newer toolchain becomes available.
2278 #if defined(__sparc64__) && defined(__GNUC__)
2279 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2280 # undef SPARC64_GCC_WORKAROUND
2281 # define SPARC64_GCC_WORKAROUND 1
2285 #define DEBUG_PEEP(str,scan,depth) \
2286 DEBUG_OPTIMISE_r({if (scan){ \
2287 SV * const mysv=sv_newmortal(); \
2288 regnode *Next = regnext(scan); \
2289 regprop(RExC_rx, mysv, scan); \
2290 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2291 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2292 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2299 #define JOIN_EXACT(scan,min,flags) \
2300 if (PL_regkind[OP(scan)] == EXACT) \
2301 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2304 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2305 /* Merge several consecutive EXACTish nodes into one. */
2306 regnode *n = regnext(scan);
2308 regnode *next = scan + NODE_SZ_STR(scan);
2312 regnode *stop = scan;
2313 GET_RE_DEBUG_FLAGS_DECL;
2315 PERL_UNUSED_ARG(depth);
2318 PERL_ARGS_ASSERT_JOIN_EXACT;
2319 #ifndef EXPERIMENTAL_INPLACESCAN
2320 PERL_UNUSED_ARG(flags);
2321 PERL_UNUSED_ARG(val);
2323 DEBUG_PEEP("join",scan,depth);
2325 /* Skip NOTHING, merge EXACT*. */
2327 ( PL_regkind[OP(n)] == NOTHING ||
2328 (stringok && (OP(n) == OP(scan))))
2330 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2332 if (OP(n) == TAIL || n > next)
2334 if (PL_regkind[OP(n)] == NOTHING) {
2335 DEBUG_PEEP("skip:",n,depth);
2336 NEXT_OFF(scan) += NEXT_OFF(n);
2337 next = n + NODE_STEP_REGNODE;
2344 else if (stringok) {
2345 const unsigned int oldl = STR_LEN(scan);
2346 regnode * const nnext = regnext(n);
2348 DEBUG_PEEP("merg",n,depth);
2351 if (oldl + STR_LEN(n) > U8_MAX)
2353 NEXT_OFF(scan) += NEXT_OFF(n);
2354 STR_LEN(scan) += STR_LEN(n);
2355 next = n + NODE_SZ_STR(n);
2356 /* Now we can overwrite *n : */
2357 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2365 #ifdef EXPERIMENTAL_INPLACESCAN
2366 if (flags && !NEXT_OFF(n)) {
2367 DEBUG_PEEP("atch", val, depth);
2368 if (reg_off_by_arg[OP(n)]) {
2369 ARG_SET(n, val - n);
2372 NEXT_OFF(n) = val - n;
2379 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2381 Two problematic code points in Unicode casefolding of EXACT nodes:
2383 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2384 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2390 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2391 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2393 This means that in case-insensitive matching (or "loose matching",
2394 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2395 length of the above casefolded versions) can match a target string
2396 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2397 This would rather mess up the minimum length computation.
2399 What we'll do is to look for the tail four bytes, and then peek
2400 at the preceding two bytes to see whether we need to decrease
2401 the minimum length by four (six minus two).
2403 Thanks to the design of UTF-8, there cannot be false matches:
2404 A sequence of valid UTF-8 bytes cannot be a subsequence of
2405 another valid sequence of UTF-8 bytes.
2408 char * const s0 = STRING(scan), *s, *t;
2409 char * const s1 = s0 + STR_LEN(scan) - 1;
2410 char * const s2 = s1 - 4;
2411 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2412 const char t0[] = "\xaf\x49\xaf\x42";
2414 const char t0[] = "\xcc\x88\xcc\x81";
2416 const char * const t1 = t0 + 3;
2419 s < s2 && (t = ninstr(s, s1, t0, t1));
2422 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2423 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2425 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2426 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2434 n = scan + NODE_SZ_STR(scan);
2436 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2443 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2447 /* REx optimizer. Converts nodes into quickier variants "in place".
2448 Finds fixed substrings. */
2450 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2451 to the position after last scanned or to NULL. */
2453 #define INIT_AND_WITHP \
2454 assert(!and_withp); \
2455 Newx(and_withp,1,struct regnode_charclass_class); \
2456 SAVEFREEPV(and_withp)
2458 /* this is a chain of data about sub patterns we are processing that
2459 need to be handled seperately/specially in study_chunk. Its so
2460 we can simulate recursion without losing state. */
2462 typedef struct scan_frame {
2463 regnode *last; /* last node to process in this frame */
2464 regnode *next; /* next node to process when last is reached */
2465 struct scan_frame *prev; /*previous frame*/
2466 I32 stop; /* what stopparen do we use */
2470 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2472 #define CASE_SYNST_FNC(nAmE) \
2474 if (flags & SCF_DO_STCLASS_AND) { \
2475 for (value = 0; value < 256; value++) \
2476 if (!is_ ## nAmE ## _cp(value)) \
2477 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2480 for (value = 0; value < 256; value++) \
2481 if (is_ ## nAmE ## _cp(value)) \
2482 ANYOF_BITMAP_SET(data->start_class, value); \
2486 if (flags & SCF_DO_STCLASS_AND) { \
2487 for (value = 0; value < 256; value++) \
2488 if (is_ ## nAmE ## _cp(value)) \
2489 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2492 for (value = 0; value < 256; value++) \
2493 if (!is_ ## nAmE ## _cp(value)) \
2494 ANYOF_BITMAP_SET(data->start_class, value); \
2501 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2502 I32 *minlenp, I32 *deltap,
2507 struct regnode_charclass_class *and_withp,
2508 U32 flags, U32 depth)
2509 /* scanp: Start here (read-write). */
2510 /* deltap: Write maxlen-minlen here. */
2511 /* last: Stop before this one. */
2512 /* data: string data about the pattern */
2513 /* stopparen: treat close N as END */
2514 /* recursed: which subroutines have we recursed into */
2515 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2518 I32 min = 0, pars = 0, code;
2519 regnode *scan = *scanp, *next;
2521 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2522 int is_inf_internal = 0; /* The studied chunk is infinite */
2523 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2524 scan_data_t data_fake;
2525 SV *re_trie_maxbuff = NULL;
2526 regnode *first_non_open = scan;
2527 I32 stopmin = I32_MAX;
2528 scan_frame *frame = NULL;
2529 GET_RE_DEBUG_FLAGS_DECL;
2531 PERL_ARGS_ASSERT_STUDY_CHUNK;
2534 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2538 while (first_non_open && OP(first_non_open) == OPEN)
2539 first_non_open=regnext(first_non_open);
2544 while ( scan && OP(scan) != END && scan < last ){
2545 /* Peephole optimizer: */
2546 DEBUG_STUDYDATA("Peep:", data,depth);
2547 DEBUG_PEEP("Peep",scan,depth);
2548 JOIN_EXACT(scan,&min,0);
2550 /* Follow the next-chain of the current node and optimize
2551 away all the NOTHINGs from it. */
2552 if (OP(scan) != CURLYX) {
2553 const int max = (reg_off_by_arg[OP(scan)]
2555 /* I32 may be smaller than U16 on CRAYs! */
2556 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2557 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2561 /* Skip NOTHING and LONGJMP. */
2562 while ((n = regnext(n))
2563 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2564 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2565 && off + noff < max)
2567 if (reg_off_by_arg[OP(scan)])
2570 NEXT_OFF(scan) = off;
2575 /* The principal pseudo-switch. Cannot be a switch, since we
2576 look into several different things. */
2577 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2578 || OP(scan) == IFTHEN) {
2579 next = regnext(scan);
2581 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2583 if (OP(next) == code || code == IFTHEN) {
2584 /* NOTE - There is similar code to this block below for handling
2585 TRIE nodes on a re-study. If you change stuff here check there
2587 I32 max1 = 0, min1 = I32_MAX, num = 0;
2588 struct regnode_charclass_class accum;
2589 regnode * const startbranch=scan;
2591 if (flags & SCF_DO_SUBSTR)
2592 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2593 if (flags & SCF_DO_STCLASS)
2594 cl_init_zero(pRExC_state, &accum);
2596 while (OP(scan) == code) {
2597 I32 deltanext, minnext, f = 0, fake;
2598 struct regnode_charclass_class this_class;
2601 data_fake.flags = 0;
2603 data_fake.whilem_c = data->whilem_c;
2604 data_fake.last_closep = data->last_closep;
2607 data_fake.last_closep = &fake;
2609 data_fake.pos_delta = delta;
2610 next = regnext(scan);
2611 scan = NEXTOPER(scan);
2613 scan = NEXTOPER(scan);
2614 if (flags & SCF_DO_STCLASS) {
2615 cl_init(pRExC_state, &this_class);
2616 data_fake.start_class = &this_class;
2617 f = SCF_DO_STCLASS_AND;
2619 if (flags & SCF_WHILEM_VISITED_POS)
2620 f |= SCF_WHILEM_VISITED_POS;
2622 /* we suppose the run is continuous, last=next...*/
2623 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2625 stopparen, recursed, NULL, f,depth+1);
2628 if (max1 < minnext + deltanext)
2629 max1 = minnext + deltanext;
2630 if (deltanext == I32_MAX)
2631 is_inf = is_inf_internal = 1;
2633 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2635 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2636 if ( stopmin > minnext)
2637 stopmin = min + min1;
2638 flags &= ~SCF_DO_SUBSTR;
2640 data->flags |= SCF_SEEN_ACCEPT;
2643 if (data_fake.flags & SF_HAS_EVAL)
2644 data->flags |= SF_HAS_EVAL;
2645 data->whilem_c = data_fake.whilem_c;
2647 if (flags & SCF_DO_STCLASS)
2648 cl_or(pRExC_state, &accum, &this_class);
2650 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2652 if (flags & SCF_DO_SUBSTR) {
2653 data->pos_min += min1;
2654 data->pos_delta += max1 - min1;
2655 if (max1 != min1 || is_inf)
2656 data->longest = &(data->longest_float);
2659 delta += max1 - min1;
2660 if (flags & SCF_DO_STCLASS_OR) {
2661 cl_or(pRExC_state, data->start_class, &accum);
2663 cl_and(data->start_class, and_withp);
2664 flags &= ~SCF_DO_STCLASS;
2667 else if (flags & SCF_DO_STCLASS_AND) {
2669 cl_and(data->start_class, &accum);
2670 flags &= ~SCF_DO_STCLASS;
2673 /* Switch to OR mode: cache the old value of
2674 * data->start_class */
2676 StructCopy(data->start_class, and_withp,
2677 struct regnode_charclass_class);
2678 flags &= ~SCF_DO_STCLASS_AND;
2679 StructCopy(&accum, data->start_class,
2680 struct regnode_charclass_class);
2681 flags |= SCF_DO_STCLASS_OR;
2682 data->start_class->flags |= ANYOF_EOS;
2686 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2689 Assuming this was/is a branch we are dealing with: 'scan' now
2690 points at the item that follows the branch sequence, whatever
2691 it is. We now start at the beginning of the sequence and look
2698 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2700 If we can find such a subseqence we need to turn the first
2701 element into a trie and then add the subsequent branch exact
2702 strings to the trie.
2706 1. patterns where the whole set of branch can be converted.
2708 2. patterns where only a subset can be converted.
2710 In case 1 we can replace the whole set with a single regop
2711 for the trie. In case 2 we need to keep the start and end
2714 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2715 becomes BRANCH TRIE; BRANCH X;
2717 There is an additional case, that being where there is a
2718 common prefix, which gets split out into an EXACT like node
2719 preceding the TRIE node.
2721 If x(1..n)==tail then we can do a simple trie, if not we make
2722 a "jump" trie, such that when we match the appropriate word
2723 we "jump" to the appopriate tail node. Essentailly we turn
2724 a nested if into a case structure of sorts.
2729 if (!re_trie_maxbuff) {
2730 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2731 if (!SvIOK(re_trie_maxbuff))
2732 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2734 if ( SvIV(re_trie_maxbuff)>=0 ) {
2736 regnode *first = (regnode *)NULL;
2737 regnode *last = (regnode *)NULL;
2738 regnode *tail = scan;
2743 SV * const mysv = sv_newmortal(); /* for dumping */
2745 /* var tail is used because there may be a TAIL
2746 regop in the way. Ie, the exacts will point to the
2747 thing following the TAIL, but the last branch will
2748 point at the TAIL. So we advance tail. If we
2749 have nested (?:) we may have to move through several
2753 while ( OP( tail ) == TAIL ) {
2754 /* this is the TAIL generated by (?:) */
2755 tail = regnext( tail );
2760 regprop(RExC_rx, mysv, tail );
2761 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2762 (int)depth * 2 + 2, "",
2763 "Looking for TRIE'able sequences. Tail node is: ",
2764 SvPV_nolen_const( mysv )
2770 step through the branches, cur represents each
2771 branch, noper is the first thing to be matched
2772 as part of that branch and noper_next is the
2773 regnext() of that node. if noper is an EXACT
2774 and noper_next is the same as scan (our current
2775 position in the regex) then the EXACT branch is
2776 a possible optimization target. Once we have
2777 two or more consequetive such branches we can
2778 create a trie of the EXACT's contents and stich
2779 it in place. If the sequence represents all of
2780 the branches we eliminate the whole thing and
2781 replace it with a single TRIE. If it is a
2782 subsequence then we need to stitch it in. This
2783 means the first branch has to remain, and needs
2784 to be repointed at the item on the branch chain
2785 following the last branch optimized. This could
2786 be either a BRANCH, in which case the
2787 subsequence is internal, or it could be the
2788 item following the branch sequence in which
2789 case the subsequence is at the end.
2793 /* dont use tail as the end marker for this traverse */
2794 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2795 regnode * const noper = NEXTOPER( cur );
2796 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2797 regnode * const noper_next = regnext( noper );
2801 regprop(RExC_rx, mysv, cur);
2802 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2803 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2805 regprop(RExC_rx, mysv, noper);
2806 PerlIO_printf( Perl_debug_log, " -> %s",
2807 SvPV_nolen_const(mysv));
2810 regprop(RExC_rx, mysv, noper_next );
2811 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2812 SvPV_nolen_const(mysv));
2814 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2815 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2817 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2818 : PL_regkind[ OP( noper ) ] == EXACT )
2819 || OP(noper) == NOTHING )
2821 && noper_next == tail
2826 if ( !first || optype == NOTHING ) {
2827 if (!first) first = cur;
2828 optype = OP( noper );
2834 Currently we do not believe that the trie logic can
2835 handle case insensitive matching properly when the
2836 pattern is not unicode (thus forcing unicode semantics).
2838 If/when this is fixed the following define can be swapped
2839 in below to fully enable trie logic.
2841 #define TRIE_TYPE_IS_SAFE 1
2844 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2846 if ( last && TRIE_TYPE_IS_SAFE ) {
2847 make_trie( pRExC_state,
2848 startbranch, first, cur, tail, count,
2851 if ( PL_regkind[ OP( noper ) ] == EXACT
2853 && noper_next == tail
2858 optype = OP( noper );
2868 regprop(RExC_rx, mysv, cur);
2869 PerlIO_printf( Perl_debug_log,
2870 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2871 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2875 if ( last && TRIE_TYPE_IS_SAFE ) {
2876 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2877 #ifdef TRIE_STUDY_OPT
2878 if ( ((made == MADE_EXACT_TRIE &&
2879 startbranch == first)
2880 || ( first_non_open == first )) &&
2882 flags |= SCF_TRIE_RESTUDY;
2883 if ( startbranch == first
2886 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2896 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2897 scan = NEXTOPER(NEXTOPER(scan));
2898 } else /* single branch is optimized. */
2899 scan = NEXTOPER(scan);
2901 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2902 scan_frame *newframe = NULL;
2907 if (OP(scan) != SUSPEND) {
2908 /* set the pointer */
2909 if (OP(scan) == GOSUB) {
2911 RExC_recurse[ARG2L(scan)] = scan;
2912 start = RExC_open_parens[paren-1];
2913 end = RExC_close_parens[paren-1];
2916 start = RExC_rxi->program + 1;
2920 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2921 SAVEFREEPV(recursed);
2923 if (!PAREN_TEST(recursed,paren+1)) {
2924 PAREN_SET(recursed,paren+1);
2925 Newx(newframe,1,scan_frame);
2927 if (flags & SCF_DO_SUBSTR) {
2928 SCAN_COMMIT(pRExC_state,data,minlenp);
2929 data->longest = &(data->longest_float);
2931 is_inf = is_inf_internal = 1;
2932 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2933 cl_anything(pRExC_state, data->start_class);
2934 flags &= ~SCF_DO_STCLASS;
2937 Newx(newframe,1,scan_frame);
2940 end = regnext(scan);
2945 SAVEFREEPV(newframe);
2946 newframe->next = regnext(scan);
2947 newframe->last = last;
2948 newframe->stop = stopparen;
2949 newframe->prev = frame;
2959 else if (OP(scan) == EXACT) {
2960 I32 l = STR_LEN(scan);
2963 const U8 * const s = (U8*)STRING(scan);
2964 l = utf8_length(s, s + l);
2965 uc = utf8_to_uvchr(s, NULL);
2967 uc = *((U8*)STRING(scan));
2970 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2971 /* The code below prefers earlier match for fixed
2972 offset, later match for variable offset. */
2973 if (data->last_end == -1) { /* Update the start info. */
2974 data->last_start_min = data->pos_min;
2975 data->last_start_max = is_inf
2976 ? I32_MAX : data->pos_min + data->pos_delta;
2978 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2980 SvUTF8_on(data->last_found);
2982 SV * const sv = data->last_found;
2983 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2984 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2985 if (mg && mg->mg_len >= 0)
2986 mg->mg_len += utf8_length((U8*)STRING(scan),
2987 (U8*)STRING(scan)+STR_LEN(scan));
2989 data->last_end = data->pos_min + l;
2990 data->pos_min += l; /* As in the first entry. */
2991 data->flags &= ~SF_BEFORE_EOL;
2993 if (flags & SCF_DO_STCLASS_AND) {
2994 /* Check whether it is compatible with what we know already! */
2998 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2999 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3000 && (!(data->start_class->flags & ANYOF_FOLD)
3001 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3004 ANYOF_CLASS_ZERO(data->start_class);
3005 ANYOF_BITMAP_ZERO(data->start_class);
3007 ANYOF_BITMAP_SET(data->start_class, uc);
3008 data->start_class->flags &= ~ANYOF_EOS;
3010 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3012 else if (flags & SCF_DO_STCLASS_OR) {
3013 /* false positive possible if the class is case-folded */
3015 ANYOF_BITMAP_SET(data->start_class, uc);
3017 data->start_class->flags |= ANYOF_UNICODE_ALL;
3018 data->start_class->flags &= ~ANYOF_EOS;
3019 cl_and(data->start_class, and_withp);
3021 flags &= ~SCF_DO_STCLASS;
3023 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3024 I32 l = STR_LEN(scan);
3025 UV uc = *((U8*)STRING(scan));
3027 /* Search for fixed substrings supports EXACT only. */
3028 if (flags & SCF_DO_SUBSTR) {
3030 SCAN_COMMIT(pRExC_state, data, minlenp);
3033 const U8 * const s = (U8 *)STRING(scan);
3034 l = utf8_length(s, s + l);
3035 uc = utf8_to_uvchr(s, NULL);
3038 if (flags & SCF_DO_SUBSTR)
3040 if (flags & SCF_DO_STCLASS_AND) {
3041 /* Check whether it is compatible with what we know already! */
3045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3046 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3049 ANYOF_CLASS_ZERO(data->start_class);
3050 ANYOF_BITMAP_ZERO(data->start_class);
3052 ANYOF_BITMAP_SET(data->start_class, uc);
3053 data->start_class->flags &= ~ANYOF_EOS;
3054 data->start_class->flags |= ANYOF_FOLD;
3055 if (OP(scan) == EXACTFL)
3056 data->start_class->flags |= ANYOF_LOCALE;
3059 else if (flags & SCF_DO_STCLASS_OR) {
3060 if (data->start_class->flags & ANYOF_FOLD) {
3061 /* false positive possible if the class is case-folded.
3062 Assume that the locale settings are the same... */
3064 ANYOF_BITMAP_SET(data->start_class, uc);
3065 data->start_class->flags &= ~ANYOF_EOS;
3067 cl_and(data->start_class, and_withp);
3069 flags &= ~SCF_DO_STCLASS;
3071 else if (strchr((const char*)PL_varies,OP(scan))) {
3072 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3073 I32 f = flags, pos_before = 0;
3074 regnode * const oscan = scan;
3075 struct regnode_charclass_class this_class;
3076 struct regnode_charclass_class *oclass = NULL;
3077 I32 next_is_eval = 0;
3079 switch (PL_regkind[OP(scan)]) {
3080 case WHILEM: /* End of (?:...)* . */
3081 scan = NEXTOPER(scan);
3084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3085 next = NEXTOPER(scan);
3086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3088 maxcount = REG_INFTY;
3089 next = regnext(scan);
3090 scan = NEXTOPER(scan);
3094 if (flags & SCF_DO_SUBSTR)
3099 if (flags & SCF_DO_STCLASS) {
3101 maxcount = REG_INFTY;
3102 next = regnext(scan);
3103 scan = NEXTOPER(scan);
3106 is_inf = is_inf_internal = 1;
3107 scan = regnext(scan);
3108 if (flags & SCF_DO_SUBSTR) {
3109 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3110 data->longest = &(data->longest_float);
3112 goto optimize_curly_tail;
3114 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3115 && (scan->flags == stopparen))
3120 mincount = ARG1(scan);
3121 maxcount = ARG2(scan);
3123 next = regnext(scan);
3124 if (OP(scan) == CURLYX) {
3125 I32 lp = (data ? *(data->last_closep) : 0);
3126 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3128 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3129 next_is_eval = (OP(scan) == EVAL);
3131 if (flags & SCF_DO_SUBSTR) {
3132 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3133 pos_before = data->pos_min;
3137 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3139 data->flags |= SF_IS_INF;
3141 if (flags & SCF_DO_STCLASS) {
3142 cl_init(pRExC_state, &this_class);
3143 oclass = data->start_class;
3144 data->start_class = &this_class;
3145 f |= SCF_DO_STCLASS_AND;
3146 f &= ~SCF_DO_STCLASS_OR;
3148 /* These are the cases when once a subexpression
3149 fails at a particular position, it cannot succeed
3150 even after backtracking at the enclosing scope.
3152 XXXX what if minimal match and we are at the
3153 initial run of {n,m}? */
3154 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3155 f &= ~SCF_WHILEM_VISITED_POS;
3157 /* This will finish on WHILEM, setting scan, or on NULL: */
3158 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3159 last, data, stopparen, recursed, NULL,
3161 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3163 if (flags & SCF_DO_STCLASS)
3164 data->start_class = oclass;
3165 if (mincount == 0 || minnext == 0) {
3166 if (flags & SCF_DO_STCLASS_OR) {
3167 cl_or(pRExC_state, data->start_class, &this_class);
3169 else if (flags & SCF_DO_STCLASS_AND) {
3170 /* Switch to OR mode: cache the old value of
3171 * data->start_class */
3173 StructCopy(data->start_class, and_withp,
3174 struct regnode_charclass_class);
3175 flags &= ~SCF_DO_STCLASS_AND;
3176 StructCopy(&this_class, data->start_class,
3177 struct regnode_charclass_class);
3178 flags |= SCF_DO_STCLASS_OR;
3179 data->start_class->flags |= ANYOF_EOS;
3181 } else { /* Non-zero len */
3182 if (flags & SCF_DO_STCLASS_OR) {
3183 cl_or(pRExC_state, data->start_class, &this_class);
3184 cl_and(data->start_class, and_withp);
3186 else if (flags & SCF_DO_STCLASS_AND)
3187 cl_and(data->start_class, &this_class);
3188 flags &= ~SCF_DO_STCLASS;
3190 if (!scan) /* It was not CURLYX, but CURLY. */
3192 if ( /* ? quantifier ok, except for (?{ ... }) */
3193 (next_is_eval || !(mincount == 0 && maxcount == 1))
3194 && (minnext == 0) && (deltanext == 0)
3195 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3196 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3198 ckWARNreg(RExC_parse,
3199 "Quantifier unexpected on zero-length expression");
3202 min += minnext * mincount;
3203 is_inf_internal |= ((maxcount == REG_INFTY
3204 && (minnext + deltanext) > 0)
3205 || deltanext == I32_MAX);
3206 is_inf |= is_inf_internal;
3207 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3209 /* Try powerful optimization CURLYX => CURLYN. */
3210 if ( OP(oscan) == CURLYX && data
3211 && data->flags & SF_IN_PAR
3212 && !(data->flags & SF_HAS_EVAL)
3213 && !deltanext && minnext == 1 ) {
3214 /* Try to optimize to CURLYN. */
3215 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3216 regnode * const nxt1 = nxt;
3223 if (!strchr((const char*)PL_simple,OP(nxt))
3224 && !(PL_regkind[OP(nxt)] == EXACT
3225 && STR_LEN(nxt) == 1))
3231 if (OP(nxt) != CLOSE)
3233 if (RExC_open_parens) {
3234 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3235 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3237 /* Now we know that nxt2 is the only contents: */
3238 oscan->flags = (U8)ARG(nxt);
3240 OP(nxt1) = NOTHING; /* was OPEN. */
3243 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3244 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3245 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3246 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3247 OP(nxt + 1) = OPTIMIZED; /* was count. */
3248 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3253 /* Try optimization CURLYX => CURLYM. */
3254 if ( OP(oscan) == CURLYX && data
3255 && !(data->flags & SF_HAS_PAR)
3256 && !(data->flags & SF_HAS_EVAL)
3257 && !deltanext /* atom is fixed width */
3258 && minnext != 0 /* CURLYM can't handle zero width */
3260 /* XXXX How to optimize if data == 0? */
3261 /* Optimize to a simpler form. */
3262 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3266 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3267 && (OP(nxt2) != WHILEM))
3269 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3270 /* Need to optimize away parenths. */
3271 if (data->flags & SF_IN_PAR) {
3272 /* Set the parenth number. */
3273 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3275 if (OP(nxt) != CLOSE)
3276 FAIL("Panic opt close");
3277 oscan->flags = (U8)ARG(nxt);
3278 if (RExC_open_parens) {
3279 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3280 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3282 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3283 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3286 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3287 OP(nxt + 1) = OPTIMIZED; /* was count. */
3288 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3289 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3292 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3293 regnode *nnxt = regnext(nxt1);
3296 if (reg_off_by_arg[OP(nxt1)])
3297 ARG_SET(nxt1, nxt2 - nxt1);
3298 else if (nxt2 - nxt1 < U16_MAX)
3299 NEXT_OFF(nxt1) = nxt2 - nxt1;
3301 OP(nxt) = NOTHING; /* Cannot beautify */
3306 /* Optimize again: */
3307 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3308 NULL, stopparen, recursed, NULL, 0,depth+1);
3313 else if ((OP(oscan) == CURLYX)
3314 && (flags & SCF_WHILEM_VISITED_POS)
3315 /* See the comment on a similar expression above.
3316 However, this time it not a subexpression
3317 we care about, but the expression itself. */
3318 && (maxcount == REG_INFTY)
3319 && data && ++data->whilem_c < 16) {
3320 /* This stays as CURLYX, we can put the count/of pair. */
3321 /* Find WHILEM (as in regexec.c) */
3322 regnode *nxt = oscan + NEXT_OFF(oscan);
3324 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3326 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3327 | (RExC_whilem_seen << 4)); /* On WHILEM */
3329 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3331 if (flags & SCF_DO_SUBSTR) {
3332 SV *last_str = NULL;
3333 int counted = mincount != 0;
3335 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3336 #if defined(SPARC64_GCC_WORKAROUND)
3339 const char *s = NULL;
3342 if (pos_before >= data->last_start_min)
3345 b = data->last_start_min;
3348 s = SvPV_const(data->last_found, l);
3349 old = b - data->last_start_min;
3352 I32 b = pos_before >= data->last_start_min
3353 ? pos_before : data->last_start_min;
3355 const char * const s = SvPV_const(data->last_found, l);
3356 I32 old = b - data->last_start_min;
3360 old = utf8_hop((U8*)s, old) - (U8*)s;
3363 /* Get the added string: */
3364 last_str = newSVpvn_utf8(s + old, l, UTF);
3365 if (deltanext == 0 && pos_before == b) {
3366 /* What was added is a constant string */
3368 SvGROW(last_str, (mincount * l) + 1);
3369 repeatcpy(SvPVX(last_str) + l,
3370 SvPVX_const(last_str), l, mincount - 1);
3371 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3372 /* Add additional parts. */
3373 SvCUR_set(data->last_found,
3374 SvCUR(data->last_found) - l);
3375 sv_catsv(data->last_found, last_str);
3377 SV * sv = data->last_found;
3379 SvUTF8(sv) && SvMAGICAL(sv) ?
3380 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3381 if (mg && mg->mg_len >= 0)
3382 mg->mg_len += CHR_SVLEN(last_str) - l;
3384 data->last_end += l * (mincount - 1);
3387 /* start offset must point into the last copy */
3388 data->last_start_min += minnext * (mincount - 1);
3389 data->last_start_max += is_inf ? I32_MAX
3390 : (maxcount - 1) * (minnext + data->pos_delta);
3393 /* It is counted once already... */
3394 data->pos_min += minnext * (mincount - counted);
3395 data->pos_delta += - counted * deltanext +
3396 (minnext + deltanext) * maxcount - minnext * mincount;
3397 if (mincount != maxcount) {
3398 /* Cannot extend fixed substrings found inside
3400 SCAN_COMMIT(pRExC_state,data,minlenp);
3401 if (mincount && last_str) {
3402 SV * const sv = data->last_found;
3403 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3404 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3408 sv_setsv(sv, last_str);
3409 data->last_end = data->pos_min;
3410 data->last_start_min =
3411 data->pos_min - CHR_SVLEN(last_str);
3412 data->last_start_max = is_inf
3414 : data->pos_min + data->pos_delta
3415 - CHR_SVLEN(last_str);
3417 data->longest = &(data->longest_float);
3419 SvREFCNT_dec(last_str);
3421 if (data && (fl & SF_HAS_EVAL))
3422 data->flags |= SF_HAS_EVAL;
3423 optimize_curly_tail:
3424 if (OP(oscan) != CURLYX) {
3425 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3427 NEXT_OFF(oscan) += NEXT_OFF(next);
3430 default: /* REF and CLUMP only? */
3431 if (flags & SCF_DO_SUBSTR) {
3432 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3433 data->longest = &(data->longest_float);
3435 is_inf = is_inf_internal = 1;
3436 if (flags & SCF_DO_STCLASS_OR)
3437 cl_anything(pRExC_state, data->start_class);
3438 flags &= ~SCF_DO_STCLASS;
3442 else if (OP(scan) == LNBREAK) {
3443 if (flags & SCF_DO_STCLASS) {
3445 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3446 if (flags & SCF_DO_STCLASS_AND) {
3447 for (value = 0; value < 256; value++)
3448 if (!is_VERTWS_cp(value))
3449 ANYOF_BITMAP_CLEAR(data->start_class, value);
3452 for (value = 0; value < 256; value++)
3453 if (is_VERTWS_cp(value))
3454 ANYOF_BITMAP_SET(data->start_class, value);
3456 if (flags & SCF_DO_STCLASS_OR)
3457 cl_and(data->start_class, and_withp);
3458 flags &= ~SCF_DO_STCLASS;
3462 if (flags & SCF_DO_SUBSTR) {
3463 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3465 data->pos_delta += 1;
3466 data->longest = &(data->longest_float);
3470 else if (OP(scan) == FOLDCHAR) {
3471 int d = ARG(scan)==0xDF ? 1 : 2;
3472 flags &= ~SCF_DO_STCLASS;
3475 if (flags & SCF_DO_SUBSTR) {
3476 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3478 data->pos_delta += d;
3479 data->longest = &(data->longest_float);
3482 else if (strchr((const char*)PL_simple,OP(scan))) {
3485 if (flags & SCF_DO_SUBSTR) {
3486 SCAN_COMMIT(pRExC_state,data,minlenp);
3490 if (flags & SCF_DO_STCLASS) {
3491 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3493 /* Some of the logic below assumes that switching
3494 locale on will only add false positives. */
3495 switch (PL_regkind[OP(scan)]) {
3499 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3500 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3501 cl_anything(pRExC_state, data->start_class);
3504 if (OP(scan) == SANY)
3506 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3507 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3508 || (data->start_class->flags & ANYOF_CLASS));
3509 cl_anything(pRExC_state, data->start_class);
3511 if (flags & SCF_DO_STCLASS_AND || !value)
3512 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3515 if (flags & SCF_DO_STCLASS_AND)
3516 cl_and(data->start_class,
3517 (struct regnode_charclass_class*)scan);
3519 cl_or(pRExC_state, data->start_class,
3520 (struct regnode_charclass_class*)scan);
3523 if (flags & SCF_DO_STCLASS_AND) {
3524 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3525 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3526 for (value = 0; value < 256; value++)
3527 if (!isALNUM(value))
3528 ANYOF_BITMAP_CLEAR(data->start_class, value);
3532 if (data->start_class->flags & ANYOF_LOCALE)
3533 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3535 for (value = 0; value < 256; value++)
3537 ANYOF_BITMAP_SET(data->start_class, value);
3542 if (flags & SCF_DO_STCLASS_AND) {
3543 if (data->start_class->flags & ANYOF_LOCALE)
3544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3547 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3548 data->start_class->flags |= ANYOF_LOCALE;
3552 if (flags & SCF_DO_STCLASS_AND) {
3553 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3554 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3555 for (value = 0; value < 256; value++)
3557 ANYOF_BITMAP_CLEAR(data->start_class, value);
3561 if (data->start_class->flags & ANYOF_LOCALE)
3562 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3564 for (value = 0; value < 256; value++)
3565 if (!isALNUM(value))
3566 ANYOF_BITMAP_SET(data->start_class, value);
3571 if (flags & SCF_DO_STCLASS_AND) {
3572 if (data->start_class->flags & ANYOF_LOCALE)
3573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3576 data->start_class->flags |= ANYOF_LOCALE;
3577 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3581 if (flags & SCF_DO_STCLASS_AND) {
3582 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3583 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3584 for (value = 0; value < 256; value++)
3585 if (!isSPACE(value))
3586 ANYOF_BITMAP_CLEAR(data->start_class, value);
3590 if (data->start_class->flags & ANYOF_LOCALE)
3591 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3593 for (value = 0; value < 256; value++)
3595 ANYOF_BITMAP_SET(data->start_class, value);
3600 if (flags & SCF_DO_STCLASS_AND) {
3601 if (data->start_class->flags & ANYOF_LOCALE)
3602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3605 data->start_class->flags |= ANYOF_LOCALE;
3606 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3610 if (flags & SCF_DO_STCLASS_AND) {
3611 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3612 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3613 for (value = 0; value < 256; value++)
3615 ANYOF_BITMAP_CLEAR(data->start_class, value);
3619 if (data->start_class->flags & ANYOF_LOCALE)
3620 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3622 for (value = 0; value < 256; value++)
3623 if (!isSPACE(value))
3624 ANYOF_BITMAP_SET(data->start_class, value);
3629 if (flags & SCF_DO_STCLASS_AND) {
3630 if (data->start_class->flags & ANYOF_LOCALE) {
3631 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3632 for (value = 0; value < 256; value++)
3633 if (!isSPACE(value))
3634 ANYOF_BITMAP_CLEAR(data->start_class, value);
3638 data->start_class->flags |= ANYOF_LOCALE;
3639 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3643 if (flags & SCF_DO_STCLASS_AND) {
3644 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3645 for (value = 0; value < 256; value++)
3646 if (!isDIGIT(value))
3647 ANYOF_BITMAP_CLEAR(data->start_class, value);
3650 if (data->start_class->flags & ANYOF_LOCALE)
3651 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3653 for (value = 0; value < 256; value++)
3655 ANYOF_BITMAP_SET(data->start_class, value);
3660 if (flags & SCF_DO_STCLASS_AND) {
3661 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3662 for (value = 0; value < 256; value++)
3664 ANYOF_BITMAP_CLEAR(data->start_class, value);
3667 if (data->start_class->flags & ANYOF_LOCALE)
3668 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3670 for (value = 0; value < 256; value++)
3671 if (!isDIGIT(value))
3672 ANYOF_BITMAP_SET(data->start_class, value);
3676 CASE_SYNST_FNC(VERTWS);
3677 CASE_SYNST_FNC(HORIZWS);
3680 if (flags & SCF_DO_STCLASS_OR)
3681 cl_and(data->start_class, and_withp);
3682 flags &= ~SCF_DO_STCLASS;
3685 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3686 data->flags |= (OP(scan) == MEOL
3690 else if ( PL_regkind[OP(scan)] == BRANCHJ
3691 /* Lookbehind, or need to calculate parens/evals/stclass: */
3692 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3693 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3694 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3695 || OP(scan) == UNLESSM )
3697 /* Negative Lookahead/lookbehind
3698 In this case we can't do fixed string optimisation.
3701 I32 deltanext, minnext, fake = 0;
3703 struct regnode_charclass_class intrnl;
3706 data_fake.flags = 0;
3708 data_fake.whilem_c = data->whilem_c;
3709 data_fake.last_closep = data->last_closep;
3712 data_fake.last_closep = &fake;
3713 data_fake.pos_delta = delta;
3714 if ( flags & SCF_DO_STCLASS && !scan->flags
3715 && OP(scan) == IFMATCH ) { /* Lookahead */
3716 cl_init(pRExC_state, &intrnl);
3717 data_fake.start_class = &intrnl;
3718 f |= SCF_DO_STCLASS_AND;
3720 if (flags & SCF_WHILEM_VISITED_POS)
3721 f |= SCF_WHILEM_VISITED_POS;
3722 next = regnext(scan);
3723 nscan = NEXTOPER(NEXTOPER(scan));
3724 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3725 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3728 FAIL("Variable length lookbehind not implemented");
3730 else if (minnext > (I32)U8_MAX) {
3731 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3733 scan->flags = (U8)minnext;
3736 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3738 if (data_fake.flags & SF_HAS_EVAL)
3739 data->flags |= SF_HAS_EVAL;
3740 data->whilem_c = data_fake.whilem_c;
3742 if (f & SCF_DO_STCLASS_AND) {
3743 if (flags & SCF_DO_STCLASS_OR) {
3744 /* OR before, AND after: ideally we would recurse with
3745 * data_fake to get the AND applied by study of the
3746 * remainder of the pattern, and then derecurse;
3747 * *** HACK *** for now just treat as "no information".
3748 * See [perl #56690].
3750 cl_init(pRExC_state, data->start_class);
3752 /* AND before and after: combine and continue */
3753 const int was = (data->start_class->flags & ANYOF_EOS);
3755 cl_and(data->start_class, &intrnl);
3757 data->start_class->flags |= ANYOF_EOS;
3761 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3763 /* Positive Lookahead/lookbehind
3764 In this case we can do fixed string optimisation,
3765 but we must be careful about it. Note in the case of
3766 lookbehind the positions will be offset by the minimum
3767 length of the pattern, something we won't know about
3768 until after the recurse.
3770 I32 deltanext, fake = 0;
3772 struct regnode_charclass_class intrnl;
3774 /* We use SAVEFREEPV so that when the full compile
3775 is finished perl will clean up the allocated
3776 minlens when its all done. This was we don't
3777 have to worry about freeing them when we know
3778 they wont be used, which would be a pain.
3781 Newx( minnextp, 1, I32 );
3782 SAVEFREEPV(minnextp);
3785 StructCopy(data, &data_fake, scan_data_t);
3786 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3789 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3790 data_fake.last_found=newSVsv(data->last_found);
3794 data_fake.last_closep = &fake;
3795 data_fake.flags = 0;
3796 data_fake.pos_delta = delta;
3798 data_fake.flags |= SF_IS_INF;
3799 if ( flags & SCF_DO_STCLASS && !scan->flags
3800 && OP(scan) == IFMATCH ) { /* Lookahead */
3801 cl_init(pRExC_state, &intrnl);
3802 data_fake.start_class = &intrnl;
3803 f |= SCF_DO_STCLASS_AND;
3805 if (flags & SCF_WHILEM_VISITED_POS)
3806 f |= SCF_WHILEM_VISITED_POS;
3807 next = regnext(scan);
3808 nscan = NEXTOPER(NEXTOPER(scan));
3810 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3811 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3814 FAIL("Variable length lookbehind not implemented");
3816 else if (*minnextp > (I32)U8_MAX) {
3817 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3819 scan->flags = (U8)*minnextp;
3824 if (f & SCF_DO_STCLASS_AND) {
3825 const int was = (data->start_class->flags & ANYOF_EOS);
3827 cl_and(data->start_class, &intrnl);
3829 data->start_class->flags |= ANYOF_EOS;
3832 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3834 if (data_fake.flags & SF_HAS_EVAL)
3835 data->flags |= SF_HAS_EVAL;
3836 data->whilem_c = data_fake.whilem_c;
3837 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3838 if (RExC_rx->minlen<*minnextp)
3839 RExC_rx->minlen=*minnextp;
3840 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3841 SvREFCNT_dec(data_fake.last_found);
3843 if ( data_fake.minlen_fixed != minlenp )
3845 data->offset_fixed= data_fake.offset_fixed;
3846 data->minlen_fixed= data_fake.minlen_fixed;
3847 data->lookbehind_fixed+= scan->flags;
3849 if ( data_fake.minlen_float != minlenp )
3851 data->minlen_float= data_fake.minlen_float;
3852 data->offset_float_min=data_fake.offset_float_min;
3853 data->offset_float_max=data_fake.offset_float_max;
3854 data->lookbehind_float+= scan->flags;
3863 else if (OP(scan) == OPEN) {
3864 if (stopparen != (I32)ARG(scan))
3867 else if (OP(scan) == CLOSE) {
3868 if (stopparen == (I32)ARG(scan)) {
3871 if ((I32)ARG(scan) == is_par) {
3872 next = regnext(scan);
3874 if ( next && (OP(next) != WHILEM) && next < last)
3875 is_par = 0; /* Disable optimization */
3878 *(data->last_closep) = ARG(scan);
3880 else if (OP(scan) == EVAL) {
3882 data->flags |= SF_HAS_EVAL;
3884 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3885 if (flags & SCF_DO_SUBSTR) {
3886 SCAN_COMMIT(pRExC_state,data,minlenp);
3887 flags &= ~SCF_DO_SUBSTR;
3889 if (data && OP(scan)==ACCEPT) {
3890 data->flags |= SCF_SEEN_ACCEPT;
3895 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3897 if (flags & SCF_DO_SUBSTR) {
3898 SCAN_COMMIT(pRExC_state,data,minlenp);
3899 data->longest = &(data->longest_float);
3901 is_inf = is_inf_internal = 1;
3902 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3903 cl_anything(pRExC_state, data->start_class);
3904 flags &= ~SCF_DO_STCLASS;
3906 else if (OP(scan) == GPOS) {
3907 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3908 !(delta || is_inf || (data && data->pos_delta)))
3910 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3911 RExC_rx->extflags |= RXf_ANCH_GPOS;
3912 if (RExC_rx->gofs < (U32)min)
3913 RExC_rx->gofs = min;
3915 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3919 #ifdef TRIE_STUDY_OPT
3920 #ifdef FULL_TRIE_STUDY
3921 else if (PL_regkind[OP(scan)] == TRIE) {
3922 /* NOTE - There is similar code to this block above for handling
3923 BRANCH nodes on the initial study. If you change stuff here
3925 regnode *trie_node= scan;
3926 regnode *tail= regnext(scan);
3927 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3928 I32 max1 = 0, min1 = I32_MAX;
3929 struct regnode_charclass_class accum;
3931 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3932 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3933 if (flags & SCF_DO_STCLASS)
3934 cl_init_zero(pRExC_state, &accum);
3940 const regnode *nextbranch= NULL;
3943 for ( word=1 ; word <= trie->wordcount ; word++)
3945 I32 deltanext=0, minnext=0, f = 0, fake;
3946 struct regnode_charclass_class this_class;
3948 data_fake.flags = 0;
3950 data_fake.whilem_c = data->whilem_c;
3951 data_fake.last_closep = data->last_closep;
3954 data_fake.last_closep = &fake;
3955 data_fake.pos_delta = delta;
3956 if (flags & SCF_DO_STCLASS) {
3957 cl_init(pRExC_state, &this_class);
3958 data_fake.start_class = &this_class;
3959 f = SCF_DO_STCLASS_AND;
3961 if (flags & SCF_WHILEM_VISITED_POS)
3962 f |= SCF_WHILEM_VISITED_POS;
3964 if (trie->jump[word]) {
3966 nextbranch = trie_node + trie->jump[0];
3967 scan= trie_node + trie->jump[word];
3968 /* We go from the jump point to the branch that follows
3969 it. Note this means we need the vestigal unused branches
3970 even though they arent otherwise used.
3972 minnext = study_chunk(pRExC_state, &scan, minlenp,
3973 &deltanext, (regnode *)nextbranch, &data_fake,
3974 stopparen, recursed, NULL, f,depth+1);
3976 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3977 nextbranch= regnext((regnode*)nextbranch);
3979 if (min1 > (I32)(minnext + trie->minlen))
3980 min1 = minnext + trie->minlen;
3981 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3982 max1 = minnext + deltanext + trie->maxlen;
3983 if (deltanext == I32_MAX)
3984 is_inf = is_inf_internal = 1;
3986 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3988 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3989 if ( stopmin > min + min1)
3990 stopmin = min + min1;
3991 flags &= ~SCF_DO_SUBSTR;
3993 data->flags |= SCF_SEEN_ACCEPT;
3996 if (data_fake.flags & SF_HAS_EVAL)
3997 data->flags |= SF_HAS_EVAL;
3998 data->whilem_c = data_fake.whilem_c;
4000 if (flags & SCF_DO_STCLASS)
4001 cl_or(pRExC_state, &accum, &this_class);
4004 if (flags & SCF_DO_SUBSTR) {
4005 data->pos_min += min1;
4006 data->pos_delta += max1 - min1;
4007 if (max1 != min1 || is_inf)
4008 data->longest = &(data->longest_float);
4011 delta += max1 - min1;
4012 if (flags & SCF_DO_STCLASS_OR) {
4013 cl_or(pRExC_state, data->start_class, &accum);
4015 cl_and(data->start_class, and_withp);
4016 flags &= ~SCF_DO_STCLASS;
4019 else if (flags & SCF_DO_STCLASS_AND) {
4021 cl_and(data->start_class, &accum);
4022 flags &= ~SCF_DO_STCLASS;
4025 /* Switch to OR mode: cache the old value of
4026 * data->start_class */
4028 StructCopy(data->start_class, and_withp,
4029 struct regnode_charclass_class);
4030 flags &= ~SCF_DO_STCLASS_AND;
4031 StructCopy(&accum, data->start_class,
4032 struct regnode_charclass_class);
4033 flags |= SCF_DO_STCLASS_OR;
4034 data->start_class->flags |= ANYOF_EOS;
4041 else if (PL_regkind[OP(scan)] == TRIE) {
4042 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4045 min += trie->minlen;
4046 delta += (trie->maxlen - trie->minlen);
4047 flags &= ~SCF_DO_STCLASS; /* xxx */
4048 if (flags & SCF_DO_SUBSTR) {
4049 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4050 data->pos_min += trie->minlen;
4051 data->pos_delta += (trie->maxlen - trie->minlen);
4052 if (trie->maxlen != trie->minlen)
4053 data->longest = &(data->longest_float);
4055 if (trie->jump) /* no more substrings -- for now /grr*/
4056 flags &= ~SCF_DO_SUBSTR;
4058 #endif /* old or new */
4059 #endif /* TRIE_STUDY_OPT */
4061 /* Else: zero-length, ignore. */
4062 scan = regnext(scan);
4067 stopparen = frame->stop;
4068 frame = frame->prev;
4069 goto fake_study_recurse;
4074 DEBUG_STUDYDATA("pre-fin:",data,depth);
4077 *deltap = is_inf_internal ? I32_MAX : delta;
4078 if (flags & SCF_DO_SUBSTR && is_inf)
4079 data->pos_delta = I32_MAX - data->pos_min;
4080 if (is_par > (I32)U8_MAX)
4082 if (is_par && pars==1 && data) {
4083 data->flags |= SF_IN_PAR;
4084 data->flags &= ~SF_HAS_PAR;
4086 else if (pars && data) {
4087 data->flags |= SF_HAS_PAR;
4088 data->flags &= ~SF_IN_PAR;
4090 if (flags & SCF_DO_STCLASS_OR)
4091 cl_and(data->start_class, and_withp);
4092 if (flags & SCF_TRIE_RESTUDY)
4093 data->flags |= SCF_TRIE_RESTUDY;
4095 DEBUG_STUDYDATA("post-fin:",data,depth);
4097 return min < stopmin ? min : stopmin;
4101 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4103 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4105 PERL_ARGS_ASSERT_ADD_DATA;
4107 Renewc(RExC_rxi->data,
4108 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4109 char, struct reg_data);
4111 Renew(RExC_rxi->data->what, count + n, U8);
4113 Newx(RExC_rxi->data->what, n, U8);
4114 RExC_rxi->data->count = count + n;
4115 Copy(s, RExC_rxi->data->what + count, n, U8);
4119 /*XXX: todo make this not included in a non debugging perl */
4120 #ifndef PERL_IN_XSUB_RE
4122 Perl_reginitcolors(pTHX)
4125 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4127 char *t = savepv(s);
4131 t = strchr(t, '\t');
4137 PL_colors[i] = t = (char *)"";
4142 PL_colors[i++] = (char *)"";
4149 #ifdef TRIE_STUDY_OPT
4150 #define CHECK_RESTUDY_GOTO \
4152 (data.flags & SCF_TRIE_RESTUDY) \
4156 #define CHECK_RESTUDY_GOTO
4160 - pregcomp - compile a regular expression into internal code
4162 * We can't allocate space until we know how big the compiled form will be,
4163 * but we can't compile it (and thus know how big it is) until we've got a
4164 * place to put the code. So we cheat: we compile it twice, once with code
4165 * generation turned off and size counting turned on, and once "for real".
4166 * This also means that we don't allocate space until we are sure that the
4167 * thing really will compile successfully, and we never have to move the
4168 * code and thus invalidate pointers into it. (Note that it has to be in
4169 * one piece because free() must be able to free it all.) [NB: not true in perl]
4171 * Beware that the optimization-preparation code in here knows about some
4172 * of the structure of the compiled regexp. [I'll say.]
4177 #ifndef PERL_IN_XSUB_RE
4178 #define RE_ENGINE_PTR &PL_core_reg_engine
4180 extern const struct regexp_engine my_reg_engine;
4181 #define RE_ENGINE_PTR &my_reg_engine
4184 #ifndef PERL_IN_XSUB_RE
4186 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4189 HV * const table = GvHV(PL_hintgv);
4191 PERL_ARGS_ASSERT_PREGCOMP;
4193 /* Dispatch a request to compile a regexp to correct
4196 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4197 GET_RE_DEBUG_FLAGS_DECL;
4198 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4199 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4201 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4204 return CALLREGCOMP_ENG(eng, pattern, flags);
4207 return Perl_re_compile(aTHX_ pattern, flags);
4212 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4217 register regexp_internal *ri;
4219 char *exp = SvPV(pattern, plen);
4220 char* xend = exp + plen;
4227 RExC_state_t RExC_state;
4228 RExC_state_t * const pRExC_state = &RExC_state;
4229 #ifdef TRIE_STUDY_OPT
4231 RExC_state_t copyRExC_state;
4233 GET_RE_DEBUG_FLAGS_DECL;
4235 PERL_ARGS_ASSERT_RE_COMPILE;
4237 DEBUG_r(if (!PL_colorset) reginitcolors());
4239 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4242 SV *dsv= sv_newmortal();
4243 RE_PV_QUOTED_DECL(s, RExC_utf8,
4244 dsv, exp, plen, 60);
4245 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4246 PL_colors[4],PL_colors[5],s);
4251 RExC_flags = pm_flags;
4255 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4256 RExC_seen_evals = 0;
4259 /* First pass: determine size, legality. */
4267 RExC_emit = &PL_regdummy;
4268 RExC_whilem_seen = 0;
4269 RExC_open_parens = NULL;
4270 RExC_close_parens = NULL;
4272 RExC_paren_names = NULL;
4274 RExC_paren_name_list = NULL;
4276 RExC_recurse = NULL;
4277 RExC_recurse_count = 0;
4279 #if 0 /* REGC() is (currently) a NOP at the first pass.
4280 * Clever compilers notice this and complain. --jhi */
4281 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4283 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4284 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4285 RExC_precomp = NULL;
4288 if (RExC_utf8 && !RExC_orig_utf8) {
4289 /* It's possible to write a regexp in ascii that represents Unicode
4290 codepoints outside of the byte range, such as via \x{100}. If we
4291 detect such a sequence we have to convert the entire pattern to utf8
4292 and then recompile, as our sizing calculation will have been based
4293 on 1 byte == 1 character, but we will need to use utf8 to encode
4294 at least some part of the pattern, and therefore must convert the whole
4296 XXX: somehow figure out how to make this less expensive...
4299 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4300 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4301 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4303 RExC_orig_utf8 = RExC_utf8;
4305 goto redo_first_pass;
4308 PerlIO_printf(Perl_debug_log,
4309 "Required size %"IVdf" nodes\n"
4310 "Starting second pass (creation)\n",
4313 RExC_lastparse=NULL;
4315 /* Small enough for pointer-storage convention?
4316 If extralen==0, this means that we will not need long jumps. */
4317 if (RExC_size >= 0x10000L && RExC_extralen)
4318 RExC_size += RExC_extralen;
4321 if (RExC_whilem_seen > 15)
4322 RExC_whilem_seen = 15;
4324 /* Allocate space and zero-initialize. Note, the two step process
4325 of zeroing when in debug mode, thus anything assigned has to
4326 happen after that */
4327 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4328 r = (struct regexp*)SvANY(rx);
4329 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4330 char, regexp_internal);
4331 if ( r == NULL || ri == NULL )
4332 FAIL("Regexp out of space");
4334 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4335 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4337 /* bulk initialize base fields with 0. */
4338 Zero(ri, sizeof(regexp_internal), char);
4341 /* non-zero initialization begins here */
4343 r->engine= RE_ENGINE_PTR;
4344 r->extflags = pm_flags;
4346 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4347 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4348 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4349 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4350 >> RXf_PMf_STD_PMMOD_SHIFT);
4351 const char *fptr = STD_PAT_MODS; /*"msix"*/
4353 const STRLEN wraplen = plen + has_minus + has_p + has_runon
4354 + (sizeof(STD_PAT_MODS) - 1)
4355 + (sizeof("(?:)") - 1);
4357 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4358 SvCUR_set(rx, wraplen);
4360 SvFLAGS(rx) |= SvUTF8(pattern);
4363 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4365 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4366 char *colon = r + 1;
4369 while((ch = *fptr++)) {
4383 Copy(RExC_precomp, p, plen, char);
4384 assert ((RX_WRAPPED(rx) - p) < 16);
4385 r->pre_prefix = p - RX_WRAPPED(rx);
4394 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4396 if (RExC_seen & REG_SEEN_RECURSE) {
4397 Newxz(RExC_open_parens, RExC_npar,regnode *);
4398 SAVEFREEPV(RExC_open_parens);
4399 Newxz(RExC_close_parens,RExC_npar,regnode *);
4400 SAVEFREEPV(RExC_close_parens);
4403 /* Useful during FAIL. */
4404 #ifdef RE_TRACK_PATTERN_OFFSETS
4405 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4406 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4407 "%s %"UVuf" bytes for offset annotations.\n",
4408 ri->u.offsets ? "Got" : "Couldn't get",
4409 (UV)((2*RExC_size+1) * sizeof(U32))));
4411 SetProgLen(ri,RExC_size);
4416 /* Second pass: emit code. */
4417 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4422 RExC_emit_start = ri->program;
4423 RExC_emit = ri->program;
4424 RExC_emit_bound = ri->program + RExC_size + 1;
4426 /* Store the count of eval-groups for security checks: */
4427 RExC_rx->seen_evals = RExC_seen_evals;
4428 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4429 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4433 /* XXXX To minimize changes to RE engine we always allocate
4434 3-units-long substrs field. */
4435 Newx(r->substrs, 1, struct reg_substr_data);
4436 if (RExC_recurse_count) {
4437 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4438 SAVEFREEPV(RExC_recurse);
4442 r->minlen = minlen = sawplus = sawopen = 0;
4443 Zero(r->substrs, 1, struct reg_substr_data);
4445 #ifdef TRIE_STUDY_OPT
4447 StructCopy(&zero_scan_data, &data, scan_data_t);
4448 copyRExC_state = RExC_state;
4451 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4453 RExC_state = copyRExC_state;
4454 if (seen & REG_TOP_LEVEL_BRANCHES)
4455 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4457 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4458 if (data.last_found) {
4459 SvREFCNT_dec(data.longest_fixed);
4460 SvREFCNT_dec(data.longest_float);
4461 SvREFCNT_dec(data.last_found);
4463 StructCopy(&zero_scan_data, &data, scan_data_t);
4466 StructCopy(&zero_scan_data, &data, scan_data_t);
4469 /* Dig out information for optimizations. */
4470 r->extflags = RExC_flags; /* was pm_op */
4471 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4474 SvUTF8_on(rx); /* Unicode in it? */
4475 ri->regstclass = NULL;
4476 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4477 r->intflags |= PREGf_NAUGHTY;
4478 scan = ri->program + 1; /* First BRANCH. */
4480 /* testing for BRANCH here tells us whether there is "must appear"
4481 data in the pattern. If there is then we can use it for optimisations */
4482 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4484 STRLEN longest_float_length, longest_fixed_length;
4485 struct regnode_charclass_class ch_class; /* pointed to by data */
4487 I32 last_close = 0; /* pointed to by data */
4488 regnode *first= scan;
4489 regnode *first_next= regnext(first);
4492 * Skip introductions and multiplicators >= 1
4493 * so that we can extract the 'meat' of the pattern that must
4494 * match in the large if() sequence following.
4495 * NOTE that EXACT is NOT covered here, as it is normally
4496 * picked up by the optimiser separately.
4498 * This is unfortunate as the optimiser isnt handling lookahead
4499 * properly currently.
4502 while ((OP(first) == OPEN && (sawopen = 1)) ||
4503 /* An OR of *one* alternative - should not happen now. */
4504 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4505 /* for now we can't handle lookbehind IFMATCH*/
4506 (OP(first) == IFMATCH && !first->flags) ||
4507 (OP(first) == PLUS) ||
4508 (OP(first) == MINMOD) ||
4509 /* An {n,m} with n>0 */
4510 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4511 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4514 * the only op that could be a regnode is PLUS, all the rest
4515 * will be regnode_1 or regnode_2.
4518 if (OP(first) == PLUS)
4521 first += regarglen[OP(first)];
4523 first = NEXTOPER(first);
4524 first_next= regnext(first);
4527 /* Starting-point info. */
4529 DEBUG_PEEP("first:",first,0);
4530 /* Ignore EXACT as we deal with it later. */
4531 if (PL_regkind[OP(first)] == EXACT) {
4532 if (OP(first) == EXACT)
4533 NOOP; /* Empty, get anchored substr later. */
4534 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4535 ri->regstclass = first;
4538 else if (PL_regkind[OP(first)] == TRIE &&
4539 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4542 /* this can happen only on restudy */
4543 if ( OP(first) == TRIE ) {
4544 struct regnode_1 *trieop = (struct regnode_1 *)
4545 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4546 StructCopy(first,trieop,struct regnode_1);
4547 trie_op=(regnode *)trieop;
4549 struct regnode_charclass *trieop = (struct regnode_charclass *)
4550 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4551 StructCopy(first,trieop,struct regnode_charclass);
4552 trie_op=(regnode *)trieop;
4555 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4556 ri->regstclass = trie_op;
4559 else if (strchr((const char*)PL_simple,OP(first)))
4560 ri->regstclass = first;
4561 else if (PL_regkind[OP(first)] == BOUND ||
4562 PL_regkind[OP(first)] == NBOUND)
4563 ri->regstclass = first;
4564 else if (PL_regkind[OP(first)] == BOL) {
4565 r->extflags |= (OP(first) == MBOL
4567 : (OP(first) == SBOL
4570 first = NEXTOPER(first);
4573 else if (OP(first) == GPOS) {
4574 r->extflags |= RXf_ANCH_GPOS;
4575 first = NEXTOPER(first);
4578 else if ((!sawopen || !RExC_sawback) &&
4579 (OP(first) == STAR &&
4580 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4581 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4583 /* turn .* into ^.* with an implied $*=1 */
4585 (OP(NEXTOPER(first)) == REG_ANY)
4588 r->extflags |= type;
4589 r->intflags |= PREGf_IMPLICIT;
4590 first = NEXTOPER(first);
4593 if (sawplus && (!sawopen || !RExC_sawback)
4594 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4595 /* x+ must match at the 1st pos of run of x's */
4596 r->intflags |= PREGf_SKIP;
4598 /* Scan is after the zeroth branch, first is atomic matcher. */
4599 #ifdef TRIE_STUDY_OPT
4602 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4603 (IV)(first - scan + 1))
4607 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4608 (IV)(first - scan + 1))
4614 * If there's something expensive in the r.e., find the
4615 * longest literal string that must appear and make it the
4616 * regmust. Resolve ties in favor of later strings, since
4617 * the regstart check works with the beginning of the r.e.
4618 * and avoiding duplication strengthens checking. Not a
4619 * strong reason, but sufficient in the absence of others.
4620 * [Now we resolve ties in favor of the earlier string if
4621 * it happens that c_offset_min has been invalidated, since the
4622 * earlier string may buy us something the later one won't.]
4625 data.longest_fixed = newSVpvs("");
4626 data.longest_float = newSVpvs("");
4627 data.last_found = newSVpvs("");
4628 data.longest = &(data.longest_fixed);
4630 if (!ri->regstclass) {
4631 cl_init(pRExC_state, &ch_class);
4632 data.start_class = &ch_class;
4633 stclass_flag = SCF_DO_STCLASS_AND;
4634 } else /* XXXX Check for BOUND? */
4636 data.last_closep = &last_close;
4638 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4639 &data, -1, NULL, NULL,
4640 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4646 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4647 && data.last_start_min == 0 && data.last_end > 0
4648 && !RExC_seen_zerolen
4649 && !(RExC_seen & REG_SEEN_VERBARG)
4650 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4651 r->extflags |= RXf_CHECK_ALL;
4652 scan_commit(pRExC_state, &data,&minlen,0);
4653 SvREFCNT_dec(data.last_found);
4655 /* Note that code very similar to this but for anchored string
4656 follows immediately below, changes may need to be made to both.
4659 longest_float_length = CHR_SVLEN(data.longest_float);
4660 if (longest_float_length
4661 || (data.flags & SF_FL_BEFORE_EOL
4662 && (!(data.flags & SF_FL_BEFORE_MEOL)
4663 || (RExC_flags & RXf_PMf_MULTILINE))))
4667 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4668 && data.offset_fixed == data.offset_float_min
4669 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4670 goto remove_float; /* As in (a)+. */
4672 /* copy the information about the longest float from the reg_scan_data
4673 over to the program. */
4674 if (SvUTF8(data.longest_float)) {
4675 r->float_utf8 = data.longest_float;
4676 r->float_substr = NULL;
4678 r->float_substr = data.longest_float;
4679 r->float_utf8 = NULL;
4681 /* float_end_shift is how many chars that must be matched that
4682 follow this item. We calculate it ahead of time as once the
4683 lookbehind offset is added in we lose the ability to correctly
4685 ml = data.minlen_float ? *(data.minlen_float)
4686 : (I32)longest_float_length;
4687 r->float_end_shift = ml - data.offset_float_min
4688 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4689 + data.lookbehind_float;
4690 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4691 r->float_max_offset = data.offset_float_max;
4692 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4693 r->float_max_offset -= data.lookbehind_float;
4695 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4696 && (!(data.flags & SF_FL_BEFORE_MEOL)
4697 || (RExC_flags & RXf_PMf_MULTILINE)));
4698 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4702 r->float_substr = r->float_utf8 = NULL;
4703 SvREFCNT_dec(data.longest_float);
4704 longest_float_length = 0;
4707 /* Note that code very similar to this but for floating string
4708 is immediately above, changes may need to be made to both.
4711 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4712 if (longest_fixed_length
4713 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4714 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4715 || (RExC_flags & RXf_PMf_MULTILINE))))
4719 /* copy the information about the longest fixed
4720 from the reg_scan_data over to the program. */
4721 if (SvUTF8(data.longest_fixed)) {
4722 r->anchored_utf8 = data.longest_fixed;
4723 r->anchored_substr = NULL;
4725 r->anchored_substr = data.longest_fixed;
4726 r->anchored_utf8 = NULL;
4728 /* fixed_end_shift is how many chars that must be matched that
4729 follow this item. We calculate it ahead of time as once the
4730 lookbehind offset is added in we lose the ability to correctly
4732 ml = data.minlen_fixed ? *(data.minlen_fixed)
4733 : (I32)longest_fixed_length;
4734 r->anchored_end_shift = ml - data.offset_fixed
4735 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4736 + data.lookbehind_fixed;
4737 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4739 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4740 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4741 || (RExC_flags & RXf_PMf_MULTILINE)));
4742 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4745 r->anchored_substr = r->anchored_utf8 = NULL;
4746 SvREFCNT_dec(data.longest_fixed);
4747 longest_fixed_length = 0;
4750 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4751 ri->regstclass = NULL;
4752 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4754 && !(data.start_class->flags & ANYOF_EOS)
4755 && !cl_is_anything(data.start_class))
4757 const U32 n = add_data(pRExC_state, 1, "f");
4759 Newx(RExC_rxi->data->data[n], 1,
4760 struct regnode_charclass_class);
4761 StructCopy(data.start_class,
4762 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4763 struct regnode_charclass_class);
4764 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4765 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4766 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4767 regprop(r, sv, (regnode*)data.start_class);
4768 PerlIO_printf(Perl_debug_log,
4769 "synthetic stclass \"%s\".\n",
4770 SvPVX_const(sv));});
4773 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4774 if (longest_fixed_length > longest_float_length) {
4775 r->check_end_shift = r->anchored_end_shift;
4776 r->check_substr = r->anchored_substr;
4777 r->check_utf8 = r->anchored_utf8;
4778 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4779 if (r->extflags & RXf_ANCH_SINGLE)
4780 r->extflags |= RXf_NOSCAN;
4783 r->check_end_shift = r->float_end_shift;
4784 r->check_substr = r->float_substr;
4785 r->check_utf8 = r->float_utf8;
4786 r->check_offset_min = r->float_min_offset;
4787 r->check_offset_max = r->float_max_offset;
4789 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4790 This should be changed ASAP! */
4791 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4792 r->extflags |= RXf_USE_INTUIT;
4793 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4794 r->extflags |= RXf_INTUIT_TAIL;
4796 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4797 if ( (STRLEN)minlen < longest_float_length )
4798 minlen= longest_float_length;
4799 if ( (STRLEN)minlen < longest_fixed_length )
4800 minlen= longest_fixed_length;
4804 /* Several toplevels. Best we can is to set minlen. */
4806 struct regnode_charclass_class ch_class;
4809 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4811 scan = ri->program + 1;
4812 cl_init(pRExC_state, &ch_class);
4813 data.start_class = &ch_class;
4814 data.last_closep = &last_close;
4817 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4818 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4822 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4823 = r->float_substr = r->float_utf8 = NULL;
4824 if (!(data.start_class->flags & ANYOF_EOS)
4825 && !cl_is_anything(data.start_class))
4827 const U32 n = add_data(pRExC_state, 1, "f");
4829 Newx(RExC_rxi->data->data[n], 1,
4830 struct regnode_charclass_class);
4831 StructCopy(data.start_class,
4832 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4833 struct regnode_charclass_class);
4834 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4835 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4836 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4837 regprop(r, sv, (regnode*)data.start_class);
4838 PerlIO_printf(Perl_debug_log,
4839 "synthetic stclass \"%s\".\n",
4840 SvPVX_const(sv));});
4844 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4845 the "real" pattern. */
4847 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4848 (IV)minlen, (IV)r->minlen);
4850 r->minlenret = minlen;
4851 if (r->minlen < minlen)
4854 if (RExC_seen & REG_SEEN_GPOS)
4855 r->extflags |= RXf_GPOS_SEEN;
4856 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4857 r->extflags |= RXf_LOOKBEHIND_SEEN;
4858 if (RExC_seen & REG_SEEN_EVAL)
4859 r->extflags |= RXf_EVAL_SEEN;
4860 if (RExC_seen & REG_SEEN_CANY)
4861 r->extflags |= RXf_CANY_SEEN;
4862 if (RExC_seen & REG_SEEN_VERBARG)
4863 r->intflags |= PREGf_VERBARG_SEEN;
4864 if (RExC_seen & REG_SEEN_CUTGROUP)
4865 r->intflags |= PREGf_CUTGROUP_SEEN;
4866 if (RExC_paren_names)
4867 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4869 RXp_PAREN_NAMES(r) = NULL;
4871 #ifdef STUPID_PATTERN_CHECKS
4872 if (RX_PRELEN(rx) == 0)
4873 r->extflags |= RXf_NULL;
4874 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4875 /* XXX: this should happen BEFORE we compile */
4876 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4877 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4878 r->extflags |= RXf_WHITE;
4879 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4880 r->extflags |= RXf_START_ONLY;
4882 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4883 /* XXX: this should happen BEFORE we compile */
4884 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4886 regnode *first = ri->program + 1;
4888 U8 nop = OP(NEXTOPER(first));
4890 if (PL_regkind[fop] == NOTHING && nop == END)
4891 r->extflags |= RXf_NULL;
4892 else if (PL_regkind[fop] == BOL && nop == END)
4893 r->extflags |= RXf_START_ONLY;
4894 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4895 r->extflags |= RXf_WHITE;
4899 if (RExC_paren_names) {
4900 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4901 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4904 ri->name_list_idx = 0;
4906 if (RExC_recurse_count) {
4907 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4908 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4909 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4912 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4913 /* assume we don't need to swap parens around before we match */
4916 PerlIO_printf(Perl_debug_log,"Final program:\n");
4919 #ifdef RE_TRACK_PATTERN_OFFSETS
4920 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4921 const U32 len = ri->u.offsets[0];
4923 GET_RE_DEBUG_FLAGS_DECL;
4924 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4925 for (i = 1; i <= len; i++) {
4926 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4927 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4928 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4930 PerlIO_printf(Perl_debug_log, "\n");
4936 #undef RE_ENGINE_PTR
4940 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4943 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4945 PERL_UNUSED_ARG(value);
4947 if (flags & RXapif_FETCH) {
4948 return reg_named_buff_fetch(rx, key, flags);
4949 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4950 Perl_croak(aTHX_ "%s", PL_no_modify);
4952 } else if (flags & RXapif_EXISTS) {
4953 return reg_named_buff_exists(rx, key, flags)
4956 } else if (flags & RXapif_REGNAMES) {
4957 return reg_named_buff_all(rx, flags);
4958 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4959 return reg_named_buff_scalar(rx, flags);
4961 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4967 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4970 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4971 PERL_UNUSED_ARG(lastkey);
4973 if (flags & RXapif_FIRSTKEY)
4974 return reg_named_buff_firstkey(rx, flags);
4975 else if (flags & RXapif_NEXTKEY)
4976 return reg_named_buff_nextkey(rx, flags);
4978 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4984 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4987 AV *retarray = NULL;
4989 struct regexp *const rx = (struct regexp *)SvANY(r);
4991 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4993 if (flags & RXapif_ALL)
4996 if (rx && RXp_PAREN_NAMES(rx)) {
4997 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5000 SV* sv_dat=HeVAL(he_str);
5001 I32 *nums=(I32*)SvPVX(sv_dat);
5002 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5003 if ((I32)(rx->nparens) >= nums[i]
5004 && rx->offs[nums[i]].start != -1
5005 && rx->offs[nums[i]].end != -1)
5008 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5012 ret = newSVsv(&PL_sv_undef);
5015 av_push(retarray, ret);
5018 return newRV_noinc(MUTABLE_SV(retarray));
5025 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5028 struct regexp *const rx = (struct regexp *)SvANY(r);
5030 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5032 if (rx && RXp_PAREN_NAMES(rx)) {
5033 if (flags & RXapif_ALL) {
5034 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5036 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5050 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5052 struct regexp *const rx = (struct regexp *)SvANY(r);
5054 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5056 if ( rx && RXp_PAREN_NAMES(rx) ) {
5057 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5059 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5066 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5068 struct regexp *const rx = (struct regexp *)SvANY(r);
5069 GET_RE_DEBUG_FLAGS_DECL;
5071 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5073 if (rx && RXp_PAREN_NAMES(rx)) {
5074 HV *hv = RXp_PAREN_NAMES(rx);
5076 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5079 SV* sv_dat = HeVAL(temphe);
5080 I32 *nums = (I32*)SvPVX(sv_dat);
5081 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5082 if ((I32)(rx->lastparen) >= nums[i] &&
5083 rx->offs[nums[i]].start != -1 &&
5084 rx->offs[nums[i]].end != -1)
5090 if (parno || flags & RXapif_ALL) {
5091 return newSVhek(HeKEY_hek(temphe));
5099 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5104 struct regexp *const rx = (struct regexp *)SvANY(r);
5106 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5108 if (rx && RXp_PAREN_NAMES(rx)) {
5109 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5110 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5111 } else if (flags & RXapif_ONE) {
5112 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5113 av = MUTABLE_AV(SvRV(ret));
5114 length = av_len(av);
5116 return newSViv(length + 1);
5118 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5122 return &PL_sv_undef;
5126 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5128 struct regexp *const rx = (struct regexp *)SvANY(r);
5131 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5133 if (rx && RXp_PAREN_NAMES(rx)) {
5134 HV *hv= RXp_PAREN_NAMES(rx);
5136 (void)hv_iterinit(hv);
5137 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5140 SV* sv_dat = HeVAL(temphe);
5141 I32 *nums = (I32*)SvPVX(sv_dat);
5142 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5143 if ((I32)(rx->lastparen) >= nums[i] &&
5144 rx->offs[nums[i]].start != -1 &&
5145 rx->offs[nums[i]].end != -1)
5151 if (parno || flags & RXapif_ALL) {
5152 av_push(av, newSVhek(HeKEY_hek(temphe)));
5157 return newRV_noinc(MUTABLE_SV(av));
5161 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5164 struct regexp *const rx = (struct regexp *)SvANY(r);
5169 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5172 sv_setsv(sv,&PL_sv_undef);
5176 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5178 i = rx->offs[0].start;
5182 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5184 s = rx->subbeg + rx->offs[0].end;
5185 i = rx->sublen - rx->offs[0].end;
5188 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5189 (s1 = rx->offs[paren].start) != -1 &&
5190 (t1 = rx->offs[paren].end) != -1)
5194 s = rx->subbeg + s1;
5196 sv_setsv(sv,&PL_sv_undef);
5199 assert(rx->sublen >= (s - rx->subbeg) + i );
5201 const int oldtainted = PL_tainted;
5203 sv_setpvn(sv, s, i);
5204 PL_tainted = oldtainted;
5205 if ( (rx->extflags & RXf_CANY_SEEN)
5206 ? (RXp_MATCH_UTF8(rx)
5207 && (!i || is_utf8_string((U8*)s, i)))
5208 : (RXp_MATCH_UTF8(rx)) )
5215 if (RXp_MATCH_TAINTED(rx)) {
5216 if (SvTYPE(sv) >= SVt_PVMG) {
5217 MAGIC* const mg = SvMAGIC(sv);
5220 SvMAGIC_set(sv, mg->mg_moremagic);
5222 if ((mgt = SvMAGIC(sv))) {
5223 mg->mg_moremagic = mgt;
5224 SvMAGIC_set(sv, mg);
5234 sv_setsv(sv,&PL_sv_undef);
5240 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5241 SV const * const value)
5243 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5245 PERL_UNUSED_ARG(rx);
5246 PERL_UNUSED_ARG(paren);
5247 PERL_UNUSED_ARG(value);
5250 Perl_croak(aTHX_ "%s", PL_no_modify);
5254 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5257 struct regexp *const rx = (struct regexp *)SvANY(r);
5261 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5263 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5265 /* $` / ${^PREMATCH} */
5266 case RX_BUFF_IDX_PREMATCH:
5267 if (rx->offs[0].start != -1) {
5268 i = rx->offs[0].start;
5276 /* $' / ${^POSTMATCH} */
5277 case RX_BUFF_IDX_POSTMATCH:
5278 if (rx->offs[0].end != -1) {
5279 i = rx->sublen - rx->offs[0].end;
5281 s1 = rx->offs[0].end;
5287 /* $& / ${^MATCH}, $1, $2, ... */
5289 if (paren <= (I32)rx->nparens &&
5290 (s1 = rx->offs[paren].start) != -1 &&
5291 (t1 = rx->offs[paren].end) != -1)
5296 if (ckWARN(WARN_UNINITIALIZED))
5297 report_uninit((const SV *)sv);
5302 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5303 const char * const s = rx->subbeg + s1;
5308 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5315 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5317 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5318 PERL_UNUSED_ARG(rx);
5322 return newSVpvs("Regexp");
5325 /* Scans the name of a named buffer from the pattern.
5326 * If flags is REG_RSN_RETURN_NULL returns null.
5327 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5328 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5329 * to the parsed name as looked up in the RExC_paren_names hash.
5330 * If there is an error throws a vFAIL().. type exception.
5333 #define REG_RSN_RETURN_NULL 0
5334 #define REG_RSN_RETURN_NAME 1
5335 #define REG_RSN_RETURN_DATA 2
5338 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5340 char *name_start = RExC_parse;
5342 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5344 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5345 /* skip IDFIRST by using do...while */
5348 RExC_parse += UTF8SKIP(RExC_parse);
5349 } while (isALNUM_utf8((U8*)RExC_parse));
5353 } while (isALNUM(*RExC_parse));
5358 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5359 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5360 if ( flags == REG_RSN_RETURN_NAME)
5362 else if (flags==REG_RSN_RETURN_DATA) {
5365 if ( ! sv_name ) /* should not happen*/
5366 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5367 if (RExC_paren_names)
5368 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5370 sv_dat = HeVAL(he_str);
5372 vFAIL("Reference to nonexistent named group");
5376 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5383 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5384 int rem=(int)(RExC_end - RExC_parse); \
5393 if (RExC_lastparse!=RExC_parse) \
5394 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5397 iscut ? "..." : "<" \
5400 PerlIO_printf(Perl_debug_log,"%16s",""); \
5403 num = RExC_size + 1; \
5405 num=REG_NODE_NUM(RExC_emit); \
5406 if (RExC_lastnum!=num) \
5407 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5409 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5410 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5411 (int)((depth*2)), "", \
5415 RExC_lastparse=RExC_parse; \
5420 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5421 DEBUG_PARSE_MSG((funcname)); \
5422 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5424 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5425 DEBUG_PARSE_MSG((funcname)); \
5426 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5429 - reg - regular expression, i.e. main body or parenthesized thing
5431 * Caller must absorb opening parenthesis.
5433 * Combining parenthesis handling with the base level of regular expression
5434 * is a trifle forced, but the need to tie the tails of the branches to what
5435 * follows makes it hard to avoid.
5437 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5439 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5441 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5445 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5446 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5449 register regnode *ret; /* Will be the head of the group. */
5450 register regnode *br;
5451 register regnode *lastbr;
5452 register regnode *ender = NULL;
5453 register I32 parno = 0;
5455 U32 oregflags = RExC_flags;
5456 bool have_branch = 0;
5458 I32 freeze_paren = 0;
5459 I32 after_freeze = 0;
5461 /* for (?g), (?gc), and (?o) warnings; warning
5462 about (?c) will warn about (?g) -- japhy */
5464 #define WASTED_O 0x01
5465 #define WASTED_G 0x02
5466 #define WASTED_C 0x04
5467 #define WASTED_GC (0x02|0x04)
5468 I32 wastedflags = 0x00;
5470 char * parse_start = RExC_parse; /* MJD */
5471 char * const oregcomp_parse = RExC_parse;
5473 GET_RE_DEBUG_FLAGS_DECL;
5475 PERL_ARGS_ASSERT_REG;
5476 DEBUG_PARSE("reg ");
5478 *flagp = 0; /* Tentatively. */
5481 /* Make an OPEN node, if parenthesized. */
5483 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5484 char *start_verb = RExC_parse;
5485 STRLEN verb_len = 0;
5486 char *start_arg = NULL;
5487 unsigned char op = 0;
5489 int internal_argval = 0; /* internal_argval is only useful if !argok */
5490 while ( *RExC_parse && *RExC_parse != ')' ) {
5491 if ( *RExC_parse == ':' ) {
5492 start_arg = RExC_parse + 1;
5498 verb_len = RExC_parse - start_verb;
5501 while ( *RExC_parse && *RExC_parse != ')' )
5503 if ( *RExC_parse != ')' )
5504 vFAIL("Unterminated verb pattern argument");
5505 if ( RExC_parse == start_arg )
5508 if ( *RExC_parse != ')' )
5509 vFAIL("Unterminated verb pattern");
5512 switch ( *start_verb ) {
5513 case 'A': /* (*ACCEPT) */
5514 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5516 internal_argval = RExC_nestroot;
5519 case 'C': /* (*COMMIT) */
5520 if ( memEQs(start_verb,verb_len,"COMMIT") )
5523 case 'F': /* (*FAIL) */
5524 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5529 case ':': /* (*:NAME) */
5530 case 'M': /* (*MARK:NAME) */
5531 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5536 case 'P': /* (*PRUNE) */
5537 if ( memEQs(start_verb,verb_len,"PRUNE") )
5540 case 'S': /* (*SKIP) */
5541 if ( memEQs(start_verb,verb_len,"SKIP") )
5544 case 'T': /* (*THEN) */
5545 /* [19:06] <TimToady> :: is then */
5546 if ( memEQs(start_verb,verb_len,"THEN") ) {
5548 RExC_seen |= REG_SEEN_CUTGROUP;
5554 vFAIL3("Unknown verb pattern '%.*s'",
5555 verb_len, start_verb);
5558 if ( start_arg && internal_argval ) {
5559 vFAIL3("Verb pattern '%.*s' may not have an argument",
5560 verb_len, start_verb);
5561 } else if ( argok < 0 && !start_arg ) {
5562 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5563 verb_len, start_verb);
5565 ret = reganode(pRExC_state, op, internal_argval);
5566 if ( ! internal_argval && ! SIZE_ONLY ) {
5568 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5569 ARG(ret) = add_data( pRExC_state, 1, "S" );
5570 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5577 if (!internal_argval)
5578 RExC_seen |= REG_SEEN_VERBARG;
5579 } else if ( start_arg ) {
5580 vFAIL3("Verb pattern '%.*s' may not have an argument",
5581 verb_len, start_verb);
5583 ret = reg_node(pRExC_state, op);
5585 nextchar(pRExC_state);
5588 if (*RExC_parse == '?') { /* (?...) */
5589 bool is_logical = 0;
5590 const char * const seqstart = RExC_parse;
5593 paren = *RExC_parse++;
5594 ret = NULL; /* For look-ahead/behind. */
5597 case 'P': /* (?P...) variants for those used to PCRE/Python */
5598 paren = *RExC_parse++;
5599 if ( paren == '<') /* (?P<...>) named capture */
5601 else if (paren == '>') { /* (?P>name) named recursion */
5602 goto named_recursion;
5604 else if (paren == '=') { /* (?P=...) named backref */
5605 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5606 you change this make sure you change that */
5607 char* name_start = RExC_parse;
5609 SV *sv_dat = reg_scan_name(pRExC_state,
5610 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5611 if (RExC_parse == name_start || *RExC_parse != ')')
5612 vFAIL2("Sequence %.3s... not terminated",parse_start);
5615 num = add_data( pRExC_state, 1, "S" );
5616 RExC_rxi->data->data[num]=(void*)sv_dat;
5617 SvREFCNT_inc_simple_void(sv_dat);
5620 ret = reganode(pRExC_state,
5621 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5625 Set_Node_Offset(ret, parse_start+1);
5626 Set_Node_Cur_Length(ret); /* MJD */
5628 nextchar(pRExC_state);
5632 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5634 case '<': /* (?<...) */
5635 if (*RExC_parse == '!')
5637 else if (*RExC_parse != '=')
5643 case '\'': /* (?'...') */
5644 name_start= RExC_parse;
5645 svname = reg_scan_name(pRExC_state,
5646 SIZE_ONLY ? /* reverse test from the others */
5647 REG_RSN_RETURN_NAME :
5648 REG_RSN_RETURN_NULL);
5649 if (RExC_parse == name_start) {
5651 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5654 if (*RExC_parse != paren)
5655 vFAIL2("Sequence (?%c... not terminated",
5656 paren=='>' ? '<' : paren);
5660 if (!svname) /* shouldnt happen */
5662 "panic: reg_scan_name returned NULL");
5663 if (!RExC_paren_names) {
5664 RExC_paren_names= newHV();
5665 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5667 RExC_paren_name_list= newAV();
5668 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5671 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5673 sv_dat = HeVAL(he_str);
5675 /* croak baby croak */
5677 "panic: paren_name hash element allocation failed");
5678 } else if ( SvPOK(sv_dat) ) {
5679 /* (?|...) can mean we have dupes so scan to check
5680 its already been stored. Maybe a flag indicating
5681 we are inside such a construct would be useful,
5682 but the arrays are likely to be quite small, so
5683 for now we punt -- dmq */
5684 IV count = SvIV(sv_dat);
5685 I32 *pv = (I32*)SvPVX(sv_dat);
5687 for ( i = 0 ; i < count ; i++ ) {
5688 if ( pv[i] == RExC_npar ) {
5694 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5695 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5696 pv[count] = RExC_npar;
5697 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5700 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5701 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5703 SvIV_set(sv_dat, 1);
5706 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5707 SvREFCNT_dec(svname);
5710 /*sv_dump(sv_dat);*/
5712 nextchar(pRExC_state);
5714 goto capturing_parens;
5716 RExC_seen |= REG_SEEN_LOOKBEHIND;
5718 case '=': /* (?=...) */
5719 RExC_seen_zerolen++;
5721 case '!': /* (?!...) */
5722 RExC_seen_zerolen++;
5723 if (*RExC_parse == ')') {
5724 ret=reg_node(pRExC_state, OPFAIL);
5725 nextchar(pRExC_state);
5729 case '|': /* (?|...) */
5730 /* branch reset, behave like a (?:...) except that
5731 buffers in alternations share the same numbers */
5733 after_freeze = freeze_paren = RExC_npar;
5735 case ':': /* (?:...) */
5736 case '>': /* (?>...) */
5738 case '$': /* (?$...) */
5739 case '@': /* (?@...) */
5740 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5742 case '#': /* (?#...) */
5743 while (*RExC_parse && *RExC_parse != ')')
5745 if (*RExC_parse != ')')
5746 FAIL("Sequence (?#... not terminated");
5747 nextchar(pRExC_state);
5750 case '0' : /* (?0) */
5751 case 'R' : /* (?R) */
5752 if (*RExC_parse != ')')
5753 FAIL("Sequence (?R) not terminated");
5754 ret = reg_node(pRExC_state, GOSTART);
5755 *flagp |= POSTPONED;
5756 nextchar(pRExC_state);
5759 { /* named and numeric backreferences */
5761 case '&': /* (?&NAME) */
5762 parse_start = RExC_parse - 1;
5765 SV *sv_dat = reg_scan_name(pRExC_state,
5766 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5767 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5769 goto gen_recurse_regop;
5772 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5774 vFAIL("Illegal pattern");
5776 goto parse_recursion;
5778 case '-': /* (?-1) */
5779 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5780 RExC_parse--; /* rewind to let it be handled later */
5784 case '1': case '2': case '3': case '4': /* (?1) */
5785 case '5': case '6': case '7': case '8': case '9':
5788 num = atoi(RExC_parse);
5789 parse_start = RExC_parse - 1; /* MJD */
5790 if (*RExC_parse == '-')
5792 while (isDIGIT(*RExC_parse))
5794 if (*RExC_parse!=')')
5795 vFAIL("Expecting close bracket");
5798 if ( paren == '-' ) {
5800 Diagram of capture buffer numbering.
5801 Top line is the normal capture buffer numbers
5802 Botton line is the negative indexing as from
5806 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5810 num = RExC_npar + num;
5813 vFAIL("Reference to nonexistent group");
5815 } else if ( paren == '+' ) {
5816 num = RExC_npar + num - 1;
5819 ret = reganode(pRExC_state, GOSUB, num);
5821 if (num > (I32)RExC_rx->nparens) {
5823 vFAIL("Reference to nonexistent group");
5825 ARG2L_SET( ret, RExC_recurse_count++);
5827 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5828 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5832 RExC_seen |= REG_SEEN_RECURSE;
5833 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5834 Set_Node_Offset(ret, parse_start); /* MJD */
5836 *flagp |= POSTPONED;
5837 nextchar(pRExC_state);
5839 } /* named and numeric backreferences */
5842 case '?': /* (??...) */
5844 if (*RExC_parse != '{') {
5846 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5849 *flagp |= POSTPONED;
5850 paren = *RExC_parse++;
5852 case '{': /* (?{...}) */
5857 char *s = RExC_parse;
5859 RExC_seen_zerolen++;
5860 RExC_seen |= REG_SEEN_EVAL;
5861 while (count && (c = *RExC_parse)) {
5872 if (*RExC_parse != ')') {
5874 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5878 OP_4tree *sop, *rop;
5879 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5882 Perl_save_re_context(aTHX);
5883 rop = sv_compile_2op(sv, &sop, "re", &pad);
5884 sop->op_private |= OPpREFCOUNTED;
5885 /* re_dup will OpREFCNT_inc */
5886 OpREFCNT_set(sop, 1);
5889 n = add_data(pRExC_state, 3, "nop");
5890 RExC_rxi->data->data[n] = (void*)rop;
5891 RExC_rxi->data->data[n+1] = (void*)sop;
5892 RExC_rxi->data->data[n+2] = (void*)pad;
5895 else { /* First pass */
5896 if (PL_reginterp_cnt < ++RExC_seen_evals
5898 /* No compiled RE interpolated, has runtime
5899 components ===> unsafe. */
5900 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5901 if (PL_tainting && PL_tainted)
5902 FAIL("Eval-group in insecure regular expression");
5903 #if PERL_VERSION > 8
5904 if (IN_PERL_COMPILETIME)
5909 nextchar(pRExC_state);
5911 ret = reg_node(pRExC_state, LOGICAL);
5914 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5915 /* deal with the length of this later - MJD */
5918 ret = reganode(pRExC_state, EVAL, n);
5919 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5920 Set_Node_Offset(ret, parse_start);
5923 case '(': /* (?(?{...})...) and (?(?=...)...) */
5926 if (RExC_parse[0] == '?') { /* (?(?...)) */
5927 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5928 || RExC_parse[1] == '<'
5929 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5932 ret = reg_node(pRExC_state, LOGICAL);
5935 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5939 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5940 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5942 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5943 char *name_start= RExC_parse++;
5945 SV *sv_dat=reg_scan_name(pRExC_state,
5946 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5947 if (RExC_parse == name_start || *RExC_parse != ch)
5948 vFAIL2("Sequence (?(%c... not terminated",
5949 (ch == '>' ? '<' : ch));
5952 num = add_data( pRExC_state, 1, "S" );
5953 RExC_rxi->data->data[num]=(void*)sv_dat;
5954 SvREFCNT_inc_simple_void(sv_dat);
5956 ret = reganode(pRExC_state,NGROUPP,num);
5957 goto insert_if_check_paren;
5959 else if (RExC_parse[0] == 'D' &&
5960 RExC_parse[1] == 'E' &&
5961 RExC_parse[2] == 'F' &&
5962 RExC_parse[3] == 'I' &&
5963 RExC_parse[4] == 'N' &&
5964 RExC_parse[5] == 'E')
5966 ret = reganode(pRExC_state,DEFINEP,0);
5969 goto insert_if_check_paren;
5971 else if (RExC_parse[0] == 'R') {
5974 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5975 parno = atoi(RExC_parse++);
5976 while (isDIGIT(*RExC_parse))
5978 } else if (RExC_parse[0] == '&') {
5981 sv_dat = reg_scan_name(pRExC_state,
5982 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5983 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5985 ret = reganode(pRExC_state,INSUBP,parno);
5986 goto insert_if_check_paren;
5988 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5991 parno = atoi(RExC_parse++);
5993 while (isDIGIT(*RExC_parse))
5995 ret = reganode(pRExC_state, GROUPP, parno);
5997 insert_if_check_paren:
5998 if ((c = *nextchar(pRExC_state)) != ')')
5999 vFAIL("Switch condition not recognized");
6001 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6002 br = regbranch(pRExC_state, &flags, 1,depth+1);
6004 br = reganode(pRExC_state, LONGJMP, 0);
6006 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6007 c = *nextchar(pRExC_state);
6012 vFAIL("(?(DEFINE)....) does not allow branches");
6013 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6014 regbranch(pRExC_state, &flags, 1,depth+1);
6015 REGTAIL(pRExC_state, ret, lastbr);
6018 c = *nextchar(pRExC_state);
6023 vFAIL("Switch (?(condition)... contains too many branches");
6024 ender = reg_node(pRExC_state, TAIL);
6025 REGTAIL(pRExC_state, br, ender);
6027 REGTAIL(pRExC_state, lastbr, ender);
6028 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6031 REGTAIL(pRExC_state, ret, ender);
6032 RExC_size++; /* XXX WHY do we need this?!!
6033 For large programs it seems to be required
6034 but I can't figure out why. -- dmq*/
6038 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6042 RExC_parse--; /* for vFAIL to print correctly */
6043 vFAIL("Sequence (? incomplete");
6047 parse_flags: /* (?i) */
6049 U32 posflags = 0, negflags = 0;
6050 U32 *flagsp = &posflags;
6052 while (*RExC_parse) {
6053 /* && strchr("iogcmsx", *RExC_parse) */
6054 /* (?g), (?gc) and (?o) are useless here
6055 and must be globally applied -- japhy */
6056 switch (*RExC_parse) {
6057 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6058 case ONCE_PAT_MOD: /* 'o' */
6059 case GLOBAL_PAT_MOD: /* 'g' */
6060 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6061 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6062 if (! (wastedflags & wflagbit) ) {
6063 wastedflags |= wflagbit;
6066 "Useless (%s%c) - %suse /%c modifier",
6067 flagsp == &negflags ? "?-" : "?",
6069 flagsp == &negflags ? "don't " : "",
6076 case CONTINUE_PAT_MOD: /* 'c' */
6077 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6078 if (! (wastedflags & WASTED_C) ) {
6079 wastedflags |= WASTED_GC;
6082 "Useless (%sc) - %suse /gc modifier",
6083 flagsp == &negflags ? "?-" : "?",
6084 flagsp == &negflags ? "don't " : ""
6089 case KEEPCOPY_PAT_MOD: /* 'p' */
6090 if (flagsp == &negflags) {
6092 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6094 *flagsp |= RXf_PMf_KEEPCOPY;
6098 if (flagsp == &negflags) {
6100 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6104 wastedflags = 0; /* reset so (?g-c) warns twice */
6110 RExC_flags |= posflags;
6111 RExC_flags &= ~negflags;
6113 oregflags |= posflags;
6114 oregflags &= ~negflags;
6116 nextchar(pRExC_state);
6127 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6132 }} /* one for the default block, one for the switch */
6139 ret = reganode(pRExC_state, OPEN, parno);
6142 RExC_nestroot = parno;
6143 if (RExC_seen & REG_SEEN_RECURSE
6144 && !RExC_open_parens[parno-1])
6146 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6147 "Setting open paren #%"IVdf" to %d\n",
6148 (IV)parno, REG_NODE_NUM(ret)));
6149 RExC_open_parens[parno-1]= ret;
6152 Set_Node_Length(ret, 1); /* MJD */
6153 Set_Node_Offset(ret, RExC_parse); /* MJD */
6161 /* Pick up the branches, linking them together. */
6162 parse_start = RExC_parse; /* MJD */
6163 br = regbranch(pRExC_state, &flags, 1,depth+1);
6166 if (RExC_npar > after_freeze)
6167 after_freeze = RExC_npar;
6168 RExC_npar = freeze_paren;
6171 /* branch_len = (paren != 0); */
6175 if (*RExC_parse == '|') {
6176 if (!SIZE_ONLY && RExC_extralen) {
6177 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6180 reginsert(pRExC_state, BRANCH, br, depth+1);
6181 Set_Node_Length(br, paren != 0);
6182 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6186 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6188 else if (paren == ':') {
6189 *flagp |= flags&SIMPLE;
6191 if (is_open) { /* Starts with OPEN. */
6192 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6194 else if (paren != '?') /* Not Conditional */
6196 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6198 while (*RExC_parse == '|') {
6199 if (!SIZE_ONLY && RExC_extralen) {
6200 ender = reganode(pRExC_state, LONGJMP,0);
6201 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6204 RExC_extralen += 2; /* Account for LONGJMP. */
6205 nextchar(pRExC_state);
6207 if (RExC_npar > after_freeze)
6208 after_freeze = RExC_npar;
6209 RExC_npar = freeze_paren;
6211 br = regbranch(pRExC_state, &flags, 0, depth+1);
6215 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6217 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6220 if (have_branch || paren != ':') {
6221 /* Make a closing node, and hook it on the end. */
6224 ender = reg_node(pRExC_state, TAIL);
6227 ender = reganode(pRExC_state, CLOSE, parno);
6228 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6229 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6230 "Setting close paren #%"IVdf" to %d\n",
6231 (IV)parno, REG_NODE_NUM(ender)));
6232 RExC_close_parens[parno-1]= ender;
6233 if (RExC_nestroot == parno)
6236 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6237 Set_Node_Length(ender,1); /* MJD */
6243 *flagp &= ~HASWIDTH;
6246 ender = reg_node(pRExC_state, SUCCEED);
6249 ender = reg_node(pRExC_state, END);
6251 assert(!RExC_opend); /* there can only be one! */
6256 REGTAIL(pRExC_state, lastbr, ender);
6258 if (have_branch && !SIZE_ONLY) {
6260 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6262 /* Hook the tails of the branches to the closing node. */
6263 for (br = ret; br; br = regnext(br)) {
6264 const U8 op = PL_regkind[OP(br)];
6266 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6268 else if (op == BRANCHJ) {
6269 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6277 static const char parens[] = "=!<,>";
6279 if (paren && (p = strchr(parens, paren))) {
6280 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6281 int flag = (p - parens) > 1;
6284 node = SUSPEND, flag = 0;
6285 reginsert(pRExC_state, node,ret, depth+1);
6286 Set_Node_Cur_Length(ret);
6287 Set_Node_Offset(ret, parse_start + 1);
6289 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6293 /* Check for proper termination. */
6295 RExC_flags = oregflags;
6296 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6297 RExC_parse = oregcomp_parse;
6298 vFAIL("Unmatched (");
6301 else if (!paren && RExC_parse < RExC_end) {
6302 if (*RExC_parse == ')') {
6304 vFAIL("Unmatched )");
6307 FAIL("Junk on end of regexp"); /* "Can't happen". */
6311 RExC_npar = after_freeze;
6316 - regbranch - one alternative of an | operator
6318 * Implements the concatenation operator.
6321 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6324 register regnode *ret;
6325 register regnode *chain = NULL;
6326 register regnode *latest;
6327 I32 flags = 0, c = 0;
6328 GET_RE_DEBUG_FLAGS_DECL;
6330 PERL_ARGS_ASSERT_REGBRANCH;
6332 DEBUG_PARSE("brnc");
6337 if (!SIZE_ONLY && RExC_extralen)
6338 ret = reganode(pRExC_state, BRANCHJ,0);
6340 ret = reg_node(pRExC_state, BRANCH);
6341 Set_Node_Length(ret, 1);
6345 if (!first && SIZE_ONLY)
6346 RExC_extralen += 1; /* BRANCHJ */
6348 *flagp = WORST; /* Tentatively. */
6351 nextchar(pRExC_state);
6352 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6354 latest = regpiece(pRExC_state, &flags,depth+1);
6355 if (latest == NULL) {
6356 if (flags & TRYAGAIN)
6360 else if (ret == NULL)
6362 *flagp |= flags&(HASWIDTH|POSTPONED);
6363 if (chain == NULL) /* First piece. */
6364 *flagp |= flags&SPSTART;
6367 REGTAIL(pRExC_state, chain, latest);
6372 if (chain == NULL) { /* Loop ran zero times. */
6373 chain = reg_node(pRExC_state, NOTHING);
6378 *flagp |= flags&SIMPLE;
6385 - regpiece - something followed by possible [*+?]
6387 * Note that the branching code sequences used for ? and the general cases
6388 * of * and + are somewhat optimized: they use the same NOTHING node as
6389 * both the endmarker for their branch list and the body of the last branch.
6390 * It might seem that this node could be dispensed with entirely, but the
6391 * endmarker role is not redundant.
6394 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6397 register regnode *ret;
6399 register char *next;
6401 const char * const origparse = RExC_parse;
6403 I32 max = REG_INFTY;
6405 const char *maxpos = NULL;
6406 GET_RE_DEBUG_FLAGS_DECL;
6408 PERL_ARGS_ASSERT_REGPIECE;
6410 DEBUG_PARSE("piec");
6412 ret = regatom(pRExC_state, &flags,depth+1);
6414 if (flags & TRYAGAIN)
6421 if (op == '{' && regcurly(RExC_parse)) {
6423 parse_start = RExC_parse; /* MJD */
6424 next = RExC_parse + 1;
6425 while (isDIGIT(*next) || *next == ',') {
6434 if (*next == '}') { /* got one */
6438 min = atoi(RExC_parse);
6442 maxpos = RExC_parse;
6444 if (!max && *maxpos != '0')
6445 max = REG_INFTY; /* meaning "infinity" */
6446 else if (max >= REG_INFTY)
6447 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6449 nextchar(pRExC_state);
6452 if ((flags&SIMPLE)) {
6453 RExC_naughty += 2 + RExC_naughty / 2;
6454 reginsert(pRExC_state, CURLY, ret, depth+1);
6455 Set_Node_Offset(ret, parse_start+1); /* MJD */
6456 Set_Node_Cur_Length(ret);
6459 regnode * const w = reg_node(pRExC_state, WHILEM);
6462 REGTAIL(pRExC_state, ret, w);
6463 if (!SIZE_ONLY && RExC_extralen) {
6464 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6465 reginsert(pRExC_state, NOTHING,ret, depth+1);
6466 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6468 reginsert(pRExC_state, CURLYX,ret, depth+1);
6470 Set_Node_Offset(ret, parse_start+1);
6471 Set_Node_Length(ret,
6472 op == '{' ? (RExC_parse - parse_start) : 1);
6474 if (!SIZE_ONLY && RExC_extralen)
6475 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6476 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6478 RExC_whilem_seen++, RExC_extralen += 3;
6479 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6488 vFAIL("Can't do {n,m} with n > m");
6490 ARG1_SET(ret, (U16)min);
6491 ARG2_SET(ret, (U16)max);
6503 #if 0 /* Now runtime fix should be reliable. */
6505 /* if this is reinstated, don't forget to put this back into perldiag:
6507 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6509 (F) The part of the regexp subject to either the * or + quantifier
6510 could match an empty string. The {#} shows in the regular
6511 expression about where the problem was discovered.
6515 if (!(flags&HASWIDTH) && op != '?')
6516 vFAIL("Regexp *+ operand could be empty");
6519 parse_start = RExC_parse;
6520 nextchar(pRExC_state);
6522 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6524 if (op == '*' && (flags&SIMPLE)) {
6525 reginsert(pRExC_state, STAR, ret, depth+1);
6529 else if (op == '*') {
6533 else if (op == '+' && (flags&SIMPLE)) {
6534 reginsert(pRExC_state, PLUS, ret, depth+1);
6538 else if (op == '+') {
6542 else if (op == '?') {
6547 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6548 ckWARN3reg(RExC_parse,
6549 "%.*s matches null string many times",
6550 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6554 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6555 nextchar(pRExC_state);
6556 reginsert(pRExC_state, MINMOD, ret, depth+1);
6557 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6559 #ifndef REG_ALLOW_MINMOD_SUSPEND
6562 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6564 nextchar(pRExC_state);
6565 ender = reg_node(pRExC_state, SUCCEED);
6566 REGTAIL(pRExC_state, ret, ender);
6567 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6569 ender = reg_node(pRExC_state, TAIL);
6570 REGTAIL(pRExC_state, ret, ender);
6574 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6576 vFAIL("Nested quantifiers");
6583 /* reg_namedseq(pRExC_state,UVp)
6585 This is expected to be called by a parser routine that has
6586 recognized '\N' and needs to handle the rest. RExC_parse is
6587 expected to point at the first char following the N at the time
6590 The \N may be inside (indicated by valuep not being NULL) or outside a
6593 \N may begin either a named sequence, or if outside a character class, mean
6594 to match a non-newline. For non single-quoted regexes, the tokenizer has
6595 attempted to decide which, and in the case of a named sequence converted it
6596 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6597 where c1... are the characters in the sequence. For single-quoted regexes,
6598 the tokenizer passes the \N sequence through unchanged; this code will not
6599 attempt to determine this nor expand those. The net effect is that if the
6600 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6601 signals that this \N occurrence means to match a non-newline.
6603 Only the \N{U+...} form should occur in a character class, for the same
6604 reason that '.' inside a character class means to just match a period: it
6605 just doesn't make sense.
6607 If valuep is non-null then it is assumed that we are parsing inside
6608 of a charclass definition and the first codepoint in the resolved
6609 string is returned via *valuep and the routine will return NULL.
6610 In this mode if a multichar string is returned from the charnames
6611 handler, a warning will be issued, and only the first char in the
6612 sequence will be examined. If the string returned is zero length
6613 then the value of *valuep is undefined and NON-NULL will
6614 be returned to indicate failure. (This will NOT be a valid pointer
6617 If valuep is null then it is assumed that we are parsing normal text and a
6618 new EXACT node is inserted into the program containing the resolved string,
6619 and a pointer to the new node is returned. But if the string is zero length
6620 a NOTHING node is emitted instead.
6622 On success RExC_parse is set to the char following the endbrace.
6623 Parsing failures will generate a fatal error via vFAIL(...)
6626 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6628 char * endbrace; /* '}' following the name */
6629 regnode *ret = NULL;
6631 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6635 GET_RE_DEBUG_FLAGS_DECL;
6637 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6641 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6642 * modifier. The other meaning does not */
6643 p = (RExC_flags & RXf_PMf_EXTENDED)
6644 ? regwhite( pRExC_state, RExC_parse )
6647 /* Disambiguate between \N meaning a named character versus \N meaning
6648 * [^\n]. The former is assumed when it can't be the latter. */
6649 if (*p != '{' || regcurly(p)) {
6652 /* no bare \N in a charclass */
6653 vFAIL("\\N in a character class must be a named character: \\N{...}");
6655 nextchar(pRExC_state);
6656 ret = reg_node(pRExC_state, REG_ANY);
6657 *flagp |= HASWIDTH|SIMPLE;
6660 Set_Node_Length(ret, 1); /* MJD */
6664 /* Here, we have decided it should be a named sequence */
6666 /* The test above made sure that the next real character is a '{', but
6667 * under the /x modifier, it could be separated by space (or a comment and
6668 * \n) and this is not allowed (for consistency with \x{...} and the
6669 * tokenizer handling of \N{NAME}). */
6670 if (*RExC_parse != '{') {
6671 vFAIL("Missing braces on \\N{}");
6674 RExC_parse++; /* Skip past the '{' */
6676 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6677 || ! (endbrace == RExC_parse /* nothing between the {} */
6678 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6679 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6681 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6682 vFAIL("\\N{NAME} must be resolved by the lexer");
6685 if (endbrace == RExC_parse) { /* empty: \N{} */
6687 RExC_parse = endbrace + 1;
6688 return reg_node(pRExC_state,NOTHING);
6692 ckWARNreg(RExC_parse,
6693 "Ignoring zero length \\N{} in character class"
6695 RExC_parse = endbrace + 1;
6698 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6701 RExC_utf8 = 1; /* named sequences imply Unicode semantics */
6702 RExC_parse += 2; /* Skip past the 'U+' */
6704 if (valuep) { /* In a bracketed char class */
6705 /* We only pay attention to the first char of
6706 multichar strings being returned. I kinda wonder
6707 if this makes sense as it does change the behaviour
6708 from earlier versions, OTOH that behaviour was broken
6709 as well. XXX Solution is to recharacterize as
6710 [rest-of-class]|multi1|multi2... */
6712 STRLEN length_of_hex;
6713 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6714 | PERL_SCAN_DISALLOW_PREFIX
6715 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6717 char * endchar = strchr(RExC_parse, '.');
6719 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6721 else endchar = endbrace;
6723 length_of_hex = (STRLEN)(endchar - RExC_parse);
6724 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6726 /* The tokenizer should have guaranteed validity, but it's possible to
6727 * bypass it by using single quoting, so check */
6728 if (length_of_hex == 0
6729 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6731 RExC_parse += length_of_hex; /* Includes all the valid */
6732 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6733 ? UTF8SKIP(RExC_parse)
6735 /* Guard against malformed utf8 */
6736 if (RExC_parse >= endchar) RExC_parse = endchar;
6737 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6740 RExC_parse = endbrace + 1;
6741 if (endchar == endbrace) return NULL;
6743 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
6745 else { /* Not a char class */
6746 char *s; /* String to put in generated EXACT node */
6747 STRLEN len = 0; /* Its current length */
6748 char *endchar; /* Points to '.' or '}' ending cur char in the input
6751 ret = reg_node(pRExC_state,
6752 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6755 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6756 * the input which is of the form now 'c1.c2.c3...}' until find the
6757 * ending brace or exeed length 255. The characters that exceed this
6758 * limit are dropped. The limit could be relaxed should it become
6759 * desirable by reparsing this as (?:\N{NAME}), so could generate
6760 * multiple EXACT nodes, as is done for just regular input. But this
6761 * is primarily a named character, and not intended to be a huge long
6762 * string, so 255 bytes should be good enough */
6764 STRLEN length_of_hex;
6765 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6766 | PERL_SCAN_DISALLOW_PREFIX
6767 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6768 UV cp; /* Ord of current character */
6770 /* Code points are separated by dots. If none, there is only one
6771 * code point, and is terminated by the brace */
6772 endchar = strchr(RExC_parse, '.');
6773 if (! endchar) endchar = endbrace;
6775 /* The values are Unicode even on EBCDIC machines */
6776 length_of_hex = (STRLEN)(endchar - RExC_parse);
6777 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6778 if ( length_of_hex == 0
6779 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6781 RExC_parse += length_of_hex; /* Includes all the valid */
6782 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6783 ? UTF8SKIP(RExC_parse)
6785 /* Guard against malformed utf8 */
6786 if (RExC_parse >= endchar) RExC_parse = endchar;
6787 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6790 if (! FOLD) { /* Not folding, just append to the string */
6793 /* Quit before adding this character if would exceed limit */
6794 if (len + UNISKIP(cp) > U8_MAX) break;
6796 unilen = reguni(pRExC_state, cp, s);
6801 } else { /* Folding, output the folded equivalent */
6802 STRLEN foldlen,numlen;
6803 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6804 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6806 /* Quit before exceeding size limit */
6807 if (len + foldlen > U8_MAX) break;
6809 for (foldbuf = tmpbuf;
6813 cp = utf8_to_uvchr(foldbuf, &numlen);
6815 const STRLEN unilen = reguni(pRExC_state, cp, s);
6818 /* In EBCDIC the numlen and unilen can differ. */
6820 if (numlen >= foldlen)
6824 break; /* "Can't happen." */
6828 /* Point to the beginning of the next character in the sequence. */
6829 RExC_parse = endchar + 1;
6831 /* Quit if no more characters */
6832 if (RExC_parse >= endbrace) break;
6837 if (RExC_parse < endbrace) {
6838 ckWARNreg(RExC_parse - 1,
6839 "Using just the first characters returned by \\N{}");
6842 RExC_size += STR_SZ(len);
6845 RExC_emit += STR_SZ(len);
6848 RExC_parse = endbrace + 1;
6850 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6851 with malformed in t/re/pat_advanced.t */
6853 Set_Node_Cur_Length(ret); /* MJD */
6854 nextchar(pRExC_state);
6864 * It returns the code point in utf8 for the value in *encp.
6865 * value: a code value in the source encoding
6866 * encp: a pointer to an Encode object
6868 * If the result from Encode is not a single character,
6869 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6872 S_reg_recode(pTHX_ const char value, SV **encp)
6875 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6876 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6877 const STRLEN newlen = SvCUR(sv);
6878 UV uv = UNICODE_REPLACEMENT;
6880 PERL_ARGS_ASSERT_REG_RECODE;
6884 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6887 if (!newlen || numlen != newlen) {
6888 uv = UNICODE_REPLACEMENT;
6896 - regatom - the lowest level
6898 Try to identify anything special at the start of the pattern. If there
6899 is, then handle it as required. This may involve generating a single regop,
6900 such as for an assertion; or it may involve recursing, such as to
6901 handle a () structure.
6903 If the string doesn't start with something special then we gobble up
6904 as much literal text as we can.
6906 Once we have been able to handle whatever type of thing started the
6907 sequence, we return.
6909 Note: we have to be careful with escapes, as they can be both literal
6910 and special, and in the case of \10 and friends can either, depending
6911 on context. Specifically there are two seperate switches for handling
6912 escape sequences, with the one for handling literal escapes requiring
6913 a dummy entry for all of the special escapes that are actually handled
6918 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6921 register regnode *ret = NULL;
6923 char *parse_start = RExC_parse;
6924 GET_RE_DEBUG_FLAGS_DECL;
6925 DEBUG_PARSE("atom");
6926 *flagp = WORST; /* Tentatively. */
6928 PERL_ARGS_ASSERT_REGATOM;
6931 switch ((U8)*RExC_parse) {
6933 RExC_seen_zerolen++;
6934 nextchar(pRExC_state);
6935 if (RExC_flags & RXf_PMf_MULTILINE)
6936 ret = reg_node(pRExC_state, MBOL);
6937 else if (RExC_flags & RXf_PMf_SINGLELINE)
6938 ret = reg_node(pRExC_state, SBOL);
6940 ret = reg_node(pRExC_state, BOL);
6941 Set_Node_Length(ret, 1); /* MJD */
6944 nextchar(pRExC_state);
6946 RExC_seen_zerolen++;
6947 if (RExC_flags & RXf_PMf_MULTILINE)
6948 ret = reg_node(pRExC_state, MEOL);
6949 else if (RExC_flags & RXf_PMf_SINGLELINE)
6950 ret = reg_node(pRExC_state, SEOL);
6952 ret = reg_node(pRExC_state, EOL);
6953 Set_Node_Length(ret, 1); /* MJD */
6956 nextchar(pRExC_state);
6957 if (RExC_flags & RXf_PMf_SINGLELINE)
6958 ret = reg_node(pRExC_state, SANY);
6960 ret = reg_node(pRExC_state, REG_ANY);
6961 *flagp |= HASWIDTH|SIMPLE;
6963 Set_Node_Length(ret, 1); /* MJD */
6967 char * const oregcomp_parse = ++RExC_parse;
6968 ret = regclass(pRExC_state,depth+1);
6969 if (*RExC_parse != ']') {
6970 RExC_parse = oregcomp_parse;
6971 vFAIL("Unmatched [");
6973 nextchar(pRExC_state);
6974 *flagp |= HASWIDTH|SIMPLE;
6975 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6979 nextchar(pRExC_state);
6980 ret = reg(pRExC_state, 1, &flags,depth+1);
6982 if (flags & TRYAGAIN) {
6983 if (RExC_parse == RExC_end) {
6984 /* Make parent create an empty node if needed. */
6992 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6996 if (flags & TRYAGAIN) {
7000 vFAIL("Internal urp");
7001 /* Supposed to be caught earlier. */
7004 if (!regcurly(RExC_parse)) {
7013 vFAIL("Quantifier follows nothing");
7021 len=0; /* silence a spurious compiler warning */
7022 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7023 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7024 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7025 ret = reganode(pRExC_state, FOLDCHAR, cp);
7026 Set_Node_Length(ret, 1); /* MJD */
7027 nextchar(pRExC_state); /* kill whitespace under /x */
7035 This switch handles escape sequences that resolve to some kind
7036 of special regop and not to literal text. Escape sequnces that
7037 resolve to literal text are handled below in the switch marked
7040 Every entry in this switch *must* have a corresponding entry
7041 in the literal escape switch. However, the opposite is not
7042 required, as the default for this switch is to jump to the
7043 literal text handling code.
7045 switch ((U8)*++RExC_parse) {
7050 /* Special Escapes */
7052 RExC_seen_zerolen++;
7053 ret = reg_node(pRExC_state, SBOL);
7055 goto finish_meta_pat;
7057 ret = reg_node(pRExC_state, GPOS);
7058 RExC_seen |= REG_SEEN_GPOS;
7060 goto finish_meta_pat;
7062 RExC_seen_zerolen++;
7063 ret = reg_node(pRExC_state, KEEPS);
7065 /* XXX:dmq : disabling in-place substitution seems to
7066 * be necessary here to avoid cases of memory corruption, as
7067 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7069 RExC_seen |= REG_SEEN_LOOKBEHIND;
7070 goto finish_meta_pat;
7072 ret = reg_node(pRExC_state, SEOL);
7074 RExC_seen_zerolen++; /* Do not optimize RE away */
7075 goto finish_meta_pat;
7077 ret = reg_node(pRExC_state, EOS);
7079 RExC_seen_zerolen++; /* Do not optimize RE away */
7080 goto finish_meta_pat;
7082 ret = reg_node(pRExC_state, CANY);
7083 RExC_seen |= REG_SEEN_CANY;
7084 *flagp |= HASWIDTH|SIMPLE;
7085 goto finish_meta_pat;
7087 ret = reg_node(pRExC_state, CLUMP);
7089 goto finish_meta_pat;
7091 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7092 *flagp |= HASWIDTH|SIMPLE;
7093 goto finish_meta_pat;
7095 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7096 *flagp |= HASWIDTH|SIMPLE;
7097 goto finish_meta_pat;
7099 RExC_seen_zerolen++;
7100 RExC_seen |= REG_SEEN_LOOKBEHIND;
7101 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7103 goto finish_meta_pat;
7105 RExC_seen_zerolen++;
7106 RExC_seen |= REG_SEEN_LOOKBEHIND;
7107 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7109 goto finish_meta_pat;
7111 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7112 *flagp |= HASWIDTH|SIMPLE;
7113 goto finish_meta_pat;
7115 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7116 *flagp |= HASWIDTH|SIMPLE;
7117 goto finish_meta_pat;
7119 ret = reg_node(pRExC_state, DIGIT);
7120 *flagp |= HASWIDTH|SIMPLE;
7121 goto finish_meta_pat;
7123 ret = reg_node(pRExC_state, NDIGIT);
7124 *flagp |= HASWIDTH|SIMPLE;
7125 goto finish_meta_pat;
7127 ret = reg_node(pRExC_state, LNBREAK);
7128 *flagp |= HASWIDTH|SIMPLE;
7129 goto finish_meta_pat;
7131 ret = reg_node(pRExC_state, HORIZWS);
7132 *flagp |= HASWIDTH|SIMPLE;
7133 goto finish_meta_pat;
7135 ret = reg_node(pRExC_state, NHORIZWS);
7136 *flagp |= HASWIDTH|SIMPLE;
7137 goto finish_meta_pat;
7139 ret = reg_node(pRExC_state, VERTWS);
7140 *flagp |= HASWIDTH|SIMPLE;
7141 goto finish_meta_pat;
7143 ret = reg_node(pRExC_state, NVERTWS);
7144 *flagp |= HASWIDTH|SIMPLE;
7146 nextchar(pRExC_state);
7147 Set_Node_Length(ret, 2); /* MJD */
7152 char* const oldregxend = RExC_end;
7154 char* parse_start = RExC_parse - 2;
7157 if (RExC_parse[1] == '{') {
7158 /* a lovely hack--pretend we saw [\pX] instead */
7159 RExC_end = strchr(RExC_parse, '}');
7161 const U8 c = (U8)*RExC_parse;
7163 RExC_end = oldregxend;
7164 vFAIL2("Missing right brace on \\%c{}", c);
7169 RExC_end = RExC_parse + 2;
7170 if (RExC_end > oldregxend)
7171 RExC_end = oldregxend;
7175 ret = regclass(pRExC_state,depth+1);
7177 RExC_end = oldregxend;
7180 Set_Node_Offset(ret, parse_start + 2);
7181 Set_Node_Cur_Length(ret);
7182 nextchar(pRExC_state);
7183 *flagp |= HASWIDTH|SIMPLE;
7187 /* Handle \N and \N{NAME} here and not below because it can be
7188 multicharacter. join_exact() will join them up later on.
7189 Also this makes sure that things like /\N{BLAH}+/ and
7190 \N{BLAH} being multi char Just Happen. dmq*/
7192 ret= reg_namedseq(pRExC_state, NULL, flagp);
7194 case 'k': /* Handle \k<NAME> and \k'NAME' */
7197 char ch= RExC_parse[1];
7198 if (ch != '<' && ch != '\'' && ch != '{') {
7200 vFAIL2("Sequence %.2s... not terminated",parse_start);
7202 /* this pretty much dupes the code for (?P=...) in reg(), if
7203 you change this make sure you change that */
7204 char* name_start = (RExC_parse += 2);
7206 SV *sv_dat = reg_scan_name(pRExC_state,
7207 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7208 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7209 if (RExC_parse == name_start || *RExC_parse != ch)
7210 vFAIL2("Sequence %.3s... not terminated",parse_start);
7213 num = add_data( pRExC_state, 1, "S" );
7214 RExC_rxi->data->data[num]=(void*)sv_dat;
7215 SvREFCNT_inc_simple_void(sv_dat);
7219 ret = reganode(pRExC_state,
7220 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7224 /* override incorrect value set in reganode MJD */
7225 Set_Node_Offset(ret, parse_start+1);
7226 Set_Node_Cur_Length(ret); /* MJD */
7227 nextchar(pRExC_state);
7233 case '1': case '2': case '3': case '4':
7234 case '5': case '6': case '7': case '8': case '9':
7237 bool isg = *RExC_parse == 'g';
7242 if (*RExC_parse == '{') {
7246 if (*RExC_parse == '-') {
7250 if (hasbrace && !isDIGIT(*RExC_parse)) {
7251 if (isrel) RExC_parse--;
7253 goto parse_named_seq;
7255 num = atoi(RExC_parse);
7256 if (isg && num == 0)
7257 vFAIL("Reference to invalid group 0");
7259 num = RExC_npar - num;
7261 vFAIL("Reference to nonexistent or unclosed group");
7263 if (!isg && num > 9 && num >= RExC_npar)
7266 char * const parse_start = RExC_parse - 1; /* MJD */
7267 while (isDIGIT(*RExC_parse))
7269 if (parse_start == RExC_parse - 1)
7270 vFAIL("Unterminated \\g... pattern");
7272 if (*RExC_parse != '}')
7273 vFAIL("Unterminated \\g{...} pattern");
7277 if (num > (I32)RExC_rx->nparens)
7278 vFAIL("Reference to nonexistent group");
7281 ret = reganode(pRExC_state,
7282 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7286 /* override incorrect value set in reganode MJD */
7287 Set_Node_Offset(ret, parse_start+1);
7288 Set_Node_Cur_Length(ret); /* MJD */
7290 nextchar(pRExC_state);
7295 if (RExC_parse >= RExC_end)
7296 FAIL("Trailing \\");
7299 /* Do not generate "unrecognized" warnings here, we fall
7300 back into the quick-grab loop below */
7307 if (RExC_flags & RXf_PMf_EXTENDED) {
7308 if ( reg_skipcomment( pRExC_state ) )
7315 register STRLEN len;
7320 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7322 parse_start = RExC_parse - 1;
7328 ret = reg_node(pRExC_state,
7329 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7331 for (len = 0, p = RExC_parse - 1;
7332 len < 127 && p < RExC_end;
7335 char * const oldp = p;
7337 if (RExC_flags & RXf_PMf_EXTENDED)
7338 p = regwhite( pRExC_state, p );
7343 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7344 goto normal_default;
7354 /* Literal Escapes Switch
7356 This switch is meant to handle escape sequences that
7357 resolve to a literal character.
7359 Every escape sequence that represents something
7360 else, like an assertion or a char class, is handled
7361 in the switch marked 'Special Escapes' above in this
7362 routine, but also has an entry here as anything that
7363 isn't explicitly mentioned here will be treated as
7364 an unescaped equivalent literal.
7368 /* These are all the special escapes. */
7372 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7373 goto normal_default;
7374 case 'A': /* Start assertion */
7375 case 'b': case 'B': /* Word-boundary assertion*/
7376 case 'C': /* Single char !DANGEROUS! */
7377 case 'd': case 'D': /* digit class */
7378 case 'g': case 'G': /* generic-backref, pos assertion */
7379 case 'h': case 'H': /* HORIZWS */
7380 case 'k': case 'K': /* named backref, keep marker */
7381 case 'N': /* named char sequence */
7382 case 'p': case 'P': /* Unicode property */
7383 case 'R': /* LNBREAK */
7384 case 's': case 'S': /* space class */
7385 case 'v': case 'V': /* VERTWS */
7386 case 'w': case 'W': /* word class */
7387 case 'X': /* eXtended Unicode "combining character sequence" */
7388 case 'z': case 'Z': /* End of line/string assertion */
7392 /* Anything after here is an escape that resolves to a
7393 literal. (Except digits, which may or may not)
7412 ender = ASCII_TO_NATIVE('\033');
7416 ender = ASCII_TO_NATIVE('\007');
7421 char* const e = strchr(p, '}');
7425 vFAIL("Missing right brace on \\x{}");
7428 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7429 | PERL_SCAN_DISALLOW_PREFIX;
7430 STRLEN numlen = e - p - 1;
7431 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7438 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7440 ender = grok_hex(p, &numlen, &flags, NULL);
7443 if (PL_encoding && ender < 0x100)
7444 goto recode_encoding;
7448 ender = UCHARAT(p++);
7449 ender = toCTRL(ender);
7451 case '0': case '1': case '2': case '3':case '4':
7452 case '5': case '6': case '7': case '8':case '9':
7454 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7457 ender = grok_oct(p, &numlen, &flags, NULL);
7459 /* An octal above 0xff is interpreted differently
7460 * depending on if the re is in utf8 or not. If it
7461 * is in utf8, the value will be itself, otherwise
7462 * it is interpreted as modulo 0x100. It has been
7463 * decided to discourage the use of octal above the
7464 * single-byte range. For now, warn only when
7465 * it ends up modulo */
7466 if (SIZE_ONLY && ender >= 0x100
7467 && ! UTF && ! PL_encoding) {
7468 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7476 if (PL_encoding && ender < 0x100)
7477 goto recode_encoding;
7481 SV* enc = PL_encoding;
7482 ender = reg_recode((const char)(U8)ender, &enc);
7483 if (!enc && SIZE_ONLY)
7484 ckWARNreg(p, "Invalid escape in the specified encoding");
7490 FAIL("Trailing \\");
7493 if (!SIZE_ONLY&& isALPHA(*p))
7494 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7495 goto normal_default;
7500 if (UTF8_IS_START(*p) && UTF) {
7502 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7503 &numlen, UTF8_ALLOW_DEFAULT);
7510 if ( RExC_flags & RXf_PMf_EXTENDED)
7511 p = regwhite( pRExC_state, p );
7513 /* Prime the casefolded buffer. */
7514 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7516 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7521 /* Emit all the Unicode characters. */
7523 for (foldbuf = tmpbuf;
7525 foldlen -= numlen) {
7526 ender = utf8_to_uvchr(foldbuf, &numlen);
7528 const STRLEN unilen = reguni(pRExC_state, ender, s);
7531 /* In EBCDIC the numlen
7532 * and unilen can differ. */
7534 if (numlen >= foldlen)
7538 break; /* "Can't happen." */
7542 const STRLEN unilen = reguni(pRExC_state, ender, s);
7551 REGC((char)ender, s++);
7557 /* Emit all the Unicode characters. */
7559 for (foldbuf = tmpbuf;
7561 foldlen -= numlen) {
7562 ender = utf8_to_uvchr(foldbuf, &numlen);
7564 const STRLEN unilen = reguni(pRExC_state, ender, s);
7567 /* In EBCDIC the numlen
7568 * and unilen can differ. */
7570 if (numlen >= foldlen)
7578 const STRLEN unilen = reguni(pRExC_state, ender, s);
7587 REGC((char)ender, s++);
7591 Set_Node_Cur_Length(ret); /* MJD */
7592 nextchar(pRExC_state);
7594 /* len is STRLEN which is unsigned, need to copy to signed */
7597 vFAIL("Internal disaster");
7601 if (len == 1 && UNI_IS_INVARIANT(ender))
7605 RExC_size += STR_SZ(len);
7608 RExC_emit += STR_SZ(len);
7618 S_regwhite( RExC_state_t *pRExC_state, char *p )
7620 const char *e = RExC_end;
7622 PERL_ARGS_ASSERT_REGWHITE;
7627 else if (*p == '#') {
7636 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7644 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7645 Character classes ([:foo:]) can also be negated ([:^foo:]).
7646 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7647 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7648 but trigger failures because they are currently unimplemented. */
7650 #define POSIXCC_DONE(c) ((c) == ':')
7651 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7652 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7655 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7658 I32 namedclass = OOB_NAMEDCLASS;
7660 PERL_ARGS_ASSERT_REGPPOSIXCC;
7662 if (value == '[' && RExC_parse + 1 < RExC_end &&
7663 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7664 POSIXCC(UCHARAT(RExC_parse))) {
7665 const char c = UCHARAT(RExC_parse);
7666 char* const s = RExC_parse++;
7668 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7670 if (RExC_parse == RExC_end)
7671 /* Grandfather lone [:, [=, [. */
7674 const char* const t = RExC_parse++; /* skip over the c */
7677 if (UCHARAT(RExC_parse) == ']') {
7678 const char *posixcc = s + 1;
7679 RExC_parse++; /* skip over the ending ] */
7682 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7683 const I32 skip = t - posixcc;
7685 /* Initially switch on the length of the name. */
7688 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7689 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7692 /* Names all of length 5. */
7693 /* alnum alpha ascii blank cntrl digit graph lower
7694 print punct space upper */
7695 /* Offset 4 gives the best switch position. */
7696 switch (posixcc[4]) {
7698 if (memEQ(posixcc, "alph", 4)) /* alpha */
7699 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7702 if (memEQ(posixcc, "spac", 4)) /* space */
7703 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7706 if (memEQ(posixcc, "grap", 4)) /* graph */
7707 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7710 if (memEQ(posixcc, "asci", 4)) /* ascii */
7711 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7714 if (memEQ(posixcc, "blan", 4)) /* blank */
7715 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7718 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7719 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7722 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7723 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7726 if (memEQ(posixcc, "lowe", 4)) /* lower */
7727 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7728 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7729 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7732 if (memEQ(posixcc, "digi", 4)) /* digit */
7733 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7734 else if (memEQ(posixcc, "prin", 4)) /* print */
7735 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7736 else if (memEQ(posixcc, "punc", 4)) /* punct */
7737 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7742 if (memEQ(posixcc, "xdigit", 6))
7743 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7747 if (namedclass == OOB_NAMEDCLASS)
7748 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7750 assert (posixcc[skip] == ':');
7751 assert (posixcc[skip+1] == ']');
7752 } else if (!SIZE_ONLY) {
7753 /* [[=foo=]] and [[.foo.]] are still future. */
7755 /* adjust RExC_parse so the warning shows after
7757 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7759 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7762 /* Maternal grandfather:
7763 * "[:" ending in ":" but not in ":]" */
7773 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7777 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7779 if (POSIXCC(UCHARAT(RExC_parse))) {
7780 const char *s = RExC_parse;
7781 const char c = *s++;
7785 if (*s && c == *s && s[1] == ']') {
7787 "POSIX syntax [%c %c] belongs inside character classes",
7790 /* [[=foo=]] and [[.foo.]] are still future. */
7791 if (POSIXCC_NOTYET(c)) {
7792 /* adjust RExC_parse so the error shows after
7794 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7796 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7803 #define _C_C_T_(NAME,TEST,WORD) \
7806 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7808 for (value = 0; value < 256; value++) \
7810 ANYOF_BITMAP_SET(ret, value); \
7815 case ANYOF_N##NAME: \
7817 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7819 for (value = 0; value < 256; value++) \
7821 ANYOF_BITMAP_SET(ret, value); \
7827 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7829 for (value = 0; value < 256; value++) \
7831 ANYOF_BITMAP_SET(ret, value); \
7835 case ANYOF_N##NAME: \
7836 for (value = 0; value < 256; value++) \
7838 ANYOF_BITMAP_SET(ret, value); \
7844 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7845 so that it is possible to override the option here without having to
7846 rebuild the entire core. as we are required to do if we change regcomp.h
7847 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7849 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7850 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7853 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7854 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7856 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7860 parse a class specification and produce either an ANYOF node that
7861 matches the pattern or if the pattern matches a single char only and
7862 that char is < 256 and we are case insensitive then we produce an
7867 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7870 register UV nextvalue;
7871 register IV prevvalue = OOB_UNICODE;
7872 register IV range = 0;
7873 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7874 register regnode *ret;
7877 char *rangebegin = NULL;
7878 bool need_class = 0;
7881 bool optimize_invert = TRUE;
7882 AV* unicode_alternate = NULL;
7884 UV literal_endpoint = 0;
7886 UV stored = 0; /* number of chars stored in the class */
7888 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7889 case we need to change the emitted regop to an EXACT. */
7890 const char * orig_parse = RExC_parse;
7891 GET_RE_DEBUG_FLAGS_DECL;
7893 PERL_ARGS_ASSERT_REGCLASS;
7895 PERL_UNUSED_ARG(depth);
7898 DEBUG_PARSE("clas");
7900 /* Assume we are going to generate an ANYOF node. */
7901 ret = reganode(pRExC_state, ANYOF, 0);
7904 ANYOF_FLAGS(ret) = 0;
7906 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7910 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7914 RExC_size += ANYOF_SKIP;
7915 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7918 RExC_emit += ANYOF_SKIP;
7920 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7922 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7923 ANYOF_BITMAP_ZERO(ret);
7924 listsv = newSVpvs("# comment\n");
7927 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7929 if (!SIZE_ONLY && POSIXCC(nextvalue))
7930 checkposixcc(pRExC_state);
7932 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7933 if (UCHARAT(RExC_parse) == ']')
7937 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7941 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7944 rangebegin = RExC_parse;
7946 value = utf8n_to_uvchr((U8*)RExC_parse,
7947 RExC_end - RExC_parse,
7948 &numlen, UTF8_ALLOW_DEFAULT);
7949 RExC_parse += numlen;
7952 value = UCHARAT(RExC_parse++);
7954 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7955 if (value == '[' && POSIXCC(nextvalue))
7956 namedclass = regpposixcc(pRExC_state, value);
7957 else if (value == '\\') {
7959 value = utf8n_to_uvchr((U8*)RExC_parse,
7960 RExC_end - RExC_parse,
7961 &numlen, UTF8_ALLOW_DEFAULT);
7962 RExC_parse += numlen;
7965 value = UCHARAT(RExC_parse++);
7966 /* Some compilers cannot handle switching on 64-bit integer
7967 * values, therefore value cannot be an UV. Yes, this will
7968 * be a problem later if we want switch on Unicode.
7969 * A similar issue a little bit later when switching on
7970 * namedclass. --jhi */
7971 switch ((I32)value) {
7972 case 'w': namedclass = ANYOF_ALNUM; break;
7973 case 'W': namedclass = ANYOF_NALNUM; break;
7974 case 's': namedclass = ANYOF_SPACE; break;
7975 case 'S': namedclass = ANYOF_NSPACE; break;
7976 case 'd': namedclass = ANYOF_DIGIT; break;
7977 case 'D': namedclass = ANYOF_NDIGIT; break;
7978 case 'v': namedclass = ANYOF_VERTWS; break;
7979 case 'V': namedclass = ANYOF_NVERTWS; break;
7980 case 'h': namedclass = ANYOF_HORIZWS; break;
7981 case 'H': namedclass = ANYOF_NHORIZWS; break;
7982 case 'N': /* Handle \N{NAME} in class */
7984 /* We only pay attention to the first char of
7985 multichar strings being returned. I kinda wonder
7986 if this makes sense as it does change the behaviour
7987 from earlier versions, OTOH that behaviour was broken
7989 UV v; /* value is register so we cant & it /grrr */
7990 if (reg_namedseq(pRExC_state, &v, NULL)) {
8000 if (RExC_parse >= RExC_end)
8001 vFAIL2("Empty \\%c{}", (U8)value);
8002 if (*RExC_parse == '{') {
8003 const U8 c = (U8)value;
8004 e = strchr(RExC_parse++, '}');
8006 vFAIL2("Missing right brace on \\%c{}", c);
8007 while (isSPACE(UCHARAT(RExC_parse)))
8009 if (e == RExC_parse)
8010 vFAIL2("Empty \\%c{}", c);
8012 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8020 if (UCHARAT(RExC_parse) == '^') {
8023 value = value == 'p' ? 'P' : 'p'; /* toggle */
8024 while (isSPACE(UCHARAT(RExC_parse))) {
8029 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8030 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8033 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8034 namedclass = ANYOF_MAX; /* no official name, but it's named */
8037 case 'n': value = '\n'; break;
8038 case 'r': value = '\r'; break;
8039 case 't': value = '\t'; break;
8040 case 'f': value = '\f'; break;
8041 case 'b': value = '\b'; break;
8042 case 'e': value = ASCII_TO_NATIVE('\033');break;
8043 case 'a': value = ASCII_TO_NATIVE('\007');break;
8045 if (*RExC_parse == '{') {
8046 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8047 | PERL_SCAN_DISALLOW_PREFIX;
8048 char * const e = strchr(RExC_parse++, '}');
8050 vFAIL("Missing right brace on \\x{}");
8052 numlen = e - RExC_parse;
8053 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8057 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8059 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8060 RExC_parse += numlen;
8062 if (PL_encoding && value < 0x100)
8063 goto recode_encoding;
8066 value = UCHARAT(RExC_parse++);
8067 value = toCTRL(value);
8069 case '0': case '1': case '2': case '3': case '4':
8070 case '5': case '6': case '7': case '8': case '9':
8074 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8075 RExC_parse += numlen;
8076 if (PL_encoding && value < 0x100)
8077 goto recode_encoding;
8082 SV* enc = PL_encoding;
8083 value = reg_recode((const char)(U8)value, &enc);
8084 if (!enc && SIZE_ONLY)
8085 ckWARNreg(RExC_parse,
8086 "Invalid escape in the specified encoding");
8090 if (!SIZE_ONLY && isALPHA(value))
8091 ckWARN2reg(RExC_parse,
8092 "Unrecognized escape \\%c in character class passed through",
8096 } /* end of \blah */
8102 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8104 if (!SIZE_ONLY && !need_class)
8105 ANYOF_CLASS_ZERO(ret);
8109 /* a bad range like a-\d, a-[:digit:] ? */
8113 RExC_parse >= rangebegin ?
8114 RExC_parse - rangebegin : 0;
8115 ckWARN4reg(RExC_parse,
8116 "False [] range \"%*.*s\"",
8119 if (prevvalue < 256) {
8120 ANYOF_BITMAP_SET(ret, prevvalue);
8121 ANYOF_BITMAP_SET(ret, '-');
8124 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8125 Perl_sv_catpvf(aTHX_ listsv,
8126 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8130 range = 0; /* this was not a true range */
8136 const char *what = NULL;
8139 if (namedclass > OOB_NAMEDCLASS)
8140 optimize_invert = FALSE;
8141 /* Possible truncation here but in some 64-bit environments
8142 * the compiler gets heartburn about switch on 64-bit values.
8143 * A similar issue a little earlier when switching on value.
8145 switch ((I32)namedclass) {
8147 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8148 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8149 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8150 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8151 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8152 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8153 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8154 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8155 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8156 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8157 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8158 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8159 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8161 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8162 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8164 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8165 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8166 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8169 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8172 for (value = 0; value < 128; value++)
8173 ANYOF_BITMAP_SET(ret, value);
8175 for (value = 0; value < 256; value++) {
8177 ANYOF_BITMAP_SET(ret, value);
8186 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8189 for (value = 128; value < 256; value++)
8190 ANYOF_BITMAP_SET(ret, value);
8192 for (value = 0; value < 256; value++) {
8193 if (!isASCII(value))
8194 ANYOF_BITMAP_SET(ret, value);
8203 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8205 /* consecutive digits assumed */
8206 for (value = '0'; value <= '9'; value++)
8207 ANYOF_BITMAP_SET(ret, value);
8210 what = POSIX_CC_UNI_NAME("Digit");
8214 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8216 /* consecutive digits assumed */
8217 for (value = 0; value < '0'; value++)
8218 ANYOF_BITMAP_SET(ret, value);
8219 for (value = '9' + 1; value < 256; value++)
8220 ANYOF_BITMAP_SET(ret, value);
8223 what = POSIX_CC_UNI_NAME("Digit");
8226 /* this is to handle \p and \P */
8229 vFAIL("Invalid [::] class");
8233 /* Strings such as "+utf8::isWord\n" */
8234 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8237 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8240 } /* end of namedclass \blah */
8243 if (prevvalue > (IV)value) /* b-a */ {
8244 const int w = RExC_parse - rangebegin;
8245 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8246 range = 0; /* not a valid range */
8250 prevvalue = value; /* save the beginning of the range */
8251 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8252 RExC_parse[1] != ']') {
8255 /* a bad range like \w-, [:word:]- ? */
8256 if (namedclass > OOB_NAMEDCLASS) {
8257 if (ckWARN(WARN_REGEXP)) {
8259 RExC_parse >= rangebegin ?
8260 RExC_parse - rangebegin : 0;
8262 "False [] range \"%*.*s\"",
8266 ANYOF_BITMAP_SET(ret, '-');
8268 range = 1; /* yeah, it's a range! */
8269 continue; /* but do it the next time */
8273 /* now is the next time */
8274 /*stored += (value - prevvalue + 1);*/
8276 if (prevvalue < 256) {
8277 const IV ceilvalue = value < 256 ? value : 255;
8280 /* In EBCDIC [\x89-\x91] should include
8281 * the \x8e but [i-j] should not. */
8282 if (literal_endpoint == 2 &&
8283 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8284 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8286 if (isLOWER(prevvalue)) {
8287 for (i = prevvalue; i <= ceilvalue; i++)
8288 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8290 ANYOF_BITMAP_SET(ret, i);
8293 for (i = prevvalue; i <= ceilvalue; i++)
8294 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8296 ANYOF_BITMAP_SET(ret, i);
8302 for (i = prevvalue; i <= ceilvalue; i++) {
8303 if (!ANYOF_BITMAP_TEST(ret,i)) {
8305 ANYOF_BITMAP_SET(ret, i);
8309 if (value > 255 || UTF) {
8310 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8311 const UV natvalue = NATIVE_TO_UNI(value);
8312 stored+=2; /* can't optimize this class */
8313 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8314 if (prevnatvalue < natvalue) { /* what about > ? */
8315 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8316 prevnatvalue, natvalue);
8318 else if (prevnatvalue == natvalue) {
8319 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8321 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8323 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8325 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8326 if (RExC_precomp[0] == ':' &&
8327 RExC_precomp[1] == '[' &&
8328 (f == 0xDF || f == 0x92)) {
8329 f = NATIVE_TO_UNI(f);
8332 /* If folding and foldable and a single
8333 * character, insert also the folded version
8334 * to the charclass. */
8336 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8337 if ((RExC_precomp[0] == ':' &&
8338 RExC_precomp[1] == '[' &&
8340 (value == 0xFB05 || value == 0xFB06))) ?
8341 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8342 foldlen == (STRLEN)UNISKIP(f) )
8344 if (foldlen == (STRLEN)UNISKIP(f))
8346 Perl_sv_catpvf(aTHX_ listsv,
8349 /* Any multicharacter foldings
8350 * require the following transform:
8351 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8352 * where E folds into "pq" and F folds
8353 * into "rst", all other characters
8354 * fold to single characters. We save
8355 * away these multicharacter foldings,
8356 * to be later saved as part of the
8357 * additional "s" data. */
8360 if (!unicode_alternate)
8361 unicode_alternate = newAV();
8362 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8364 av_push(unicode_alternate, sv);
8368 /* If folding and the value is one of the Greek
8369 * sigmas insert a few more sigmas to make the
8370 * folding rules of the sigmas to work right.
8371 * Note that not all the possible combinations
8372 * are handled here: some of them are handled
8373 * by the standard folding rules, and some of
8374 * them (literal or EXACTF cases) are handled
8375 * during runtime in regexec.c:S_find_byclass(). */
8376 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8377 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8378 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8379 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8380 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8382 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8383 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8384 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8389 literal_endpoint = 0;
8393 range = 0; /* this range (if it was one) is done now */
8397 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8399 RExC_size += ANYOF_CLASS_ADD_SKIP;
8401 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8407 /****** !SIZE_ONLY AFTER HERE *********/
8409 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8410 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8412 /* optimize single char class to an EXACT node
8413 but *only* when its not a UTF/high char */
8414 const char * cur_parse= RExC_parse;
8415 RExC_emit = (regnode *)orig_emit;
8416 RExC_parse = (char *)orig_parse;
8417 ret = reg_node(pRExC_state,
8418 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8419 RExC_parse = (char *)cur_parse;
8420 *STRING(ret)= (char)value;
8422 RExC_emit += STR_SZ(1);
8423 SvREFCNT_dec(listsv);
8426 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8427 if ( /* If the only flag is folding (plus possibly inversion). */
8428 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8430 for (value = 0; value < 256; ++value) {
8431 if (ANYOF_BITMAP_TEST(ret, value)) {
8432 UV fold = PL_fold[value];
8435 ANYOF_BITMAP_SET(ret, fold);
8438 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8441 /* optimize inverted simple patterns (e.g. [^a-z]) */
8442 if (optimize_invert &&
8443 /* If the only flag is inversion. */
8444 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8445 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8446 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8447 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8450 AV * const av = newAV();
8452 /* The 0th element stores the character class description
8453 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8454 * to initialize the appropriate swash (which gets stored in
8455 * the 1st element), and also useful for dumping the regnode.
8456 * The 2nd element stores the multicharacter foldings,
8457 * used later (regexec.c:S_reginclass()). */
8458 av_store(av, 0, listsv);
8459 av_store(av, 1, NULL);
8460 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8461 rv = newRV_noinc(MUTABLE_SV(av));
8462 n = add_data(pRExC_state, 1, "s");
8463 RExC_rxi->data->data[n] = (void*)rv;
8471 /* reg_skipcomment()
8473 Absorbs an /x style # comments from the input stream.
8474 Returns true if there is more text remaining in the stream.
8475 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8476 terminates the pattern without including a newline.
8478 Note its the callers responsibility to ensure that we are
8484 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8488 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8490 while (RExC_parse < RExC_end)
8491 if (*RExC_parse++ == '\n') {
8496 /* we ran off the end of the pattern without ending
8497 the comment, so we have to add an \n when wrapping */
8498 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8506 Advance that parse position, and optionall absorbs
8507 "whitespace" from the inputstream.
8509 Without /x "whitespace" means (?#...) style comments only,
8510 with /x this means (?#...) and # comments and whitespace proper.
8512 Returns the RExC_parse point from BEFORE the scan occurs.
8514 This is the /x friendly way of saying RExC_parse++.
8518 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8520 char* const retval = RExC_parse++;
8522 PERL_ARGS_ASSERT_NEXTCHAR;
8525 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8526 RExC_parse[2] == '#') {
8527 while (*RExC_parse != ')') {
8528 if (RExC_parse == RExC_end)
8529 FAIL("Sequence (?#... not terminated");
8535 if (RExC_flags & RXf_PMf_EXTENDED) {
8536 if (isSPACE(*RExC_parse)) {
8540 else if (*RExC_parse == '#') {
8541 if ( reg_skipcomment( pRExC_state ) )
8550 - reg_node - emit a node
8552 STATIC regnode * /* Location. */
8553 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8556 register regnode *ptr;
8557 regnode * const ret = RExC_emit;
8558 GET_RE_DEBUG_FLAGS_DECL;
8560 PERL_ARGS_ASSERT_REG_NODE;
8563 SIZE_ALIGN(RExC_size);
8567 if (RExC_emit >= RExC_emit_bound)
8568 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8570 NODE_ALIGN_FILL(ret);
8572 FILL_ADVANCE_NODE(ptr, op);
8573 #ifdef RE_TRACK_PATTERN_OFFSETS
8574 if (RExC_offsets) { /* MJD */
8575 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8576 "reg_node", __LINE__,
8578 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8579 ? "Overwriting end of array!\n" : "OK",
8580 (UV)(RExC_emit - RExC_emit_start),
8581 (UV)(RExC_parse - RExC_start),
8582 (UV)RExC_offsets[0]));
8583 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8591 - reganode - emit a node with an argument
8593 STATIC regnode * /* Location. */
8594 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8597 register regnode *ptr;
8598 regnode * const ret = RExC_emit;
8599 GET_RE_DEBUG_FLAGS_DECL;
8601 PERL_ARGS_ASSERT_REGANODE;
8604 SIZE_ALIGN(RExC_size);
8609 assert(2==regarglen[op]+1);
8611 Anything larger than this has to allocate the extra amount.
8612 If we changed this to be:
8614 RExC_size += (1 + regarglen[op]);
8616 then it wouldn't matter. Its not clear what side effect
8617 might come from that so its not done so far.
8622 if (RExC_emit >= RExC_emit_bound)
8623 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8625 NODE_ALIGN_FILL(ret);
8627 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8628 #ifdef RE_TRACK_PATTERN_OFFSETS
8629 if (RExC_offsets) { /* MJD */
8630 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8634 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8635 "Overwriting end of array!\n" : "OK",
8636 (UV)(RExC_emit - RExC_emit_start),
8637 (UV)(RExC_parse - RExC_start),
8638 (UV)RExC_offsets[0]));
8639 Set_Cur_Node_Offset;
8647 - reguni - emit (if appropriate) a Unicode character
8650 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8654 PERL_ARGS_ASSERT_REGUNI;
8656 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8660 - reginsert - insert an operator in front of already-emitted operand
8662 * Means relocating the operand.
8665 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8668 register regnode *src;
8669 register regnode *dst;
8670 register regnode *place;
8671 const int offset = regarglen[(U8)op];
8672 const int size = NODE_STEP_REGNODE + offset;
8673 GET_RE_DEBUG_FLAGS_DECL;
8675 PERL_ARGS_ASSERT_REGINSERT;
8676 PERL_UNUSED_ARG(depth);
8677 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8678 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8687 if (RExC_open_parens) {
8689 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8690 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8691 if ( RExC_open_parens[paren] >= opnd ) {
8692 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8693 RExC_open_parens[paren] += size;
8695 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8697 if ( RExC_close_parens[paren] >= opnd ) {
8698 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8699 RExC_close_parens[paren] += size;
8701 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8706 while (src > opnd) {
8707 StructCopy(--src, --dst, regnode);
8708 #ifdef RE_TRACK_PATTERN_OFFSETS
8709 if (RExC_offsets) { /* MJD 20010112 */
8710 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8714 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8715 ? "Overwriting end of array!\n" : "OK",
8716 (UV)(src - RExC_emit_start),
8717 (UV)(dst - RExC_emit_start),
8718 (UV)RExC_offsets[0]));
8719 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8720 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8726 place = opnd; /* Op node, where operand used to be. */
8727 #ifdef RE_TRACK_PATTERN_OFFSETS
8728 if (RExC_offsets) { /* MJD */
8729 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8733 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8734 ? "Overwriting end of array!\n" : "OK",
8735 (UV)(place - RExC_emit_start),
8736 (UV)(RExC_parse - RExC_start),
8737 (UV)RExC_offsets[0]));
8738 Set_Node_Offset(place, RExC_parse);
8739 Set_Node_Length(place, 1);
8742 src = NEXTOPER(place);
8743 FILL_ADVANCE_NODE(place, op);
8744 Zero(src, offset, regnode);
8748 - regtail - set the next-pointer at the end of a node chain of p to val.
8749 - SEE ALSO: regtail_study
8751 /* TODO: All three parms should be const */
8753 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8756 register regnode *scan;
8757 GET_RE_DEBUG_FLAGS_DECL;
8759 PERL_ARGS_ASSERT_REGTAIL;
8761 PERL_UNUSED_ARG(depth);
8767 /* Find last node. */
8770 regnode * const temp = regnext(scan);
8772 SV * const mysv=sv_newmortal();
8773 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8774 regprop(RExC_rx, mysv, scan);
8775 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8776 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8777 (temp == NULL ? "->" : ""),
8778 (temp == NULL ? PL_reg_name[OP(val)] : "")
8786 if (reg_off_by_arg[OP(scan)]) {
8787 ARG_SET(scan, val - scan);
8790 NEXT_OFF(scan) = val - scan;
8796 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8797 - Look for optimizable sequences at the same time.
8798 - currently only looks for EXACT chains.
8800 This is expermental code. The idea is to use this routine to perform
8801 in place optimizations on branches and groups as they are constructed,
8802 with the long term intention of removing optimization from study_chunk so
8803 that it is purely analytical.
8805 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8806 to control which is which.
8809 /* TODO: All four parms should be const */
8812 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8815 register regnode *scan;
8817 #ifdef EXPERIMENTAL_INPLACESCAN
8820 GET_RE_DEBUG_FLAGS_DECL;
8822 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8828 /* Find last node. */
8832 regnode * const temp = regnext(scan);
8833 #ifdef EXPERIMENTAL_INPLACESCAN
8834 if (PL_regkind[OP(scan)] == EXACT)
8835 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8843 if( exact == PSEUDO )
8845 else if ( exact != OP(scan) )
8854 SV * const mysv=sv_newmortal();
8855 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8856 regprop(RExC_rx, mysv, scan);
8857 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8858 SvPV_nolen_const(mysv),
8860 PL_reg_name[exact]);
8867 SV * const mysv_val=sv_newmortal();
8868 DEBUG_PARSE_MSG("");
8869 regprop(RExC_rx, mysv_val, val);
8870 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8871 SvPV_nolen_const(mysv_val),
8872 (IV)REG_NODE_NUM(val),
8876 if (reg_off_by_arg[OP(scan)]) {
8877 ARG_SET(scan, val - scan);
8880 NEXT_OFF(scan) = val - scan;
8888 - regcurly - a little FSA that accepts {\d+,?\d*}
8891 Perl_regcurly(register const char *s)
8893 PERL_ARGS_ASSERT_REGCURLY;
8912 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8916 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8921 for (bit=0; bit<32; bit++) {
8922 if (flags & (1<<bit)) {
8924 PerlIO_printf(Perl_debug_log, "%s",lead);
8925 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8930 PerlIO_printf(Perl_debug_log, "\n");
8932 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8938 Perl_regdump(pTHX_ const regexp *r)
8942 SV * const sv = sv_newmortal();
8943 SV *dsv= sv_newmortal();
8945 GET_RE_DEBUG_FLAGS_DECL;
8947 PERL_ARGS_ASSERT_REGDUMP;
8949 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8951 /* Header fields of interest. */
8952 if (r->anchored_substr) {
8953 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8954 RE_SV_DUMPLEN(r->anchored_substr), 30);
8955 PerlIO_printf(Perl_debug_log,
8956 "anchored %s%s at %"IVdf" ",
8957 s, RE_SV_TAIL(r->anchored_substr),
8958 (IV)r->anchored_offset);
8959 } else if (r->anchored_utf8) {
8960 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8961 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8962 PerlIO_printf(Perl_debug_log,
8963 "anchored utf8 %s%s at %"IVdf" ",
8964 s, RE_SV_TAIL(r->anchored_utf8),
8965 (IV)r->anchored_offset);
8967 if (r->float_substr) {
8968 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8969 RE_SV_DUMPLEN(r->float_substr), 30);
8970 PerlIO_printf(Perl_debug_log,
8971 "floating %s%s at %"IVdf"..%"UVuf" ",
8972 s, RE_SV_TAIL(r->float_substr),
8973 (IV)r->float_min_offset, (UV)r->float_max_offset);
8974 } else if (r->float_utf8) {
8975 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8976 RE_SV_DUMPLEN(r->float_utf8), 30);
8977 PerlIO_printf(Perl_debug_log,
8978 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8979 s, RE_SV_TAIL(r->float_utf8),
8980 (IV)r->float_min_offset, (UV)r->float_max_offset);
8982 if (r->check_substr || r->check_utf8)
8983 PerlIO_printf(Perl_debug_log,
8985 (r->check_substr == r->float_substr
8986 && r->check_utf8 == r->float_utf8
8987 ? "(checking floating" : "(checking anchored"));
8988 if (r->extflags & RXf_NOSCAN)
8989 PerlIO_printf(Perl_debug_log, " noscan");
8990 if (r->extflags & RXf_CHECK_ALL)
8991 PerlIO_printf(Perl_debug_log, " isall");
8992 if (r->check_substr || r->check_utf8)
8993 PerlIO_printf(Perl_debug_log, ") ");
8995 if (ri->regstclass) {
8996 regprop(r, sv, ri->regstclass);
8997 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8999 if (r->extflags & RXf_ANCH) {
9000 PerlIO_printf(Perl_debug_log, "anchored");
9001 if (r->extflags & RXf_ANCH_BOL)
9002 PerlIO_printf(Perl_debug_log, "(BOL)");
9003 if (r->extflags & RXf_ANCH_MBOL)
9004 PerlIO_printf(Perl_debug_log, "(MBOL)");
9005 if (r->extflags & RXf_ANCH_SBOL)
9006 PerlIO_printf(Perl_debug_log, "(SBOL)");
9007 if (r->extflags & RXf_ANCH_GPOS)
9008 PerlIO_printf(Perl_debug_log, "(GPOS)");
9009 PerlIO_putc(Perl_debug_log, ' ');
9011 if (r->extflags & RXf_GPOS_SEEN)
9012 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9013 if (r->intflags & PREGf_SKIP)
9014 PerlIO_printf(Perl_debug_log, "plus ");
9015 if (r->intflags & PREGf_IMPLICIT)
9016 PerlIO_printf(Perl_debug_log, "implicit ");
9017 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9018 if (r->extflags & RXf_EVAL_SEEN)
9019 PerlIO_printf(Perl_debug_log, "with eval ");
9020 PerlIO_printf(Perl_debug_log, "\n");
9021 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9023 PERL_ARGS_ASSERT_REGDUMP;
9024 PERL_UNUSED_CONTEXT;
9026 #endif /* DEBUGGING */
9030 - regprop - printable representation of opcode
9032 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9035 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9036 if (flags & ANYOF_INVERT) \
9037 /*make sure the invert info is in each */ \
9038 sv_catpvs(sv, "^"); \
9044 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9049 RXi_GET_DECL(prog,progi);
9050 GET_RE_DEBUG_FLAGS_DECL;
9052 PERL_ARGS_ASSERT_REGPROP;
9056 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9057 /* It would be nice to FAIL() here, but this may be called from
9058 regexec.c, and it would be hard to supply pRExC_state. */
9059 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9060 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9062 k = PL_regkind[OP(o)];
9066 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9067 * is a crude hack but it may be the best for now since
9068 * we have no flag "this EXACTish node was UTF-8"
9070 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9071 PERL_PV_ESCAPE_UNI_DETECT |
9072 PERL_PV_PRETTY_ELLIPSES |
9073 PERL_PV_PRETTY_LTGT |
9074 PERL_PV_PRETTY_NOCLEAR
9076 } else if (k == TRIE) {
9077 /* print the details of the trie in dumpuntil instead, as
9078 * progi->data isn't available here */
9079 const char op = OP(o);
9080 const U32 n = ARG(o);
9081 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9082 (reg_ac_data *)progi->data->data[n] :
9084 const reg_trie_data * const trie
9085 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9087 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9088 DEBUG_TRIE_COMPILE_r(
9089 Perl_sv_catpvf(aTHX_ sv,
9090 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9091 (UV)trie->startstate,
9092 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9093 (UV)trie->wordcount,
9096 (UV)TRIE_CHARCOUNT(trie),
9097 (UV)trie->uniquecharcount
9100 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9102 int rangestart = -1;
9103 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9105 for (i = 0; i <= 256; i++) {
9106 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9107 if (rangestart == -1)
9109 } else if (rangestart != -1) {
9110 if (i <= rangestart + 3)
9111 for (; rangestart < i; rangestart++)
9112 put_byte(sv, rangestart);
9114 put_byte(sv, rangestart);
9116 put_byte(sv, i - 1);
9124 } else if (k == CURLY) {
9125 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9126 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9127 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9129 else if (k == WHILEM && o->flags) /* Ordinal/of */
9130 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9131 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9132 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9133 if ( RXp_PAREN_NAMES(prog) ) {
9134 if ( k != REF || OP(o) < NREF) {
9135 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9136 SV **name= av_fetch(list, ARG(o), 0 );
9138 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9141 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9142 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9143 I32 *nums=(I32*)SvPVX(sv_dat);
9144 SV **name= av_fetch(list, nums[0], 0 );
9147 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9148 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9149 (n ? "," : ""), (IV)nums[n]);
9151 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9155 } else if (k == GOSUB)
9156 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9157 else if (k == VERB) {
9159 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9160 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9161 } else if (k == LOGICAL)
9162 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9163 else if (k == FOLDCHAR)
9164 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9165 else if (k == ANYOF) {
9166 int i, rangestart = -1;
9167 const U8 flags = ANYOF_FLAGS(o);
9170 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9171 static const char * const anyofs[] = {
9204 if (flags & ANYOF_LOCALE)
9205 sv_catpvs(sv, "{loc}");
9206 if (flags & ANYOF_FOLD)
9207 sv_catpvs(sv, "{i}");
9208 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9209 if (flags & ANYOF_INVERT)
9212 /* output what the standard cp 0-255 bitmap matches */
9213 for (i = 0; i <= 256; i++) {
9214 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9215 if (rangestart == -1)
9217 } else if (rangestart != -1) {
9218 if (i <= rangestart + 3)
9219 for (; rangestart < i; rangestart++)
9220 put_byte(sv, rangestart);
9222 put_byte(sv, rangestart);
9224 put_byte(sv, i - 1);
9231 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9232 /* output any special charclass tests (used mostly under use locale) */
9233 if (o->flags & ANYOF_CLASS)
9234 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9235 if (ANYOF_CLASS_TEST(o,i)) {
9236 sv_catpv(sv, anyofs[i]);
9240 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9242 /* output information about the unicode matching */
9243 if (flags & ANYOF_UNICODE)
9244 sv_catpvs(sv, "{unicode}");
9245 else if (flags & ANYOF_UNICODE_ALL)
9246 sv_catpvs(sv, "{unicode_all}");
9250 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9254 U8 s[UTF8_MAXBYTES_CASE+1];
9256 for (i = 0; i <= 256; i++) { /* just the first 256 */
9257 uvchr_to_utf8(s, i);
9259 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9260 if (rangestart == -1)
9262 } else if (rangestart != -1) {
9263 if (i <= rangestart + 3)
9264 for (; rangestart < i; rangestart++) {
9265 const U8 * const e = uvchr_to_utf8(s,rangestart);
9267 for(p = s; p < e; p++)
9271 const U8 *e = uvchr_to_utf8(s,rangestart);
9273 for (p = s; p < e; p++)
9276 e = uvchr_to_utf8(s, i-1);
9277 for (p = s; p < e; p++)
9284 sv_catpvs(sv, "..."); /* et cetera */
9288 char *s = savesvpv(lv);
9289 char * const origs = s;
9291 while (*s && *s != '\n')
9295 const char * const t = ++s;
9313 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9315 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9316 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9318 PERL_UNUSED_CONTEXT;
9319 PERL_UNUSED_ARG(sv);
9321 PERL_UNUSED_ARG(prog);
9322 #endif /* DEBUGGING */
9326 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9327 { /* Assume that RE_INTUIT is set */
9329 struct regexp *const prog = (struct regexp *)SvANY(r);
9330 GET_RE_DEBUG_FLAGS_DECL;
9332 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9333 PERL_UNUSED_CONTEXT;
9337 const char * const s = SvPV_nolen_const(prog->check_substr
9338 ? prog->check_substr : prog->check_utf8);
9340 if (!PL_colorset) reginitcolors();
9341 PerlIO_printf(Perl_debug_log,
9342 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9344 prog->check_substr ? "" : "utf8 ",
9345 PL_colors[5],PL_colors[0],
9348 (strlen(s) > 60 ? "..." : ""));
9351 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9357 handles refcounting and freeing the perl core regexp structure. When
9358 it is necessary to actually free the structure the first thing it
9359 does is call the 'free' method of the regexp_engine associated to to
9360 the regexp, allowing the handling of the void *pprivate; member
9361 first. (This routine is not overridable by extensions, which is why
9362 the extensions free is called first.)
9364 See regdupe and regdupe_internal if you change anything here.
9366 #ifndef PERL_IN_XSUB_RE
9368 Perl_pregfree(pTHX_ REGEXP *r)
9374 Perl_pregfree2(pTHX_ REGEXP *rx)
9377 struct regexp *const r = (struct regexp *)SvANY(rx);
9378 GET_RE_DEBUG_FLAGS_DECL;
9380 PERL_ARGS_ASSERT_PREGFREE2;
9383 ReREFCNT_dec(r->mother_re);
9385 CALLREGFREE_PVT(rx); /* free the private data */
9386 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9389 SvREFCNT_dec(r->anchored_substr);
9390 SvREFCNT_dec(r->anchored_utf8);
9391 SvREFCNT_dec(r->float_substr);
9392 SvREFCNT_dec(r->float_utf8);
9393 Safefree(r->substrs);
9395 RX_MATCH_COPY_FREE(rx);
9396 #ifdef PERL_OLD_COPY_ON_WRITE
9397 SvREFCNT_dec(r->saved_copy);
9404 This is a hacky workaround to the structural issue of match results
9405 being stored in the regexp structure which is in turn stored in
9406 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9407 could be PL_curpm in multiple contexts, and could require multiple
9408 result sets being associated with the pattern simultaneously, such
9409 as when doing a recursive match with (??{$qr})
9411 The solution is to make a lightweight copy of the regexp structure
9412 when a qr// is returned from the code executed by (??{$qr}) this
9413 lightweight copy doesnt actually own any of its data except for
9414 the starp/end and the actual regexp structure itself.
9420 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9423 struct regexp *const r = (struct regexp *)SvANY(rx);
9424 register const I32 npar = r->nparens+1;
9426 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9429 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9430 ret = (struct regexp *)SvANY(ret_x);
9432 (void)ReREFCNT_inc(rx);
9433 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9434 by pointing directly at the buffer, but flagging that the allocated
9435 space in the copy is zero. As we've just done a struct copy, it's now
9436 a case of zero-ing that, rather than copying the current length. */
9437 SvPV_set(ret_x, RX_WRAPPED(rx));
9438 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9439 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9440 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9441 SvLEN_set(ret_x, 0);
9442 SvSTASH_set(ret_x, NULL);
9443 SvMAGIC_set(ret_x, NULL);
9444 Newx(ret->offs, npar, regexp_paren_pair);
9445 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9447 Newx(ret->substrs, 1, struct reg_substr_data);
9448 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9450 SvREFCNT_inc_void(ret->anchored_substr);
9451 SvREFCNT_inc_void(ret->anchored_utf8);
9452 SvREFCNT_inc_void(ret->float_substr);
9453 SvREFCNT_inc_void(ret->float_utf8);
9455 /* check_substr and check_utf8, if non-NULL, point to either their
9456 anchored or float namesakes, and don't hold a second reference. */
9458 RX_MATCH_COPIED_off(ret_x);
9459 #ifdef PERL_OLD_COPY_ON_WRITE
9460 ret->saved_copy = NULL;
9462 ret->mother_re = rx;
9468 /* regfree_internal()
9470 Free the private data in a regexp. This is overloadable by
9471 extensions. Perl takes care of the regexp structure in pregfree(),
9472 this covers the *pprivate pointer which technically perldoesnt
9473 know about, however of course we have to handle the
9474 regexp_internal structure when no extension is in use.
9476 Note this is called before freeing anything in the regexp
9481 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9484 struct regexp *const r = (struct regexp *)SvANY(rx);
9486 GET_RE_DEBUG_FLAGS_DECL;
9488 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9494 SV *dsv= sv_newmortal();
9495 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9496 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9497 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9498 PL_colors[4],PL_colors[5],s);
9501 #ifdef RE_TRACK_PATTERN_OFFSETS
9503 Safefree(ri->u.offsets); /* 20010421 MJD */
9506 int n = ri->data->count;
9507 PAD* new_comppad = NULL;
9512 /* If you add a ->what type here, update the comment in regcomp.h */
9513 switch (ri->data->what[n]) {
9517 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9520 Safefree(ri->data->data[n]);
9523 new_comppad = MUTABLE_AV(ri->data->data[n]);
9526 if (new_comppad == NULL)
9527 Perl_croak(aTHX_ "panic: pregfree comppad");
9528 PAD_SAVE_LOCAL(old_comppad,
9529 /* Watch out for global destruction's random ordering. */
9530 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9533 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9536 op_free((OP_4tree*)ri->data->data[n]);
9538 PAD_RESTORE_LOCAL(old_comppad);
9539 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9545 { /* Aho Corasick add-on structure for a trie node.
9546 Used in stclass optimization only */
9548 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9550 refcount = --aho->refcount;
9553 PerlMemShared_free(aho->states);
9554 PerlMemShared_free(aho->fail);
9555 /* do this last!!!! */
9556 PerlMemShared_free(ri->data->data[n]);
9557 PerlMemShared_free(ri->regstclass);
9563 /* trie structure. */
9565 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9567 refcount = --trie->refcount;
9570 PerlMemShared_free(trie->charmap);
9571 PerlMemShared_free(trie->states);
9572 PerlMemShared_free(trie->trans);
9574 PerlMemShared_free(trie->bitmap);
9576 PerlMemShared_free(trie->wordlen);
9578 PerlMemShared_free(trie->jump);
9580 PerlMemShared_free(trie->nextword);
9581 /* do this last!!!! */
9582 PerlMemShared_free(ri->data->data[n]);
9587 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9590 Safefree(ri->data->what);
9597 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9598 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9599 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9600 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9603 re_dup - duplicate a regexp.
9605 This routine is expected to clone a given regexp structure. It is only
9606 compiled under USE_ITHREADS.
9608 After all of the core data stored in struct regexp is duplicated
9609 the regexp_engine.dupe method is used to copy any private data
9610 stored in the *pprivate pointer. This allows extensions to handle
9611 any duplication it needs to do.
9613 See pregfree() and regfree_internal() if you change anything here.
9615 #if defined(USE_ITHREADS)
9616 #ifndef PERL_IN_XSUB_RE
9618 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9622 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9623 struct regexp *ret = (struct regexp *)SvANY(dstr);
9625 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9627 npar = r->nparens+1;
9628 Newx(ret->offs, npar, regexp_paren_pair);
9629 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9631 /* no need to copy these */
9632 Newx(ret->swap, npar, regexp_paren_pair);
9636 /* Do it this way to avoid reading from *r after the StructCopy().
9637 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9638 cache, it doesn't matter. */
9639 const bool anchored = r->check_substr
9640 ? r->check_substr == r->anchored_substr
9641 : r->check_utf8 == r->anchored_utf8;
9642 Newx(ret->substrs, 1, struct reg_substr_data);
9643 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9645 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9646 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9647 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9648 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9650 /* check_substr and check_utf8, if non-NULL, point to either their
9651 anchored or float namesakes, and don't hold a second reference. */
9653 if (ret->check_substr) {
9655 assert(r->check_utf8 == r->anchored_utf8);
9656 ret->check_substr = ret->anchored_substr;
9657 ret->check_utf8 = ret->anchored_utf8;
9659 assert(r->check_substr == r->float_substr);
9660 assert(r->check_utf8 == r->float_utf8);
9661 ret->check_substr = ret->float_substr;
9662 ret->check_utf8 = ret->float_utf8;
9664 } else if (ret->check_utf8) {
9666 ret->check_utf8 = ret->anchored_utf8;
9668 ret->check_utf8 = ret->float_utf8;
9673 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9676 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9678 if (RX_MATCH_COPIED(dstr))
9679 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9682 #ifdef PERL_OLD_COPY_ON_WRITE
9683 ret->saved_copy = NULL;
9686 if (ret->mother_re) {
9687 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9688 /* Our storage points directly to our mother regexp, but that's
9689 1: a buffer in a different thread
9690 2: something we no longer hold a reference on
9691 so we need to copy it locally. */
9692 /* Note we need to sue SvCUR() on our mother_re, because it, in
9693 turn, may well be pointing to its own mother_re. */
9694 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9695 SvCUR(ret->mother_re)+1));
9696 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9698 ret->mother_re = NULL;
9702 #endif /* PERL_IN_XSUB_RE */
9707 This is the internal complement to regdupe() which is used to copy
9708 the structure pointed to by the *pprivate pointer in the regexp.
9709 This is the core version of the extension overridable cloning hook.
9710 The regexp structure being duplicated will be copied by perl prior
9711 to this and will be provided as the regexp *r argument, however
9712 with the /old/ structures pprivate pointer value. Thus this routine
9713 may override any copying normally done by perl.
9715 It returns a pointer to the new regexp_internal structure.
9719 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9722 struct regexp *const r = (struct regexp *)SvANY(rx);
9723 regexp_internal *reti;
9727 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9729 npar = r->nparens+1;
9732 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9733 Copy(ri->program, reti->program, len+1, regnode);
9736 reti->regstclass = NULL;
9740 const int count = ri->data->count;
9743 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9744 char, struct reg_data);
9745 Newx(d->what, count, U8);
9748 for (i = 0; i < count; i++) {
9749 d->what[i] = ri->data->what[i];
9750 switch (d->what[i]) {
9751 /* legal options are one of: sSfpontTu
9752 see also regcomp.h and pregfree() */
9755 case 'p': /* actually an AV, but the dup function is identical. */
9756 case 'u': /* actually an HV, but the dup function is identical. */
9757 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9760 /* This is cheating. */
9761 Newx(d->data[i], 1, struct regnode_charclass_class);
9762 StructCopy(ri->data->data[i], d->data[i],
9763 struct regnode_charclass_class);
9764 reti->regstclass = (regnode*)d->data[i];
9767 /* Compiled op trees are readonly and in shared memory,
9768 and can thus be shared without duplication. */
9770 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9774 /* Trie stclasses are readonly and can thus be shared
9775 * without duplication. We free the stclass in pregfree
9776 * when the corresponding reg_ac_data struct is freed.
9778 reti->regstclass= ri->regstclass;
9782 ((reg_trie_data*)ri->data->data[i])->refcount++;
9786 d->data[i] = ri->data->data[i];
9789 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9798 reti->name_list_idx = ri->name_list_idx;
9800 #ifdef RE_TRACK_PATTERN_OFFSETS
9801 if (ri->u.offsets) {
9802 Newx(reti->u.offsets, 2*len+1, U32);
9803 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9806 SetProgLen(reti,len);
9812 #endif /* USE_ITHREADS */
9814 #ifndef PERL_IN_XSUB_RE
9817 - regnext - dig the "next" pointer out of a node
9820 Perl_regnext(pTHX_ register regnode *p)
9823 register I32 offset;
9828 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9837 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9840 STRLEN l1 = strlen(pat1);
9841 STRLEN l2 = strlen(pat2);
9844 const char *message;
9846 PERL_ARGS_ASSERT_RE_CROAK2;
9852 Copy(pat1, buf, l1 , char);
9853 Copy(pat2, buf + l1, l2 , char);
9854 buf[l1 + l2] = '\n';
9855 buf[l1 + l2 + 1] = '\0';
9857 /* ANSI variant takes additional second argument */
9858 va_start(args, pat2);
9862 msv = vmess(buf, &args);
9864 message = SvPV_const(msv,l1);
9867 Copy(message, buf, l1 , char);
9868 buf[l1-1] = '\0'; /* Overwrite \n */
9869 Perl_croak(aTHX_ "%s", buf);
9872 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9874 #ifndef PERL_IN_XSUB_RE
9876 Perl_save_re_context(pTHX)
9880 struct re_save_state *state;
9882 SAVEVPTR(PL_curcop);
9883 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9885 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9886 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9887 SSPUSHINT(SAVEt_RE_STATE);
9889 Copy(&PL_reg_state, state, 1, struct re_save_state);
9891 PL_reg_start_tmp = 0;
9892 PL_reg_start_tmpl = 0;
9893 PL_reg_oldsaved = NULL;
9894 PL_reg_oldsavedlen = 0;
9896 PL_reg_leftiter = 0;
9897 PL_reg_poscache = NULL;
9898 PL_reg_poscache_size = 0;
9899 #ifdef PERL_OLD_COPY_ON_WRITE
9903 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9905 const REGEXP * const rx = PM_GETRE(PL_curpm);
9908 for (i = 1; i <= RX_NPARENS(rx); i++) {
9909 char digits[TYPE_CHARS(long)];
9910 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9911 GV *const *const gvp
9912 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9915 GV * const gv = *gvp;
9916 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9926 clear_re(pTHX_ void *r)
9929 ReREFCNT_dec((REGEXP *)r);
9935 S_put_byte(pTHX_ SV *sv, int c)
9937 PERL_ARGS_ASSERT_PUT_BYTE;
9939 /* Our definition of isPRINT() ignores locales, so only bytes that are
9940 not part of UTF-8 are considered printable. I assume that the same
9941 holds for UTF-EBCDIC.
9942 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9943 which Wikipedia says:
9945 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9946 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9947 identical, to the ASCII delete (DEL) or rubout control character.
9948 ) So the old condition can be simplified to !isPRINT(c) */
9950 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9952 const char string = c;
9953 if (c == '-' || c == ']' || c == '\\' || c == '^')
9954 sv_catpvs(sv, "\\");
9955 sv_catpvn(sv, &string, 1);
9960 #define CLEAR_OPTSTART \
9961 if (optstart) STMT_START { \
9962 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9966 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9968 STATIC const regnode *
9969 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9970 const regnode *last, const regnode *plast,
9971 SV* sv, I32 indent, U32 depth)
9974 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9975 register const regnode *next;
9976 const regnode *optstart= NULL;
9979 GET_RE_DEBUG_FLAGS_DECL;
9981 PERL_ARGS_ASSERT_DUMPUNTIL;
9983 #ifdef DEBUG_DUMPUNTIL
9984 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9985 last ? last-start : 0,plast ? plast-start : 0);
9988 if (plast && plast < last)
9991 while (PL_regkind[op] != END && (!last || node < last)) {
9992 /* While that wasn't END last time... */
9995 if (op == CLOSE || op == WHILEM)
9997 next = regnext((regnode *)node);
10000 if (OP(node) == OPTIMIZED) {
10001 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10008 regprop(r, sv, node);
10009 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10010 (int)(2*indent + 1), "", SvPVX_const(sv));
10012 if (OP(node) != OPTIMIZED) {
10013 if (next == NULL) /* Next ptr. */
10014 PerlIO_printf(Perl_debug_log, " (0)");
10015 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10016 PerlIO_printf(Perl_debug_log, " (FAIL)");
10018 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10019 (void)PerlIO_putc(Perl_debug_log, '\n');
10023 if (PL_regkind[(U8)op] == BRANCHJ) {
10026 register const regnode *nnode = (OP(next) == LONGJMP
10027 ? regnext((regnode *)next)
10029 if (last && nnode > last)
10031 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10034 else if (PL_regkind[(U8)op] == BRANCH) {
10036 DUMPUNTIL(NEXTOPER(node), next);
10038 else if ( PL_regkind[(U8)op] == TRIE ) {
10039 const regnode *this_trie = node;
10040 const char op = OP(node);
10041 const U32 n = ARG(node);
10042 const reg_ac_data * const ac = op>=AHOCORASICK ?
10043 (reg_ac_data *)ri->data->data[n] :
10045 const reg_trie_data * const trie =
10046 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10048 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10050 const regnode *nextbranch= NULL;
10053 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10054 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10056 PerlIO_printf(Perl_debug_log, "%*s%s ",
10057 (int)(2*(indent+3)), "",
10058 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10059 PL_colors[0], PL_colors[1],
10060 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10061 PERL_PV_PRETTY_ELLIPSES |
10062 PERL_PV_PRETTY_LTGT
10067 U16 dist= trie->jump[word_idx+1];
10068 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10069 (UV)((dist ? this_trie + dist : next) - start));
10072 nextbranch= this_trie + trie->jump[0];
10073 DUMPUNTIL(this_trie + dist, nextbranch);
10075 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10076 nextbranch= regnext((regnode *)nextbranch);
10078 PerlIO_printf(Perl_debug_log, "\n");
10081 if (last && next > last)
10086 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10087 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10088 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10090 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10092 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10094 else if ( op == PLUS || op == STAR) {
10095 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10097 else if (op == ANYOF) {
10098 /* arglen 1 + class block */
10099 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10100 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10101 node = NEXTOPER(node);
10103 else if (PL_regkind[(U8)op] == EXACT) {
10104 /* Literal string, where present. */
10105 node += NODE_SZ_STR(node) - 1;
10106 node = NEXTOPER(node);
10109 node = NEXTOPER(node);
10110 node += regarglen[(U8)op];
10112 if (op == CURLYX || op == OPEN)
10116 #ifdef DEBUG_DUMPUNTIL
10117 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10122 #endif /* DEBUGGING */
10126 * c-indentation-style: bsd
10127 * c-basic-offset: 4
10128 * indent-tabs-mode: t
10131 * ex: set ts=8 sts=4 sw=4 noet: