5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
75 #undef PERL_IN_XSUB_RE
76 #define PERL_IN_XSUB_RE 1
78 #undef PERL_IN_XSUB_RE
80 #ifndef PERL_IN_XSUB_RE
85 #ifdef PERL_IN_XSUB_RE
96 # if defined(BUGGY_MSC6)
97 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
98 # pragma optimize("a",off)
99 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
100 # pragma optimize("w",on )
101 # endif /* BUGGY_MSC6 */
105 #define STATIC static
108 typedef struct RExC_state_t {
109 U32 flags; /* are we folding, multilining? */
110 char *precomp; /* uncompiled string. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 HV *charnames; /* cache of named sequences */
138 HV *paren_names; /* Paren names */
140 regnode **recurse; /* Recurse regops */
141 I32 recurse_count; /* Number of recurse regops */
143 char *starttry; /* -Dr: where regtry was called. */
144 #define RExC_starttry (pRExC_state->starttry)
147 const char *lastparse;
149 AV *paren_name_list; /* idx -> name */
150 #define RExC_lastparse (pRExC_state->lastparse)
151 #define RExC_lastnum (pRExC_state->lastnum)
152 #define RExC_paren_name_list (pRExC_state->paren_name_list)
156 #define RExC_flags (pRExC_state->flags)
157 #define RExC_precomp (pRExC_state->precomp)
158 #define RExC_rx (pRExC_state->rx)
159 #define RExC_rxi (pRExC_state->rxi)
160 #define RExC_start (pRExC_state->start)
161 #define RExC_end (pRExC_state->end)
162 #define RExC_parse (pRExC_state->parse)
163 #define RExC_whilem_seen (pRExC_state->whilem_seen)
164 #ifdef RE_TRACK_PATTERN_OFFSETS
165 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
167 #define RExC_emit (pRExC_state->emit)
168 #define RExC_emit_start (pRExC_state->emit_start)
169 #define RExC_emit_bound (pRExC_state->emit_bound)
170 #define RExC_naughty (pRExC_state->naughty)
171 #define RExC_sawback (pRExC_state->sawback)
172 #define RExC_seen (pRExC_state->seen)
173 #define RExC_size (pRExC_state->size)
174 #define RExC_npar (pRExC_state->npar)
175 #define RExC_nestroot (pRExC_state->nestroot)
176 #define RExC_extralen (pRExC_state->extralen)
177 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178 #define RExC_seen_evals (pRExC_state->seen_evals)
179 #define RExC_utf8 (pRExC_state->utf8)
180 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
181 #define RExC_charnames (pRExC_state->charnames)
182 #define RExC_open_parens (pRExC_state->open_parens)
183 #define RExC_close_parens (pRExC_state->close_parens)
184 #define RExC_opend (pRExC_state->opend)
185 #define RExC_paren_names (pRExC_state->paren_names)
186 #define RExC_recurse (pRExC_state->recurse)
187 #define RExC_recurse_count (pRExC_state->recurse_count)
190 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
191 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
192 ((*s) == '{' && regcurly(s)))
195 #undef SPSTART /* dratted cpp namespace... */
198 * Flags to be passed up and down.
200 #define WORST 0 /* Worst case. */
201 #define HASWIDTH 0x01 /* Known to match non-null strings. */
202 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
203 #define SPSTART 0x04 /* Starts with * or +. */
204 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
205 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
207 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
209 /* whether trie related optimizations are enabled */
210 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
211 #define TRIE_STUDY_OPT
212 #define FULL_TRIE_STUDY
218 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
219 #define PBITVAL(paren) (1 << ((paren) & 7))
220 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
221 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
222 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225 /* About scan_data_t.
227 During optimisation we recurse through the regexp program performing
228 various inplace (keyhole style) optimisations. In addition study_chunk
229 and scan_commit populate this data structure with information about
230 what strings MUST appear in the pattern. We look for the longest
231 string that must appear for at a fixed location, and we look for the
232 longest string that may appear at a floating location. So for instance
237 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
238 strings (because they follow a .* construct). study_chunk will identify
239 both FOO and BAR as being the longest fixed and floating strings respectively.
241 The strings can be composites, for instance
245 will result in a composite fixed substring 'foo'.
247 For each string some basic information is maintained:
249 - offset or min_offset
250 This is the position the string must appear at, or not before.
251 It also implicitly (when combined with minlenp) tells us how many
252 character must match before the string we are searching.
253 Likewise when combined with minlenp and the length of the string
254 tells us how many characters must appear after the string we have
258 Only used for floating strings. This is the rightmost point that
259 the string can appear at. Ifset to I32 max it indicates that the
260 string can occur infinitely far to the right.
263 A pointer to the minimum length of the pattern that the string
264 was found inside. This is important as in the case of positive
265 lookahead or positive lookbehind we can have multiple patterns
270 The minimum length of the pattern overall is 3, the minimum length
271 of the lookahead part is 3, but the minimum length of the part that
272 will actually match is 1. So 'FOO's minimum length is 3, but the
273 minimum length for the F is 1. This is important as the minimum length
274 is used to determine offsets in front of and behind the string being
275 looked for. Since strings can be composites this is the length of the
276 pattern at the time it was commited with a scan_commit. Note that
277 the length is calculated by study_chunk, so that the minimum lengths
278 are not known until the full pattern has been compiled, thus the
279 pointer to the value.
283 In the case of lookbehind the string being searched for can be
284 offset past the start point of the final matching string.
285 If this value was just blithely removed from the min_offset it would
286 invalidate some of the calculations for how many chars must match
287 before or after (as they are derived from min_offset and minlen and
288 the length of the string being searched for).
289 When the final pattern is compiled and the data is moved from the
290 scan_data_t structure into the regexp structure the information
291 about lookbehind is factored in, with the information that would
292 have been lost precalculated in the end_shift field for the
295 The fields pos_min and pos_delta are used to store the minimum offset
296 and the delta to the maximum offset at the current point in the pattern.
300 typedef struct scan_data_t {
301 /*I32 len_min; unused */
302 /*I32 len_delta; unused */
306 I32 last_end; /* min value, <0 unless valid. */
309 SV **longest; /* Either &l_fixed, or &l_float. */
310 SV *longest_fixed; /* longest fixed string found in pattern */
311 I32 offset_fixed; /* offset where it starts */
312 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
313 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
314 SV *longest_float; /* longest floating string found in pattern */
315 I32 offset_float_min; /* earliest point in string it can appear */
316 I32 offset_float_max; /* latest point in string it can appear */
317 I32 *minlen_float; /* pointer to the minlen relevent to the string */
318 I32 lookbehind_float; /* is the position of the string modified by LB */
322 struct regnode_charclass_class *start_class;
326 * Forward declarations for pregcomp()'s friends.
329 static const scan_data_t zero_scan_data =
330 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
332 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
333 #define SF_BEFORE_SEOL 0x0001
334 #define SF_BEFORE_MEOL 0x0002
335 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
336 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
339 # define SF_FIX_SHIFT_EOL (0+2)
340 # define SF_FL_SHIFT_EOL (0+4)
342 # define SF_FIX_SHIFT_EOL (+2)
343 # define SF_FL_SHIFT_EOL (+4)
346 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
347 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
349 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
350 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
351 #define SF_IS_INF 0x0040
352 #define SF_HAS_PAR 0x0080
353 #define SF_IN_PAR 0x0100
354 #define SF_HAS_EVAL 0x0200
355 #define SCF_DO_SUBSTR 0x0400
356 #define SCF_DO_STCLASS_AND 0x0800
357 #define SCF_DO_STCLASS_OR 0x1000
358 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
359 #define SCF_WHILEM_VISITED_POS 0x2000
361 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
362 #define SCF_SEEN_ACCEPT 0x8000
364 #define UTF (RExC_utf8 != 0)
365 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
366 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
368 #define OOB_UNICODE 12345678
369 #define OOB_NAMEDCLASS -1
371 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
372 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
375 /* length of regex to show in messages that don't mark a position within */
376 #define RegexLengthToShowInErrorMessages 127
379 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
380 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
381 * op/pragma/warn/regcomp.
383 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
384 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
386 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
389 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
390 * arg. Show regex, up to a maximum length. If it's too long, chop and add
393 #define _FAIL(code) STMT_START { \
394 const char *ellipses = ""; \
395 IV len = RExC_end - RExC_precomp; \
398 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
399 if (len > RegexLengthToShowInErrorMessages) { \
400 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
401 len = RegexLengthToShowInErrorMessages - 10; \
407 #define FAIL(msg) _FAIL( \
408 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
409 msg, (int)len, RExC_precomp, ellipses))
411 #define FAIL2(msg,arg) _FAIL( \
412 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
413 arg, (int)len, RExC_precomp, ellipses))
416 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
418 #define Simple_vFAIL(m) STMT_START { \
419 const IV offset = RExC_parse - RExC_precomp; \
420 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
421 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
427 #define vFAIL(m) STMT_START { \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
434 * Like Simple_vFAIL(), but accepts two arguments.
436 #define Simple_vFAIL2(m,a1) STMT_START { \
437 const IV offset = RExC_parse - RExC_precomp; \
438 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
439 (int)offset, RExC_precomp, RExC_precomp + offset); \
443 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
445 #define vFAIL2(m,a1) STMT_START { \
447 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
448 Simple_vFAIL2(m, a1); \
453 * Like Simple_vFAIL(), but accepts three arguments.
455 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
456 const IV offset = RExC_parse - RExC_precomp; \
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
462 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
464 #define vFAIL3(m,a1,a2) STMT_START { \
466 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
467 Simple_vFAIL3(m, a1, a2); \
471 * Like Simple_vFAIL(), but accepts four arguments.
473 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
474 const IV offset = RExC_parse - RExC_precomp; \
475 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
476 (int)offset, RExC_precomp, RExC_precomp + offset); \
479 #define vWARN(loc,m) STMT_START { \
480 const IV offset = loc - RExC_precomp; \
481 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
482 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
485 #define vWARNdep(loc,m) STMT_START { \
486 const IV offset = loc - RExC_precomp; \
487 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
488 "%s" REPORT_LOCATION, \
489 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define vWARN2(loc, m, a1) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
499 #define vWARN3(loc, m, a1, a2) STMT_START { \
500 const IV offset = loc - RExC_precomp; \
501 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
502 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
505 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
506 const IV offset = loc - RExC_precomp; \
507 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
508 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
511 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
512 const IV offset = loc - RExC_precomp; \
513 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
514 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
518 /* Allow for side effects in s */
519 #define REGC(c,s) STMT_START { \
520 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
523 /* Macros for recording node offsets. 20001227 mjd@plover.com
524 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
525 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
526 * Element 0 holds the number n.
527 * Position is 1 indexed.
529 #ifndef RE_TRACK_PATTERN_OFFSETS
530 #define Set_Node_Offset_To_R(node,byte)
531 #define Set_Node_Offset(node,byte)
532 #define Set_Cur_Node_Offset
533 #define Set_Node_Length_To_R(node,len)
534 #define Set_Node_Length(node,len)
535 #define Set_Node_Cur_Length(node)
536 #define Node_Offset(n)
537 #define Node_Length(n)
538 #define Set_Node_Offset_Length(node,offset,len)
539 #define ProgLen(ri) ri->u.proglen
540 #define SetProgLen(ri,x) ri->u.proglen = x
542 #define ProgLen(ri) ri->u.offsets[0]
543 #define SetProgLen(ri,x) ri->u.offsets[0] = x
544 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
546 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
547 __LINE__, (int)(node), (int)(byte))); \
549 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
551 RExC_offsets[2*(node)-1] = (byte); \
556 #define Set_Node_Offset(node,byte) \
557 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
558 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
560 #define Set_Node_Length_To_R(node,len) STMT_START { \
562 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
563 __LINE__, (int)(node), (int)(len))); \
565 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
567 RExC_offsets[2*(node)] = (len); \
572 #define Set_Node_Length(node,len) \
573 Set_Node_Length_To_R((node)-RExC_emit_start, len)
574 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
575 #define Set_Node_Cur_Length(node) \
576 Set_Node_Length(node, RExC_parse - parse_start)
578 /* Get offsets and lengths */
579 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
580 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
582 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
583 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
584 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
588 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
589 #define EXPERIMENTAL_INPLACESCAN
590 #endif /*RE_TRACK_PATTERN_OFFSETS*/
592 #define DEBUG_STUDYDATA(str,data,depth) \
593 DEBUG_OPTIMISE_MORE_r(if(data){ \
594 PerlIO_printf(Perl_debug_log, \
595 "%*s" str "Pos:%"IVdf"/%"IVdf \
596 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
597 (int)(depth)*2, "", \
598 (IV)((data)->pos_min), \
599 (IV)((data)->pos_delta), \
600 (UV)((data)->flags), \
601 (IV)((data)->whilem_c), \
602 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
603 is_inf ? "INF " : "" \
605 if ((data)->last_found) \
606 PerlIO_printf(Perl_debug_log, \
607 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
608 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
609 SvPVX_const((data)->last_found), \
610 (IV)((data)->last_end), \
611 (IV)((data)->last_start_min), \
612 (IV)((data)->last_start_max), \
613 ((data)->longest && \
614 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
615 SvPVX_const((data)->longest_fixed), \
616 (IV)((data)->offset_fixed), \
617 ((data)->longest && \
618 (data)->longest==&((data)->longest_float)) ? "*" : "", \
619 SvPVX_const((data)->longest_float), \
620 (IV)((data)->offset_float_min), \
621 (IV)((data)->offset_float_max) \
623 PerlIO_printf(Perl_debug_log,"\n"); \
626 static void clear_re(pTHX_ void *r);
628 /* Mark that we cannot extend a found fixed substring at this point.
629 Update the longest found anchored substring and the longest found
630 floating substrings if needed. */
633 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
635 const STRLEN l = CHR_SVLEN(data->last_found);
636 const STRLEN old_l = CHR_SVLEN(*data->longest);
637 GET_RE_DEBUG_FLAGS_DECL;
639 PERL_ARGS_ASSERT_SCAN_COMMIT;
641 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
642 SvSetMagicSV(*data->longest, data->last_found);
643 if (*data->longest == data->longest_fixed) {
644 data->offset_fixed = l ? data->last_start_min : data->pos_min;
645 if (data->flags & SF_BEFORE_EOL)
647 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
649 data->flags &= ~SF_FIX_BEFORE_EOL;
650 data->minlen_fixed=minlenp;
651 data->lookbehind_fixed=0;
653 else { /* *data->longest == data->longest_float */
654 data->offset_float_min = l ? data->last_start_min : data->pos_min;
655 data->offset_float_max = (l
656 ? data->last_start_max
657 : data->pos_min + data->pos_delta);
658 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
659 data->offset_float_max = I32_MAX;
660 if (data->flags & SF_BEFORE_EOL)
662 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
664 data->flags &= ~SF_FL_BEFORE_EOL;
665 data->minlen_float=minlenp;
666 data->lookbehind_float=0;
669 SvCUR_set(data->last_found, 0);
671 SV * const sv = data->last_found;
672 if (SvUTF8(sv) && SvMAGICAL(sv)) {
673 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
679 data->flags &= ~SF_BEFORE_EOL;
680 DEBUG_STUDYDATA("commit: ",data,0);
683 /* Can match anything (initialization) */
685 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
687 PERL_ARGS_ASSERT_CL_ANYTHING;
689 ANYOF_CLASS_ZERO(cl);
690 ANYOF_BITMAP_SETALL(cl);
691 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
693 cl->flags |= ANYOF_LOCALE;
696 /* Can match anything (initialization) */
698 S_cl_is_anything(const struct regnode_charclass_class *cl)
702 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
704 for (value = 0; value <= ANYOF_MAX; value += 2)
705 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
707 if (!(cl->flags & ANYOF_UNICODE_ALL))
709 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
714 /* Can match anything (initialization) */
716 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
718 PERL_ARGS_ASSERT_CL_INIT;
720 Zero(cl, 1, struct regnode_charclass_class);
722 cl_anything(pRExC_state, cl);
726 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
728 PERL_ARGS_ASSERT_CL_INIT_ZERO;
730 Zero(cl, 1, struct regnode_charclass_class);
732 cl_anything(pRExC_state, cl);
734 cl->flags |= ANYOF_LOCALE;
737 /* 'And' a given class with another one. Can create false positives */
738 /* We assume that cl is not inverted */
740 S_cl_and(struct regnode_charclass_class *cl,
741 const struct regnode_charclass_class *and_with)
743 PERL_ARGS_ASSERT_CL_AND;
745 assert(and_with->type == ANYOF);
746 if (!(and_with->flags & ANYOF_CLASS)
747 && !(cl->flags & ANYOF_CLASS)
748 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(and_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD)) {
753 if (and_with->flags & ANYOF_INVERT)
754 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
755 cl->bitmap[i] &= ~and_with->bitmap[i];
757 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
758 cl->bitmap[i] &= and_with->bitmap[i];
759 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
760 if (!(and_with->flags & ANYOF_EOS))
761 cl->flags &= ~ANYOF_EOS;
763 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
764 !(and_with->flags & ANYOF_INVERT)) {
765 cl->flags &= ~ANYOF_UNICODE_ALL;
766 cl->flags |= ANYOF_UNICODE;
767 ARG_SET(cl, ARG(and_with));
769 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
770 !(and_with->flags & ANYOF_INVERT))
771 cl->flags &= ~ANYOF_UNICODE_ALL;
772 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
773 !(and_with->flags & ANYOF_INVERT))
774 cl->flags &= ~ANYOF_UNICODE;
777 /* 'OR' a given class with another one. Can create false positives */
778 /* We assume that cl is not inverted */
780 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
782 PERL_ARGS_ASSERT_CL_OR;
784 if (or_with->flags & ANYOF_INVERT) {
786 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
787 * <= (B1 | !B2) | (CL1 | !CL2)
788 * which is wasteful if CL2 is small, but we ignore CL2:
789 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
790 * XXXX Can we handle case-fold? Unclear:
791 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
792 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
794 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
795 && !(or_with->flags & ANYOF_FOLD)
796 && !(cl->flags & ANYOF_FOLD) ) {
799 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
800 cl->bitmap[i] |= ~or_with->bitmap[i];
801 } /* XXXX: logic is complicated otherwise */
803 cl_anything(pRExC_state, cl);
806 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
807 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
808 && (!(or_with->flags & ANYOF_FOLD)
809 || (cl->flags & ANYOF_FOLD)) ) {
812 /* OR char bitmap and class bitmap separately */
813 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
814 cl->bitmap[i] |= or_with->bitmap[i];
815 if (or_with->flags & ANYOF_CLASS) {
816 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
817 cl->classflags[i] |= or_with->classflags[i];
818 cl->flags |= ANYOF_CLASS;
821 else { /* XXXX: logic is complicated, leave it along for a moment. */
822 cl_anything(pRExC_state, cl);
825 if (or_with->flags & ANYOF_EOS)
826 cl->flags |= ANYOF_EOS;
828 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
829 ARG(cl) != ARG(or_with)) {
830 cl->flags |= ANYOF_UNICODE_ALL;
831 cl->flags &= ~ANYOF_UNICODE;
833 if (or_with->flags & ANYOF_UNICODE_ALL) {
834 cl->flags |= ANYOF_UNICODE_ALL;
835 cl->flags &= ~ANYOF_UNICODE;
839 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
840 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
841 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
842 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
847 dump_trie(trie,widecharmap,revcharmap)
848 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
849 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
851 These routines dump out a trie in a somewhat readable format.
852 The _interim_ variants are used for debugging the interim
853 tables that are used to generate the final compressed
854 representation which is what dump_trie expects.
856 Part of the reason for their existance is to provide a form
857 of documentation as to how the different representations function.
862 Dumps the final compressed table form of the trie to Perl_debug_log.
863 Used for debugging make_trie().
867 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
868 AV *revcharmap, U32 depth)
871 SV *sv=sv_newmortal();
872 int colwidth= widecharmap ? 6 : 4;
873 GET_RE_DEBUG_FLAGS_DECL;
875 PERL_ARGS_ASSERT_DUMP_TRIE;
877 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
878 (int)depth * 2 + 2,"",
879 "Match","Base","Ofs" );
881 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
882 SV ** const tmp = av_fetch( revcharmap, state, 0);
884 PerlIO_printf( Perl_debug_log, "%*s",
886 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
887 PL_colors[0], PL_colors[1],
888 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
889 PERL_PV_ESCAPE_FIRSTCHAR
894 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
895 (int)depth * 2 + 2,"");
897 for( state = 0 ; state < trie->uniquecharcount ; state++ )
898 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
899 PerlIO_printf( Perl_debug_log, "\n");
901 for( state = 1 ; state < trie->statecount ; state++ ) {
902 const U32 base = trie->states[ state ].trans.base;
904 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
906 if ( trie->states[ state ].wordnum ) {
907 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
909 PerlIO_printf( Perl_debug_log, "%6s", "" );
912 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
917 while( ( base + ofs < trie->uniquecharcount ) ||
918 ( base + ofs - trie->uniquecharcount < trie->lasttrans
919 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
922 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
924 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
925 if ( ( base + ofs >= trie->uniquecharcount ) &&
926 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
927 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
929 PerlIO_printf( Perl_debug_log, "%*"UVXf,
931 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
933 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
937 PerlIO_printf( Perl_debug_log, "]");
940 PerlIO_printf( Perl_debug_log, "\n" );
944 Dumps a fully constructed but uncompressed trie in list form.
945 List tries normally only are used for construction when the number of
946 possible chars (trie->uniquecharcount) is very high.
947 Used for debugging make_trie().
950 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
951 HV *widecharmap, AV *revcharmap, U32 next_alloc,
955 SV *sv=sv_newmortal();
956 int colwidth= widecharmap ? 6 : 4;
957 GET_RE_DEBUG_FLAGS_DECL;
959 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
961 /* print out the table precompression. */
962 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
963 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
964 "------:-----+-----------------\n" );
966 for( state=1 ; state < next_alloc ; state ++ ) {
969 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
970 (int)depth * 2 + 2,"", (UV)state );
971 if ( ! trie->states[ state ].wordnum ) {
972 PerlIO_printf( Perl_debug_log, "%5s| ","");
974 PerlIO_printf( Perl_debug_log, "W%4x| ",
975 trie->states[ state ].wordnum
978 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
979 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
981 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
983 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
984 PL_colors[0], PL_colors[1],
985 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
986 PERL_PV_ESCAPE_FIRSTCHAR
988 TRIE_LIST_ITEM(state,charid).forid,
989 (UV)TRIE_LIST_ITEM(state,charid).newstate
992 PerlIO_printf(Perl_debug_log, "\n%*s| ",
993 (int)((depth * 2) + 14), "");
996 PerlIO_printf( Perl_debug_log, "\n");
1001 Dumps a fully constructed but uncompressed trie in table form.
1002 This is the normal DFA style state transition table, with a few
1003 twists to facilitate compression later.
1004 Used for debugging make_trie().
1007 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1008 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1013 SV *sv=sv_newmortal();
1014 int colwidth= widecharmap ? 6 : 4;
1015 GET_RE_DEBUG_FLAGS_DECL;
1017 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1020 print out the table precompression so that we can do a visual check
1021 that they are identical.
1024 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1026 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1027 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1029 PerlIO_printf( Perl_debug_log, "%*s",
1031 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1032 PL_colors[0], PL_colors[1],
1033 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1034 PERL_PV_ESCAPE_FIRSTCHAR
1040 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1042 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1043 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1046 PerlIO_printf( Perl_debug_log, "\n" );
1048 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1050 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1051 (int)depth * 2 + 2,"",
1052 (UV)TRIE_NODENUM( state ) );
1054 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1055 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1057 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1059 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1061 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1062 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1064 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1065 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1072 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1073 startbranch: the first branch in the whole branch sequence
1074 first : start branch of sequence of branch-exact nodes.
1075 May be the same as startbranch
1076 last : Thing following the last branch.
1077 May be the same as tail.
1078 tail : item following the branch sequence
1079 count : words in the sequence
1080 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1081 depth : indent depth
1083 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1085 A trie is an N'ary tree where the branches are determined by digital
1086 decomposition of the key. IE, at the root node you look up the 1st character and
1087 follow that branch repeat until you find the end of the branches. Nodes can be
1088 marked as "accepting" meaning they represent a complete word. Eg:
1092 would convert into the following structure. Numbers represent states, letters
1093 following numbers represent valid transitions on the letter from that state, if
1094 the number is in square brackets it represents an accepting state, otherwise it
1095 will be in parenthesis.
1097 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1101 (1) +-i->(6)-+-s->[7]
1103 +-s->(3)-+-h->(4)-+-e->[5]
1105 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1107 This shows that when matching against the string 'hers' we will begin at state 1
1108 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1109 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1110 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1111 single traverse. We store a mapping from accepting to state to which word was
1112 matched, and then when we have multiple possibilities we try to complete the
1113 rest of the regex in the order in which they occured in the alternation.
1115 The only prior NFA like behaviour that would be changed by the TRIE support is
1116 the silent ignoring of duplicate alternations which are of the form:
1118 / (DUPE|DUPE) X? (?{ ... }) Y /x
1120 Thus EVAL blocks follwing a trie may be called a different number of times with
1121 and without the optimisation. With the optimisations dupes will be silently
1122 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1123 the following demonstrates:
1125 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1127 which prints out 'word' three times, but
1129 'words'=~/(word|word|word)(?{ print $1 })S/
1131 which doesnt print it out at all. This is due to other optimisations kicking in.
1133 Example of what happens on a structural level:
1135 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1137 1: CURLYM[1] {1,32767}(18)
1148 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1149 and should turn into:
1151 1: CURLYM[1] {1,32767}(18)
1153 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1161 Cases where tail != last would be like /(?foo|bar)baz/:
1171 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1172 and would end up looking like:
1175 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1182 d = uvuni_to_utf8_flags(d, uv, 0);
1184 is the recommended Unicode-aware way of saying
1189 #define TRIE_STORE_REVCHAR \
1192 SV *zlopp = newSV(2); \
1193 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1194 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1195 SvCUR_set(zlopp, kapow - flrbbbbb); \
1198 av_push(revcharmap, zlopp); \
1200 char ooooff = (char)uvc; \
1201 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1205 #define TRIE_READ_CHAR STMT_START { \
1209 if ( foldlen > 0 ) { \
1210 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1215 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1216 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1217 foldlen -= UNISKIP( uvc ); \
1218 scan = foldbuf + UNISKIP( uvc ); \
1221 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1231 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1232 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1233 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1234 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1236 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1237 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1238 TRIE_LIST_CUR( state )++; \
1241 #define TRIE_LIST_NEW(state) STMT_START { \
1242 Newxz( trie->states[ state ].trans.list, \
1243 4, reg_trie_trans_le ); \
1244 TRIE_LIST_CUR( state ) = 1; \
1245 TRIE_LIST_LEN( state ) = 4; \
1248 #define TRIE_HANDLE_WORD(state) STMT_START { \
1249 U16 dupe= trie->states[ state ].wordnum; \
1250 regnode * const noper_next = regnext( noper ); \
1252 if (trie->wordlen) \
1253 trie->wordlen[ curword ] = wordlen; \
1255 /* store the word for dumping */ \
1257 if (OP(noper) != NOTHING) \
1258 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1260 tmp = newSVpvn_utf8( "", 0, UTF ); \
1261 av_push( trie_words, tmp ); \
1266 if ( noper_next < tail ) { \
1268 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1269 trie->jump[curword] = (U16)(noper_next - convert); \
1271 jumper = noper_next; \
1273 nextbranch= regnext(cur); \
1277 /* So it's a dupe. This means we need to maintain a */\
1278 /* linked-list from the first to the next. */\
1279 /* we only allocate the nextword buffer when there */\
1280 /* a dupe, so first time we have to do the allocation */\
1281 if (!trie->nextword) \
1282 trie->nextword = (U16 *) \
1283 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1284 while ( trie->nextword[dupe] ) \
1285 dupe= trie->nextword[dupe]; \
1286 trie->nextword[dupe]= curword; \
1288 /* we haven't inserted this word yet. */ \
1289 trie->states[ state ].wordnum = curword; \
1294 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1295 ( ( base + charid >= ucharcount \
1296 && base + charid < ubound \
1297 && state == trie->trans[ base - ucharcount + charid ].check \
1298 && trie->trans[ base - ucharcount + charid ].next ) \
1299 ? trie->trans[ base - ucharcount + charid ].next \
1300 : ( state==1 ? special : 0 ) \
1304 #define MADE_JUMP_TRIE 2
1305 #define MADE_EXACT_TRIE 4
1308 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1311 /* first pass, loop through and scan words */
1312 reg_trie_data *trie;
1313 HV *widecharmap = NULL;
1314 AV *revcharmap = newAV();
1316 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1321 regnode *jumper = NULL;
1322 regnode *nextbranch = NULL;
1323 regnode *convert = NULL;
1324 /* we just use folder as a flag in utf8 */
1325 const U8 * const folder = ( flags == EXACTF
1327 : ( flags == EXACTFL
1334 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1335 AV *trie_words = NULL;
1336 /* along with revcharmap, this only used during construction but both are
1337 * useful during debugging so we store them in the struct when debugging.
1340 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1341 STRLEN trie_charcount=0;
1343 SV *re_trie_maxbuff;
1344 GET_RE_DEBUG_FLAGS_DECL;
1346 PERL_ARGS_ASSERT_MAKE_TRIE;
1348 PERL_UNUSED_ARG(depth);
1351 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1353 trie->startstate = 1;
1354 trie->wordcount = word_count;
1355 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1356 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1357 if (!(UTF && folder))
1358 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1360 trie_words = newAV();
1363 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1364 if (!SvIOK(re_trie_maxbuff)) {
1365 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1368 PerlIO_printf( Perl_debug_log,
1369 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1370 (int)depth * 2 + 2, "",
1371 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1372 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1376 /* Find the node we are going to overwrite */
1377 if ( first == startbranch && OP( last ) != BRANCH ) {
1378 /* whole branch chain */
1381 /* branch sub-chain */
1382 convert = NEXTOPER( first );
1385 /* -- First loop and Setup --
1387 We first traverse the branches and scan each word to determine if it
1388 contains widechars, and how many unique chars there are, this is
1389 important as we have to build a table with at least as many columns as we
1392 We use an array of integers to represent the character codes 0..255
1393 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1394 native representation of the character value as the key and IV's for the
1397 *TODO* If we keep track of how many times each character is used we can
1398 remap the columns so that the table compression later on is more
1399 efficient in terms of memory by ensuring most common value is in the
1400 middle and the least common are on the outside. IMO this would be better
1401 than a most to least common mapping as theres a decent chance the most
1402 common letter will share a node with the least common, meaning the node
1403 will not be compressable. With a middle is most common approach the worst
1404 case is when we have the least common nodes twice.
1408 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1409 regnode * const noper = NEXTOPER( cur );
1410 const U8 *uc = (U8*)STRING( noper );
1411 const U8 * const e = uc + STR_LEN( noper );
1413 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1414 const U8 *scan = (U8*)NULL;
1415 U32 wordlen = 0; /* required init */
1417 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1419 if (OP(noper) == NOTHING) {
1423 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1424 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1425 regardless of encoding */
1427 for ( ; uc < e ; uc += len ) {
1428 TRIE_CHARCOUNT(trie)++;
1432 if ( !trie->charmap[ uvc ] ) {
1433 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1435 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1439 /* store the codepoint in the bitmap, and if its ascii
1440 also store its folded equivelent. */
1441 TRIE_BITMAP_SET(trie,uvc);
1443 /* store the folded codepoint */
1444 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1447 /* store first byte of utf8 representation of
1448 codepoints in the 127 < uvc < 256 range */
1449 if (127 < uvc && uvc < 192) {
1450 TRIE_BITMAP_SET(trie,194);
1451 } else if (191 < uvc ) {
1452 TRIE_BITMAP_SET(trie,195);
1453 /* && uvc < 256 -- we know uvc is < 256 already */
1456 set_bit = 0; /* We've done our bit :-) */
1461 widecharmap = newHV();
1463 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1466 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1468 if ( !SvTRUE( *svpp ) ) {
1469 sv_setiv( *svpp, ++trie->uniquecharcount );
1474 if( cur == first ) {
1477 } else if (chars < trie->minlen) {
1479 } else if (chars > trie->maxlen) {
1483 } /* end first pass */
1484 DEBUG_TRIE_COMPILE_r(
1485 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1486 (int)depth * 2 + 2,"",
1487 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1488 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1489 (int)trie->minlen, (int)trie->maxlen )
1491 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1494 We now know what we are dealing with in terms of unique chars and
1495 string sizes so we can calculate how much memory a naive
1496 representation using a flat table will take. If it's over a reasonable
1497 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1498 conservative but potentially much slower representation using an array
1501 At the end we convert both representations into the same compressed
1502 form that will be used in regexec.c for matching with. The latter
1503 is a form that cannot be used to construct with but has memory
1504 properties similar to the list form and access properties similar
1505 to the table form making it both suitable for fast searches and
1506 small enough that its feasable to store for the duration of a program.
1508 See the comment in the code where the compressed table is produced
1509 inplace from the flat tabe representation for an explanation of how
1510 the compression works.
1515 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1517 Second Pass -- Array Of Lists Representation
1519 Each state will be represented by a list of charid:state records
1520 (reg_trie_trans_le) the first such element holds the CUR and LEN
1521 points of the allocated array. (See defines above).
1523 We build the initial structure using the lists, and then convert
1524 it into the compressed table form which allows faster lookups
1525 (but cant be modified once converted).
1528 STRLEN transcount = 1;
1530 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1531 "%*sCompiling trie using list compiler\n",
1532 (int)depth * 2 + 2, ""));
1534 trie->states = (reg_trie_state *)
1535 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1536 sizeof(reg_trie_state) );
1540 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1542 regnode * const noper = NEXTOPER( cur );
1543 U8 *uc = (U8*)STRING( noper );
1544 const U8 * const e = uc + STR_LEN( noper );
1545 U32 state = 1; /* required init */
1546 U16 charid = 0; /* sanity init */
1547 U8 *scan = (U8*)NULL; /* sanity init */
1548 STRLEN foldlen = 0; /* required init */
1549 U32 wordlen = 0; /* required init */
1550 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1552 if (OP(noper) != NOTHING) {
1553 for ( ; uc < e ; uc += len ) {
1558 charid = trie->charmap[ uvc ];
1560 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1564 charid=(U16)SvIV( *svpp );
1567 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1574 if ( !trie->states[ state ].trans.list ) {
1575 TRIE_LIST_NEW( state );
1577 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1578 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1579 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1584 newstate = next_alloc++;
1585 TRIE_LIST_PUSH( state, charid, newstate );
1590 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1594 TRIE_HANDLE_WORD(state);
1596 } /* end second pass */
1598 /* next alloc is the NEXT state to be allocated */
1599 trie->statecount = next_alloc;
1600 trie->states = (reg_trie_state *)
1601 PerlMemShared_realloc( trie->states,
1603 * sizeof(reg_trie_state) );
1605 /* and now dump it out before we compress it */
1606 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1607 revcharmap, next_alloc,
1611 trie->trans = (reg_trie_trans *)
1612 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1619 for( state=1 ; state < next_alloc ; state ++ ) {
1623 DEBUG_TRIE_COMPILE_MORE_r(
1624 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1628 if (trie->states[state].trans.list) {
1629 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1633 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1634 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1635 if ( forid < minid ) {
1637 } else if ( forid > maxid ) {
1641 if ( transcount < tp + maxid - minid + 1) {
1643 trie->trans = (reg_trie_trans *)
1644 PerlMemShared_realloc( trie->trans,
1646 * sizeof(reg_trie_trans) );
1647 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1649 base = trie->uniquecharcount + tp - minid;
1650 if ( maxid == minid ) {
1652 for ( ; zp < tp ; zp++ ) {
1653 if ( ! trie->trans[ zp ].next ) {
1654 base = trie->uniquecharcount + zp - minid;
1655 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1656 trie->trans[ zp ].check = state;
1662 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1663 trie->trans[ tp ].check = state;
1668 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1669 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1670 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1671 trie->trans[ tid ].check = state;
1673 tp += ( maxid - minid + 1 );
1675 Safefree(trie->states[ state ].trans.list);
1678 DEBUG_TRIE_COMPILE_MORE_r(
1679 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1682 trie->states[ state ].trans.base=base;
1684 trie->lasttrans = tp + 1;
1688 Second Pass -- Flat Table Representation.
1690 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1691 We know that we will need Charcount+1 trans at most to store the data
1692 (one row per char at worst case) So we preallocate both structures
1693 assuming worst case.
1695 We then construct the trie using only the .next slots of the entry
1698 We use the .check field of the first entry of the node temporarily to
1699 make compression both faster and easier by keeping track of how many non
1700 zero fields are in the node.
1702 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1705 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1706 number representing the first entry of the node, and state as a
1707 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1708 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1709 are 2 entrys per node. eg:
1717 The table is internally in the right hand, idx form. However as we also
1718 have to deal with the states array which is indexed by nodenum we have to
1719 use TRIE_NODENUM() to convert.
1722 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1723 "%*sCompiling trie using table compiler\n",
1724 (int)depth * 2 + 2, ""));
1726 trie->trans = (reg_trie_trans *)
1727 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1728 * trie->uniquecharcount + 1,
1729 sizeof(reg_trie_trans) );
1730 trie->states = (reg_trie_state *)
1731 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1732 sizeof(reg_trie_state) );
1733 next_alloc = trie->uniquecharcount + 1;
1736 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1738 regnode * const noper = NEXTOPER( cur );
1739 const U8 *uc = (U8*)STRING( noper );
1740 const U8 * const e = uc + STR_LEN( noper );
1742 U32 state = 1; /* required init */
1744 U16 charid = 0; /* sanity init */
1745 U32 accept_state = 0; /* sanity init */
1746 U8 *scan = (U8*)NULL; /* sanity init */
1748 STRLEN foldlen = 0; /* required init */
1749 U32 wordlen = 0; /* required init */
1750 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1752 if ( OP(noper) != NOTHING ) {
1753 for ( ; uc < e ; uc += len ) {
1758 charid = trie->charmap[ uvc ];
1760 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1761 charid = svpp ? (U16)SvIV(*svpp) : 0;
1765 if ( !trie->trans[ state + charid ].next ) {
1766 trie->trans[ state + charid ].next = next_alloc;
1767 trie->trans[ state ].check++;
1768 next_alloc += trie->uniquecharcount;
1770 state = trie->trans[ state + charid ].next;
1772 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1774 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1777 accept_state = TRIE_NODENUM( state );
1778 TRIE_HANDLE_WORD(accept_state);
1780 } /* end second pass */
1782 /* and now dump it out before we compress it */
1783 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1785 next_alloc, depth+1));
1789 * Inplace compress the table.*
1791 For sparse data sets the table constructed by the trie algorithm will
1792 be mostly 0/FAIL transitions or to put it another way mostly empty.
1793 (Note that leaf nodes will not contain any transitions.)
1795 This algorithm compresses the tables by eliminating most such
1796 transitions, at the cost of a modest bit of extra work during lookup:
1798 - Each states[] entry contains a .base field which indicates the
1799 index in the state[] array wheres its transition data is stored.
1801 - If .base is 0 there are no valid transitions from that node.
1803 - If .base is nonzero then charid is added to it to find an entry in
1806 -If trans[states[state].base+charid].check!=state then the
1807 transition is taken to be a 0/Fail transition. Thus if there are fail
1808 transitions at the front of the node then the .base offset will point
1809 somewhere inside the previous nodes data (or maybe even into a node
1810 even earlier), but the .check field determines if the transition is
1814 The following process inplace converts the table to the compressed
1815 table: We first do not compress the root node 1,and mark its all its
1816 .check pointers as 1 and set its .base pointer as 1 as well. This
1817 allows to do a DFA construction from the compressed table later, and
1818 ensures that any .base pointers we calculate later are greater than
1821 - We set 'pos' to indicate the first entry of the second node.
1823 - We then iterate over the columns of the node, finding the first and
1824 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1825 and set the .check pointers accordingly, and advance pos
1826 appropriately and repreat for the next node. Note that when we copy
1827 the next pointers we have to convert them from the original
1828 NODEIDX form to NODENUM form as the former is not valid post
1831 - If a node has no transitions used we mark its base as 0 and do not
1832 advance the pos pointer.
1834 - If a node only has one transition we use a second pointer into the
1835 structure to fill in allocated fail transitions from other states.
1836 This pointer is independent of the main pointer and scans forward
1837 looking for null transitions that are allocated to a state. When it
1838 finds one it writes the single transition into the "hole". If the
1839 pointer doesnt find one the single transition is appended as normal.
1841 - Once compressed we can Renew/realloc the structures to release the
1844 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1845 specifically Fig 3.47 and the associated pseudocode.
1849 const U32 laststate = TRIE_NODENUM( next_alloc );
1852 trie->statecount = laststate;
1854 for ( state = 1 ; state < laststate ; state++ ) {
1856 const U32 stateidx = TRIE_NODEIDX( state );
1857 const U32 o_used = trie->trans[ stateidx ].check;
1858 U32 used = trie->trans[ stateidx ].check;
1859 trie->trans[ stateidx ].check = 0;
1861 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1862 if ( flag || trie->trans[ stateidx + charid ].next ) {
1863 if ( trie->trans[ stateidx + charid ].next ) {
1865 for ( ; zp < pos ; zp++ ) {
1866 if ( ! trie->trans[ zp ].next ) {
1870 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1871 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1872 trie->trans[ zp ].check = state;
1873 if ( ++zp > pos ) pos = zp;
1880 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1882 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1883 trie->trans[ pos ].check = state;
1888 trie->lasttrans = pos + 1;
1889 trie->states = (reg_trie_state *)
1890 PerlMemShared_realloc( trie->states, laststate
1891 * sizeof(reg_trie_state) );
1892 DEBUG_TRIE_COMPILE_MORE_r(
1893 PerlIO_printf( Perl_debug_log,
1894 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1895 (int)depth * 2 + 2,"",
1896 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1899 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1902 } /* end table compress */
1904 DEBUG_TRIE_COMPILE_MORE_r(
1905 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1906 (int)depth * 2 + 2, "",
1907 (UV)trie->statecount,
1908 (UV)trie->lasttrans)
1910 /* resize the trans array to remove unused space */
1911 trie->trans = (reg_trie_trans *)
1912 PerlMemShared_realloc( trie->trans, trie->lasttrans
1913 * sizeof(reg_trie_trans) );
1915 /* and now dump out the compressed format */
1916 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1918 { /* Modify the program and insert the new TRIE node*/
1919 U8 nodetype =(U8)(flags & 0xFF);
1923 regnode *optimize = NULL;
1924 #ifdef RE_TRACK_PATTERN_OFFSETS
1927 U32 mjd_nodelen = 0;
1928 #endif /* RE_TRACK_PATTERN_OFFSETS */
1929 #endif /* DEBUGGING */
1931 This means we convert either the first branch or the first Exact,
1932 depending on whether the thing following (in 'last') is a branch
1933 or not and whther first is the startbranch (ie is it a sub part of
1934 the alternation or is it the whole thing.)
1935 Assuming its a sub part we conver the EXACT otherwise we convert
1936 the whole branch sequence, including the first.
1938 /* Find the node we are going to overwrite */
1939 if ( first != startbranch || OP( last ) == BRANCH ) {
1940 /* branch sub-chain */
1941 NEXT_OFF( first ) = (U16)(last - first);
1942 #ifdef RE_TRACK_PATTERN_OFFSETS
1944 mjd_offset= Node_Offset((convert));
1945 mjd_nodelen= Node_Length((convert));
1948 /* whole branch chain */
1950 #ifdef RE_TRACK_PATTERN_OFFSETS
1953 const regnode *nop = NEXTOPER( convert );
1954 mjd_offset= Node_Offset((nop));
1955 mjd_nodelen= Node_Length((nop));
1959 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1960 (int)depth * 2 + 2, "",
1961 (UV)mjd_offset, (UV)mjd_nodelen)
1964 /* But first we check to see if there is a common prefix we can
1965 split out as an EXACT and put in front of the TRIE node. */
1966 trie->startstate= 1;
1967 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1969 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1973 const U32 base = trie->states[ state ].trans.base;
1975 if ( trie->states[state].wordnum )
1978 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1979 if ( ( base + ofs >= trie->uniquecharcount ) &&
1980 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1981 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1983 if ( ++count > 1 ) {
1984 SV **tmp = av_fetch( revcharmap, ofs, 0);
1985 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1986 if ( state == 1 ) break;
1988 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1990 PerlIO_printf(Perl_debug_log,
1991 "%*sNew Start State=%"UVuf" Class: [",
1992 (int)depth * 2 + 2, "",
1995 SV ** const tmp = av_fetch( revcharmap, idx, 0);
1996 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1998 TRIE_BITMAP_SET(trie,*ch);
2000 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2002 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2006 TRIE_BITMAP_SET(trie,*ch);
2008 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2009 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2015 SV **tmp = av_fetch( revcharmap, idx, 0);
2017 char *ch = SvPV( *tmp, len );
2019 SV *sv=sv_newmortal();
2020 PerlIO_printf( Perl_debug_log,
2021 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2022 (int)depth * 2 + 2, "",
2024 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2025 PL_colors[0], PL_colors[1],
2026 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2027 PERL_PV_ESCAPE_FIRSTCHAR
2032 OP( convert ) = nodetype;
2033 str=STRING(convert);
2036 STR_LEN(convert) += len;
2042 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2048 regnode *n = convert+NODE_SZ_STR(convert);
2049 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2050 trie->startstate = state;
2051 trie->minlen -= (state - 1);
2052 trie->maxlen -= (state - 1);
2054 /* At least the UNICOS C compiler choked on this
2055 * being argument to DEBUG_r(), so let's just have
2058 #ifdef PERL_EXT_RE_BUILD
2064 regnode *fix = convert;
2065 U32 word = trie->wordcount;
2067 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2068 while( ++fix < n ) {
2069 Set_Node_Offset_Length(fix, 0, 0);
2072 SV ** const tmp = av_fetch( trie_words, word, 0 );
2074 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2075 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2077 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2085 NEXT_OFF(convert) = (U16)(tail - convert);
2086 DEBUG_r(optimize= n);
2092 if ( trie->maxlen ) {
2093 NEXT_OFF( convert ) = (U16)(tail - convert);
2094 ARG_SET( convert, data_slot );
2095 /* Store the offset to the first unabsorbed branch in
2096 jump[0], which is otherwise unused by the jump logic.
2097 We use this when dumping a trie and during optimisation. */
2099 trie->jump[0] = (U16)(nextbranch - convert);
2102 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2103 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2105 OP( convert ) = TRIEC;
2106 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2107 PerlMemShared_free(trie->bitmap);
2110 OP( convert ) = TRIE;
2112 /* store the type in the flags */
2113 convert->flags = nodetype;
2117 + regarglen[ OP( convert ) ];
2119 /* XXX We really should free up the resource in trie now,
2120 as we won't use them - (which resources?) dmq */
2122 /* needed for dumping*/
2123 DEBUG_r(if (optimize) {
2124 regnode *opt = convert;
2126 while ( ++opt < optimize) {
2127 Set_Node_Offset_Length(opt,0,0);
2130 Try to clean up some of the debris left after the
2133 while( optimize < jumper ) {
2134 mjd_nodelen += Node_Length((optimize));
2135 OP( optimize ) = OPTIMIZED;
2136 Set_Node_Offset_Length(optimize,0,0);
2139 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2141 } /* end node insert */
2142 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2143 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2145 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2146 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2148 SvREFCNT_dec(revcharmap);
2152 : trie->startstate>1
2158 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2160 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2162 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2163 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2166 We find the fail state for each state in the trie, this state is the longest proper
2167 suffix of the current states 'word' that is also a proper prefix of another word in our
2168 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2169 the DFA not to have to restart after its tried and failed a word at a given point, it
2170 simply continues as though it had been matching the other word in the first place.
2172 'abcdgu'=~/abcdefg|cdgu/
2173 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2174 fail, which would bring use to the state representing 'd' in the second word where we would
2175 try 'g' and succeed, prodceding to match 'cdgu'.
2177 /* add a fail transition */
2178 const U32 trie_offset = ARG(source);
2179 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2181 const U32 ucharcount = trie->uniquecharcount;
2182 const U32 numstates = trie->statecount;
2183 const U32 ubound = trie->lasttrans + ucharcount;
2187 U32 base = trie->states[ 1 ].trans.base;
2190 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2191 GET_RE_DEBUG_FLAGS_DECL;
2193 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2195 PERL_UNUSED_ARG(depth);
2199 ARG_SET( stclass, data_slot );
2200 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2201 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2202 aho->trie=trie_offset;
2203 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2204 Copy( trie->states, aho->states, numstates, reg_trie_state );
2205 Newxz( q, numstates, U32);
2206 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2209 /* initialize fail[0..1] to be 1 so that we always have
2210 a valid final fail state */
2211 fail[ 0 ] = fail[ 1 ] = 1;
2213 for ( charid = 0; charid < ucharcount ; charid++ ) {
2214 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2216 q[ q_write ] = newstate;
2217 /* set to point at the root */
2218 fail[ q[ q_write++ ] ]=1;
2221 while ( q_read < q_write) {
2222 const U32 cur = q[ q_read++ % numstates ];
2223 base = trie->states[ cur ].trans.base;
2225 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2226 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2228 U32 fail_state = cur;
2231 fail_state = fail[ fail_state ];
2232 fail_base = aho->states[ fail_state ].trans.base;
2233 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2235 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2236 fail[ ch_state ] = fail_state;
2237 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2239 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2241 q[ q_write++ % numstates] = ch_state;
2245 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2246 when we fail in state 1, this allows us to use the
2247 charclass scan to find a valid start char. This is based on the principle
2248 that theres a good chance the string being searched contains lots of stuff
2249 that cant be a start char.
2251 fail[ 0 ] = fail[ 1 ] = 0;
2252 DEBUG_TRIE_COMPILE_r({
2253 PerlIO_printf(Perl_debug_log,
2254 "%*sStclass Failtable (%"UVuf" states): 0",
2255 (int)(depth * 2), "", (UV)numstates
2257 for( q_read=1; q_read<numstates; q_read++ ) {
2258 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2260 PerlIO_printf(Perl_debug_log, "\n");
2263 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2268 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2269 * These need to be revisited when a newer toolchain becomes available.
2271 #if defined(__sparc64__) && defined(__GNUC__)
2272 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2273 # undef SPARC64_GCC_WORKAROUND
2274 # define SPARC64_GCC_WORKAROUND 1
2278 #define DEBUG_PEEP(str,scan,depth) \
2279 DEBUG_OPTIMISE_r({if (scan){ \
2280 SV * const mysv=sv_newmortal(); \
2281 regnode *Next = regnext(scan); \
2282 regprop(RExC_rx, mysv, scan); \
2283 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2284 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2285 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2292 #define JOIN_EXACT(scan,min,flags) \
2293 if (PL_regkind[OP(scan)] == EXACT) \
2294 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2297 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2298 /* Merge several consecutive EXACTish nodes into one. */
2299 regnode *n = regnext(scan);
2301 regnode *next = scan + NODE_SZ_STR(scan);
2305 regnode *stop = scan;
2306 GET_RE_DEBUG_FLAGS_DECL;
2308 PERL_UNUSED_ARG(depth);
2311 PERL_ARGS_ASSERT_JOIN_EXACT;
2312 #ifndef EXPERIMENTAL_INPLACESCAN
2313 PERL_UNUSED_ARG(flags);
2314 PERL_UNUSED_ARG(val);
2316 DEBUG_PEEP("join",scan,depth);
2318 /* Skip NOTHING, merge EXACT*. */
2320 ( PL_regkind[OP(n)] == NOTHING ||
2321 (stringok && (OP(n) == OP(scan))))
2323 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2325 if (OP(n) == TAIL || n > next)
2327 if (PL_regkind[OP(n)] == NOTHING) {
2328 DEBUG_PEEP("skip:",n,depth);
2329 NEXT_OFF(scan) += NEXT_OFF(n);
2330 next = n + NODE_STEP_REGNODE;
2337 else if (stringok) {
2338 const unsigned int oldl = STR_LEN(scan);
2339 regnode * const nnext = regnext(n);
2341 DEBUG_PEEP("merg",n,depth);
2344 if (oldl + STR_LEN(n) > U8_MAX)
2346 NEXT_OFF(scan) += NEXT_OFF(n);
2347 STR_LEN(scan) += STR_LEN(n);
2348 next = n + NODE_SZ_STR(n);
2349 /* Now we can overwrite *n : */
2350 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2358 #ifdef EXPERIMENTAL_INPLACESCAN
2359 if (flags && !NEXT_OFF(n)) {
2360 DEBUG_PEEP("atch", val, depth);
2361 if (reg_off_by_arg[OP(n)]) {
2362 ARG_SET(n, val - n);
2365 NEXT_OFF(n) = val - n;
2372 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2374 Two problematic code points in Unicode casefolding of EXACT nodes:
2376 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2377 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2383 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2384 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2386 This means that in case-insensitive matching (or "loose matching",
2387 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2388 length of the above casefolded versions) can match a target string
2389 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2390 This would rather mess up the minimum length computation.
2392 What we'll do is to look for the tail four bytes, and then peek
2393 at the preceding two bytes to see whether we need to decrease
2394 the minimum length by four (six minus two).
2396 Thanks to the design of UTF-8, there cannot be false matches:
2397 A sequence of valid UTF-8 bytes cannot be a subsequence of
2398 another valid sequence of UTF-8 bytes.
2401 char * const s0 = STRING(scan), *s, *t;
2402 char * const s1 = s0 + STR_LEN(scan) - 1;
2403 char * const s2 = s1 - 4;
2404 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2405 const char t0[] = "\xaf\x49\xaf\x42";
2407 const char t0[] = "\xcc\x88\xcc\x81";
2409 const char * const t1 = t0 + 3;
2412 s < s2 && (t = ninstr(s, s1, t0, t1));
2415 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2416 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2418 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2419 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2427 n = scan + NODE_SZ_STR(scan);
2429 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2436 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2440 /* REx optimizer. Converts nodes into quickier variants "in place".
2441 Finds fixed substrings. */
2443 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2444 to the position after last scanned or to NULL. */
2446 #define INIT_AND_WITHP \
2447 assert(!and_withp); \
2448 Newx(and_withp,1,struct regnode_charclass_class); \
2449 SAVEFREEPV(and_withp)
2451 /* this is a chain of data about sub patterns we are processing that
2452 need to be handled seperately/specially in study_chunk. Its so
2453 we can simulate recursion without losing state. */
2455 typedef struct scan_frame {
2456 regnode *last; /* last node to process in this frame */
2457 regnode *next; /* next node to process when last is reached */
2458 struct scan_frame *prev; /*previous frame*/
2459 I32 stop; /* what stopparen do we use */
2463 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2465 #define CASE_SYNST_FNC(nAmE) \
2467 if (flags & SCF_DO_STCLASS_AND) { \
2468 for (value = 0; value < 256; value++) \
2469 if (!is_ ## nAmE ## _cp(value)) \
2470 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2473 for (value = 0; value < 256; value++) \
2474 if (is_ ## nAmE ## _cp(value)) \
2475 ANYOF_BITMAP_SET(data->start_class, value); \
2479 if (flags & SCF_DO_STCLASS_AND) { \
2480 for (value = 0; value < 256; value++) \
2481 if (is_ ## nAmE ## _cp(value)) \
2482 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2485 for (value = 0; value < 256; value++) \
2486 if (!is_ ## nAmE ## _cp(value)) \
2487 ANYOF_BITMAP_SET(data->start_class, value); \
2494 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2495 I32 *minlenp, I32 *deltap,
2500 struct regnode_charclass_class *and_withp,
2501 U32 flags, U32 depth)
2502 /* scanp: Start here (read-write). */
2503 /* deltap: Write maxlen-minlen here. */
2504 /* last: Stop before this one. */
2505 /* data: string data about the pattern */
2506 /* stopparen: treat close N as END */
2507 /* recursed: which subroutines have we recursed into */
2508 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2511 I32 min = 0, pars = 0, code;
2512 regnode *scan = *scanp, *next;
2514 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2515 int is_inf_internal = 0; /* The studied chunk is infinite */
2516 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2517 scan_data_t data_fake;
2518 SV *re_trie_maxbuff = NULL;
2519 regnode *first_non_open = scan;
2520 I32 stopmin = I32_MAX;
2521 scan_frame *frame = NULL;
2522 GET_RE_DEBUG_FLAGS_DECL;
2524 PERL_ARGS_ASSERT_STUDY_CHUNK;
2527 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2531 while (first_non_open && OP(first_non_open) == OPEN)
2532 first_non_open=regnext(first_non_open);
2537 while ( scan && OP(scan) != END && scan < last ){
2538 /* Peephole optimizer: */
2539 DEBUG_STUDYDATA("Peep:", data,depth);
2540 DEBUG_PEEP("Peep",scan,depth);
2541 JOIN_EXACT(scan,&min,0);
2543 /* Follow the next-chain of the current node and optimize
2544 away all the NOTHINGs from it. */
2545 if (OP(scan) != CURLYX) {
2546 const int max = (reg_off_by_arg[OP(scan)]
2548 /* I32 may be smaller than U16 on CRAYs! */
2549 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2550 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2554 /* Skip NOTHING and LONGJMP. */
2555 while ((n = regnext(n))
2556 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2557 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2558 && off + noff < max)
2560 if (reg_off_by_arg[OP(scan)])
2563 NEXT_OFF(scan) = off;
2568 /* The principal pseudo-switch. Cannot be a switch, since we
2569 look into several different things. */
2570 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2571 || OP(scan) == IFTHEN) {
2572 next = regnext(scan);
2574 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2576 if (OP(next) == code || code == IFTHEN) {
2577 /* NOTE - There is similar code to this block below for handling
2578 TRIE nodes on a re-study. If you change stuff here check there
2580 I32 max1 = 0, min1 = I32_MAX, num = 0;
2581 struct regnode_charclass_class accum;
2582 regnode * const startbranch=scan;
2584 if (flags & SCF_DO_SUBSTR)
2585 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2586 if (flags & SCF_DO_STCLASS)
2587 cl_init_zero(pRExC_state, &accum);
2589 while (OP(scan) == code) {
2590 I32 deltanext, minnext, f = 0, fake;
2591 struct regnode_charclass_class this_class;
2594 data_fake.flags = 0;
2596 data_fake.whilem_c = data->whilem_c;
2597 data_fake.last_closep = data->last_closep;
2600 data_fake.last_closep = &fake;
2602 data_fake.pos_delta = delta;
2603 next = regnext(scan);
2604 scan = NEXTOPER(scan);
2606 scan = NEXTOPER(scan);
2607 if (flags & SCF_DO_STCLASS) {
2608 cl_init(pRExC_state, &this_class);
2609 data_fake.start_class = &this_class;
2610 f = SCF_DO_STCLASS_AND;
2612 if (flags & SCF_WHILEM_VISITED_POS)
2613 f |= SCF_WHILEM_VISITED_POS;
2615 /* we suppose the run is continuous, last=next...*/
2616 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2618 stopparen, recursed, NULL, f,depth+1);
2621 if (max1 < minnext + deltanext)
2622 max1 = minnext + deltanext;
2623 if (deltanext == I32_MAX)
2624 is_inf = is_inf_internal = 1;
2626 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2628 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2629 if ( stopmin > minnext)
2630 stopmin = min + min1;
2631 flags &= ~SCF_DO_SUBSTR;
2633 data->flags |= SCF_SEEN_ACCEPT;
2636 if (data_fake.flags & SF_HAS_EVAL)
2637 data->flags |= SF_HAS_EVAL;
2638 data->whilem_c = data_fake.whilem_c;
2640 if (flags & SCF_DO_STCLASS)
2641 cl_or(pRExC_state, &accum, &this_class);
2643 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2645 if (flags & SCF_DO_SUBSTR) {
2646 data->pos_min += min1;
2647 data->pos_delta += max1 - min1;
2648 if (max1 != min1 || is_inf)
2649 data->longest = &(data->longest_float);
2652 delta += max1 - min1;
2653 if (flags & SCF_DO_STCLASS_OR) {
2654 cl_or(pRExC_state, data->start_class, &accum);
2656 cl_and(data->start_class, and_withp);
2657 flags &= ~SCF_DO_STCLASS;
2660 else if (flags & SCF_DO_STCLASS_AND) {
2662 cl_and(data->start_class, &accum);
2663 flags &= ~SCF_DO_STCLASS;
2666 /* Switch to OR mode: cache the old value of
2667 * data->start_class */
2669 StructCopy(data->start_class, and_withp,
2670 struct regnode_charclass_class);
2671 flags &= ~SCF_DO_STCLASS_AND;
2672 StructCopy(&accum, data->start_class,
2673 struct regnode_charclass_class);
2674 flags |= SCF_DO_STCLASS_OR;
2675 data->start_class->flags |= ANYOF_EOS;
2679 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2682 Assuming this was/is a branch we are dealing with: 'scan' now
2683 points at the item that follows the branch sequence, whatever
2684 it is. We now start at the beginning of the sequence and look
2691 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2693 If we can find such a subseqence we need to turn the first
2694 element into a trie and then add the subsequent branch exact
2695 strings to the trie.
2699 1. patterns where the whole set of branch can be converted.
2701 2. patterns where only a subset can be converted.
2703 In case 1 we can replace the whole set with a single regop
2704 for the trie. In case 2 we need to keep the start and end
2707 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2708 becomes BRANCH TRIE; BRANCH X;
2710 There is an additional case, that being where there is a
2711 common prefix, which gets split out into an EXACT like node
2712 preceding the TRIE node.
2714 If x(1..n)==tail then we can do a simple trie, if not we make
2715 a "jump" trie, such that when we match the appropriate word
2716 we "jump" to the appopriate tail node. Essentailly we turn
2717 a nested if into a case structure of sorts.
2722 if (!re_trie_maxbuff) {
2723 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2724 if (!SvIOK(re_trie_maxbuff))
2725 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2727 if ( SvIV(re_trie_maxbuff)>=0 ) {
2729 regnode *first = (regnode *)NULL;
2730 regnode *last = (regnode *)NULL;
2731 regnode *tail = scan;
2736 SV * const mysv = sv_newmortal(); /* for dumping */
2738 /* var tail is used because there may be a TAIL
2739 regop in the way. Ie, the exacts will point to the
2740 thing following the TAIL, but the last branch will
2741 point at the TAIL. So we advance tail. If we
2742 have nested (?:) we may have to move through several
2746 while ( OP( tail ) == TAIL ) {
2747 /* this is the TAIL generated by (?:) */
2748 tail = regnext( tail );
2753 regprop(RExC_rx, mysv, tail );
2754 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2755 (int)depth * 2 + 2, "",
2756 "Looking for TRIE'able sequences. Tail node is: ",
2757 SvPV_nolen_const( mysv )
2763 step through the branches, cur represents each
2764 branch, noper is the first thing to be matched
2765 as part of that branch and noper_next is the
2766 regnext() of that node. if noper is an EXACT
2767 and noper_next is the same as scan (our current
2768 position in the regex) then the EXACT branch is
2769 a possible optimization target. Once we have
2770 two or more consequetive such branches we can
2771 create a trie of the EXACT's contents and stich
2772 it in place. If the sequence represents all of
2773 the branches we eliminate the whole thing and
2774 replace it with a single TRIE. If it is a
2775 subsequence then we need to stitch it in. This
2776 means the first branch has to remain, and needs
2777 to be repointed at the item on the branch chain
2778 following the last branch optimized. This could
2779 be either a BRANCH, in which case the
2780 subsequence is internal, or it could be the
2781 item following the branch sequence in which
2782 case the subsequence is at the end.
2786 /* dont use tail as the end marker for this traverse */
2787 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2788 regnode * const noper = NEXTOPER( cur );
2789 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2790 regnode * const noper_next = regnext( noper );
2794 regprop(RExC_rx, mysv, cur);
2795 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2796 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2798 regprop(RExC_rx, mysv, noper);
2799 PerlIO_printf( Perl_debug_log, " -> %s",
2800 SvPV_nolen_const(mysv));
2803 regprop(RExC_rx, mysv, noper_next );
2804 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2805 SvPV_nolen_const(mysv));
2807 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2808 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2810 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2811 : PL_regkind[ OP( noper ) ] == EXACT )
2812 || OP(noper) == NOTHING )
2814 && noper_next == tail
2819 if ( !first || optype == NOTHING ) {
2820 if (!first) first = cur;
2821 optype = OP( noper );
2827 Currently we assume that the trie can handle unicode and ascii
2828 matches fold cased matches. If this proves true then the following
2829 define will prevent tries in this situation.
2831 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2833 #define TRIE_TYPE_IS_SAFE 1
2834 if ( last && TRIE_TYPE_IS_SAFE ) {
2835 make_trie( pRExC_state,
2836 startbranch, first, cur, tail, count,
2839 if ( PL_regkind[ OP( noper ) ] == EXACT
2841 && noper_next == tail
2846 optype = OP( noper );
2856 regprop(RExC_rx, mysv, cur);
2857 PerlIO_printf( Perl_debug_log,
2858 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2859 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2863 if ( last && TRIE_TYPE_IS_SAFE ) {
2864 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2865 #ifdef TRIE_STUDY_OPT
2866 if ( ((made == MADE_EXACT_TRIE &&
2867 startbranch == first)
2868 || ( first_non_open == first )) &&
2870 flags |= SCF_TRIE_RESTUDY;
2871 if ( startbranch == first
2874 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2884 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2885 scan = NEXTOPER(NEXTOPER(scan));
2886 } else /* single branch is optimized. */
2887 scan = NEXTOPER(scan);
2889 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2890 scan_frame *newframe = NULL;
2895 if (OP(scan) != SUSPEND) {
2896 /* set the pointer */
2897 if (OP(scan) == GOSUB) {
2899 RExC_recurse[ARG2L(scan)] = scan;
2900 start = RExC_open_parens[paren-1];
2901 end = RExC_close_parens[paren-1];
2904 start = RExC_rxi->program + 1;
2908 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2909 SAVEFREEPV(recursed);
2911 if (!PAREN_TEST(recursed,paren+1)) {
2912 PAREN_SET(recursed,paren+1);
2913 Newx(newframe,1,scan_frame);
2915 if (flags & SCF_DO_SUBSTR) {
2916 SCAN_COMMIT(pRExC_state,data,minlenp);
2917 data->longest = &(data->longest_float);
2919 is_inf = is_inf_internal = 1;
2920 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2921 cl_anything(pRExC_state, data->start_class);
2922 flags &= ~SCF_DO_STCLASS;
2925 Newx(newframe,1,scan_frame);
2928 end = regnext(scan);
2933 SAVEFREEPV(newframe);
2934 newframe->next = regnext(scan);
2935 newframe->last = last;
2936 newframe->stop = stopparen;
2937 newframe->prev = frame;
2947 else if (OP(scan) == EXACT) {
2948 I32 l = STR_LEN(scan);
2951 const U8 * const s = (U8*)STRING(scan);
2952 l = utf8_length(s, s + l);
2953 uc = utf8_to_uvchr(s, NULL);
2955 uc = *((U8*)STRING(scan));
2958 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2959 /* The code below prefers earlier match for fixed
2960 offset, later match for variable offset. */
2961 if (data->last_end == -1) { /* Update the start info. */
2962 data->last_start_min = data->pos_min;
2963 data->last_start_max = is_inf
2964 ? I32_MAX : data->pos_min + data->pos_delta;
2966 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2968 SvUTF8_on(data->last_found);
2970 SV * const sv = data->last_found;
2971 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2972 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2973 if (mg && mg->mg_len >= 0)
2974 mg->mg_len += utf8_length((U8*)STRING(scan),
2975 (U8*)STRING(scan)+STR_LEN(scan));
2977 data->last_end = data->pos_min + l;
2978 data->pos_min += l; /* As in the first entry. */
2979 data->flags &= ~SF_BEFORE_EOL;
2981 if (flags & SCF_DO_STCLASS_AND) {
2982 /* Check whether it is compatible with what we know already! */
2986 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2987 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2988 && (!(data->start_class->flags & ANYOF_FOLD)
2989 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2992 ANYOF_CLASS_ZERO(data->start_class);
2993 ANYOF_BITMAP_ZERO(data->start_class);
2995 ANYOF_BITMAP_SET(data->start_class, uc);
2996 data->start_class->flags &= ~ANYOF_EOS;
2998 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3000 else if (flags & SCF_DO_STCLASS_OR) {
3001 /* false positive possible if the class is case-folded */
3003 ANYOF_BITMAP_SET(data->start_class, uc);
3005 data->start_class->flags |= ANYOF_UNICODE_ALL;
3006 data->start_class->flags &= ~ANYOF_EOS;
3007 cl_and(data->start_class, and_withp);
3009 flags &= ~SCF_DO_STCLASS;
3011 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3012 I32 l = STR_LEN(scan);
3013 UV uc = *((U8*)STRING(scan));
3015 /* Search for fixed substrings supports EXACT only. */
3016 if (flags & SCF_DO_SUBSTR) {
3018 SCAN_COMMIT(pRExC_state, data, minlenp);
3021 const U8 * const s = (U8 *)STRING(scan);
3022 l = utf8_length(s, s + l);
3023 uc = utf8_to_uvchr(s, NULL);
3026 if (flags & SCF_DO_SUBSTR)
3028 if (flags & SCF_DO_STCLASS_AND) {
3029 /* Check whether it is compatible with what we know already! */
3033 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3034 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3035 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3037 ANYOF_CLASS_ZERO(data->start_class);
3038 ANYOF_BITMAP_ZERO(data->start_class);
3040 ANYOF_BITMAP_SET(data->start_class, uc);
3041 data->start_class->flags &= ~ANYOF_EOS;
3042 data->start_class->flags |= ANYOF_FOLD;
3043 if (OP(scan) == EXACTFL)
3044 data->start_class->flags |= ANYOF_LOCALE;
3047 else if (flags & SCF_DO_STCLASS_OR) {
3048 if (data->start_class->flags & ANYOF_FOLD) {
3049 /* false positive possible if the class is case-folded.
3050 Assume that the locale settings are the same... */
3052 ANYOF_BITMAP_SET(data->start_class, uc);
3053 data->start_class->flags &= ~ANYOF_EOS;
3055 cl_and(data->start_class, and_withp);
3057 flags &= ~SCF_DO_STCLASS;
3059 else if (strchr((const char*)PL_varies,OP(scan))) {
3060 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3061 I32 f = flags, pos_before = 0;
3062 regnode * const oscan = scan;
3063 struct regnode_charclass_class this_class;
3064 struct regnode_charclass_class *oclass = NULL;
3065 I32 next_is_eval = 0;
3067 switch (PL_regkind[OP(scan)]) {
3068 case WHILEM: /* End of (?:...)* . */
3069 scan = NEXTOPER(scan);
3072 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3073 next = NEXTOPER(scan);
3074 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3076 maxcount = REG_INFTY;
3077 next = regnext(scan);
3078 scan = NEXTOPER(scan);
3082 if (flags & SCF_DO_SUBSTR)
3087 if (flags & SCF_DO_STCLASS) {
3089 maxcount = REG_INFTY;
3090 next = regnext(scan);
3091 scan = NEXTOPER(scan);
3094 is_inf = is_inf_internal = 1;
3095 scan = regnext(scan);
3096 if (flags & SCF_DO_SUBSTR) {
3097 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3098 data->longest = &(data->longest_float);
3100 goto optimize_curly_tail;
3102 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3103 && (scan->flags == stopparen))
3108 mincount = ARG1(scan);
3109 maxcount = ARG2(scan);
3111 next = regnext(scan);
3112 if (OP(scan) == CURLYX) {
3113 I32 lp = (data ? *(data->last_closep) : 0);
3114 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3116 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3117 next_is_eval = (OP(scan) == EVAL);
3119 if (flags & SCF_DO_SUBSTR) {
3120 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3121 pos_before = data->pos_min;
3125 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3127 data->flags |= SF_IS_INF;
3129 if (flags & SCF_DO_STCLASS) {
3130 cl_init(pRExC_state, &this_class);
3131 oclass = data->start_class;
3132 data->start_class = &this_class;
3133 f |= SCF_DO_STCLASS_AND;
3134 f &= ~SCF_DO_STCLASS_OR;
3136 /* These are the cases when once a subexpression
3137 fails at a particular position, it cannot succeed
3138 even after backtracking at the enclosing scope.
3140 XXXX what if minimal match and we are at the
3141 initial run of {n,m}? */
3142 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3143 f &= ~SCF_WHILEM_VISITED_POS;
3145 /* This will finish on WHILEM, setting scan, or on NULL: */
3146 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3147 last, data, stopparen, recursed, NULL,
3149 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3151 if (flags & SCF_DO_STCLASS)
3152 data->start_class = oclass;
3153 if (mincount == 0 || minnext == 0) {
3154 if (flags & SCF_DO_STCLASS_OR) {
3155 cl_or(pRExC_state, data->start_class, &this_class);
3157 else if (flags & SCF_DO_STCLASS_AND) {
3158 /* Switch to OR mode: cache the old value of
3159 * data->start_class */
3161 StructCopy(data->start_class, and_withp,
3162 struct regnode_charclass_class);
3163 flags &= ~SCF_DO_STCLASS_AND;
3164 StructCopy(&this_class, data->start_class,
3165 struct regnode_charclass_class);
3166 flags |= SCF_DO_STCLASS_OR;
3167 data->start_class->flags |= ANYOF_EOS;
3169 } else { /* Non-zero len */
3170 if (flags & SCF_DO_STCLASS_OR) {
3171 cl_or(pRExC_state, data->start_class, &this_class);
3172 cl_and(data->start_class, and_withp);
3174 else if (flags & SCF_DO_STCLASS_AND)
3175 cl_and(data->start_class, &this_class);
3176 flags &= ~SCF_DO_STCLASS;
3178 if (!scan) /* It was not CURLYX, but CURLY. */
3180 if ( /* ? quantifier ok, except for (?{ ... }) */
3181 (next_is_eval || !(mincount == 0 && maxcount == 1))
3182 && (minnext == 0) && (deltanext == 0)
3183 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3184 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3185 && ckWARN(WARN_REGEXP))
3188 "Quantifier unexpected on zero-length expression");
3191 min += minnext * mincount;
3192 is_inf_internal |= ((maxcount == REG_INFTY
3193 && (minnext + deltanext) > 0)
3194 || deltanext == I32_MAX);
3195 is_inf |= is_inf_internal;
3196 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3198 /* Try powerful optimization CURLYX => CURLYN. */
3199 if ( OP(oscan) == CURLYX && data
3200 && data->flags & SF_IN_PAR
3201 && !(data->flags & SF_HAS_EVAL)
3202 && !deltanext && minnext == 1 ) {
3203 /* Try to optimize to CURLYN. */
3204 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3205 regnode * const nxt1 = nxt;
3212 if (!strchr((const char*)PL_simple,OP(nxt))
3213 && !(PL_regkind[OP(nxt)] == EXACT
3214 && STR_LEN(nxt) == 1))
3220 if (OP(nxt) != CLOSE)
3222 if (RExC_open_parens) {
3223 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3224 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3226 /* Now we know that nxt2 is the only contents: */
3227 oscan->flags = (U8)ARG(nxt);
3229 OP(nxt1) = NOTHING; /* was OPEN. */
3232 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3233 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3234 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3235 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3236 OP(nxt + 1) = OPTIMIZED; /* was count. */
3237 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3242 /* Try optimization CURLYX => CURLYM. */
3243 if ( OP(oscan) == CURLYX && data
3244 && !(data->flags & SF_HAS_PAR)
3245 && !(data->flags & SF_HAS_EVAL)
3246 && !deltanext /* atom is fixed width */
3247 && minnext != 0 /* CURLYM can't handle zero width */
3249 /* XXXX How to optimize if data == 0? */
3250 /* Optimize to a simpler form. */
3251 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3255 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3256 && (OP(nxt2) != WHILEM))
3258 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3259 /* Need to optimize away parenths. */
3260 if (data->flags & SF_IN_PAR) {
3261 /* Set the parenth number. */
3262 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3264 if (OP(nxt) != CLOSE)
3265 FAIL("Panic opt close");
3266 oscan->flags = (U8)ARG(nxt);
3267 if (RExC_open_parens) {
3268 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3269 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3271 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3272 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3275 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3276 OP(nxt + 1) = OPTIMIZED; /* was count. */
3277 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3278 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3281 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3282 regnode *nnxt = regnext(nxt1);
3285 if (reg_off_by_arg[OP(nxt1)])
3286 ARG_SET(nxt1, nxt2 - nxt1);
3287 else if (nxt2 - nxt1 < U16_MAX)
3288 NEXT_OFF(nxt1) = nxt2 - nxt1;
3290 OP(nxt) = NOTHING; /* Cannot beautify */
3295 /* Optimize again: */
3296 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3297 NULL, stopparen, recursed, NULL, 0,depth+1);
3302 else if ((OP(oscan) == CURLYX)
3303 && (flags & SCF_WHILEM_VISITED_POS)
3304 /* See the comment on a similar expression above.
3305 However, this time it not a subexpression
3306 we care about, but the expression itself. */
3307 && (maxcount == REG_INFTY)
3308 && data && ++data->whilem_c < 16) {
3309 /* This stays as CURLYX, we can put the count/of pair. */
3310 /* Find WHILEM (as in regexec.c) */
3311 regnode *nxt = oscan + NEXT_OFF(oscan);
3313 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3315 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3316 | (RExC_whilem_seen << 4)); /* On WHILEM */
3318 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3320 if (flags & SCF_DO_SUBSTR) {
3321 SV *last_str = NULL;
3322 int counted = mincount != 0;
3324 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3325 #if defined(SPARC64_GCC_WORKAROUND)
3328 const char *s = NULL;
3331 if (pos_before >= data->last_start_min)
3334 b = data->last_start_min;
3337 s = SvPV_const(data->last_found, l);
3338 old = b - data->last_start_min;
3341 I32 b = pos_before >= data->last_start_min
3342 ? pos_before : data->last_start_min;
3344 const char * const s = SvPV_const(data->last_found, l);
3345 I32 old = b - data->last_start_min;
3349 old = utf8_hop((U8*)s, old) - (U8*)s;
3352 /* Get the added string: */
3353 last_str = newSVpvn_utf8(s + old, l, UTF);
3354 if (deltanext == 0 && pos_before == b) {
3355 /* What was added is a constant string */
3357 SvGROW(last_str, (mincount * l) + 1);
3358 repeatcpy(SvPVX(last_str) + l,
3359 SvPVX_const(last_str), l, mincount - 1);
3360 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3361 /* Add additional parts. */
3362 SvCUR_set(data->last_found,
3363 SvCUR(data->last_found) - l);
3364 sv_catsv(data->last_found, last_str);
3366 SV * sv = data->last_found;
3368 SvUTF8(sv) && SvMAGICAL(sv) ?
3369 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3370 if (mg && mg->mg_len >= 0)
3371 mg->mg_len += CHR_SVLEN(last_str) - l;
3373 data->last_end += l * (mincount - 1);
3376 /* start offset must point into the last copy */
3377 data->last_start_min += minnext * (mincount - 1);
3378 data->last_start_max += is_inf ? I32_MAX
3379 : (maxcount - 1) * (minnext + data->pos_delta);
3382 /* It is counted once already... */
3383 data->pos_min += minnext * (mincount - counted);
3384 data->pos_delta += - counted * deltanext +
3385 (minnext + deltanext) * maxcount - minnext * mincount;
3386 if (mincount != maxcount) {
3387 /* Cannot extend fixed substrings found inside
3389 SCAN_COMMIT(pRExC_state,data,minlenp);
3390 if (mincount && last_str) {
3391 SV * const sv = data->last_found;
3392 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3393 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3397 sv_setsv(sv, last_str);
3398 data->last_end = data->pos_min;
3399 data->last_start_min =
3400 data->pos_min - CHR_SVLEN(last_str);
3401 data->last_start_max = is_inf
3403 : data->pos_min + data->pos_delta
3404 - CHR_SVLEN(last_str);
3406 data->longest = &(data->longest_float);
3408 SvREFCNT_dec(last_str);
3410 if (data && (fl & SF_HAS_EVAL))
3411 data->flags |= SF_HAS_EVAL;
3412 optimize_curly_tail:
3413 if (OP(oscan) != CURLYX) {
3414 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3416 NEXT_OFF(oscan) += NEXT_OFF(next);
3419 default: /* REF and CLUMP only? */
3420 if (flags & SCF_DO_SUBSTR) {
3421 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3422 data->longest = &(data->longest_float);
3424 is_inf = is_inf_internal = 1;
3425 if (flags & SCF_DO_STCLASS_OR)
3426 cl_anything(pRExC_state, data->start_class);
3427 flags &= ~SCF_DO_STCLASS;
3431 else if (OP(scan) == LNBREAK) {
3432 if (flags & SCF_DO_STCLASS) {
3434 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3435 if (flags & SCF_DO_STCLASS_AND) {
3436 for (value = 0; value < 256; value++)
3437 if (!is_VERTWS_cp(value))
3438 ANYOF_BITMAP_CLEAR(data->start_class, value);
3441 for (value = 0; value < 256; value++)
3442 if (is_VERTWS_cp(value))
3443 ANYOF_BITMAP_SET(data->start_class, value);
3445 if (flags & SCF_DO_STCLASS_OR)
3446 cl_and(data->start_class, and_withp);
3447 flags &= ~SCF_DO_STCLASS;
3451 if (flags & SCF_DO_SUBSTR) {
3452 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3454 data->pos_delta += 1;
3455 data->longest = &(data->longest_float);
3459 else if (OP(scan) == FOLDCHAR) {
3460 int d = ARG(scan)==0xDF ? 1 : 2;
3461 flags &= ~SCF_DO_STCLASS;
3464 if (flags & SCF_DO_SUBSTR) {
3465 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3467 data->pos_delta += d;
3468 data->longest = &(data->longest_float);
3471 else if (strchr((const char*)PL_simple,OP(scan))) {
3474 if (flags & SCF_DO_SUBSTR) {
3475 SCAN_COMMIT(pRExC_state,data,minlenp);
3479 if (flags & SCF_DO_STCLASS) {
3480 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3482 /* Some of the logic below assumes that switching
3483 locale on will only add false positives. */
3484 switch (PL_regkind[OP(scan)]) {
3488 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3489 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3490 cl_anything(pRExC_state, data->start_class);
3493 if (OP(scan) == SANY)
3495 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3496 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3497 || (data->start_class->flags & ANYOF_CLASS));
3498 cl_anything(pRExC_state, data->start_class);
3500 if (flags & SCF_DO_STCLASS_AND || !value)
3501 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3504 if (flags & SCF_DO_STCLASS_AND)
3505 cl_and(data->start_class,
3506 (struct regnode_charclass_class*)scan);
3508 cl_or(pRExC_state, data->start_class,
3509 (struct regnode_charclass_class*)scan);
3512 if (flags & SCF_DO_STCLASS_AND) {
3513 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3514 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3515 for (value = 0; value < 256; value++)
3516 if (!isALNUM(value))
3517 ANYOF_BITMAP_CLEAR(data->start_class, value);
3521 if (data->start_class->flags & ANYOF_LOCALE)
3522 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3524 for (value = 0; value < 256; value++)
3526 ANYOF_BITMAP_SET(data->start_class, value);
3531 if (flags & SCF_DO_STCLASS_AND) {
3532 if (data->start_class->flags & ANYOF_LOCALE)
3533 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3536 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3537 data->start_class->flags |= ANYOF_LOCALE;
3541 if (flags & SCF_DO_STCLASS_AND) {
3542 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3543 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3544 for (value = 0; value < 256; value++)
3546 ANYOF_BITMAP_CLEAR(data->start_class, value);
3550 if (data->start_class->flags & ANYOF_LOCALE)
3551 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3553 for (value = 0; value < 256; value++)
3554 if (!isALNUM(value))
3555 ANYOF_BITMAP_SET(data->start_class, value);
3560 if (flags & SCF_DO_STCLASS_AND) {
3561 if (data->start_class->flags & ANYOF_LOCALE)
3562 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3565 data->start_class->flags |= ANYOF_LOCALE;
3566 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3570 if (flags & SCF_DO_STCLASS_AND) {
3571 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3572 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3573 for (value = 0; value < 256; value++)
3574 if (!isSPACE(value))
3575 ANYOF_BITMAP_CLEAR(data->start_class, value);
3579 if (data->start_class->flags & ANYOF_LOCALE)
3580 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3582 for (value = 0; value < 256; value++)
3584 ANYOF_BITMAP_SET(data->start_class, value);
3589 if (flags & SCF_DO_STCLASS_AND) {
3590 if (data->start_class->flags & ANYOF_LOCALE)
3591 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3594 data->start_class->flags |= ANYOF_LOCALE;
3595 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3599 if (flags & SCF_DO_STCLASS_AND) {
3600 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3601 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3602 for (value = 0; value < 256; value++)
3604 ANYOF_BITMAP_CLEAR(data->start_class, value);
3608 if (data->start_class->flags & ANYOF_LOCALE)
3609 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3611 for (value = 0; value < 256; value++)
3612 if (!isSPACE(value))
3613 ANYOF_BITMAP_SET(data->start_class, value);
3618 if (flags & SCF_DO_STCLASS_AND) {
3619 if (data->start_class->flags & ANYOF_LOCALE) {
3620 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3621 for (value = 0; value < 256; value++)
3622 if (!isSPACE(value))
3623 ANYOF_BITMAP_CLEAR(data->start_class, value);
3627 data->start_class->flags |= ANYOF_LOCALE;
3628 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3632 if (flags & SCF_DO_STCLASS_AND) {
3633 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3634 for (value = 0; value < 256; value++)
3635 if (!isDIGIT(value))
3636 ANYOF_BITMAP_CLEAR(data->start_class, value);
3639 if (data->start_class->flags & ANYOF_LOCALE)
3640 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3642 for (value = 0; value < 256; value++)
3644 ANYOF_BITMAP_SET(data->start_class, value);
3649 if (flags & SCF_DO_STCLASS_AND) {
3650 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3651 for (value = 0; value < 256; value++)
3653 ANYOF_BITMAP_CLEAR(data->start_class, value);
3656 if (data->start_class->flags & ANYOF_LOCALE)
3657 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3659 for (value = 0; value < 256; value++)
3660 if (!isDIGIT(value))
3661 ANYOF_BITMAP_SET(data->start_class, value);
3665 CASE_SYNST_FNC(VERTWS);
3666 CASE_SYNST_FNC(HORIZWS);
3669 if (flags & SCF_DO_STCLASS_OR)
3670 cl_and(data->start_class, and_withp);
3671 flags &= ~SCF_DO_STCLASS;
3674 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3675 data->flags |= (OP(scan) == MEOL
3679 else if ( PL_regkind[OP(scan)] == BRANCHJ
3680 /* Lookbehind, or need to calculate parens/evals/stclass: */
3681 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3682 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3683 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3684 || OP(scan) == UNLESSM )
3686 /* Negative Lookahead/lookbehind
3687 In this case we can't do fixed string optimisation.
3690 I32 deltanext, minnext, fake = 0;
3692 struct regnode_charclass_class intrnl;
3695 data_fake.flags = 0;
3697 data_fake.whilem_c = data->whilem_c;
3698 data_fake.last_closep = data->last_closep;
3701 data_fake.last_closep = &fake;
3702 data_fake.pos_delta = delta;
3703 if ( flags & SCF_DO_STCLASS && !scan->flags
3704 && OP(scan) == IFMATCH ) { /* Lookahead */
3705 cl_init(pRExC_state, &intrnl);
3706 data_fake.start_class = &intrnl;
3707 f |= SCF_DO_STCLASS_AND;
3709 if (flags & SCF_WHILEM_VISITED_POS)
3710 f |= SCF_WHILEM_VISITED_POS;
3711 next = regnext(scan);
3712 nscan = NEXTOPER(NEXTOPER(scan));
3713 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3714 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3717 FAIL("Variable length lookbehind not implemented");
3719 else if (minnext > (I32)U8_MAX) {
3720 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3722 scan->flags = (U8)minnext;
3725 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3727 if (data_fake.flags & SF_HAS_EVAL)
3728 data->flags |= SF_HAS_EVAL;
3729 data->whilem_c = data_fake.whilem_c;
3731 if (f & SCF_DO_STCLASS_AND) {
3732 if (flags & SCF_DO_STCLASS_OR) {
3733 /* OR before, AND after: ideally we would recurse with
3734 * data_fake to get the AND applied by study of the
3735 * remainder of the pattern, and then derecurse;
3736 * *** HACK *** for now just treat as "no information".
3737 * See [perl #56690].
3739 cl_init(pRExC_state, data->start_class);
3741 /* AND before and after: combine and continue */
3742 const int was = (data->start_class->flags & ANYOF_EOS);
3744 cl_and(data->start_class, &intrnl);
3746 data->start_class->flags |= ANYOF_EOS;
3750 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3752 /* Positive Lookahead/lookbehind
3753 In this case we can do fixed string optimisation,
3754 but we must be careful about it. Note in the case of
3755 lookbehind the positions will be offset by the minimum
3756 length of the pattern, something we won't know about
3757 until after the recurse.
3759 I32 deltanext, fake = 0;
3761 struct regnode_charclass_class intrnl;
3763 /* We use SAVEFREEPV so that when the full compile
3764 is finished perl will clean up the allocated
3765 minlens when its all done. This was we don't
3766 have to worry about freeing them when we know
3767 they wont be used, which would be a pain.
3770 Newx( minnextp, 1, I32 );
3771 SAVEFREEPV(minnextp);
3774 StructCopy(data, &data_fake, scan_data_t);
3775 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3778 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3779 data_fake.last_found=newSVsv(data->last_found);
3783 data_fake.last_closep = &fake;
3784 data_fake.flags = 0;
3785 data_fake.pos_delta = delta;
3787 data_fake.flags |= SF_IS_INF;
3788 if ( flags & SCF_DO_STCLASS && !scan->flags
3789 && OP(scan) == IFMATCH ) { /* Lookahead */
3790 cl_init(pRExC_state, &intrnl);
3791 data_fake.start_class = &intrnl;
3792 f |= SCF_DO_STCLASS_AND;
3794 if (flags & SCF_WHILEM_VISITED_POS)
3795 f |= SCF_WHILEM_VISITED_POS;
3796 next = regnext(scan);
3797 nscan = NEXTOPER(NEXTOPER(scan));
3799 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3800 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3803 FAIL("Variable length lookbehind not implemented");
3805 else if (*minnextp > (I32)U8_MAX) {
3806 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3808 scan->flags = (U8)*minnextp;
3813 if (f & SCF_DO_STCLASS_AND) {
3814 const int was = (data->start_class->flags & ANYOF_EOS);
3816 cl_and(data->start_class, &intrnl);
3818 data->start_class->flags |= ANYOF_EOS;
3821 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3823 if (data_fake.flags & SF_HAS_EVAL)
3824 data->flags |= SF_HAS_EVAL;
3825 data->whilem_c = data_fake.whilem_c;
3826 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3827 if (RExC_rx->minlen<*minnextp)
3828 RExC_rx->minlen=*minnextp;
3829 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3830 SvREFCNT_dec(data_fake.last_found);
3832 if ( data_fake.minlen_fixed != minlenp )
3834 data->offset_fixed= data_fake.offset_fixed;
3835 data->minlen_fixed= data_fake.minlen_fixed;
3836 data->lookbehind_fixed+= scan->flags;
3838 if ( data_fake.minlen_float != minlenp )
3840 data->minlen_float= data_fake.minlen_float;
3841 data->offset_float_min=data_fake.offset_float_min;
3842 data->offset_float_max=data_fake.offset_float_max;
3843 data->lookbehind_float+= scan->flags;
3852 else if (OP(scan) == OPEN) {
3853 if (stopparen != (I32)ARG(scan))
3856 else if (OP(scan) == CLOSE) {
3857 if (stopparen == (I32)ARG(scan)) {
3860 if ((I32)ARG(scan) == is_par) {
3861 next = regnext(scan);
3863 if ( next && (OP(next) != WHILEM) && next < last)
3864 is_par = 0; /* Disable optimization */
3867 *(data->last_closep) = ARG(scan);
3869 else if (OP(scan) == EVAL) {
3871 data->flags |= SF_HAS_EVAL;
3873 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3874 if (flags & SCF_DO_SUBSTR) {
3875 SCAN_COMMIT(pRExC_state,data,minlenp);
3876 flags &= ~SCF_DO_SUBSTR;
3878 if (data && OP(scan)==ACCEPT) {
3879 data->flags |= SCF_SEEN_ACCEPT;
3884 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3886 if (flags & SCF_DO_SUBSTR) {
3887 SCAN_COMMIT(pRExC_state,data,minlenp);
3888 data->longest = &(data->longest_float);
3890 is_inf = is_inf_internal = 1;
3891 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3892 cl_anything(pRExC_state, data->start_class);
3893 flags &= ~SCF_DO_STCLASS;
3895 else if (OP(scan) == GPOS) {
3896 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3897 !(delta || is_inf || (data && data->pos_delta)))
3899 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3900 RExC_rx->extflags |= RXf_ANCH_GPOS;
3901 if (RExC_rx->gofs < (U32)min)
3902 RExC_rx->gofs = min;
3904 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3908 #ifdef TRIE_STUDY_OPT
3909 #ifdef FULL_TRIE_STUDY
3910 else if (PL_regkind[OP(scan)] == TRIE) {
3911 /* NOTE - There is similar code to this block above for handling
3912 BRANCH nodes on the initial study. If you change stuff here
3914 regnode *trie_node= scan;
3915 regnode *tail= regnext(scan);
3916 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3917 I32 max1 = 0, min1 = I32_MAX;
3918 struct regnode_charclass_class accum;
3920 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3921 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3922 if (flags & SCF_DO_STCLASS)
3923 cl_init_zero(pRExC_state, &accum);
3929 const regnode *nextbranch= NULL;
3932 for ( word=1 ; word <= trie->wordcount ; word++)
3934 I32 deltanext=0, minnext=0, f = 0, fake;
3935 struct regnode_charclass_class this_class;
3937 data_fake.flags = 0;
3939 data_fake.whilem_c = data->whilem_c;
3940 data_fake.last_closep = data->last_closep;
3943 data_fake.last_closep = &fake;
3944 data_fake.pos_delta = delta;
3945 if (flags & SCF_DO_STCLASS) {
3946 cl_init(pRExC_state, &this_class);
3947 data_fake.start_class = &this_class;
3948 f = SCF_DO_STCLASS_AND;
3950 if (flags & SCF_WHILEM_VISITED_POS)
3951 f |= SCF_WHILEM_VISITED_POS;
3953 if (trie->jump[word]) {
3955 nextbranch = trie_node + trie->jump[0];
3956 scan= trie_node + trie->jump[word];
3957 /* We go from the jump point to the branch that follows
3958 it. Note this means we need the vestigal unused branches
3959 even though they arent otherwise used.
3961 minnext = study_chunk(pRExC_state, &scan, minlenp,
3962 &deltanext, (regnode *)nextbranch, &data_fake,
3963 stopparen, recursed, NULL, f,depth+1);
3965 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3966 nextbranch= regnext((regnode*)nextbranch);
3968 if (min1 > (I32)(minnext + trie->minlen))
3969 min1 = minnext + trie->minlen;
3970 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3971 max1 = minnext + deltanext + trie->maxlen;
3972 if (deltanext == I32_MAX)
3973 is_inf = is_inf_internal = 1;
3975 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3977 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3978 if ( stopmin > min + min1)
3979 stopmin = min + min1;
3980 flags &= ~SCF_DO_SUBSTR;
3982 data->flags |= SCF_SEEN_ACCEPT;
3985 if (data_fake.flags & SF_HAS_EVAL)
3986 data->flags |= SF_HAS_EVAL;
3987 data->whilem_c = data_fake.whilem_c;
3989 if (flags & SCF_DO_STCLASS)
3990 cl_or(pRExC_state, &accum, &this_class);
3993 if (flags & SCF_DO_SUBSTR) {
3994 data->pos_min += min1;
3995 data->pos_delta += max1 - min1;
3996 if (max1 != min1 || is_inf)
3997 data->longest = &(data->longest_float);
4000 delta += max1 - min1;
4001 if (flags & SCF_DO_STCLASS_OR) {
4002 cl_or(pRExC_state, data->start_class, &accum);
4004 cl_and(data->start_class, and_withp);
4005 flags &= ~SCF_DO_STCLASS;
4008 else if (flags & SCF_DO_STCLASS_AND) {
4010 cl_and(data->start_class, &accum);
4011 flags &= ~SCF_DO_STCLASS;
4014 /* Switch to OR mode: cache the old value of
4015 * data->start_class */
4017 StructCopy(data->start_class, and_withp,
4018 struct regnode_charclass_class);
4019 flags &= ~SCF_DO_STCLASS_AND;
4020 StructCopy(&accum, data->start_class,
4021 struct regnode_charclass_class);
4022 flags |= SCF_DO_STCLASS_OR;
4023 data->start_class->flags |= ANYOF_EOS;
4030 else if (PL_regkind[OP(scan)] == TRIE) {
4031 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4034 min += trie->minlen;
4035 delta += (trie->maxlen - trie->minlen);
4036 flags &= ~SCF_DO_STCLASS; /* xxx */
4037 if (flags & SCF_DO_SUBSTR) {
4038 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4039 data->pos_min += trie->minlen;
4040 data->pos_delta += (trie->maxlen - trie->minlen);
4041 if (trie->maxlen != trie->minlen)
4042 data->longest = &(data->longest_float);
4044 if (trie->jump) /* no more substrings -- for now /grr*/
4045 flags &= ~SCF_DO_SUBSTR;
4047 #endif /* old or new */
4048 #endif /* TRIE_STUDY_OPT */
4050 /* Else: zero-length, ignore. */
4051 scan = regnext(scan);
4056 stopparen = frame->stop;
4057 frame = frame->prev;
4058 goto fake_study_recurse;
4063 DEBUG_STUDYDATA("pre-fin:",data,depth);
4066 *deltap = is_inf_internal ? I32_MAX : delta;
4067 if (flags & SCF_DO_SUBSTR && is_inf)
4068 data->pos_delta = I32_MAX - data->pos_min;
4069 if (is_par > (I32)U8_MAX)
4071 if (is_par && pars==1 && data) {
4072 data->flags |= SF_IN_PAR;
4073 data->flags &= ~SF_HAS_PAR;
4075 else if (pars && data) {
4076 data->flags |= SF_HAS_PAR;
4077 data->flags &= ~SF_IN_PAR;
4079 if (flags & SCF_DO_STCLASS_OR)
4080 cl_and(data->start_class, and_withp);
4081 if (flags & SCF_TRIE_RESTUDY)
4082 data->flags |= SCF_TRIE_RESTUDY;
4084 DEBUG_STUDYDATA("post-fin:",data,depth);
4086 return min < stopmin ? min : stopmin;
4090 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4092 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4094 PERL_ARGS_ASSERT_ADD_DATA;
4096 Renewc(RExC_rxi->data,
4097 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4098 char, struct reg_data);
4100 Renew(RExC_rxi->data->what, count + n, U8);
4102 Newx(RExC_rxi->data->what, n, U8);
4103 RExC_rxi->data->count = count + n;
4104 Copy(s, RExC_rxi->data->what + count, n, U8);
4108 /*XXX: todo make this not included in a non debugging perl */
4109 #ifndef PERL_IN_XSUB_RE
4111 Perl_reginitcolors(pTHX)
4114 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4116 char *t = savepv(s);
4120 t = strchr(t, '\t');
4126 PL_colors[i] = t = (char *)"";
4131 PL_colors[i++] = (char *)"";
4138 #ifdef TRIE_STUDY_OPT
4139 #define CHECK_RESTUDY_GOTO \
4141 (data.flags & SCF_TRIE_RESTUDY) \
4145 #define CHECK_RESTUDY_GOTO
4149 - pregcomp - compile a regular expression into internal code
4151 * We can't allocate space until we know how big the compiled form will be,
4152 * but we can't compile it (and thus know how big it is) until we've got a
4153 * place to put the code. So we cheat: we compile it twice, once with code
4154 * generation turned off and size counting turned on, and once "for real".
4155 * This also means that we don't allocate space until we are sure that the
4156 * thing really will compile successfully, and we never have to move the
4157 * code and thus invalidate pointers into it. (Note that it has to be in
4158 * one piece because free() must be able to free it all.) [NB: not true in perl]
4160 * Beware that the optimization-preparation code in here knows about some
4161 * of the structure of the compiled regexp. [I'll say.]
4166 #ifndef PERL_IN_XSUB_RE
4167 #define RE_ENGINE_PTR &reh_regexp_engine
4169 extern const struct regexp_engine my_reg_engine;
4170 #define RE_ENGINE_PTR &my_reg_engine
4173 #ifndef PERL_IN_XSUB_RE
4175 Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
4178 HV * const table = GvHV(PL_hintgv);
4180 PERL_ARGS_ASSERT_PREGCOMP;
4182 /* Dispatch a request to compile a regexp to correct
4185 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4186 GET_RE_DEBUG_FLAGS_DECL;
4187 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4188 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4190 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4193 return CALLREGCOMP_ENG(eng, pattern, flags);
4196 return Perl_re_compile(aTHX_ pattern, flags);
4201 Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
4205 register regexp_internal *ri;
4207 char* exp = SvPV((SV*)pattern, plen);
4208 char* xend = exp + plen;
4215 RExC_state_t RExC_state;
4216 RExC_state_t * const pRExC_state = &RExC_state;
4217 #ifdef TRIE_STUDY_OPT
4219 RExC_state_t copyRExC_state;
4221 GET_RE_DEBUG_FLAGS_DECL;
4223 PERL_ARGS_ASSERT_RE_COMPILE;
4225 DEBUG_r(if (!PL_colorset) reginitcolors());
4227 RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
4230 SV *dsv= sv_newmortal();
4231 RE_PV_QUOTED_DECL(s, RExC_utf8,
4232 dsv, exp, plen, 60);
4233 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4234 PL_colors[4],PL_colors[5],s);
4239 RExC_flags = pm_flags;
4243 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4244 RExC_seen_evals = 0;
4247 /* First pass: determine size, legality. */
4255 RExC_emit = &PL_regdummy;
4256 RExC_whilem_seen = 0;
4257 RExC_charnames = NULL;
4258 RExC_open_parens = NULL;
4259 RExC_close_parens = NULL;
4261 RExC_paren_names = NULL;
4263 RExC_paren_name_list = NULL;
4265 RExC_recurse = NULL;
4266 RExC_recurse_count = 0;
4268 #if 0 /* REGC() is (currently) a NOP at the first pass.
4269 * Clever compilers notice this and complain. --jhi */
4270 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4272 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4273 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4274 RExC_precomp = NULL;
4277 if (RExC_utf8 && !RExC_orig_utf8) {
4278 /* It's possible to write a regexp in ascii that represents Unicode
4279 codepoints outside of the byte range, such as via \x{100}. If we
4280 detect such a sequence we have to convert the entire pattern to utf8
4281 and then recompile, as our sizing calculation will have been based
4282 on 1 byte == 1 character, but we will need to use utf8 to encode
4283 at least some part of the pattern, and therefore must convert the whole
4285 XXX: somehow figure out how to make this less expensive...
4288 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4289 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4290 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4292 RExC_orig_utf8 = RExC_utf8;
4294 goto redo_first_pass;
4297 PerlIO_printf(Perl_debug_log,
4298 "Required size %"IVdf" nodes\n"
4299 "Starting second pass (creation)\n",
4302 RExC_lastparse=NULL;
4304 /* Small enough for pointer-storage convention?
4305 If extralen==0, this means that we will not need long jumps. */
4306 if (RExC_size >= 0x10000L && RExC_extralen)
4307 RExC_size += RExC_extralen;
4310 if (RExC_whilem_seen > 15)
4311 RExC_whilem_seen = 15;
4313 /* Allocate space and zero-initialize. Note, the two step process
4314 of zeroing when in debug mode, thus anything assigned has to
4315 happen after that */
4316 Newxz(r, 1, regexp);
4317 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4318 char, regexp_internal);
4319 if ( r == NULL || ri == NULL )
4320 FAIL("Regexp out of space");
4322 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4323 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4325 /* bulk initialize base fields with 0. */
4326 Zero(ri, sizeof(regexp_internal), char);
4329 /* non-zero initialization begins here */
4331 r->engine= RE_ENGINE_PTR;
4333 RX_PRELEN(r) = plen;
4334 r->extflags = pm_flags;
4336 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4337 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4338 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4339 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4340 >> RXf_PMf_STD_PMMOD_SHIFT);
4341 const char *fptr = STD_PAT_MODS; /*"msix"*/
4343 RX_WRAPLEN(r) = plen + has_minus + has_p + has_runon
4344 + (sizeof(STD_PAT_MODS) - 1)
4345 + (sizeof("(?:)") - 1);
4347 Newx(RX_WRAPPED(r), RX_WRAPLEN(r) + 1, char );
4351 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4353 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4354 char *colon = r + 1;
4357 while((ch = *fptr++)) {
4371 Copy(RExC_precomp, p, plen, char);
4381 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4383 if (RExC_seen & REG_SEEN_RECURSE) {
4384 Newxz(RExC_open_parens, RExC_npar,regnode *);
4385 SAVEFREEPV(RExC_open_parens);
4386 Newxz(RExC_close_parens,RExC_npar,regnode *);
4387 SAVEFREEPV(RExC_close_parens);
4390 /* Useful during FAIL. */
4391 #ifdef RE_TRACK_PATTERN_OFFSETS
4392 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4393 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4394 "%s %"UVuf" bytes for offset annotations.\n",
4395 ri->u.offsets ? "Got" : "Couldn't get",
4396 (UV)((2*RExC_size+1) * sizeof(U32))));
4398 SetProgLen(ri,RExC_size);
4401 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4403 /* Second pass: emit code. */
4404 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4409 RExC_emit_start = ri->program;
4410 RExC_emit = ri->program;
4411 RExC_emit_bound = ri->program + RExC_size + 1;
4413 /* Store the count of eval-groups for security checks: */
4414 RExC_rx->seen_evals = RExC_seen_evals;
4415 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4416 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4420 /* XXXX To minimize changes to RE engine we always allocate
4421 3-units-long substrs field. */
4422 Newx(r->substrs, 1, struct reg_substr_data);
4423 if (RExC_recurse_count) {
4424 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4425 SAVEFREEPV(RExC_recurse);
4429 r->minlen = minlen = sawplus = sawopen = 0;
4430 Zero(r->substrs, 1, struct reg_substr_data);
4432 #ifdef TRIE_STUDY_OPT
4434 StructCopy(&zero_scan_data, &data, scan_data_t);
4435 copyRExC_state = RExC_state;
4438 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4440 RExC_state = copyRExC_state;
4441 if (seen & REG_TOP_LEVEL_BRANCHES)
4442 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4444 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4445 if (data.last_found) {
4446 SvREFCNT_dec(data.longest_fixed);
4447 SvREFCNT_dec(data.longest_float);
4448 SvREFCNT_dec(data.last_found);
4450 StructCopy(&zero_scan_data, &data, scan_data_t);
4453 StructCopy(&zero_scan_data, &data, scan_data_t);
4456 /* Dig out information for optimizations. */
4457 r->extflags = RExC_flags; /* was pm_op */
4458 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4461 r->extflags |= RXf_UTF8; /* Unicode in it? */
4462 ri->regstclass = NULL;
4463 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4464 r->intflags |= PREGf_NAUGHTY;
4465 scan = ri->program + 1; /* First BRANCH. */
4467 /* testing for BRANCH here tells us whether there is "must appear"
4468 data in the pattern. If there is then we can use it for optimisations */
4469 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4471 STRLEN longest_float_length, longest_fixed_length;
4472 struct regnode_charclass_class ch_class; /* pointed to by data */
4474 I32 last_close = 0; /* pointed to by data */
4475 regnode *first= scan;
4476 regnode *first_next= regnext(first);
4479 * Skip introductions and multiplicators >= 1
4480 * so that we can extract the 'meat' of the pattern that must
4481 * match in the large if() sequence following.
4482 * NOTE that EXACT is NOT covered here, as it is normally
4483 * picked up by the optimiser separately.
4485 * This is unfortunate as the optimiser isnt handling lookahead
4486 * properly currently.
4489 while ((OP(first) == OPEN && (sawopen = 1)) ||
4490 /* An OR of *one* alternative - should not happen now. */
4491 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4492 /* for now we can't handle lookbehind IFMATCH*/
4493 (OP(first) == IFMATCH && !first->flags) ||
4494 (OP(first) == PLUS) ||
4495 (OP(first) == MINMOD) ||
4496 /* An {n,m} with n>0 */
4497 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4498 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4501 * the only op that could be a regnode is PLUS, all the rest
4502 * will be regnode_1 or regnode_2.
4505 if (OP(first) == PLUS)
4508 first += regarglen[OP(first)];
4510 first = NEXTOPER(first);
4511 first_next= regnext(first);
4514 /* Starting-point info. */
4516 DEBUG_PEEP("first:",first,0);
4517 /* Ignore EXACT as we deal with it later. */
4518 if (PL_regkind[OP(first)] == EXACT) {
4519 if (OP(first) == EXACT)
4520 NOOP; /* Empty, get anchored substr later. */
4521 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4522 ri->regstclass = first;
4525 else if (PL_regkind[OP(first)] == TRIE &&
4526 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4529 /* this can happen only on restudy */
4530 if ( OP(first) == TRIE ) {
4531 struct regnode_1 *trieop = (struct regnode_1 *)
4532 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4533 StructCopy(first,trieop,struct regnode_1);
4534 trie_op=(regnode *)trieop;
4536 struct regnode_charclass *trieop = (struct regnode_charclass *)
4537 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4538 StructCopy(first,trieop,struct regnode_charclass);
4539 trie_op=(regnode *)trieop;
4542 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4543 ri->regstclass = trie_op;
4546 else if (strchr((const char*)PL_simple,OP(first)))
4547 ri->regstclass = first;
4548 else if (PL_regkind[OP(first)] == BOUND ||
4549 PL_regkind[OP(first)] == NBOUND)
4550 ri->regstclass = first;
4551 else if (PL_regkind[OP(first)] == BOL) {
4552 r->extflags |= (OP(first) == MBOL
4554 : (OP(first) == SBOL
4557 first = NEXTOPER(first);
4560 else if (OP(first) == GPOS) {
4561 r->extflags |= RXf_ANCH_GPOS;
4562 first = NEXTOPER(first);
4565 else if ((!sawopen || !RExC_sawback) &&
4566 (OP(first) == STAR &&
4567 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4568 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4570 /* turn .* into ^.* with an implied $*=1 */
4572 (OP(NEXTOPER(first)) == REG_ANY)
4575 r->extflags |= type;
4576 r->intflags |= PREGf_IMPLICIT;
4577 first = NEXTOPER(first);
4580 if (sawplus && (!sawopen || !RExC_sawback)
4581 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4582 /* x+ must match at the 1st pos of run of x's */
4583 r->intflags |= PREGf_SKIP;
4585 /* Scan is after the zeroth branch, first is atomic matcher. */
4586 #ifdef TRIE_STUDY_OPT
4589 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4590 (IV)(first - scan + 1))
4594 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4595 (IV)(first - scan + 1))
4601 * If there's something expensive in the r.e., find the
4602 * longest literal string that must appear and make it the
4603 * regmust. Resolve ties in favor of later strings, since
4604 * the regstart check works with the beginning of the r.e.
4605 * and avoiding duplication strengthens checking. Not a
4606 * strong reason, but sufficient in the absence of others.
4607 * [Now we resolve ties in favor of the earlier string if
4608 * it happens that c_offset_min has been invalidated, since the
4609 * earlier string may buy us something the later one won't.]
4612 data.longest_fixed = newSVpvs("");
4613 data.longest_float = newSVpvs("");
4614 data.last_found = newSVpvs("");
4615 data.longest = &(data.longest_fixed);
4617 if (!ri->regstclass) {
4618 cl_init(pRExC_state, &ch_class);
4619 data.start_class = &ch_class;
4620 stclass_flag = SCF_DO_STCLASS_AND;
4621 } else /* XXXX Check for BOUND? */
4623 data.last_closep = &last_close;
4625 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4626 &data, -1, NULL, NULL,
4627 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4633 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4634 && data.last_start_min == 0 && data.last_end > 0
4635 && !RExC_seen_zerolen
4636 && !(RExC_seen & REG_SEEN_VERBARG)
4637 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4638 r->extflags |= RXf_CHECK_ALL;
4639 scan_commit(pRExC_state, &data,&minlen,0);
4640 SvREFCNT_dec(data.last_found);
4642 /* Note that code very similar to this but for anchored string
4643 follows immediately below, changes may need to be made to both.
4646 longest_float_length = CHR_SVLEN(data.longest_float);
4647 if (longest_float_length
4648 || (data.flags & SF_FL_BEFORE_EOL
4649 && (!(data.flags & SF_FL_BEFORE_MEOL)
4650 || (RExC_flags & RXf_PMf_MULTILINE))))
4654 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4655 && data.offset_fixed == data.offset_float_min
4656 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4657 goto remove_float; /* As in (a)+. */
4659 /* copy the information about the longest float from the reg_scan_data
4660 over to the program. */
4661 if (SvUTF8(data.longest_float)) {
4662 r->float_utf8 = data.longest_float;
4663 r->float_substr = NULL;
4665 r->float_substr = data.longest_float;
4666 r->float_utf8 = NULL;
4668 /* float_end_shift is how many chars that must be matched that
4669 follow this item. We calculate it ahead of time as once the
4670 lookbehind offset is added in we lose the ability to correctly
4672 ml = data.minlen_float ? *(data.minlen_float)
4673 : (I32)longest_float_length;
4674 r->float_end_shift = ml - data.offset_float_min
4675 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4676 + data.lookbehind_float;
4677 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4678 r->float_max_offset = data.offset_float_max;
4679 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4680 r->float_max_offset -= data.lookbehind_float;
4682 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4683 && (!(data.flags & SF_FL_BEFORE_MEOL)
4684 || (RExC_flags & RXf_PMf_MULTILINE)));
4685 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4689 r->float_substr = r->float_utf8 = NULL;
4690 SvREFCNT_dec(data.longest_float);
4691 longest_float_length = 0;
4694 /* Note that code very similar to this but for floating string
4695 is immediately above, changes may need to be made to both.
4698 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4699 if (longest_fixed_length
4700 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4701 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4702 || (RExC_flags & RXf_PMf_MULTILINE))))
4706 /* copy the information about the longest fixed
4707 from the reg_scan_data over to the program. */
4708 if (SvUTF8(data.longest_fixed)) {
4709 r->anchored_utf8 = data.longest_fixed;
4710 r->anchored_substr = NULL;
4712 r->anchored_substr = data.longest_fixed;
4713 r->anchored_utf8 = NULL;
4715 /* fixed_end_shift is how many chars that must be matched that
4716 follow this item. We calculate it ahead of time as once the
4717 lookbehind offset is added in we lose the ability to correctly
4719 ml = data.minlen_fixed ? *(data.minlen_fixed)
4720 : (I32)longest_fixed_length;
4721 r->anchored_end_shift = ml - data.offset_fixed
4722 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4723 + data.lookbehind_fixed;
4724 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4726 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4727 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4728 || (RExC_flags & RXf_PMf_MULTILINE)));
4729 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4732 r->anchored_substr = r->anchored_utf8 = NULL;
4733 SvREFCNT_dec(data.longest_fixed);
4734 longest_fixed_length = 0;
4737 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4738 ri->regstclass = NULL;
4739 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4741 && !(data.start_class->flags & ANYOF_EOS)
4742 && !cl_is_anything(data.start_class))
4744 const U32 n = add_data(pRExC_state, 1, "f");
4746 Newx(RExC_rxi->data->data[n], 1,
4747 struct regnode_charclass_class);
4748 StructCopy(data.start_class,
4749 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4750 struct regnode_charclass_class);
4751 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4752 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4753 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4754 regprop(r, sv, (regnode*)data.start_class);
4755 PerlIO_printf(Perl_debug_log,
4756 "synthetic stclass \"%s\".\n",
4757 SvPVX_const(sv));});
4760 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4761 if (longest_fixed_length > longest_float_length) {
4762 r->check_end_shift = r->anchored_end_shift;
4763 r->check_substr = r->anchored_substr;
4764 r->check_utf8 = r->anchored_utf8;
4765 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4766 if (r->extflags & RXf_ANCH_SINGLE)
4767 r->extflags |= RXf_NOSCAN;
4770 r->check_end_shift = r->float_end_shift;
4771 r->check_substr = r->float_substr;
4772 r->check_utf8 = r->float_utf8;
4773 r->check_offset_min = r->float_min_offset;
4774 r->check_offset_max = r->float_max_offset;
4776 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4777 This should be changed ASAP! */
4778 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4779 r->extflags |= RXf_USE_INTUIT;
4780 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4781 r->extflags |= RXf_INTUIT_TAIL;
4783 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4784 if ( (STRLEN)minlen < longest_float_length )
4785 minlen= longest_float_length;
4786 if ( (STRLEN)minlen < longest_fixed_length )
4787 minlen= longest_fixed_length;
4791 /* Several toplevels. Best we can is to set minlen. */
4793 struct regnode_charclass_class ch_class;
4796 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4798 scan = ri->program + 1;
4799 cl_init(pRExC_state, &ch_class);
4800 data.start_class = &ch_class;
4801 data.last_closep = &last_close;
4804 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4805 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4809 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4810 = r->float_substr = r->float_utf8 = NULL;
4811 if (!(data.start_class->flags & ANYOF_EOS)
4812 && !cl_is_anything(data.start_class))
4814 const U32 n = add_data(pRExC_state, 1, "f");
4816 Newx(RExC_rxi->data->data[n], 1,
4817 struct regnode_charclass_class);
4818 StructCopy(data.start_class,
4819 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4820 struct regnode_charclass_class);
4821 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4822 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4823 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4824 regprop(r, sv, (regnode*)data.start_class);
4825 PerlIO_printf(Perl_debug_log,
4826 "synthetic stclass \"%s\".\n",
4827 SvPVX_const(sv));});
4831 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4832 the "real" pattern. */
4834 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4835 (IV)minlen, (IV)r->minlen);
4837 r->minlenret = minlen;
4838 if (r->minlen < minlen)
4841 if (RExC_seen & REG_SEEN_GPOS)
4842 r->extflags |= RXf_GPOS_SEEN;
4843 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4844 r->extflags |= RXf_LOOKBEHIND_SEEN;
4845 if (RExC_seen & REG_SEEN_EVAL)
4846 r->extflags |= RXf_EVAL_SEEN;
4847 if (RExC_seen & REG_SEEN_CANY)
4848 r->extflags |= RXf_CANY_SEEN;
4849 if (RExC_seen & REG_SEEN_VERBARG)
4850 r->intflags |= PREGf_VERBARG_SEEN;
4851 if (RExC_seen & REG_SEEN_CUTGROUP)
4852 r->intflags |= PREGf_CUTGROUP_SEEN;
4853 if (RExC_paren_names)
4854 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4856 RXp_PAREN_NAMES(r) = NULL;
4858 #ifdef STUPID_PATTERN_CHECKS
4859 if (RX_PRELEN(r) == 0)
4860 r->extflags |= RXf_NULL;
4861 if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
4862 /* XXX: this should happen BEFORE we compile */
4863 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4864 else if (RX_PRELEN(r) == 3 && memEQ("\\s+", RX_PRECOMP(r), 3))
4865 r->extflags |= RXf_WHITE;
4866 else if (RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == '^')
4867 r->extflags |= RXf_START_ONLY;
4869 if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
4870 /* XXX: this should happen BEFORE we compile */
4871 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4873 regnode *first = ri->program + 1;
4875 U8 nop = OP(NEXTOPER(first));
4877 if (PL_regkind[fop] == NOTHING && nop == END)
4878 r->extflags |= RXf_NULL;
4879 else if (PL_regkind[fop] == BOL && nop == END)
4880 r->extflags |= RXf_START_ONLY;
4881 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4882 r->extflags |= RXf_WHITE;
4886 if (RExC_paren_names) {
4887 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4888 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4891 ri->name_list_idx = 0;
4893 if (RExC_recurse_count) {
4894 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4895 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4896 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4899 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4900 /* assume we don't need to swap parens around before we match */
4903 PerlIO_printf(Perl_debug_log,"Final program:\n");
4906 #ifdef RE_TRACK_PATTERN_OFFSETS
4907 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4908 const U32 len = ri->u.offsets[0];
4910 GET_RE_DEBUG_FLAGS_DECL;
4911 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4912 for (i = 1; i <= len; i++) {
4913 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4914 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4915 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4917 PerlIO_printf(Perl_debug_log, "\n");
4923 #undef RE_ENGINE_PTR
4927 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4930 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4932 PERL_UNUSED_ARG(value);
4934 if (flags & RXapif_FETCH) {
4935 return reg_named_buff_fetch(rx, key, flags);
4936 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4937 Perl_croak(aTHX_ "%s", PL_no_modify);
4939 } else if (flags & RXapif_EXISTS) {
4940 return reg_named_buff_exists(rx, key, flags)
4943 } else if (flags & RXapif_REGNAMES) {
4944 return reg_named_buff_all(rx, flags);
4945 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4946 return reg_named_buff_scalar(rx, flags);
4948 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4954 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4957 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4958 PERL_UNUSED_ARG(lastkey);
4960 if (flags & RXapif_FIRSTKEY)
4961 return reg_named_buff_firstkey(rx, flags);
4962 else if (flags & RXapif_NEXTKEY)
4963 return reg_named_buff_nextkey(rx, flags);
4965 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4971 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags)
4973 AV *retarray = NULL;
4976 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4978 if (flags & RXapif_ALL)
4981 if (rx && RXp_PAREN_NAMES(rx)) {
4982 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
4985 SV* sv_dat=HeVAL(he_str);
4986 I32 *nums=(I32*)SvPVX(sv_dat);
4987 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4988 if ((I32)(rx->nparens) >= nums[i]
4989 && rx->offs[nums[i]].start != -1
4990 && rx->offs[nums[i]].end != -1)
4993 CALLREG_NUMBUF_FETCH(rx,nums[i],ret);
4997 ret = newSVsv(&PL_sv_undef);
5000 av_push(retarray, ret);
5003 return newRV_noinc(MUTABLE_SV(retarray));
5010 Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key,
5014 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5016 if (rx && RXp_PAREN_NAMES(rx)) {
5017 if (flags & RXapif_ALL) {
5018 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5020 SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags);
5034 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
5037 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5039 if ( rx && RXp_PAREN_NAMES(rx) ) {
5040 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5042 return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
5049 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags)
5051 GET_RE_DEBUG_FLAGS_DECL;
5053 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5055 if (rx && RXp_PAREN_NAMES(rx)) {
5056 HV *hv = RXp_PAREN_NAMES(rx);
5058 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5061 SV* sv_dat = HeVAL(temphe);
5062 I32 *nums = (I32*)SvPVX(sv_dat);
5063 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5064 if ((I32)(rx->lastparen) >= nums[i] &&
5065 rx->offs[nums[i]].start != -1 &&
5066 rx->offs[nums[i]].end != -1)
5072 if (parno || flags & RXapif_ALL) {
5073 return newSVhek(HeKEY_hek(temphe));
5081 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags)
5087 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5089 if (rx && RXp_PAREN_NAMES(rx)) {
5090 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5091 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5092 } else if (flags & RXapif_ONE) {
5093 ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES));
5094 av = MUTABLE_AV(SvRV(ret));
5095 length = av_len(av);
5097 return newSViv(length + 1);
5099 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5103 return &PL_sv_undef;
5107 Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags)
5111 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5113 if (rx && RXp_PAREN_NAMES(rx)) {
5114 HV *hv= RXp_PAREN_NAMES(rx);
5116 (void)hv_iterinit(hv);
5117 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5120 SV* sv_dat = HeVAL(temphe);
5121 I32 *nums = (I32*)SvPVX(sv_dat);
5122 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5123 if ((I32)(rx->lastparen) >= nums[i] &&
5124 rx->offs[nums[i]].start != -1 &&
5125 rx->offs[nums[i]].end != -1)
5131 if (parno || flags & RXapif_ALL) {
5132 av_push(av, newSVhek(HeKEY_hek(temphe)));
5137 return newRV_noinc(MUTABLE_SV(av));
5141 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv)
5147 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5150 sv_setsv(sv,&PL_sv_undef);
5154 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5156 i = rx->offs[0].start;
5160 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5162 s = rx->subbeg + rx->offs[0].end;
5163 i = rx->sublen - rx->offs[0].end;
5166 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5167 (s1 = rx->offs[paren].start) != -1 &&
5168 (t1 = rx->offs[paren].end) != -1)
5172 s = rx->subbeg + s1;
5174 sv_setsv(sv,&PL_sv_undef);
5177 assert(rx->sublen >= (s - rx->subbeg) + i );
5179 const int oldtainted = PL_tainted;
5181 sv_setpvn(sv, s, i);
5182 PL_tainted = oldtainted;
5183 if ( (rx->extflags & RXf_CANY_SEEN)
5184 ? (RXp_MATCH_UTF8(rx)
5185 && (!i || is_utf8_string((U8*)s, i)))
5186 : (RXp_MATCH_UTF8(rx)) )
5193 if (RXp_MATCH_TAINTED(rx)) {
5194 if (SvTYPE(sv) >= SVt_PVMG) {
5195 MAGIC* const mg = SvMAGIC(sv);
5198 SvMAGIC_set(sv, mg->mg_moremagic);
5200 if ((mgt = SvMAGIC(sv))) {
5201 mg->mg_moremagic = mgt;
5202 SvMAGIC_set(sv, mg);
5212 sv_setsv(sv,&PL_sv_undef);
5218 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5219 SV const * const value)
5221 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5223 PERL_UNUSED_ARG(rx);
5224 PERL_UNUSED_ARG(paren);
5225 PERL_UNUSED_ARG(value);
5228 Perl_croak(aTHX_ "%s", PL_no_modify);
5232 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv,
5238 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5240 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5242 /* $` / ${^PREMATCH} */
5243 case RX_BUFF_IDX_PREMATCH:
5244 if (rx->offs[0].start != -1) {
5245 i = rx->offs[0].start;
5253 /* $' / ${^POSTMATCH} */
5254 case RX_BUFF_IDX_POSTMATCH:
5255 if (rx->offs[0].end != -1) {
5256 i = rx->sublen - rx->offs[0].end;
5258 s1 = rx->offs[0].end;
5264 /* $& / ${^MATCH}, $1, $2, ... */
5266 if (paren <= (I32)rx->nparens &&
5267 (s1 = rx->offs[paren].start) != -1 &&
5268 (t1 = rx->offs[paren].end) != -1)
5273 if (ckWARN(WARN_UNINITIALIZED))
5274 report_uninit((SV *)sv);
5279 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5280 const char * const s = rx->subbeg + s1;
5285 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5292 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5294 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5295 PERL_UNUSED_ARG(rx);
5296 return newSVpvs("Regexp");
5299 /* Scans the name of a named buffer from the pattern.
5300 * If flags is REG_RSN_RETURN_NULL returns null.
5301 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5302 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5303 * to the parsed name as looked up in the RExC_paren_names hash.
5304 * If there is an error throws a vFAIL().. type exception.
5307 #define REG_RSN_RETURN_NULL 0
5308 #define REG_RSN_RETURN_NAME 1
5309 #define REG_RSN_RETURN_DATA 2
5312 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5314 char *name_start = RExC_parse;
5316 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5318 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5319 /* skip IDFIRST by using do...while */
5322 RExC_parse += UTF8SKIP(RExC_parse);
5323 } while (isALNUM_utf8((U8*)RExC_parse));
5327 } while (isALNUM(*RExC_parse));
5332 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5333 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5334 if ( flags == REG_RSN_RETURN_NAME)
5336 else if (flags==REG_RSN_RETURN_DATA) {
5339 if ( ! sv_name ) /* should not happen*/
5340 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5341 if (RExC_paren_names)
5342 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5344 sv_dat = HeVAL(he_str);
5346 vFAIL("Reference to nonexistent named group");
5350 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5357 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5358 int rem=(int)(RExC_end - RExC_parse); \
5367 if (RExC_lastparse!=RExC_parse) \
5368 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5371 iscut ? "..." : "<" \
5374 PerlIO_printf(Perl_debug_log,"%16s",""); \
5377 num = RExC_size + 1; \
5379 num=REG_NODE_NUM(RExC_emit); \
5380 if (RExC_lastnum!=num) \
5381 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5383 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5384 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5385 (int)((depth*2)), "", \
5389 RExC_lastparse=RExC_parse; \
5394 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5395 DEBUG_PARSE_MSG((funcname)); \
5396 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5398 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5399 DEBUG_PARSE_MSG((funcname)); \
5400 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5403 - reg - regular expression, i.e. main body or parenthesized thing
5405 * Caller must absorb opening parenthesis.
5407 * Combining parenthesis handling with the base level of regular expression
5408 * is a trifle forced, but the need to tie the tails of the branches to what
5409 * follows makes it hard to avoid.
5411 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5413 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5415 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5419 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5420 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5423 register regnode *ret; /* Will be the head of the group. */
5424 register regnode *br;
5425 register regnode *lastbr;
5426 register regnode *ender = NULL;
5427 register I32 parno = 0;
5429 U32 oregflags = RExC_flags;
5430 bool have_branch = 0;
5432 I32 freeze_paren = 0;
5433 I32 after_freeze = 0;
5435 /* for (?g), (?gc), and (?o) warnings; warning
5436 about (?c) will warn about (?g) -- japhy */
5438 #define WASTED_O 0x01
5439 #define WASTED_G 0x02
5440 #define WASTED_C 0x04
5441 #define WASTED_GC (0x02|0x04)
5442 I32 wastedflags = 0x00;
5444 char * parse_start = RExC_parse; /* MJD */
5445 char * const oregcomp_parse = RExC_parse;
5447 GET_RE_DEBUG_FLAGS_DECL;
5449 PERL_ARGS_ASSERT_REG;
5450 DEBUG_PARSE("reg ");
5452 *flagp = 0; /* Tentatively. */
5455 /* Make an OPEN node, if parenthesized. */
5457 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5458 char *start_verb = RExC_parse;
5459 STRLEN verb_len = 0;
5460 char *start_arg = NULL;
5461 unsigned char op = 0;
5463 int internal_argval = 0; /* internal_argval is only useful if !argok */
5464 while ( *RExC_parse && *RExC_parse != ')' ) {
5465 if ( *RExC_parse == ':' ) {
5466 start_arg = RExC_parse + 1;
5472 verb_len = RExC_parse - start_verb;
5475 while ( *RExC_parse && *RExC_parse != ')' )
5477 if ( *RExC_parse != ')' )
5478 vFAIL("Unterminated verb pattern argument");
5479 if ( RExC_parse == start_arg )
5482 if ( *RExC_parse != ')' )
5483 vFAIL("Unterminated verb pattern");
5486 switch ( *start_verb ) {
5487 case 'A': /* (*ACCEPT) */
5488 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5490 internal_argval = RExC_nestroot;
5493 case 'C': /* (*COMMIT) */
5494 if ( memEQs(start_verb,verb_len,"COMMIT") )
5497 case 'F': /* (*FAIL) */
5498 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5503 case ':': /* (*:NAME) */
5504 case 'M': /* (*MARK:NAME) */
5505 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5510 case 'P': /* (*PRUNE) */
5511 if ( memEQs(start_verb,verb_len,"PRUNE") )
5514 case 'S': /* (*SKIP) */
5515 if ( memEQs(start_verb,verb_len,"SKIP") )
5518 case 'T': /* (*THEN) */
5519 /* [19:06] <TimToady> :: is then */
5520 if ( memEQs(start_verb,verb_len,"THEN") ) {
5522 RExC_seen |= REG_SEEN_CUTGROUP;
5528 vFAIL3("Unknown verb pattern '%.*s'",
5529 verb_len, start_verb);
5532 if ( start_arg && internal_argval ) {
5533 vFAIL3("Verb pattern '%.*s' may not have an argument",
5534 verb_len, start_verb);
5535 } else if ( argok < 0 && !start_arg ) {
5536 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5537 verb_len, start_verb);
5539 ret = reganode(pRExC_state, op, internal_argval);
5540 if ( ! internal_argval && ! SIZE_ONLY ) {
5542 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5543 ARG(ret) = add_data( pRExC_state, 1, "S" );
5544 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5551 if (!internal_argval)
5552 RExC_seen |= REG_SEEN_VERBARG;
5553 } else if ( start_arg ) {
5554 vFAIL3("Verb pattern '%.*s' may not have an argument",
5555 verb_len, start_verb);
5557 ret = reg_node(pRExC_state, op);
5559 nextchar(pRExC_state);
5562 if (*RExC_parse == '?') { /* (?...) */
5563 bool is_logical = 0;
5564 const char * const seqstart = RExC_parse;
5567 paren = *RExC_parse++;
5568 ret = NULL; /* For look-ahead/behind. */
5571 case 'P': /* (?P...) variants for those used to PCRE/Python */
5572 paren = *RExC_parse++;
5573 if ( paren == '<') /* (?P<...>) named capture */
5575 else if (paren == '>') { /* (?P>name) named recursion */
5576 goto named_recursion;
5578 else if (paren == '=') { /* (?P=...) named backref */
5579 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5580 you change this make sure you change that */
5581 char* name_start = RExC_parse;
5583 SV *sv_dat = reg_scan_name(pRExC_state,
5584 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5585 if (RExC_parse == name_start || *RExC_parse != ')')
5586 vFAIL2("Sequence %.3s... not terminated",parse_start);
5589 num = add_data( pRExC_state, 1, "S" );
5590 RExC_rxi->data->data[num]=(void*)sv_dat;
5591 SvREFCNT_inc_simple_void(sv_dat);
5594 ret = reganode(pRExC_state,
5595 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5599 Set_Node_Offset(ret, parse_start+1);
5600 Set_Node_Cur_Length(ret); /* MJD */
5602 nextchar(pRExC_state);
5606 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5608 case '<': /* (?<...) */
5609 if (*RExC_parse == '!')
5611 else if (*RExC_parse != '=')
5617 case '\'': /* (?'...') */
5618 name_start= RExC_parse;
5619 svname = reg_scan_name(pRExC_state,
5620 SIZE_ONLY ? /* reverse test from the others */
5621 REG_RSN_RETURN_NAME :
5622 REG_RSN_RETURN_NULL);
5623 if (RExC_parse == name_start) {
5625 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5628 if (*RExC_parse != paren)
5629 vFAIL2("Sequence (?%c... not terminated",
5630 paren=='>' ? '<' : paren);
5634 if (!svname) /* shouldnt happen */
5636 "panic: reg_scan_name returned NULL");
5637 if (!RExC_paren_names) {
5638 RExC_paren_names= newHV();
5639 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5641 RExC_paren_name_list= newAV();
5642 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5645 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5647 sv_dat = HeVAL(he_str);
5649 /* croak baby croak */
5651 "panic: paren_name hash element allocation failed");
5652 } else if ( SvPOK(sv_dat) ) {
5653 /* (?|...) can mean we have dupes so scan to check
5654 its already been stored. Maybe a flag indicating
5655 we are inside such a construct would be useful,
5656 but the arrays are likely to be quite small, so
5657 for now we punt -- dmq */
5658 IV count = SvIV(sv_dat);
5659 I32 *pv = (I32*)SvPVX(sv_dat);
5661 for ( i = 0 ; i < count ; i++ ) {
5662 if ( pv[i] == RExC_npar ) {
5668 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5669 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5670 pv[count] = RExC_npar;
5671 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5674 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5675 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5677 SvIV_set(sv_dat, 1);
5680 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5681 SvREFCNT_dec(svname);
5684 /*sv_dump(sv_dat);*/
5686 nextchar(pRExC_state);
5688 goto capturing_parens;
5690 RExC_seen |= REG_SEEN_LOOKBEHIND;
5692 case '=': /* (?=...) */
5693 RExC_seen_zerolen++;
5695 case '!': /* (?!...) */
5696 RExC_seen_zerolen++;
5697 if (*RExC_parse == ')') {
5698 ret=reg_node(pRExC_state, OPFAIL);
5699 nextchar(pRExC_state);
5703 case '|': /* (?|...) */
5704 /* branch reset, behave like a (?:...) except that
5705 buffers in alternations share the same numbers */
5707 after_freeze = freeze_paren = RExC_npar;
5709 case ':': /* (?:...) */
5710 case '>': /* (?>...) */
5712 case '$': /* (?$...) */
5713 case '@': /* (?@...) */
5714 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5716 case '#': /* (?#...) */
5717 while (*RExC_parse && *RExC_parse != ')')
5719 if (*RExC_parse != ')')
5720 FAIL("Sequence (?#... not terminated");
5721 nextchar(pRExC_state);
5724 case '0' : /* (?0) */
5725 case 'R' : /* (?R) */
5726 if (*RExC_parse != ')')
5727 FAIL("Sequence (?R) not terminated");
5728 ret = reg_node(pRExC_state, GOSTART);
5729 *flagp |= POSTPONED;
5730 nextchar(pRExC_state);
5733 { /* named and numeric backreferences */
5735 case '&': /* (?&NAME) */
5736 parse_start = RExC_parse - 1;
5739 SV *sv_dat = reg_scan_name(pRExC_state,
5740 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5741 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5743 goto gen_recurse_regop;
5746 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5748 vFAIL("Illegal pattern");
5750 goto parse_recursion;
5752 case '-': /* (?-1) */
5753 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5754 RExC_parse--; /* rewind to let it be handled later */
5758 case '1': case '2': case '3': case '4': /* (?1) */
5759 case '5': case '6': case '7': case '8': case '9':
5762 num = atoi(RExC_parse);
5763 parse_start = RExC_parse - 1; /* MJD */
5764 if (*RExC_parse == '-')
5766 while (isDIGIT(*RExC_parse))
5768 if (*RExC_parse!=')')
5769 vFAIL("Expecting close bracket");
5772 if ( paren == '-' ) {
5774 Diagram of capture buffer numbering.
5775 Top line is the normal capture buffer numbers
5776 Botton line is the negative indexing as from
5780 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5784 num = RExC_npar + num;
5787 vFAIL("Reference to nonexistent group");
5789 } else if ( paren == '+' ) {
5790 num = RExC_npar + num - 1;
5793 ret = reganode(pRExC_state, GOSUB, num);
5795 if (num > (I32)RExC_rx->nparens) {
5797 vFAIL("Reference to nonexistent group");
5799 ARG2L_SET( ret, RExC_recurse_count++);
5801 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5802 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5806 RExC_seen |= REG_SEEN_RECURSE;
5807 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5808 Set_Node_Offset(ret, parse_start); /* MJD */
5810 *flagp |= POSTPONED;
5811 nextchar(pRExC_state);
5813 } /* named and numeric backreferences */
5816 case '?': /* (??...) */
5818 if (*RExC_parse != '{') {
5820 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5823 *flagp |= POSTPONED;
5824 paren = *RExC_parse++;
5826 case '{': /* (?{...}) */
5831 char *s = RExC_parse;
5833 RExC_seen_zerolen++;
5834 RExC_seen |= REG_SEEN_EVAL;
5835 while (count && (c = *RExC_parse)) {
5846 if (*RExC_parse != ')') {
5848 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5852 OP_4tree *sop, *rop;
5853 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5856 Perl_save_re_context(aTHX);
5857 rop = sv_compile_2op(sv, &sop, "re", &pad);
5858 sop->op_private |= OPpREFCOUNTED;
5859 /* re_dup will OpREFCNT_inc */
5860 OpREFCNT_set(sop, 1);
5863 n = add_data(pRExC_state, 3, "nop");
5864 RExC_rxi->data->data[n] = (void*)rop;
5865 RExC_rxi->data->data[n+1] = (void*)sop;
5866 RExC_rxi->data->data[n+2] = (void*)pad;
5869 else { /* First pass */
5870 if (PL_reginterp_cnt < ++RExC_seen_evals
5872 /* No compiled RE interpolated, has runtime
5873 components ===> unsafe. */
5874 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5875 if (PL_tainting && PL_tainted)
5876 FAIL("Eval-group in insecure regular expression");
5877 #if PERL_VERSION > 8
5878 if (IN_PERL_COMPILETIME)
5883 nextchar(pRExC_state);
5885 ret = reg_node(pRExC_state, LOGICAL);
5888 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5889 /* deal with the length of this later - MJD */
5892 ret = reganode(pRExC_state, EVAL, n);
5893 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5894 Set_Node_Offset(ret, parse_start);
5897 case '(': /* (?(?{...})...) and (?(?=...)...) */
5900 if (RExC_parse[0] == '?') { /* (?(?...)) */
5901 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5902 || RExC_parse[1] == '<'
5903 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5906 ret = reg_node(pRExC_state, LOGICAL);
5909 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5913 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5914 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5916 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5917 char *name_start= RExC_parse++;
5919 SV *sv_dat=reg_scan_name(pRExC_state,
5920 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5921 if (RExC_parse == name_start || *RExC_parse != ch)
5922 vFAIL2("Sequence (?(%c... not terminated",
5923 (ch == '>' ? '<' : ch));
5926 num = add_data( pRExC_state, 1, "S" );
5927 RExC_rxi->data->data[num]=(void*)sv_dat;
5928 SvREFCNT_inc_simple_void(sv_dat);
5930 ret = reganode(pRExC_state,NGROUPP,num);
5931 goto insert_if_check_paren;
5933 else if (RExC_parse[0] == 'D' &&
5934 RExC_parse[1] == 'E' &&
5935 RExC_parse[2] == 'F' &&
5936 RExC_parse[3] == 'I' &&
5937 RExC_parse[4] == 'N' &&
5938 RExC_parse[5] == 'E')
5940 ret = reganode(pRExC_state,DEFINEP,0);
5943 goto insert_if_check_paren;
5945 else if (RExC_parse[0] == 'R') {
5948 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5949 parno = atoi(RExC_parse++);
5950 while (isDIGIT(*RExC_parse))
5952 } else if (RExC_parse[0] == '&') {
5955 sv_dat = reg_scan_name(pRExC_state,
5956 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5957 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5959 ret = reganode(pRExC_state,INSUBP,parno);
5960 goto insert_if_check_paren;
5962 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5965 parno = atoi(RExC_parse++);
5967 while (isDIGIT(*RExC_parse))
5969 ret = reganode(pRExC_state, GROUPP, parno);
5971 insert_if_check_paren:
5972 if ((c = *nextchar(pRExC_state)) != ')')
5973 vFAIL("Switch condition not recognized");
5975 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5976 br = regbranch(pRExC_state, &flags, 1,depth+1);
5978 br = reganode(pRExC_state, LONGJMP, 0);
5980 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5981 c = *nextchar(pRExC_state);
5986 vFAIL("(?(DEFINE)....) does not allow branches");
5987 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5988 regbranch(pRExC_state, &flags, 1,depth+1);
5989 REGTAIL(pRExC_state, ret, lastbr);
5992 c = *nextchar(pRExC_state);
5997 vFAIL("Switch (?(condition)... contains too many branches");
5998 ender = reg_node(pRExC_state, TAIL);
5999 REGTAIL(pRExC_state, br, ender);
6001 REGTAIL(pRExC_state, lastbr, ender);
6002 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6005 REGTAIL(pRExC_state, ret, ender);
6006 RExC_size++; /* XXX WHY do we need this?!!
6007 For large programs it seems to be required
6008 but I can't figure out why. -- dmq*/
6012 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6016 RExC_parse--; /* for vFAIL to print correctly */
6017 vFAIL("Sequence (? incomplete");
6021 parse_flags: /* (?i) */
6023 U32 posflags = 0, negflags = 0;
6024 U32 *flagsp = &posflags;
6026 while (*RExC_parse) {
6027 /* && strchr("iogcmsx", *RExC_parse) */
6028 /* (?g), (?gc) and (?o) are useless here
6029 and must be globally applied -- japhy */
6030 switch (*RExC_parse) {
6031 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6032 case ONCE_PAT_MOD: /* 'o' */
6033 case GLOBAL_PAT_MOD: /* 'g' */
6034 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6035 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6036 if (! (wastedflags & wflagbit) ) {
6037 wastedflags |= wflagbit;
6040 "Useless (%s%c) - %suse /%c modifier",
6041 flagsp == &negflags ? "?-" : "?",
6043 flagsp == &negflags ? "don't " : "",
6050 case CONTINUE_PAT_MOD: /* 'c' */
6051 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6052 if (! (wastedflags & WASTED_C) ) {
6053 wastedflags |= WASTED_GC;
6056 "Useless (%sc) - %suse /gc modifier",
6057 flagsp == &negflags ? "?-" : "?",
6058 flagsp == &negflags ? "don't " : ""
6063 case KEEPCOPY_PAT_MOD: /* 'p' */
6064 if (flagsp == &negflags) {
6065 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
6066 vWARN(RExC_parse + 1,"Useless use of (?-p)");
6068 *flagsp |= RXf_PMf_KEEPCOPY;
6072 if (flagsp == &negflags) {
6074 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6078 wastedflags = 0; /* reset so (?g-c) warns twice */
6084 RExC_flags |= posflags;
6085 RExC_flags &= ~negflags;
6087 oregflags |= posflags;
6088 oregflags &= ~negflags;
6090 nextchar(pRExC_state);
6101 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6106 }} /* one for the default block, one for the switch */
6113 ret = reganode(pRExC_state, OPEN, parno);
6116 RExC_nestroot = parno;
6117 if (RExC_seen & REG_SEEN_RECURSE
6118 && !RExC_open_parens[parno-1])
6120 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6121 "Setting open paren #%"IVdf" to %d\n",
6122 (IV)parno, REG_NODE_NUM(ret)));
6123 RExC_open_parens[parno-1]= ret;
6126 Set_Node_Length(ret, 1); /* MJD */
6127 Set_Node_Offset(ret, RExC_parse); /* MJD */
6135 /* Pick up the branches, linking them together. */
6136 parse_start = RExC_parse; /* MJD */
6137 br = regbranch(pRExC_state, &flags, 1,depth+1);
6140 if (RExC_npar > after_freeze)
6141 after_freeze = RExC_npar;
6142 RExC_npar = freeze_paren;
6145 /* branch_len = (paren != 0); */
6149 if (*RExC_parse == '|') {
6150 if (!SIZE_ONLY && RExC_extralen) {
6151 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6154 reginsert(pRExC_state, BRANCH, br, depth+1);
6155 Set_Node_Length(br, paren != 0);
6156 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6160 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6162 else if (paren == ':') {
6163 *flagp |= flags&SIMPLE;
6165 if (is_open) { /* Starts with OPEN. */
6166 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6168 else if (paren != '?') /* Not Conditional */
6170 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6172 while (*RExC_parse == '|') {
6173 if (!SIZE_ONLY && RExC_extralen) {
6174 ender = reganode(pRExC_state, LONGJMP,0);
6175 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6178 RExC_extralen += 2; /* Account for LONGJMP. */
6179 nextchar(pRExC_state);
6181 if (RExC_npar > after_freeze)
6182 after_freeze = RExC_npar;
6183 RExC_npar = freeze_paren;
6185 br = regbranch(pRExC_state, &flags, 0, depth+1);
6189 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6191 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6194 if (have_branch || paren != ':') {
6195 /* Make a closing node, and hook it on the end. */
6198 ender = reg_node(pRExC_state, TAIL);
6201 ender = reganode(pRExC_state, CLOSE, parno);
6202 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6203 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6204 "Setting close paren #%"IVdf" to %d\n",
6205 (IV)parno, REG_NODE_NUM(ender)));
6206 RExC_close_parens[parno-1]= ender;
6207 if (RExC_nestroot == parno)
6210 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6211 Set_Node_Length(ender,1); /* MJD */
6217 *flagp &= ~HASWIDTH;
6220 ender = reg_node(pRExC_state, SUCCEED);
6223 ender = reg_node(pRExC_state, END);
6225 assert(!RExC_opend); /* there can only be one! */
6230 REGTAIL(pRExC_state, lastbr, ender);
6232 if (have_branch && !SIZE_ONLY) {
6234 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6236 /* Hook the tails of the branches to the closing node. */
6237 for (br = ret; br; br = regnext(br)) {
6238 const U8 op = PL_regkind[OP(br)];
6240 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6242 else if (op == BRANCHJ) {
6243 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6251 static const char parens[] = "=!<,>";
6253 if (paren && (p = strchr(parens, paren))) {
6254 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6255 int flag = (p - parens) > 1;
6258 node = SUSPEND, flag = 0;
6259 reginsert(pRExC_state, node,ret, depth+1);
6260 Set_Node_Cur_Length(ret);
6261 Set_Node_Offset(ret, parse_start + 1);
6263 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6267 /* Check for proper termination. */
6269 RExC_flags = oregflags;
6270 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6271 RExC_parse = oregcomp_parse;
6272 vFAIL("Unmatched (");
6275 else if (!paren && RExC_parse < RExC_end) {
6276 if (*RExC_parse == ')') {
6278 vFAIL("Unmatched )");
6281 FAIL("Junk on end of regexp"); /* "Can't happen". */
6285 RExC_npar = after_freeze;
6290 - regbranch - one alternative of an | operator
6292 * Implements the concatenation operator.
6295 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6298 register regnode *ret;
6299 register regnode *chain = NULL;
6300 register regnode *latest;
6301 I32 flags = 0, c = 0;
6302 GET_RE_DEBUG_FLAGS_DECL;
6304 PERL_ARGS_ASSERT_REGBRANCH;
6306 DEBUG_PARSE("brnc");
6311 if (!SIZE_ONLY && RExC_extralen)
6312 ret = reganode(pRExC_state, BRANCHJ,0);
6314 ret = reg_node(pRExC_state, BRANCH);
6315 Set_Node_Length(ret, 1);
6319 if (!first && SIZE_ONLY)
6320 RExC_extralen += 1; /* BRANCHJ */
6322 *flagp = WORST; /* Tentatively. */
6325 nextchar(pRExC_state);
6326 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6328 latest = regpiece(pRExC_state, &flags,depth+1);
6329 if (latest == NULL) {
6330 if (flags & TRYAGAIN)
6334 else if (ret == NULL)
6336 *flagp |= flags&(HASWIDTH|POSTPONED);
6337 if (chain == NULL) /* First piece. */
6338 *flagp |= flags&SPSTART;
6341 REGTAIL(pRExC_state, chain, latest);
6346 if (chain == NULL) { /* Loop ran zero times. */
6347 chain = reg_node(pRExC_state, NOTHING);
6352 *flagp |= flags&SIMPLE;
6359 - regpiece - something followed by possible [*+?]
6361 * Note that the branching code sequences used for ? and the general cases
6362 * of * and + are somewhat optimized: they use the same NOTHING node as
6363 * both the endmarker for their branch list and the body of the last branch.
6364 * It might seem that this node could be dispensed with entirely, but the
6365 * endmarker role is not redundant.
6368 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6371 register regnode *ret;
6373 register char *next;
6375 const char * const origparse = RExC_parse;
6377 I32 max = REG_INFTY;
6379 const char *maxpos = NULL;
6380 GET_RE_DEBUG_FLAGS_DECL;
6382 PERL_ARGS_ASSERT_REGPIECE;
6384 DEBUG_PARSE("piec");
6386 ret = regatom(pRExC_state, &flags,depth+1);
6388 if (flags & TRYAGAIN)
6395 if (op == '{' && regcurly(RExC_parse)) {
6397 parse_start = RExC_parse; /* MJD */
6398 next = RExC_parse + 1;
6399 while (isDIGIT(*next) || *next == ',') {
6408 if (*next == '}') { /* got one */
6412 min = atoi(RExC_parse);
6416 maxpos = RExC_parse;
6418 if (!max && *maxpos != '0')
6419 max = REG_INFTY; /* meaning "infinity" */
6420 else if (max >= REG_INFTY)
6421 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6423 nextchar(pRExC_state);
6426 if ((flags&SIMPLE)) {
6427 RExC_naughty += 2 + RExC_naughty / 2;
6428 reginsert(pRExC_state, CURLY, ret, depth+1);
6429 Set_Node_Offset(ret, parse_start+1); /* MJD */
6430 Set_Node_Cur_Length(ret);
6433 regnode * const w = reg_node(pRExC_state, WHILEM);
6436 REGTAIL(pRExC_state, ret, w);
6437 if (!SIZE_ONLY && RExC_extralen) {
6438 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6439 reginsert(pRExC_state, NOTHING,ret, depth+1);
6440 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6442 reginsert(pRExC_state, CURLYX,ret, depth+1);
6444 Set_Node_Offset(ret, parse_start+1);
6445 Set_Node_Length(ret,
6446 op == '{' ? (RExC_parse - parse_start) : 1);
6448 if (!SIZE_ONLY && RExC_extralen)
6449 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6450 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6452 RExC_whilem_seen++, RExC_extralen += 3;
6453 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6462 vFAIL("Can't do {n,m} with n > m");
6464 ARG1_SET(ret, (U16)min);
6465 ARG2_SET(ret, (U16)max);
6477 #if 0 /* Now runtime fix should be reliable. */
6479 /* if this is reinstated, don't forget to put this back into perldiag:
6481 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6483 (F) The part of the regexp subject to either the * or + quantifier
6484 could match an empty string. The {#} shows in the regular
6485 expression about where the problem was discovered.
6489 if (!(flags&HASWIDTH) && op != '?')
6490 vFAIL("Regexp *+ operand could be empty");
6493 parse_start = RExC_parse;
6494 nextchar(pRExC_state);
6496 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6498 if (op == '*' && (flags&SIMPLE)) {
6499 reginsert(pRExC_state, STAR, ret, depth+1);
6503 else if (op == '*') {
6507 else if (op == '+' && (flags&SIMPLE)) {
6508 reginsert(pRExC_state, PLUS, ret, depth+1);
6512 else if (op == '+') {
6516 else if (op == '?') {
6521 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
6523 "%.*s matches null string many times",
6524 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6528 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6529 nextchar(pRExC_state);
6530 reginsert(pRExC_state, MINMOD, ret, depth+1);
6531 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6533 #ifndef REG_ALLOW_MINMOD_SUSPEND
6536 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6538 nextchar(pRExC_state);
6539 ender = reg_node(pRExC_state, SUCCEED);
6540 REGTAIL(pRExC_state, ret, ender);
6541 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6543 ender = reg_node(pRExC_state, TAIL);
6544 REGTAIL(pRExC_state, ret, ender);
6548 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6550 vFAIL("Nested quantifiers");
6557 /* reg_namedseq(pRExC_state,UVp)
6559 This is expected to be called by a parser routine that has
6560 recognized'\N' and needs to handle the rest. RExC_parse is
6561 expected to point at the first char following the N at the time
6564 If valuep is non-null then it is assumed that we are parsing inside
6565 of a charclass definition and the first codepoint in the resolved
6566 string is returned via *valuep and the routine will return NULL.
6567 In this mode if a multichar string is returned from the charnames
6568 handler a warning will be issued, and only the first char in the
6569 sequence will be examined. If the string returned is zero length
6570 then the value of *valuep is undefined and NON-NULL will
6571 be returned to indicate failure. (This will NOT be a valid pointer
6574 If value is null then it is assumed that we are parsing normal text
6575 and inserts a new EXACT node into the program containing the resolved
6576 string and returns a pointer to the new node. If the string is
6577 zerolength a NOTHING node is emitted.
6579 On success RExC_parse is set to the char following the endbrace.
6580 Parsing failures will generate a fatal errorvia vFAIL(...)
6582 NOTE: We cache all results from the charnames handler locally in
6583 the RExC_charnames hash (created on first use) to prevent a charnames
6584 handler from playing silly-buggers and returning a short string and
6585 then a long string for a given pattern. Since the regexp program
6586 size is calculated during an initial parse this would result
6587 in a buffer overrun so we cache to prevent the charname result from
6588 changing during the course of the parse.
6592 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6594 char * name; /* start of the content of the name */
6595 char * endbrace; /* endbrace following the name */
6598 STRLEN len; /* this has various purposes throughout the code */
6599 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6600 regnode *ret = NULL;
6602 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6604 if (*RExC_parse != '{') {
6605 vFAIL("Missing braces on \\N{}");
6607 name = RExC_parse+1;
6608 endbrace = strchr(RExC_parse, '}');
6611 vFAIL("Missing right brace on \\N{}");
6613 RExC_parse = endbrace + 1;
6616 /* RExC_parse points at the beginning brace,
6617 endbrace points at the last */
6618 if ( name[0]=='U' && name[1]=='+' ) {
6619 /* its a "Unicode hex" notation {U+89AB} */
6620 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6621 | PERL_SCAN_DISALLOW_PREFIX
6622 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6624 len = (STRLEN)(endbrace - name - 2);
6625 cp = grok_hex(name + 2, &len, &fl, NULL);
6626 if ( len != (STRLEN)(endbrace - name - 2) ) {
6630 if (cp > 0xff) RExC_utf8 = 1;
6635 /* Need to convert to utf8 if either: won't fit into a byte, or the re
6636 * is going to be in utf8 and the representation changes under utf8. */
6637 if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
6638 U8 string[UTF8_MAXBYTES+1];
6641 tmps = uvuni_to_utf8(string, cp);
6642 sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
6643 } else { /* Otherwise, no need for utf8, can skip that step */
6646 sv_str= newSVpvn(&string, 1);
6649 /* fetch the charnames handler for this scope */
6650 HV * const table = GvHV(PL_hintgv);
6652 hv_fetchs(table, "charnames", FALSE) :
6654 SV *cv= cvp ? *cvp : NULL;
6657 /* create an SV with the name as argument */
6658 sv_name = newSVpvn(name, endbrace - name);
6660 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6661 vFAIL2("Constant(\\N{%s}) unknown: "
6662 "(possibly a missing \"use charnames ...\")",
6665 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6666 vFAIL2("Constant(\\N{%s}): "
6667 "$^H{charnames} is not defined",SvPVX(sv_name));
6672 if (!RExC_charnames) {
6673 /* make sure our cache is allocated */
6674 RExC_charnames = newHV();
6675 sv_2mortal(MUTABLE_SV(RExC_charnames));
6677 /* see if we have looked this one up before */
6678 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6680 sv_str = HeVAL(he_str);
6693 count= call_sv(cv, G_SCALAR);
6695 if (count == 1) { /* XXXX is this right? dmq */
6697 SvREFCNT_inc_simple_void(sv_str);
6705 if ( !sv_str || !SvOK(sv_str) ) {
6706 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6707 "did not return a defined value",SvPVX(sv_name));
6709 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6714 char *p = SvPV(sv_str, len);
6717 if ( SvUTF8(sv_str) ) {
6718 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6722 We have to turn on utf8 for high bit chars otherwise
6723 we get failures with
6725 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6726 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6728 This is different from what \x{} would do with the same
6729 codepoint, where the condition is > 0xFF.
6736 /* warn if we havent used the whole string? */
6738 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6740 "Ignoring excess chars from \\N{%s} in character class",
6744 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6746 "Ignoring zero length \\N{%s} in character class",
6751 SvREFCNT_dec(sv_name);
6753 SvREFCNT_dec(sv_str);
6754 return len ? NULL : (regnode *)&len;
6755 } else if(SvCUR(sv_str)) {
6761 char * parse_start = name-3; /* needed for the offsets */
6763 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6765 ret = reg_node(pRExC_state,
6766 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6769 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6770 sv_utf8_upgrade(sv_str);
6771 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6775 p = SvPV(sv_str, len);
6777 /* len is the length written, charlen is the size the char read */
6778 for ( len = 0; p < pend; p += charlen ) {
6780 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6782 STRLEN foldlen,numlen;
6783 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6784 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6785 /* Emit all the Unicode characters. */
6787 for (foldbuf = tmpbuf;
6791 uvc = utf8_to_uvchr(foldbuf, &numlen);
6793 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6796 /* In EBCDIC the numlen
6797 * and unilen can differ. */
6799 if (numlen >= foldlen)
6803 break; /* "Can't happen." */
6806 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6818 RExC_size += STR_SZ(len);
6821 RExC_emit += STR_SZ(len);
6823 Set_Node_Cur_Length(ret); /* MJD */
6825 nextchar(pRExC_state);
6826 } else { /* zero length */
6827 ret = reg_node(pRExC_state,NOTHING);
6830 SvREFCNT_dec(sv_str);
6833 SvREFCNT_dec(sv_name);
6843 * It returns the code point in utf8 for the value in *encp.
6844 * value: a code value in the source encoding
6845 * encp: a pointer to an Encode object
6847 * If the result from Encode is not a single character,
6848 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6851 S_reg_recode(pTHX_ const char value, SV **encp)
6854 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6855 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6856 const STRLEN newlen = SvCUR(sv);
6857 UV uv = UNICODE_REPLACEMENT;
6859 PERL_ARGS_ASSERT_REG_RECODE;
6863 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6866 if (!newlen || numlen != newlen) {
6867 uv = UNICODE_REPLACEMENT;
6875 - regatom - the lowest level
6877 Try to identify anything special at the start of the pattern. If there
6878 is, then handle it as required. This may involve generating a single regop,
6879 such as for an assertion; or it may involve recursing, such as to
6880 handle a () structure.
6882 If the string doesn't start with something special then we gobble up
6883 as much literal text as we can.
6885 Once we have been able to handle whatever type of thing started the
6886 sequence, we return.
6888 Note: we have to be careful with escapes, as they can be both literal
6889 and special, and in the case of \10 and friends can either, depending
6890 on context. Specifically there are two seperate switches for handling
6891 escape sequences, with the one for handling literal escapes requiring
6892 a dummy entry for all of the special escapes that are actually handled
6897 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6900 register regnode *ret = NULL;
6902 char *parse_start = RExC_parse;
6903 GET_RE_DEBUG_FLAGS_DECL;
6904 DEBUG_PARSE("atom");
6905 *flagp = WORST; /* Tentatively. */
6907 PERL_ARGS_ASSERT_REGATOM;
6910 switch ((U8)*RExC_parse) {
6912 RExC_seen_zerolen++;
6913 nextchar(pRExC_state);
6914 if (RExC_flags & RXf_PMf_MULTILINE)
6915 ret = reg_node(pRExC_state, MBOL);
6916 else if (RExC_flags & RXf_PMf_SINGLELINE)
6917 ret = reg_node(pRExC_state, SBOL);
6919 ret = reg_node(pRExC_state, BOL);
6920 Set_Node_Length(ret, 1); /* MJD */
6923 nextchar(pRExC_state);
6925 RExC_seen_zerolen++;
6926 if (RExC_flags & RXf_PMf_MULTILINE)
6927 ret = reg_node(pRExC_state, MEOL);
6928 else if (RExC_flags & RXf_PMf_SINGLELINE)
6929 ret = reg_node(pRExC_state, SEOL);
6931 ret = reg_node(pRExC_state, EOL);
6932 Set_Node_Length(ret, 1); /* MJD */
6935 nextchar(pRExC_state);
6936 if (RExC_flags & RXf_PMf_SINGLELINE)
6937 ret = reg_node(pRExC_state, SANY);
6939 ret = reg_node(pRExC_state, REG_ANY);
6940 *flagp |= HASWIDTH|SIMPLE;
6942 Set_Node_Length(ret, 1); /* MJD */
6946 char * const oregcomp_parse = ++RExC_parse;
6947 ret = regclass(pRExC_state,depth+1);
6948 if (*RExC_parse != ']') {
6949 RExC_parse = oregcomp_parse;
6950 vFAIL("Unmatched [");
6952 nextchar(pRExC_state);
6953 *flagp |= HASWIDTH|SIMPLE;
6954 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6958 nextchar(pRExC_state);
6959 ret = reg(pRExC_state, 1, &flags,depth+1);
6961 if (flags & TRYAGAIN) {
6962 if (RExC_parse == RExC_end) {
6963 /* Make parent create an empty node if needed. */
6971 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6975 if (flags & TRYAGAIN) {
6979 vFAIL("Internal urp");
6980 /* Supposed to be caught earlier. */
6983 if (!regcurly(RExC_parse)) {
6992 vFAIL("Quantifier follows nothing");
7000 len=0; /* silence a spurious compiler warning */
7001 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7002 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7003 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7004 ret = reganode(pRExC_state, FOLDCHAR, cp);
7005 Set_Node_Length(ret, 1); /* MJD */
7006 nextchar(pRExC_state); /* kill whitespace under /x */
7014 This switch handles escape sequences that resolve to some kind
7015 of special regop and not to literal text. Escape sequnces that
7016 resolve to literal text are handled below in the switch marked
7019 Every entry in this switch *must* have a corresponding entry
7020 in the literal escape switch. However, the opposite is not
7021 required, as the default for this switch is to jump to the
7022 literal text handling code.
7024 switch ((U8)*++RExC_parse) {
7029 /* Special Escapes */
7031 RExC_seen_zerolen++;
7032 ret = reg_node(pRExC_state, SBOL);
7034 goto finish_meta_pat;
7036 ret = reg_node(pRExC_state, GPOS);
7037 RExC_seen |= REG_SEEN_GPOS;
7039 goto finish_meta_pat;
7041 RExC_seen_zerolen++;
7042 ret = reg_node(pRExC_state, KEEPS);
7044 /* XXX:dmq : disabling in-place substitution seems to
7045 * be necessary here to avoid cases of memory corruption, as
7046 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7048 RExC_seen |= REG_SEEN_LOOKBEHIND;
7049 goto finish_meta_pat;
7051 ret = reg_node(pRExC_state, SEOL);
7053 RExC_seen_zerolen++; /* Do not optimize RE away */
7054 goto finish_meta_pat;
7056 ret = reg_node(pRExC_state, EOS);
7058 RExC_seen_zerolen++; /* Do not optimize RE away */
7059 goto finish_meta_pat;
7061 ret = reg_node(pRExC_state, CANY);
7062 RExC_seen |= REG_SEEN_CANY;
7063 *flagp |= HASWIDTH|SIMPLE;
7064 goto finish_meta_pat;
7066 ret = reg_node(pRExC_state, CLUMP);
7068 goto finish_meta_pat;
7070 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7071 *flagp |= HASWIDTH|SIMPLE;
7072 goto finish_meta_pat;
7074 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7075 *flagp |= HASWIDTH|SIMPLE;
7076 goto finish_meta_pat;
7078 RExC_seen_zerolen++;
7079 RExC_seen |= REG_SEEN_LOOKBEHIND;
7080 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7082 goto finish_meta_pat;
7084 RExC_seen_zerolen++;
7085 RExC_seen |= REG_SEEN_LOOKBEHIND;
7086 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7088 goto finish_meta_pat;
7090 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7091 *flagp |= HASWIDTH|SIMPLE;
7092 goto finish_meta_pat;
7094 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7095 *flagp |= HASWIDTH|SIMPLE;
7096 goto finish_meta_pat;
7098 ret = reg_node(pRExC_state, DIGIT);
7099 *flagp |= HASWIDTH|SIMPLE;
7100 goto finish_meta_pat;
7102 ret = reg_node(pRExC_state, NDIGIT);
7103 *flagp |= HASWIDTH|SIMPLE;
7104 goto finish_meta_pat;
7106 ret = reg_node(pRExC_state, LNBREAK);
7107 *flagp |= HASWIDTH|SIMPLE;
7108 goto finish_meta_pat;
7110 ret = reg_node(pRExC_state, HORIZWS);
7111 *flagp |= HASWIDTH|SIMPLE;
7112 goto finish_meta_pat;
7114 ret = reg_node(pRExC_state, NHORIZWS);
7115 *flagp |= HASWIDTH|SIMPLE;
7116 goto finish_meta_pat;
7118 ret = reg_node(pRExC_state, VERTWS);
7119 *flagp |= HASWIDTH|SIMPLE;
7120 goto finish_meta_pat;
7122 ret = reg_node(pRExC_state, NVERTWS);
7123 *flagp |= HASWIDTH|SIMPLE;
7125 nextchar(pRExC_state);
7126 Set_Node_Length(ret, 2); /* MJD */
7131 char* const oldregxend = RExC_end;
7133 char* parse_start = RExC_parse - 2;
7136 if (RExC_parse[1] == '{') {
7137 /* a lovely hack--pretend we saw [\pX] instead */
7138 RExC_end = strchr(RExC_parse, '}');
7140 const U8 c = (U8)*RExC_parse;
7142 RExC_end = oldregxend;
7143 vFAIL2("Missing right brace on \\%c{}", c);
7148 RExC_end = RExC_parse + 2;
7149 if (RExC_end > oldregxend)
7150 RExC_end = oldregxend;
7154 ret = regclass(pRExC_state,depth+1);
7156 RExC_end = oldregxend;
7159 Set_Node_Offset(ret, parse_start + 2);
7160 Set_Node_Cur_Length(ret);
7161 nextchar(pRExC_state);
7162 *flagp |= HASWIDTH|SIMPLE;
7166 /* Handle \N{NAME} here and not below because it can be
7167 multicharacter. join_exact() will join them up later on.
7168 Also this makes sure that things like /\N{BLAH}+/ and
7169 \N{BLAH} being multi char Just Happen. dmq*/
7171 ret= reg_namedseq(pRExC_state, NULL);
7173 case 'k': /* Handle \k<NAME> and \k'NAME' */
7176 char ch= RExC_parse[1];
7177 if (ch != '<' && ch != '\'' && ch != '{') {
7179 vFAIL2("Sequence %.2s... not terminated",parse_start);
7181 /* this pretty much dupes the code for (?P=...) in reg(), if
7182 you change this make sure you change that */
7183 char* name_start = (RExC_parse += 2);
7185 SV *sv_dat = reg_scan_name(pRExC_state,
7186 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7187 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7188 if (RExC_parse == name_start || *RExC_parse != ch)
7189 vFAIL2("Sequence %.3s... not terminated",parse_start);
7192 num = add_data( pRExC_state, 1, "S" );
7193 RExC_rxi->data->data[num]=(void*)sv_dat;
7194 SvREFCNT_inc_simple_void(sv_dat);
7198 ret = reganode(pRExC_state,
7199 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7203 /* override incorrect value set in reganode MJD */
7204 Set_Node_Offset(ret, parse_start+1);
7205 Set_Node_Cur_Length(ret); /* MJD */
7206 nextchar(pRExC_state);
7212 case '1': case '2': case '3': case '4':
7213 case '5': case '6': case '7': case '8': case '9':
7216 bool isg = *RExC_parse == 'g';
7221 if (*RExC_parse == '{') {
7225 if (*RExC_parse == '-') {
7229 if (hasbrace && !isDIGIT(*RExC_parse)) {
7230 if (isrel) RExC_parse--;
7232 goto parse_named_seq;
7234 num = atoi(RExC_parse);
7235 if (isg && num == 0)
7236 vFAIL("Reference to invalid group 0");
7238 num = RExC_npar - num;
7240 vFAIL("Reference to nonexistent or unclosed group");
7242 if (!isg && num > 9 && num >= RExC_npar)
7245 char * const parse_start = RExC_parse - 1; /* MJD */
7246 while (isDIGIT(*RExC_parse))
7248 if (parse_start == RExC_parse - 1)
7249 vFAIL("Unterminated \\g... pattern");
7251 if (*RExC_parse != '}')
7252 vFAIL("Unterminated \\g{...} pattern");
7256 if (num > (I32)RExC_rx->nparens)
7257 vFAIL("Reference to nonexistent group");
7260 ret = reganode(pRExC_state,
7261 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7265 /* override incorrect value set in reganode MJD */
7266 Set_Node_Offset(ret, parse_start+1);
7267 Set_Node_Cur_Length(ret); /* MJD */
7269 nextchar(pRExC_state);
7274 if (RExC_parse >= RExC_end)
7275 FAIL("Trailing \\");
7278 /* Do not generate "unrecognized" warnings here, we fall
7279 back into the quick-grab loop below */
7286 if (RExC_flags & RXf_PMf_EXTENDED) {
7287 if ( reg_skipcomment( pRExC_state ) )
7294 register STRLEN len;
7299 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7301 parse_start = RExC_parse - 1;
7307 ret = reg_node(pRExC_state,
7308 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7310 for (len = 0, p = RExC_parse - 1;
7311 len < 127 && p < RExC_end;
7314 char * const oldp = p;
7316 if (RExC_flags & RXf_PMf_EXTENDED)
7317 p = regwhite( pRExC_state, p );
7322 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7323 goto normal_default;
7333 /* Literal Escapes Switch
7335 This switch is meant to handle escape sequences that
7336 resolve to a literal character.
7338 Every escape sequence that represents something
7339 else, like an assertion or a char class, is handled
7340 in the switch marked 'Special Escapes' above in this
7341 routine, but also has an entry here as anything that
7342 isn't explicitly mentioned here will be treated as
7343 an unescaped equivalent literal.
7347 /* These are all the special escapes. */
7351 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7352 goto normal_default;
7353 case 'A': /* Start assertion */
7354 case 'b': case 'B': /* Word-boundary assertion*/
7355 case 'C': /* Single char !DANGEROUS! */
7356 case 'd': case 'D': /* digit class */
7357 case 'g': case 'G': /* generic-backref, pos assertion */
7358 case 'h': case 'H': /* HORIZWS */
7359 case 'k': case 'K': /* named backref, keep marker */
7360 case 'N': /* named char sequence */
7361 case 'p': case 'P': /* Unicode property */
7362 case 'R': /* LNBREAK */
7363 case 's': case 'S': /* space class */
7364 case 'v': case 'V': /* VERTWS */
7365 case 'w': case 'W': /* word class */
7366 case 'X': /* eXtended Unicode "combining character sequence" */
7367 case 'z': case 'Z': /* End of line/string assertion */
7371 /* Anything after here is an escape that resolves to a
7372 literal. (Except digits, which may or may not)
7391 ender = ASCII_TO_NATIVE('\033');
7395 ender = ASCII_TO_NATIVE('\007');
7400 char* const e = strchr(p, '}');
7404 vFAIL("Missing right brace on \\x{}");
7407 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7408 | PERL_SCAN_DISALLOW_PREFIX;
7409 STRLEN numlen = e - p - 1;
7410 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7417 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7419 ender = grok_hex(p, &numlen, &flags, NULL);
7422 if (PL_encoding && ender < 0x100)
7423 goto recode_encoding;
7427 ender = UCHARAT(p++);
7428 ender = toCTRL(ender);
7430 case '0': case '1': case '2': case '3':case '4':
7431 case '5': case '6': case '7': case '8':case '9':
7433 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7436 ender = grok_oct(p, &numlen, &flags, NULL);
7443 if (PL_encoding && ender < 0x100)
7444 goto recode_encoding;
7448 SV* enc = PL_encoding;
7449 ender = reg_recode((const char)(U8)ender, &enc);
7450 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7451 vWARN(p, "Invalid escape in the specified encoding");
7457 FAIL("Trailing \\");
7460 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
7461 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7462 goto normal_default;
7467 if (UTF8_IS_START(*p) && UTF) {
7469 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7470 &numlen, UTF8_ALLOW_DEFAULT);
7477 if ( RExC_flags & RXf_PMf_EXTENDED)
7478 p = regwhite( pRExC_state, p );
7480 /* Prime the casefolded buffer. */
7481 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7483 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7488 /* Emit all the Unicode characters. */
7490 for (foldbuf = tmpbuf;
7492 foldlen -= numlen) {
7493 ender = utf8_to_uvchr(foldbuf, &numlen);
7495 const STRLEN unilen = reguni(pRExC_state, ender, s);
7498 /* In EBCDIC the numlen
7499 * and unilen can differ. */
7501 if (numlen >= foldlen)
7505 break; /* "Can't happen." */
7509 const STRLEN unilen = reguni(pRExC_state, ender, s);
7518 REGC((char)ender, s++);
7524 /* Emit all the Unicode characters. */
7526 for (foldbuf = tmpbuf;
7528 foldlen -= numlen) {
7529 ender = utf8_to_uvchr(foldbuf, &numlen);
7531 const STRLEN unilen = reguni(pRExC_state, ender, s);
7534 /* In EBCDIC the numlen
7535 * and unilen can differ. */
7537 if (numlen >= foldlen)
7545 const STRLEN unilen = reguni(pRExC_state, ender, s);
7554 REGC((char)ender, s++);
7558 Set_Node_Cur_Length(ret); /* MJD */
7559 nextchar(pRExC_state);
7561 /* len is STRLEN which is unsigned, need to copy to signed */
7564 vFAIL("Internal disaster");
7568 if (len == 1 && UNI_IS_INVARIANT(ender))
7572 RExC_size += STR_SZ(len);
7575 RExC_emit += STR_SZ(len);
7585 S_regwhite( RExC_state_t *pRExC_state, char *p )
7587 const char *e = RExC_end;
7589 PERL_ARGS_ASSERT_REGWHITE;
7594 else if (*p == '#') {
7603 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7611 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7612 Character classes ([:foo:]) can also be negated ([:^foo:]).
7613 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7614 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7615 but trigger failures because they are currently unimplemented. */
7617 #define POSIXCC_DONE(c) ((c) == ':')
7618 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7619 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7622 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7625 I32 namedclass = OOB_NAMEDCLASS;
7627 PERL_ARGS_ASSERT_REGPPOSIXCC;
7629 if (value == '[' && RExC_parse + 1 < RExC_end &&
7630 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7631 POSIXCC(UCHARAT(RExC_parse))) {
7632 const char c = UCHARAT(RExC_parse);
7633 char* const s = RExC_parse++;
7635 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7637 if (RExC_parse == RExC_end)
7638 /* Grandfather lone [:, [=, [. */
7641 const char* const t = RExC_parse++; /* skip over the c */
7644 if (UCHARAT(RExC_parse) == ']') {
7645 const char *posixcc = s + 1;
7646 RExC_parse++; /* skip over the ending ] */
7649 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7650 const I32 skip = t - posixcc;
7652 /* Initially switch on the length of the name. */
7655 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7656 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7659 /* Names all of length 5. */
7660 /* alnum alpha ascii blank cntrl digit graph lower
7661 print punct space upper */
7662 /* Offset 4 gives the best switch position. */
7663 switch (posixcc[4]) {
7665 if (memEQ(posixcc, "alph", 4)) /* alpha */
7666 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7669 if (memEQ(posixcc, "spac", 4)) /* space */
7670 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7673 if (memEQ(posixcc, "grap", 4)) /* graph */
7674 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7677 if (memEQ(posixcc, "asci", 4)) /* ascii */
7678 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7681 if (memEQ(posixcc, "blan", 4)) /* blank */
7682 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7685 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7686 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7689 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7690 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7693 if (memEQ(posixcc, "lowe", 4)) /* lower */
7694 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7695 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7696 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7699 if (memEQ(posixcc, "digi", 4)) /* digit */
7700 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7701 else if (memEQ(posixcc, "prin", 4)) /* print */
7702 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7703 else if (memEQ(posixcc, "punc", 4)) /* punct */
7704 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7709 if (memEQ(posixcc, "xdigit", 6))
7710 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7714 if (namedclass == OOB_NAMEDCLASS)
7715 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7717 assert (posixcc[skip] == ':');
7718 assert (posixcc[skip+1] == ']');
7719 } else if (!SIZE_ONLY) {
7720 /* [[=foo=]] and [[.foo.]] are still future. */
7722 /* adjust RExC_parse so the warning shows after
7724 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7726 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7729 /* Maternal grandfather:
7730 * "[:" ending in ":" but not in ":]" */
7740 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7744 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7746 if (POSIXCC(UCHARAT(RExC_parse))) {
7747 const char *s = RExC_parse;
7748 const char c = *s++;
7752 if (*s && c == *s && s[1] == ']') {
7753 if (ckWARN(WARN_REGEXP))
7755 "POSIX syntax [%c %c] belongs inside character classes",
7758 /* [[=foo=]] and [[.foo.]] are still future. */
7759 if (POSIXCC_NOTYET(c)) {
7760 /* adjust RExC_parse so the error shows after
7762 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7764 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7771 #define _C_C_T_(NAME,TEST,WORD) \
7774 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7776 for (value = 0; value < 256; value++) \
7778 ANYOF_BITMAP_SET(ret, value); \
7783 case ANYOF_N##NAME: \
7785 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7787 for (value = 0; value < 256; value++) \
7789 ANYOF_BITMAP_SET(ret, value); \
7795 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7797 for (value = 0; value < 256; value++) \
7799 ANYOF_BITMAP_SET(ret, value); \
7803 case ANYOF_N##NAME: \
7804 for (value = 0; value < 256; value++) \
7806 ANYOF_BITMAP_SET(ret, value); \
7812 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7813 so that it is possible to override the option here without having to
7814 rebuild the entire core. as we are required to do if we change regcomp.h
7815 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7817 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7818 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7821 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7822 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7824 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7828 parse a class specification and produce either an ANYOF node that
7829 matches the pattern or if the pattern matches a single char only and
7830 that char is < 256 and we are case insensitive then we produce an
7835 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7838 register UV nextvalue;
7839 register IV prevvalue = OOB_UNICODE;
7840 register IV range = 0;
7841 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7842 register regnode *ret;
7845 char *rangebegin = NULL;
7846 bool need_class = 0;
7849 bool optimize_invert = TRUE;
7850 AV* unicode_alternate = NULL;
7852 UV literal_endpoint = 0;
7854 UV stored = 0; /* number of chars stored in the class */
7856 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7857 case we need to change the emitted regop to an EXACT. */
7858 const char * orig_parse = RExC_parse;
7859 GET_RE_DEBUG_FLAGS_DECL;
7861 PERL_ARGS_ASSERT_REGCLASS;
7863 PERL_UNUSED_ARG(depth);
7866 DEBUG_PARSE("clas");
7868 /* Assume we are going to generate an ANYOF node. */
7869 ret = reganode(pRExC_state, ANYOF, 0);
7872 ANYOF_FLAGS(ret) = 0;
7874 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7878 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7882 RExC_size += ANYOF_SKIP;
7883 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7886 RExC_emit += ANYOF_SKIP;
7888 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7890 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7891 ANYOF_BITMAP_ZERO(ret);
7892 listsv = newSVpvs("# comment\n");
7895 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7897 if (!SIZE_ONLY && POSIXCC(nextvalue))
7898 checkposixcc(pRExC_state);
7900 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7901 if (UCHARAT(RExC_parse) == ']')
7905 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7909 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7912 rangebegin = RExC_parse;
7914 value = utf8n_to_uvchr((U8*)RExC_parse,
7915 RExC_end - RExC_parse,
7916 &numlen, UTF8_ALLOW_DEFAULT);
7917 RExC_parse += numlen;
7920 value = UCHARAT(RExC_parse++);
7922 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7923 if (value == '[' && POSIXCC(nextvalue))
7924 namedclass = regpposixcc(pRExC_state, value);
7925 else if (value == '\\') {
7927 value = utf8n_to_uvchr((U8*)RExC_parse,
7928 RExC_end - RExC_parse,
7929 &numlen, UTF8_ALLOW_DEFAULT);
7930 RExC_parse += numlen;
7933 value = UCHARAT(RExC_parse++);
7934 /* Some compilers cannot handle switching on 64-bit integer
7935 * values, therefore value cannot be an UV. Yes, this will
7936 * be a problem later if we want switch on Unicode.
7937 * A similar issue a little bit later when switching on
7938 * namedclass. --jhi */
7939 switch ((I32)value) {
7940 case 'w': namedclass = ANYOF_ALNUM; break;
7941 case 'W': namedclass = ANYOF_NALNUM; break;
7942 case 's': namedclass = ANYOF_SPACE; break;
7943 case 'S': namedclass = ANYOF_NSPACE; break;
7944 case 'd': namedclass = ANYOF_DIGIT; break;
7945 case 'D': namedclass = ANYOF_NDIGIT; break;
7946 case 'v': namedclass = ANYOF_VERTWS; break;
7947 case 'V': namedclass = ANYOF_NVERTWS; break;
7948 case 'h': namedclass = ANYOF_HORIZWS; break;
7949 case 'H': namedclass = ANYOF_NHORIZWS; break;
7950 case 'N': /* Handle \N{NAME} in class */
7952 /* We only pay attention to the first char of
7953 multichar strings being returned. I kinda wonder
7954 if this makes sense as it does change the behaviour
7955 from earlier versions, OTOH that behaviour was broken
7957 UV v; /* value is register so we cant & it /grrr */
7958 if (reg_namedseq(pRExC_state, &v)) {
7968 if (RExC_parse >= RExC_end)
7969 vFAIL2("Empty \\%c{}", (U8)value);
7970 if (*RExC_parse == '{') {
7971 const U8 c = (U8)value;
7972 e = strchr(RExC_parse++, '}');
7974 vFAIL2("Missing right brace on \\%c{}", c);
7975 while (isSPACE(UCHARAT(RExC_parse)))
7977 if (e == RExC_parse)
7978 vFAIL2("Empty \\%c{}", c);
7980 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7988 if (UCHARAT(RExC_parse) == '^') {
7991 value = value == 'p' ? 'P' : 'p'; /* toggle */
7992 while (isSPACE(UCHARAT(RExC_parse))) {
7997 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7998 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8001 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8002 namedclass = ANYOF_MAX; /* no official name, but it's named */
8005 case 'n': value = '\n'; break;
8006 case 'r': value = '\r'; break;
8007 case 't': value = '\t'; break;
8008 case 'f': value = '\f'; break;
8009 case 'b': value = '\b'; break;
8010 case 'e': value = ASCII_TO_NATIVE('\033');break;
8011 case 'a': value = ASCII_TO_NATIVE('\007');break;
8013 if (*RExC_parse == '{') {
8014 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8015 | PERL_SCAN_DISALLOW_PREFIX;
8016 char * const e = strchr(RExC_parse++, '}');
8018 vFAIL("Missing right brace on \\x{}");
8020 numlen = e - RExC_parse;
8021 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8025 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8027 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8028 RExC_parse += numlen;
8030 if (PL_encoding && value < 0x100)
8031 goto recode_encoding;
8034 value = UCHARAT(RExC_parse++);
8035 value = toCTRL(value);
8037 case '0': case '1': case '2': case '3': case '4':
8038 case '5': case '6': case '7': case '8': case '9':
8042 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8043 RExC_parse += numlen;
8044 if (PL_encoding && value < 0x100)
8045 goto recode_encoding;
8050 SV* enc = PL_encoding;
8051 value = reg_recode((const char)(U8)value, &enc);
8052 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
8054 "Invalid escape in the specified encoding");
8058 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
8060 "Unrecognized escape \\%c in character class passed through",
8064 } /* end of \blah */
8070 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8072 if (!SIZE_ONLY && !need_class)
8073 ANYOF_CLASS_ZERO(ret);
8077 /* a bad range like a-\d, a-[:digit:] ? */
8080 if (ckWARN(WARN_REGEXP)) {
8082 RExC_parse >= rangebegin ?
8083 RExC_parse - rangebegin : 0;
8085 "False [] range \"%*.*s\"",
8088 if (prevvalue < 256) {
8089 ANYOF_BITMAP_SET(ret, prevvalue);
8090 ANYOF_BITMAP_SET(ret, '-');
8093 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8094 Perl_sv_catpvf(aTHX_ listsv,
8095 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8099 range = 0; /* this was not a true range */
8105 const char *what = NULL;
8108 if (namedclass > OOB_NAMEDCLASS)
8109 optimize_invert = FALSE;
8110 /* Possible truncation here but in some 64-bit environments
8111 * the compiler gets heartburn about switch on 64-bit values.
8112 * A similar issue a little earlier when switching on value.
8114 switch ((I32)namedclass) {
8116 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8117 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8118 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8119 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8120 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8121 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8122 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8123 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8124 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8125 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8126 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8127 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8128 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8130 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8131 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8133 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8134 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8135 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8138 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8141 for (value = 0; value < 128; value++)
8142 ANYOF_BITMAP_SET(ret, value);
8144 for (value = 0; value < 256; value++) {
8146 ANYOF_BITMAP_SET(ret, value);
8155 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8158 for (value = 128; value < 256; value++)
8159 ANYOF_BITMAP_SET(ret, value);
8161 for (value = 0; value < 256; value++) {
8162 if (!isASCII(value))
8163 ANYOF_BITMAP_SET(ret, value);
8172 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8174 /* consecutive digits assumed */
8175 for (value = '0'; value <= '9'; value++)
8176 ANYOF_BITMAP_SET(ret, value);
8179 what = POSIX_CC_UNI_NAME("Digit");
8183 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8185 /* consecutive digits assumed */
8186 for (value = 0; value < '0'; value++)
8187 ANYOF_BITMAP_SET(ret, value);
8188 for (value = '9' + 1; value < 256; value++)
8189 ANYOF_BITMAP_SET(ret, value);
8192 what = POSIX_CC_UNI_NAME("Digit");
8195 /* this is to handle \p and \P */
8198 vFAIL("Invalid [::] class");
8202 /* Strings such as "+utf8::isWord\n" */
8203 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8206 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8209 } /* end of namedclass \blah */
8212 if (prevvalue > (IV)value) /* b-a */ {
8213 const int w = RExC_parse - rangebegin;
8214 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8215 range = 0; /* not a valid range */
8219 prevvalue = value; /* save the beginning of the range */
8220 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8221 RExC_parse[1] != ']') {
8224 /* a bad range like \w-, [:word:]- ? */
8225 if (namedclass > OOB_NAMEDCLASS) {
8226 if (ckWARN(WARN_REGEXP)) {
8228 RExC_parse >= rangebegin ?
8229 RExC_parse - rangebegin : 0;
8231 "False [] range \"%*.*s\"",
8235 ANYOF_BITMAP_SET(ret, '-');
8237 range = 1; /* yeah, it's a range! */
8238 continue; /* but do it the next time */
8242 /* now is the next time */
8243 /*stored += (value - prevvalue + 1);*/
8245 if (prevvalue < 256) {
8246 const IV ceilvalue = value < 256 ? value : 255;
8249 /* In EBCDIC [\x89-\x91] should include
8250 * the \x8e but [i-j] should not. */
8251 if (literal_endpoint == 2 &&
8252 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8253 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8255 if (isLOWER(prevvalue)) {
8256 for (i = prevvalue; i <= ceilvalue; i++)
8257 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8259 ANYOF_BITMAP_SET(ret, i);
8262 for (i = prevvalue; i <= ceilvalue; i++)
8263 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8265 ANYOF_BITMAP_SET(ret, i);
8271 for (i = prevvalue; i <= ceilvalue; i++) {
8272 if (!ANYOF_BITMAP_TEST(ret,i)) {
8274 ANYOF_BITMAP_SET(ret, i);
8278 if (value > 255 || UTF) {
8279 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8280 const UV natvalue = NATIVE_TO_UNI(value);
8281 stored+=2; /* can't optimize this class */
8282 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8283 if (prevnatvalue < natvalue) { /* what about > ? */
8284 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8285 prevnatvalue, natvalue);
8287 else if (prevnatvalue == natvalue) {
8288 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8290 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8292 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8294 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8295 if (RExC_precomp[0] == ':' &&
8296 RExC_precomp[1] == '[' &&
8297 (f == 0xDF || f == 0x92)) {
8298 f = NATIVE_TO_UNI(f);
8301 /* If folding and foldable and a single
8302 * character, insert also the folded version
8303 * to the charclass. */
8305 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8306 if ((RExC_precomp[0] == ':' &&
8307 RExC_precomp[1] == '[' &&
8309 (value == 0xFB05 || value == 0xFB06))) ?
8310 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8311 foldlen == (STRLEN)UNISKIP(f) )
8313 if (foldlen == (STRLEN)UNISKIP(f))
8315 Perl_sv_catpvf(aTHX_ listsv,
8318 /* Any multicharacter foldings
8319 * require the following transform:
8320 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8321 * where E folds into "pq" and F folds
8322 * into "rst", all other characters
8323 * fold to single characters. We save
8324 * away these multicharacter foldings,
8325 * to be later saved as part of the
8326 * additional "s" data. */
8329 if (!unicode_alternate)
8330 unicode_alternate = newAV();
8331 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8333 av_push(unicode_alternate, sv);
8337 /* If folding and the value is one of the Greek
8338 * sigmas insert a few more sigmas to make the
8339 * folding rules of the sigmas to work right.
8340 * Note that not all the possible combinations
8341 * are handled here: some of them are handled
8342 * by the standard folding rules, and some of
8343 * them (literal or EXACTF cases) are handled
8344 * during runtime in regexec.c:S_find_byclass(). */
8345 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8346 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8347 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8348 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8349 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8351 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8352 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8353 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8358 literal_endpoint = 0;
8362 range = 0; /* this range (if it was one) is done now */
8366 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8368 RExC_size += ANYOF_CLASS_ADD_SKIP;
8370 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8376 /****** !SIZE_ONLY AFTER HERE *********/
8378 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8379 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8381 /* optimize single char class to an EXACT node
8382 but *only* when its not a UTF/high char */
8383 const char * cur_parse= RExC_parse;
8384 RExC_emit = (regnode *)orig_emit;
8385 RExC_parse = (char *)orig_parse;
8386 ret = reg_node(pRExC_state,
8387 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8388 RExC_parse = (char *)cur_parse;
8389 *STRING(ret)= (char)value;
8391 RExC_emit += STR_SZ(1);
8393 SvREFCNT_dec(listsv);
8397 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8398 if ( /* If the only flag is folding (plus possibly inversion). */
8399 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8401 for (value = 0; value < 256; ++value) {
8402 if (ANYOF_BITMAP_TEST(ret, value)) {
8403 UV fold = PL_fold[value];
8406 ANYOF_BITMAP_SET(ret, fold);
8409 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8412 /* optimize inverted simple patterns (e.g. [^a-z]) */
8413 if (optimize_invert &&
8414 /* If the only flag is inversion. */
8415 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8416 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8417 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8418 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8421 AV * const av = newAV();
8423 /* The 0th element stores the character class description
8424 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8425 * to initialize the appropriate swash (which gets stored in
8426 * the 1st element), and also useful for dumping the regnode.
8427 * The 2nd element stores the multicharacter foldings,
8428 * used later (regexec.c:S_reginclass()). */
8429 av_store(av, 0, listsv);
8430 av_store(av, 1, NULL);
8431 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8432 rv = newRV_noinc(MUTABLE_SV(av));
8433 n = add_data(pRExC_state, 1, "s");
8434 RExC_rxi->data->data[n] = (void*)rv;
8442 /* reg_skipcomment()
8444 Absorbs an /x style # comments from the input stream.
8445 Returns true if there is more text remaining in the stream.
8446 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8447 terminates the pattern without including a newline.
8449 Note its the callers responsibility to ensure that we are
8455 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8459 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8461 while (RExC_parse < RExC_end)
8462 if (*RExC_parse++ == '\n') {
8467 /* we ran off the end of the pattern without ending
8468 the comment, so we have to add an \n when wrapping */
8469 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8477 Advance that parse position, and optionall absorbs
8478 "whitespace" from the inputstream.
8480 Without /x "whitespace" means (?#...) style comments only,
8481 with /x this means (?#...) and # comments and whitespace proper.
8483 Returns the RExC_parse point from BEFORE the scan occurs.
8485 This is the /x friendly way of saying RExC_parse++.
8489 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8491 char* const retval = RExC_parse++;
8493 PERL_ARGS_ASSERT_NEXTCHAR;
8496 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8497 RExC_parse[2] == '#') {
8498 while (*RExC_parse != ')') {
8499 if (RExC_parse == RExC_end)
8500 FAIL("Sequence (?#... not terminated");
8506 if (RExC_flags & RXf_PMf_EXTENDED) {
8507 if (isSPACE(*RExC_parse)) {
8511 else if (*RExC_parse == '#') {
8512 if ( reg_skipcomment( pRExC_state ) )
8521 - reg_node - emit a node
8523 STATIC regnode * /* Location. */
8524 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8527 register regnode *ptr;
8528 regnode * const ret = RExC_emit;
8529 GET_RE_DEBUG_FLAGS_DECL;
8531 PERL_ARGS_ASSERT_REG_NODE;
8534 SIZE_ALIGN(RExC_size);
8538 if (RExC_emit >= RExC_emit_bound)
8539 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8541 NODE_ALIGN_FILL(ret);
8543 FILL_ADVANCE_NODE(ptr, op);
8544 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8545 #ifdef RE_TRACK_PATTERN_OFFSETS
8546 if (RExC_offsets) { /* MJD */
8547 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8548 "reg_node", __LINE__,
8550 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8551 ? "Overwriting end of array!\n" : "OK",
8552 (UV)(RExC_emit - RExC_emit_start),
8553 (UV)(RExC_parse - RExC_start),
8554 (UV)RExC_offsets[0]));
8555 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8563 - reganode - emit a node with an argument
8565 STATIC regnode * /* Location. */
8566 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8569 register regnode *ptr;
8570 regnode * const ret = RExC_emit;
8571 GET_RE_DEBUG_FLAGS_DECL;
8573 PERL_ARGS_ASSERT_REGANODE;
8576 SIZE_ALIGN(RExC_size);
8581 assert(2==regarglen[op]+1);
8583 Anything larger than this has to allocate the extra amount.
8584 If we changed this to be:
8586 RExC_size += (1 + regarglen[op]);
8588 then it wouldn't matter. Its not clear what side effect
8589 might come from that so its not done so far.
8594 if (RExC_emit >= RExC_emit_bound)
8595 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8597 NODE_ALIGN_FILL(ret);
8599 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8600 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8601 #ifdef RE_TRACK_PATTERN_OFFSETS
8602 if (RExC_offsets) { /* MJD */
8603 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8607 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8608 "Overwriting end of array!\n" : "OK",
8609 (UV)(RExC_emit - RExC_emit_start),
8610 (UV)(RExC_parse - RExC_start),
8611 (UV)RExC_offsets[0]));
8612 Set_Cur_Node_Offset;
8620 - reguni - emit (if appropriate) a Unicode character
8623 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8627 PERL_ARGS_ASSERT_REGUNI;
8629 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8633 - reginsert - insert an operator in front of already-emitted operand
8635 * Means relocating the operand.
8638 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8641 register regnode *src;
8642 register regnode *dst;
8643 register regnode *place;
8644 const int offset = regarglen[(U8)op];
8645 const int size = NODE_STEP_REGNODE + offset;
8646 GET_RE_DEBUG_FLAGS_DECL;
8648 PERL_ARGS_ASSERT_REGINSERT;
8649 PERL_UNUSED_ARG(depth);
8650 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8651 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8660 if (RExC_open_parens) {
8662 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8663 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8664 if ( RExC_open_parens[paren] >= opnd ) {
8665 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8666 RExC_open_parens[paren] += size;
8668 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8670 if ( RExC_close_parens[paren] >= opnd ) {
8671 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8672 RExC_close_parens[paren] += size;
8674 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8679 while (src > opnd) {
8680 StructCopy(--src, --dst, regnode);
8681 #ifdef RE_TRACK_PATTERN_OFFSETS
8682 if (RExC_offsets) { /* MJD 20010112 */
8683 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8687 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8688 ? "Overwriting end of array!\n" : "OK",
8689 (UV)(src - RExC_emit_start),
8690 (UV)(dst - RExC_emit_start),
8691 (UV)RExC_offsets[0]));
8692 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8693 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8699 place = opnd; /* Op node, where operand used to be. */
8700 #ifdef RE_TRACK_PATTERN_OFFSETS
8701 if (RExC_offsets) { /* MJD */
8702 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8706 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8707 ? "Overwriting end of array!\n" : "OK",
8708 (UV)(place - RExC_emit_start),
8709 (UV)(RExC_parse - RExC_start),
8710 (UV)RExC_offsets[0]));
8711 Set_Node_Offset(place, RExC_parse);
8712 Set_Node_Length(place, 1);
8715 src = NEXTOPER(place);
8716 FILL_ADVANCE_NODE(place, op);
8717 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8718 Zero(src, offset, regnode);
8722 - regtail - set the next-pointer at the end of a node chain of p to val.
8723 - SEE ALSO: regtail_study
8725 /* TODO: All three parms should be const */
8727 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8730 register regnode *scan;
8731 GET_RE_DEBUG_FLAGS_DECL;
8733 PERL_ARGS_ASSERT_REGTAIL;
8735 PERL_UNUSED_ARG(depth);
8741 /* Find last node. */
8744 regnode * const temp = regnext(scan);
8746 SV * const mysv=sv_newmortal();
8747 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8748 regprop(RExC_rx, mysv, scan);
8749 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8750 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8751 (temp == NULL ? "->" : ""),
8752 (temp == NULL ? PL_reg_name[OP(val)] : "")
8760 if (reg_off_by_arg[OP(scan)]) {
8761 ARG_SET(scan, val - scan);
8764 NEXT_OFF(scan) = val - scan;
8770 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8771 - Look for optimizable sequences at the same time.
8772 - currently only looks for EXACT chains.
8774 This is expermental code. The idea is to use this routine to perform
8775 in place optimizations on branches and groups as they are constructed,
8776 with the long term intention of removing optimization from study_chunk so
8777 that it is purely analytical.
8779 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8780 to control which is which.
8783 /* TODO: All four parms should be const */
8786 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8789 register regnode *scan;
8791 #ifdef EXPERIMENTAL_INPLACESCAN
8794 GET_RE_DEBUG_FLAGS_DECL;
8796 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8802 /* Find last node. */
8806 regnode * const temp = regnext(scan);
8807 #ifdef EXPERIMENTAL_INPLACESCAN
8808 if (PL_regkind[OP(scan)] == EXACT)
8809 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8817 if( exact == PSEUDO )
8819 else if ( exact != OP(scan) )
8828 SV * const mysv=sv_newmortal();
8829 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8830 regprop(RExC_rx, mysv, scan);
8831 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8832 SvPV_nolen_const(mysv),
8834 PL_reg_name[exact]);
8841 SV * const mysv_val=sv_newmortal();
8842 DEBUG_PARSE_MSG("");
8843 regprop(RExC_rx, mysv_val, val);
8844 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8845 SvPV_nolen_const(mysv_val),
8846 (IV)REG_NODE_NUM(val),
8850 if (reg_off_by_arg[OP(scan)]) {
8851 ARG_SET(scan, val - scan);
8854 NEXT_OFF(scan) = val - scan;
8862 - regcurly - a little FSA that accepts {\d+,?\d*}
8865 S_regcurly(register const char *s)
8867 PERL_ARGS_ASSERT_REGCURLY;
8886 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8890 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8895 for (bit=0; bit<32; bit++) {
8896 if (flags & (1<<bit)) {
8898 PerlIO_printf(Perl_debug_log, "%s",lead);
8899 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8904 PerlIO_printf(Perl_debug_log, "\n");
8906 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8912 Perl_regdump(pTHX_ const regexp *r)
8916 SV * const sv = sv_newmortal();
8917 SV *dsv= sv_newmortal();
8919 GET_RE_DEBUG_FLAGS_DECL;
8921 PERL_ARGS_ASSERT_REGDUMP;
8923 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8925 /* Header fields of interest. */
8926 if (r->anchored_substr) {
8927 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8928 RE_SV_DUMPLEN(r->anchored_substr), 30);
8929 PerlIO_printf(Perl_debug_log,
8930 "anchored %s%s at %"IVdf" ",
8931 s, RE_SV_TAIL(r->anchored_substr),
8932 (IV)r->anchored_offset);
8933 } else if (r->anchored_utf8) {
8934 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8935 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8936 PerlIO_printf(Perl_debug_log,
8937 "anchored utf8 %s%s at %"IVdf" ",
8938 s, RE_SV_TAIL(r->anchored_utf8),
8939 (IV)r->anchored_offset);
8941 if (r->float_substr) {
8942 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8943 RE_SV_DUMPLEN(r->float_substr), 30);
8944 PerlIO_printf(Perl_debug_log,
8945 "floating %s%s at %"IVdf"..%"UVuf" ",
8946 s, RE_SV_TAIL(r->float_substr),
8947 (IV)r->float_min_offset, (UV)r->float_max_offset);
8948 } else if (r->float_utf8) {
8949 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8950 RE_SV_DUMPLEN(r->float_utf8), 30);
8951 PerlIO_printf(Perl_debug_log,
8952 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8953 s, RE_SV_TAIL(r->float_utf8),
8954 (IV)r->float_min_offset, (UV)r->float_max_offset);
8956 if (r->check_substr || r->check_utf8)
8957 PerlIO_printf(Perl_debug_log,
8959 (r->check_substr == r->float_substr
8960 && r->check_utf8 == r->float_utf8
8961 ? "(checking floating" : "(checking anchored"));
8962 if (r->extflags & RXf_NOSCAN)
8963 PerlIO_printf(Perl_debug_log, " noscan");
8964 if (r->extflags & RXf_CHECK_ALL)
8965 PerlIO_printf(Perl_debug_log, " isall");
8966 if (r->check_substr || r->check_utf8)
8967 PerlIO_printf(Perl_debug_log, ") ");
8969 if (ri->regstclass) {
8970 regprop(r, sv, ri->regstclass);
8971 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8973 if (r->extflags & RXf_ANCH) {
8974 PerlIO_printf(Perl_debug_log, "anchored");
8975 if (r->extflags & RXf_ANCH_BOL)
8976 PerlIO_printf(Perl_debug_log, "(BOL)");
8977 if (r->extflags & RXf_ANCH_MBOL)
8978 PerlIO_printf(Perl_debug_log, "(MBOL)");
8979 if (r->extflags & RXf_ANCH_SBOL)
8980 PerlIO_printf(Perl_debug_log, "(SBOL)");
8981 if (r->extflags & RXf_ANCH_GPOS)
8982 PerlIO_printf(Perl_debug_log, "(GPOS)");
8983 PerlIO_putc(Perl_debug_log, ' ');
8985 if (r->extflags & RXf_GPOS_SEEN)
8986 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8987 if (r->intflags & PREGf_SKIP)
8988 PerlIO_printf(Perl_debug_log, "plus ");
8989 if (r->intflags & PREGf_IMPLICIT)
8990 PerlIO_printf(Perl_debug_log, "implicit ");
8991 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8992 if (r->extflags & RXf_EVAL_SEEN)
8993 PerlIO_printf(Perl_debug_log, "with eval ");
8994 PerlIO_printf(Perl_debug_log, "\n");
8995 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
8997 PERL_ARGS_ASSERT_REGDUMP;
8998 PERL_UNUSED_CONTEXT;
9000 #endif /* DEBUGGING */
9004 - regprop - printable representation of opcode
9006 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9009 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9010 if (flags & ANYOF_INVERT) \
9011 /*make sure the invert info is in each */ \
9012 sv_catpvs(sv, "^"); \
9018 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9023 RXi_GET_DECL(prog,progi);
9024 GET_RE_DEBUG_FLAGS_DECL;
9026 PERL_ARGS_ASSERT_REGPROP;
9030 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9031 /* It would be nice to FAIL() here, but this may be called from
9032 regexec.c, and it would be hard to supply pRExC_state. */
9033 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9034 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9036 k = PL_regkind[OP(o)];
9040 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9041 * is a crude hack but it may be the best for now since
9042 * we have no flag "this EXACTish node was UTF-8"
9044 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9045 PERL_PV_ESCAPE_UNI_DETECT |
9046 PERL_PV_PRETTY_ELLIPSES |
9047 PERL_PV_PRETTY_LTGT |
9048 PERL_PV_PRETTY_NOCLEAR
9050 } else if (k == TRIE) {
9051 /* print the details of the trie in dumpuntil instead, as
9052 * progi->data isn't available here */
9053 const char op = OP(o);
9054 const U32 n = ARG(o);
9055 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9056 (reg_ac_data *)progi->data->data[n] :
9058 const reg_trie_data * const trie
9059 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9061 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9062 DEBUG_TRIE_COMPILE_r(
9063 Perl_sv_catpvf(aTHX_ sv,
9064 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9065 (UV)trie->startstate,
9066 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9067 (UV)trie->wordcount,
9070 (UV)TRIE_CHARCOUNT(trie),
9071 (UV)trie->uniquecharcount
9074 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9076 int rangestart = -1;
9077 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9079 for (i = 0; i <= 256; i++) {
9080 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9081 if (rangestart == -1)
9083 } else if (rangestart != -1) {
9084 if (i <= rangestart + 3)
9085 for (; rangestart < i; rangestart++)
9086 put_byte(sv, rangestart);
9088 put_byte(sv, rangestart);
9090 put_byte(sv, i - 1);
9098 } else if (k == CURLY) {
9099 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9100 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9101 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9103 else if (k == WHILEM && o->flags) /* Ordinal/of */
9104 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9105 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9106 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9107 if ( RXp_PAREN_NAMES(prog) ) {
9108 if ( k != REF || OP(o) < NREF) {
9109 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9110 SV **name= av_fetch(list, ARG(o), 0 );
9112 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9115 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9116 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9117 I32 *nums=(I32*)SvPVX(sv_dat);
9118 SV **name= av_fetch(list, nums[0], 0 );
9121 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9122 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9123 (n ? "," : ""), (IV)nums[n]);
9125 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9129 } else if (k == GOSUB)
9130 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9131 else if (k == VERB) {
9133 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9134 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9135 } else if (k == LOGICAL)
9136 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9137 else if (k == FOLDCHAR)
9138 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9139 else if (k == ANYOF) {
9140 int i, rangestart = -1;
9141 const U8 flags = ANYOF_FLAGS(o);
9144 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9145 static const char * const anyofs[] = {
9178 if (flags & ANYOF_LOCALE)
9179 sv_catpvs(sv, "{loc}");
9180 if (flags & ANYOF_FOLD)
9181 sv_catpvs(sv, "{i}");
9182 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9183 if (flags & ANYOF_INVERT)
9186 /* output what the standard cp 0-255 bitmap matches */
9187 for (i = 0; i <= 256; i++) {
9188 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9189 if (rangestart == -1)
9191 } else if (rangestart != -1) {
9192 if (i <= rangestart + 3)
9193 for (; rangestart < i; rangestart++)
9194 put_byte(sv, rangestart);
9196 put_byte(sv, rangestart);
9198 put_byte(sv, i - 1);
9205 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9206 /* output any special charclass tests (used mostly under use locale) */
9207 if (o->flags & ANYOF_CLASS)
9208 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9209 if (ANYOF_CLASS_TEST(o,i)) {
9210 sv_catpv(sv, anyofs[i]);
9214 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9216 /* output information about the unicode matching */
9217 if (flags & ANYOF_UNICODE)
9218 sv_catpvs(sv, "{unicode}");
9219 else if (flags & ANYOF_UNICODE_ALL)
9220 sv_catpvs(sv, "{unicode_all}");
9224 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9228 U8 s[UTF8_MAXBYTES_CASE+1];
9230 for (i = 0; i <= 256; i++) { /* just the first 256 */
9231 uvchr_to_utf8(s, i);
9233 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9234 if (rangestart == -1)
9236 } else if (rangestart != -1) {
9237 if (i <= rangestart + 3)
9238 for (; rangestart < i; rangestart++) {
9239 const U8 * const e = uvchr_to_utf8(s,rangestart);
9241 for(p = s; p < e; p++)
9245 const U8 *e = uvchr_to_utf8(s,rangestart);
9247 for (p = s; p < e; p++)
9250 e = uvchr_to_utf8(s, i-1);
9251 for (p = s; p < e; p++)
9258 sv_catpvs(sv, "..."); /* et cetera */
9262 char *s = savesvpv(lv);
9263 char * const origs = s;
9265 while (*s && *s != '\n')
9269 const char * const t = ++s;
9287 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9289 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9290 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9292 PERL_UNUSED_CONTEXT;
9293 PERL_UNUSED_ARG(sv);
9295 PERL_UNUSED_ARG(prog);
9296 #endif /* DEBUGGING */
9300 Perl_re_intuit_string(pTHX_ REGEXP * const prog)
9301 { /* Assume that RE_INTUIT is set */
9303 GET_RE_DEBUG_FLAGS_DECL;
9305 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9306 PERL_UNUSED_CONTEXT;
9310 const char * const s = SvPV_nolen_const(prog->check_substr
9311 ? prog->check_substr : prog->check_utf8);
9313 if (!PL_colorset) reginitcolors();
9314 PerlIO_printf(Perl_debug_log,
9315 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9317 prog->check_substr ? "" : "utf8 ",
9318 PL_colors[5],PL_colors[0],
9321 (strlen(s) > 60 ? "..." : ""));
9324 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9330 handles refcounting and freeing the perl core regexp structure. When
9331 it is necessary to actually free the structure the first thing it
9332 does is call the 'free' method of the regexp_engine associated to to
9333 the regexp, allowing the handling of the void *pprivate; member
9334 first. (This routine is not overridable by extensions, which is why
9335 the extensions free is called first.)
9337 See regdupe and regdupe_internal if you change anything here.
9339 #ifndef PERL_IN_XSUB_RE
9341 Perl_pregfree(pTHX_ REGEXP *r)
9344 GET_RE_DEBUG_FLAGS_DECL;
9346 if (!r || (--r->refcnt > 0))
9349 ReREFCNT_dec(r->mother_re);
9351 CALLREGFREE_PVT(r); /* free the private data */
9352 if (RXp_PAREN_NAMES(r))
9353 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9354 Safefree(RX_WRAPPED(r));
9357 if (r->anchored_substr)
9358 SvREFCNT_dec(r->anchored_substr);
9359 if (r->anchored_utf8)
9360 SvREFCNT_dec(r->anchored_utf8);
9361 if (r->float_substr)
9362 SvREFCNT_dec(r->float_substr);
9364 SvREFCNT_dec(r->float_utf8);
9365 Safefree(r->substrs);
9367 RX_MATCH_COPY_FREE(r);
9368 #ifdef PERL_OLD_COPY_ON_WRITE
9370 SvREFCNT_dec(r->saved_copy);
9379 This is a hacky workaround to the structural issue of match results
9380 being stored in the regexp structure which is in turn stored in
9381 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9382 could be PL_curpm in multiple contexts, and could require multiple
9383 result sets being associated with the pattern simultaneously, such
9384 as when doing a recursive match with (??{$qr})
9386 The solution is to make a lightweight copy of the regexp structure
9387 when a qr// is returned from the code executed by (??{$qr}) this
9388 lightweight copy doesnt actually own any of its data except for
9389 the starp/end and the actual regexp structure itself.
9395 Perl_reg_temp_copy (pTHX_ REGEXP *r) {
9397 register const I32 npar = r->nparens+1;
9399 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9401 (void)ReREFCNT_inc(r);
9402 Newx(ret, 1, regexp);
9403 StructCopy(r, ret, regexp);
9404 Newx(ret->offs, npar, regexp_paren_pair);
9405 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9408 Newx(ret->substrs, 1, struct reg_substr_data);
9409 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9411 SvREFCNT_inc_void(ret->anchored_substr);
9412 SvREFCNT_inc_void(ret->anchored_utf8);
9413 SvREFCNT_inc_void(ret->float_substr);
9414 SvREFCNT_inc_void(ret->float_utf8);
9416 /* check_substr and check_utf8, if non-NULL, point to either their
9417 anchored or float namesakes, and don't hold a second reference. */
9419 RX_MATCH_COPIED_off(ret);
9420 #ifdef PERL_OLD_COPY_ON_WRITE
9421 ret->saved_copy = NULL;
9430 /* regfree_internal()
9432 Free the private data in a regexp. This is overloadable by
9433 extensions. Perl takes care of the regexp structure in pregfree(),
9434 this covers the *pprivate pointer which technically perldoesnt
9435 know about, however of course we have to handle the
9436 regexp_internal structure when no extension is in use.
9438 Note this is called before freeing anything in the regexp
9443 Perl_regfree_internal(pTHX_ REGEXP * const r)
9447 GET_RE_DEBUG_FLAGS_DECL;
9449 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9455 SV *dsv= sv_newmortal();
9456 RE_PV_QUOTED_DECL(s, RX_UTF8(r),
9457 dsv, RX_PRECOMP(r), RX_PRELEN(r), 60);
9458 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9459 PL_colors[4],PL_colors[5],s);
9462 #ifdef RE_TRACK_PATTERN_OFFSETS
9464 Safefree(ri->u.offsets); /* 20010421 MJD */
9467 int n = ri->data->count;
9468 PAD* new_comppad = NULL;
9473 /* If you add a ->what type here, update the comment in regcomp.h */
9474 switch (ri->data->what[n]) {
9478 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9481 Safefree(ri->data->data[n]);
9484 new_comppad = MUTABLE_AV(ri->data->data[n]);
9487 if (new_comppad == NULL)
9488 Perl_croak(aTHX_ "panic: pregfree comppad");
9489 PAD_SAVE_LOCAL(old_comppad,
9490 /* Watch out for global destruction's random ordering. */
9491 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9494 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9497 op_free((OP_4tree*)ri->data->data[n]);
9499 PAD_RESTORE_LOCAL(old_comppad);
9500 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9506 { /* Aho Corasick add-on structure for a trie node.
9507 Used in stclass optimization only */
9509 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9511 refcount = --aho->refcount;
9514 PerlMemShared_free(aho->states);
9515 PerlMemShared_free(aho->fail);
9516 /* do this last!!!! */
9517 PerlMemShared_free(ri->data->data[n]);
9518 PerlMemShared_free(ri->regstclass);
9524 /* trie structure. */
9526 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9528 refcount = --trie->refcount;
9531 PerlMemShared_free(trie->charmap);
9532 PerlMemShared_free(trie->states);
9533 PerlMemShared_free(trie->trans);
9535 PerlMemShared_free(trie->bitmap);
9537 PerlMemShared_free(trie->wordlen);
9539 PerlMemShared_free(trie->jump);
9541 PerlMemShared_free(trie->nextword);
9542 /* do this last!!!! */
9543 PerlMemShared_free(ri->data->data[n]);
9548 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9551 Safefree(ri->data->what);
9558 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9559 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9560 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9561 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9564 re_dup - duplicate a regexp.
9566 This routine is expected to clone a given regexp structure. It is only
9567 compiled under USE_ITHREADS.
9569 After all of the core data stored in struct regexp is duplicated
9570 the regexp_engine.dupe method is used to copy any private data
9571 stored in the *pprivate pointer. This allows extensions to handle
9572 any duplication it needs to do.
9574 See pregfree() and regfree_internal() if you change anything here.
9576 #if defined(USE_ITHREADS)
9577 #ifndef PERL_IN_XSUB_RE
9579 Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
9586 PERL_ARGS_ASSERT_RE_DUP;
9589 return (REGEXP *)NULL;
9591 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
9594 npar = r->nparens+1;
9595 Newx(ret, 1, regexp);
9596 StructCopy(r, ret, regexp);
9597 Newx(ret->offs, npar, regexp_paren_pair);
9598 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9600 /* no need to copy these */
9601 Newx(ret->swap, npar, regexp_paren_pair);
9605 /* Do it this way to avoid reading from *r after the StructCopy().
9606 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9607 cache, it doesn't matter. */
9608 const bool anchored = r->check_substr
9609 ? r->check_substr == r->anchored_substr
9610 : r->check_utf8 == r->anchored_utf8;
9611 Newx(ret->substrs, 1, struct reg_substr_data);
9612 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9614 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9615 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9616 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9617 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9619 /* check_substr and check_utf8, if non-NULL, point to either their
9620 anchored or float namesakes, and don't hold a second reference. */
9622 if (ret->check_substr) {
9624 assert(r->check_utf8 == r->anchored_utf8);
9625 ret->check_substr = ret->anchored_substr;
9626 ret->check_utf8 = ret->anchored_utf8;
9628 assert(r->check_substr == r->float_substr);
9629 assert(r->check_utf8 == r->float_utf8);
9630 ret->check_substr = ret->float_substr;
9631 ret->check_utf8 = ret->float_utf8;
9633 } else if (ret->check_utf8) {
9635 ret->check_utf8 = ret->anchored_utf8;
9637 ret->check_utf8 = ret->float_utf8;
9642 precomp_offset = RX_PRECOMP(ret) - ret->wrapped;
9644 RX_WRAPPED(ret) = SAVEPVN(RX_WRAPPED(ret), RX_WRAPLEN(ret)+1);
9645 RX_PRECOMP(ret) = ret->wrapped + precomp_offset;
9646 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9649 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
9651 if (RX_MATCH_COPIED(ret))
9652 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9655 #ifdef PERL_OLD_COPY_ON_WRITE
9656 ret->saved_copy = NULL;
9659 ret->mother_re = NULL;
9662 ptr_table_store(PL_ptr_table, r, ret);
9665 #endif /* PERL_IN_XSUB_RE */
9670 This is the internal complement to regdupe() which is used to copy
9671 the structure pointed to by the *pprivate pointer in the regexp.
9672 This is the core version of the extension overridable cloning hook.
9673 The regexp structure being duplicated will be copied by perl prior
9674 to this and will be provided as the regexp *r argument, however
9675 with the /old/ structures pprivate pointer value. Thus this routine
9676 may override any copying normally done by perl.
9678 It returns a pointer to the new regexp_internal structure.
9682 Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param)
9685 regexp_internal *reti;
9689 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9691 npar = r->nparens+1;
9694 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9695 Copy(ri->program, reti->program, len+1, regnode);
9698 reti->regstclass = NULL;
9702 const int count = ri->data->count;
9705 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9706 char, struct reg_data);
9707 Newx(d->what, count, U8);
9710 for (i = 0; i < count; i++) {
9711 d->what[i] = ri->data->what[i];
9712 switch (d->what[i]) {
9713 /* legal options are one of: sSfpontTu
9714 see also regcomp.h and pregfree() */
9717 case 'p': /* actually an AV, but the dup function is identical. */
9718 case 'u': /* actually an HV, but the dup function is identical. */
9719 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9722 /* This is cheating. */
9723 Newx(d->data[i], 1, struct regnode_charclass_class);
9724 StructCopy(ri->data->data[i], d->data[i],
9725 struct regnode_charclass_class);
9726 reti->regstclass = (regnode*)d->data[i];
9729 /* Compiled op trees are readonly and in shared memory,
9730 and can thus be shared without duplication. */
9732 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9736 /* Trie stclasses are readonly and can thus be shared
9737 * without duplication. We free the stclass in pregfree
9738 * when the corresponding reg_ac_data struct is freed.
9740 reti->regstclass= ri->regstclass;
9744 ((reg_trie_data*)ri->data->data[i])->refcount++;
9748 d->data[i] = ri->data->data[i];
9751 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9760 reti->name_list_idx = ri->name_list_idx;
9762 #ifdef RE_TRACK_PATTERN_OFFSETS
9763 if (ri->u.offsets) {
9764 Newx(reti->u.offsets, 2*len+1, U32);
9765 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9768 SetProgLen(reti,len);
9774 #endif /* USE_ITHREADS */
9779 converts a regexp embedded in a MAGIC struct to its stringified form,
9780 caching the converted form in the struct and returns the cached
9783 If lp is nonnull then it is used to return the length of the
9786 If flags is nonnull and the returned string contains UTF8 then
9787 (*flags & 1) will be true.
9789 If haseval is nonnull then it is used to return whether the pattern
9792 Normally called via macro:
9794 CALLREG_STRINGIFY(mg,&len,&utf8);
9798 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
9800 See sv_2pv_flags() in sv.c for an example of internal usage.
9803 #ifndef PERL_IN_XSUB_RE
9806 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9808 const REGEXP * const re = (REGEXP *)mg->mg_obj;
9810 *haseval = RX_SEEN_EVALS(re);
9812 *flags = RX_UTF8(re) ? 1 : 0;
9814 *lp = RX_WRAPLEN(re);
9815 return RX_WRAPPED(re);
9819 - regnext - dig the "next" pointer out of a node
9822 Perl_regnext(pTHX_ register regnode *p)
9825 register I32 offset;
9830 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9839 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9842 STRLEN l1 = strlen(pat1);
9843 STRLEN l2 = strlen(pat2);
9846 const char *message;
9848 PERL_ARGS_ASSERT_RE_CROAK2;
9854 Copy(pat1, buf, l1 , char);
9855 Copy(pat2, buf + l1, l2 , char);
9856 buf[l1 + l2] = '\n';
9857 buf[l1 + l2 + 1] = '\0';
9859 /* ANSI variant takes additional second argument */
9860 va_start(args, pat2);
9864 msv = vmess(buf, &args);
9866 message = SvPV_const(msv,l1);
9869 Copy(message, buf, l1 , char);
9870 buf[l1-1] = '\0'; /* Overwrite \n */
9871 Perl_croak(aTHX_ "%s", buf);
9874 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9876 #ifndef PERL_IN_XSUB_RE
9878 Perl_save_re_context(pTHX)
9882 struct re_save_state *state;
9884 SAVEVPTR(PL_curcop);
9885 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9887 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9888 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9889 SSPUSHINT(SAVEt_RE_STATE);
9891 Copy(&PL_reg_state, state, 1, struct re_save_state);
9893 PL_reg_start_tmp = 0;
9894 PL_reg_start_tmpl = 0;
9895 PL_reg_oldsaved = NULL;
9896 PL_reg_oldsavedlen = 0;
9898 PL_reg_leftiter = 0;
9899 PL_reg_poscache = NULL;
9900 PL_reg_poscache_size = 0;
9901 #ifdef PERL_OLD_COPY_ON_WRITE
9905 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9907 const REGEXP * const rx = PM_GETRE(PL_curpm);
9910 for (i = 1; i <= RX_NPARENS(rx); i++) {
9911 char digits[TYPE_CHARS(long)];
9912 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9913 GV *const *const gvp
9914 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9917 GV * const gv = *gvp;
9918 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9928 clear_re(pTHX_ void *r)
9931 ReREFCNT_dec((REGEXP *)r);
9937 S_put_byte(pTHX_ SV *sv, int c)
9939 PERL_ARGS_ASSERT_PUT_BYTE;
9941 /* Our definition of isPRINT() ignores locales, so only bytes that are
9942 not part of UTF-8 are considered printable. I assume that the same
9943 holds for UTF-EBCDIC.
9944 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9945 which Wikipedia says:
9947 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9948 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9949 identical, to the ASCII delete (DEL) or rubout control character.
9950 ) So the old condition can be simplified to !isPRINT(c) */
9952 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9954 const char string = c;
9955 if (c == '-' || c == ']' || c == '\\' || c == '^')
9956 sv_catpvs(sv, "\\");
9957 sv_catpvn(sv, &string, 1);
9962 #define CLEAR_OPTSTART \
9963 if (optstart) STMT_START { \
9964 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9968 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9970 STATIC const regnode *
9971 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9972 const regnode *last, const regnode *plast,
9973 SV* sv, I32 indent, U32 depth)
9976 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9977 register const regnode *next;
9978 const regnode *optstart= NULL;
9981 GET_RE_DEBUG_FLAGS_DECL;
9983 PERL_ARGS_ASSERT_DUMPUNTIL;
9985 #ifdef DEBUG_DUMPUNTIL
9986 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9987 last ? last-start : 0,plast ? plast-start : 0);
9990 if (plast && plast < last)
9993 while (PL_regkind[op] != END && (!last || node < last)) {
9994 /* While that wasn't END last time... */
9997 if (op == CLOSE || op == WHILEM)
9999 next = regnext((regnode *)node);
10002 if (OP(node) == OPTIMIZED) {
10003 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10010 regprop(r, sv, node);
10011 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10012 (int)(2*indent + 1), "", SvPVX_const(sv));
10014 if (OP(node) != OPTIMIZED) {
10015 if (next == NULL) /* Next ptr. */
10016 PerlIO_printf(Perl_debug_log, " (0)");
10017 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10018 PerlIO_printf(Perl_debug_log, " (FAIL)");
10020 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10021 (void)PerlIO_putc(Perl_debug_log, '\n');
10025 if (PL_regkind[(U8)op] == BRANCHJ) {
10028 register const regnode *nnode = (OP(next) == LONGJMP
10029 ? regnext((regnode *)next)
10031 if (last && nnode > last)
10033 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10036 else if (PL_regkind[(U8)op] == BRANCH) {
10038 DUMPUNTIL(NEXTOPER(node), next);
10040 else if ( PL_regkind[(U8)op] == TRIE ) {
10041 const regnode *this_trie = node;
10042 const char op = OP(node);
10043 const U32 n = ARG(node);
10044 const reg_ac_data * const ac = op>=AHOCORASICK ?
10045 (reg_ac_data *)ri->data->data[n] :
10047 const reg_trie_data * const trie =
10048 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10050 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10052 const regnode *nextbranch= NULL;
10055 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10056 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10058 PerlIO_printf(Perl_debug_log, "%*s%s ",
10059 (int)(2*(indent+3)), "",
10060 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10061 PL_colors[0], PL_colors[1],
10062 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10063 PERL_PV_PRETTY_ELLIPSES |
10064 PERL_PV_PRETTY_LTGT
10069 U16 dist= trie->jump[word_idx+1];
10070 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10071 (UV)((dist ? this_trie + dist : next) - start));
10074 nextbranch= this_trie + trie->jump[0];
10075 DUMPUNTIL(this_trie + dist, nextbranch);
10077 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10078 nextbranch= regnext((regnode *)nextbranch);
10080 PerlIO_printf(Perl_debug_log, "\n");
10083 if (last && next > last)
10088 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10089 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10090 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10092 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10094 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10096 else if ( op == PLUS || op == STAR) {
10097 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10099 else if (op == ANYOF) {
10100 /* arglen 1 + class block */
10101 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10102 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10103 node = NEXTOPER(node);
10105 else if (PL_regkind[(U8)op] == EXACT) {
10106 /* Literal string, where present. */
10107 node += NODE_SZ_STR(node) - 1;
10108 node = NEXTOPER(node);
10111 node = NEXTOPER(node);
10112 node += regarglen[(U8)op];
10114 if (op == CURLYX || op == OPEN)
10118 #ifdef DEBUG_DUMPUNTIL
10119 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10124 #endif /* DEBUGGING */
10128 * c-indentation-style: bsd
10129 * c-basic-offset: 4
10130 * indent-tabs-mode: t
10133 * ex: set ts=8 sts=4 sw=4 noet: