5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
75 #undef PERL_IN_XSUB_RE
76 #define PERL_IN_XSUB_RE 1
78 #undef PERL_IN_XSUB_RE
80 #ifndef PERL_IN_XSUB_RE
85 #ifdef PERL_IN_XSUB_RE
96 # if defined(BUGGY_MSC6)
97 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
98 # pragma optimize("a",off)
99 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
100 # pragma optimize("w",on )
101 # endif /* BUGGY_MSC6 */
105 #define STATIC static
108 typedef struct RExC_state_t {
109 U32 flags; /* are we folding, multilining? */
110 char *precomp; /* uncompiled string. */
111 REGEXP *rx_sv; /* The SV that is the regexp. */
112 regexp *rx; /* perl core regexp structure */
113 regexp_internal *rxi; /* internal data for regexp object pprivate field */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit_bound; /* First regnode outside of the allocated space */
120 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
124 I32 size; /* Code size. */
125 I32 npar; /* Capture buffer count, (OPEN). */
126 I32 cpar; /* Capture buffer count, (CLOSE). */
127 I32 nestroot; /* root parens we are in - used by accept */
131 regnode **open_parens; /* pointers to open parens */
132 regnode **close_parens; /* pointers to close parens */
133 regnode *opend; /* END node in program */
134 I32 utf8; /* whether the pattern is utf8 or not */
135 I32 orig_utf8; /* whether the pattern was originally in utf8 */
136 /* XXX use this for future optimisation of case
137 * where pattern must be upgraded to utf8. */
138 HV *paren_names; /* Paren names */
140 regnode **recurse; /* Recurse regops */
141 I32 recurse_count; /* Number of recurse regops */
143 char *starttry; /* -Dr: where regtry was called. */
144 #define RExC_starttry (pRExC_state->starttry)
147 const char *lastparse;
149 AV *paren_name_list; /* idx -> name */
150 #define RExC_lastparse (pRExC_state->lastparse)
151 #define RExC_lastnum (pRExC_state->lastnum)
152 #define RExC_paren_name_list (pRExC_state->paren_name_list)
156 #define RExC_flags (pRExC_state->flags)
157 #define RExC_precomp (pRExC_state->precomp)
158 #define RExC_rx_sv (pRExC_state->rx_sv)
159 #define RExC_rx (pRExC_state->rx)
160 #define RExC_rxi (pRExC_state->rxi)
161 #define RExC_start (pRExC_state->start)
162 #define RExC_end (pRExC_state->end)
163 #define RExC_parse (pRExC_state->parse)
164 #define RExC_whilem_seen (pRExC_state->whilem_seen)
165 #ifdef RE_TRACK_PATTERN_OFFSETS
166 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
168 #define RExC_emit (pRExC_state->emit)
169 #define RExC_emit_start (pRExC_state->emit_start)
170 #define RExC_emit_bound (pRExC_state->emit_bound)
171 #define RExC_naughty (pRExC_state->naughty)
172 #define RExC_sawback (pRExC_state->sawback)
173 #define RExC_seen (pRExC_state->seen)
174 #define RExC_size (pRExC_state->size)
175 #define RExC_npar (pRExC_state->npar)
176 #define RExC_nestroot (pRExC_state->nestroot)
177 #define RExC_extralen (pRExC_state->extralen)
178 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
179 #define RExC_seen_evals (pRExC_state->seen_evals)
180 #define RExC_utf8 (pRExC_state->utf8)
181 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
182 #define RExC_open_parens (pRExC_state->open_parens)
183 #define RExC_close_parens (pRExC_state->close_parens)
184 #define RExC_opend (pRExC_state->opend)
185 #define RExC_paren_names (pRExC_state->paren_names)
186 #define RExC_recurse (pRExC_state->recurse)
187 #define RExC_recurse_count (pRExC_state->recurse_count)
190 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
191 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
192 ((*s) == '{' && regcurly(s)))
195 #undef SPSTART /* dratted cpp namespace... */
198 * Flags to be passed up and down.
200 #define WORST 0 /* Worst case. */
201 #define HASWIDTH 0x01 /* Known to match non-null strings. */
202 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
203 #define SPSTART 0x04 /* Starts with * or +. */
204 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
205 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
207 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
209 /* whether trie related optimizations are enabled */
210 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
211 #define TRIE_STUDY_OPT
212 #define FULL_TRIE_STUDY
218 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
219 #define PBITVAL(paren) (1 << ((paren) & 7))
220 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
221 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
222 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225 /* About scan_data_t.
227 During optimisation we recurse through the regexp program performing
228 various inplace (keyhole style) optimisations. In addition study_chunk
229 and scan_commit populate this data structure with information about
230 what strings MUST appear in the pattern. We look for the longest
231 string that must appear for at a fixed location, and we look for the
232 longest string that may appear at a floating location. So for instance
237 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
238 strings (because they follow a .* construct). study_chunk will identify
239 both FOO and BAR as being the longest fixed and floating strings respectively.
241 The strings can be composites, for instance
245 will result in a composite fixed substring 'foo'.
247 For each string some basic information is maintained:
249 - offset or min_offset
250 This is the position the string must appear at, or not before.
251 It also implicitly (when combined with minlenp) tells us how many
252 character must match before the string we are searching.
253 Likewise when combined with minlenp and the length of the string
254 tells us how many characters must appear after the string we have
258 Only used for floating strings. This is the rightmost point that
259 the string can appear at. Ifset to I32 max it indicates that the
260 string can occur infinitely far to the right.
263 A pointer to the minimum length of the pattern that the string
264 was found inside. This is important as in the case of positive
265 lookahead or positive lookbehind we can have multiple patterns
270 The minimum length of the pattern overall is 3, the minimum length
271 of the lookahead part is 3, but the minimum length of the part that
272 will actually match is 1. So 'FOO's minimum length is 3, but the
273 minimum length for the F is 1. This is important as the minimum length
274 is used to determine offsets in front of and behind the string being
275 looked for. Since strings can be composites this is the length of the
276 pattern at the time it was commited with a scan_commit. Note that
277 the length is calculated by study_chunk, so that the minimum lengths
278 are not known until the full pattern has been compiled, thus the
279 pointer to the value.
283 In the case of lookbehind the string being searched for can be
284 offset past the start point of the final matching string.
285 If this value was just blithely removed from the min_offset it would
286 invalidate some of the calculations for how many chars must match
287 before or after (as they are derived from min_offset and minlen and
288 the length of the string being searched for).
289 When the final pattern is compiled and the data is moved from the
290 scan_data_t structure into the regexp structure the information
291 about lookbehind is factored in, with the information that would
292 have been lost precalculated in the end_shift field for the
295 The fields pos_min and pos_delta are used to store the minimum offset
296 and the delta to the maximum offset at the current point in the pattern.
300 typedef struct scan_data_t {
301 /*I32 len_min; unused */
302 /*I32 len_delta; unused */
306 I32 last_end; /* min value, <0 unless valid. */
309 SV **longest; /* Either &l_fixed, or &l_float. */
310 SV *longest_fixed; /* longest fixed string found in pattern */
311 I32 offset_fixed; /* offset where it starts */
312 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
313 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
314 SV *longest_float; /* longest floating string found in pattern */
315 I32 offset_float_min; /* earliest point in string it can appear */
316 I32 offset_float_max; /* latest point in string it can appear */
317 I32 *minlen_float; /* pointer to the minlen relevent to the string */
318 I32 lookbehind_float; /* is the position of the string modified by LB */
322 struct regnode_charclass_class *start_class;
326 * Forward declarations for pregcomp()'s friends.
329 static const scan_data_t zero_scan_data =
330 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
332 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
333 #define SF_BEFORE_SEOL 0x0001
334 #define SF_BEFORE_MEOL 0x0002
335 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
336 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
339 # define SF_FIX_SHIFT_EOL (0+2)
340 # define SF_FL_SHIFT_EOL (0+4)
342 # define SF_FIX_SHIFT_EOL (+2)
343 # define SF_FL_SHIFT_EOL (+4)
346 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
347 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
349 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
350 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
351 #define SF_IS_INF 0x0040
352 #define SF_HAS_PAR 0x0080
353 #define SF_IN_PAR 0x0100
354 #define SF_HAS_EVAL 0x0200
355 #define SCF_DO_SUBSTR 0x0400
356 #define SCF_DO_STCLASS_AND 0x0800
357 #define SCF_DO_STCLASS_OR 0x1000
358 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
359 #define SCF_WHILEM_VISITED_POS 0x2000
361 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
362 #define SCF_SEEN_ACCEPT 0x8000
364 #define UTF (RExC_utf8 != 0)
365 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
366 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
368 #define OOB_UNICODE 12345678
369 #define OOB_NAMEDCLASS -1
371 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
372 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
375 /* length of regex to show in messages that don't mark a position within */
376 #define RegexLengthToShowInErrorMessages 127
379 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
380 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
381 * op/pragma/warn/regcomp.
383 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
384 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
386 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
389 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
390 * arg. Show regex, up to a maximum length. If it's too long, chop and add
393 #define _FAIL(code) STMT_START { \
394 const char *ellipses = ""; \
395 IV len = RExC_end - RExC_precomp; \
398 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
399 if (len > RegexLengthToShowInErrorMessages) { \
400 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
401 len = RegexLengthToShowInErrorMessages - 10; \
407 #define FAIL(msg) _FAIL( \
408 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
409 msg, (int)len, RExC_precomp, ellipses))
411 #define FAIL2(msg,arg) _FAIL( \
412 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
413 arg, (int)len, RExC_precomp, ellipses))
416 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
418 #define Simple_vFAIL(m) STMT_START { \
419 const IV offset = RExC_parse - RExC_precomp; \
420 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
421 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
427 #define vFAIL(m) STMT_START { \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
434 * Like Simple_vFAIL(), but accepts two arguments.
436 #define Simple_vFAIL2(m,a1) STMT_START { \
437 const IV offset = RExC_parse - RExC_precomp; \
438 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
439 (int)offset, RExC_precomp, RExC_precomp + offset); \
443 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
445 #define vFAIL2(m,a1) STMT_START { \
447 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
448 Simple_vFAIL2(m, a1); \
453 * Like Simple_vFAIL(), but accepts three arguments.
455 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
456 const IV offset = RExC_parse - RExC_precomp; \
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
462 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
464 #define vFAIL3(m,a1,a2) STMT_START { \
466 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
467 Simple_vFAIL3(m, a1, a2); \
471 * Like Simple_vFAIL(), but accepts four arguments.
473 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
474 const IV offset = RExC_parse - RExC_precomp; \
475 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
476 (int)offset, RExC_precomp, RExC_precomp + offset); \
479 #define ckWARNreg(loc,m) STMT_START { \
480 const IV offset = loc - RExC_precomp; \
481 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
482 (int)offset, RExC_precomp, RExC_precomp + offset); \
485 #define ckWARNregdep(loc,m) STMT_START { \
486 const IV offset = loc - RExC_precomp; \
487 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
489 (int)offset, RExC_precomp, RExC_precomp + offset); \
492 #define ckWARN2reg(loc, m, a1) STMT_START { \
493 const IV offset = loc - RExC_precomp; \
494 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
495 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
498 #define vWARN3(loc, m, a1, a2) STMT_START { \
499 const IV offset = loc - RExC_precomp; \
500 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
501 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
504 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
505 const IV offset = loc - RExC_precomp; \
506 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
507 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
510 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
511 const IV offset = loc - RExC_precomp; \
512 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
513 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
516 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
517 const IV offset = loc - RExC_precomp; \
518 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
519 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
522 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
523 const IV offset = loc - RExC_precomp; \
524 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
525 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
529 /* Allow for side effects in s */
530 #define REGC(c,s) STMT_START { \
531 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
534 /* Macros for recording node offsets. 20001227 mjd@plover.com
535 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
536 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
537 * Element 0 holds the number n.
538 * Position is 1 indexed.
540 #ifndef RE_TRACK_PATTERN_OFFSETS
541 #define Set_Node_Offset_To_R(node,byte)
542 #define Set_Node_Offset(node,byte)
543 #define Set_Cur_Node_Offset
544 #define Set_Node_Length_To_R(node,len)
545 #define Set_Node_Length(node,len)
546 #define Set_Node_Cur_Length(node)
547 #define Node_Offset(n)
548 #define Node_Length(n)
549 #define Set_Node_Offset_Length(node,offset,len)
550 #define ProgLen(ri) ri->u.proglen
551 #define SetProgLen(ri,x) ri->u.proglen = x
553 #define ProgLen(ri) ri->u.offsets[0]
554 #define SetProgLen(ri,x) ri->u.offsets[0] = x
555 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
557 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
558 __LINE__, (int)(node), (int)(byte))); \
560 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
562 RExC_offsets[2*(node)-1] = (byte); \
567 #define Set_Node_Offset(node,byte) \
568 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
569 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
571 #define Set_Node_Length_To_R(node,len) STMT_START { \
573 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
574 __LINE__, (int)(node), (int)(len))); \
576 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
578 RExC_offsets[2*(node)] = (len); \
583 #define Set_Node_Length(node,len) \
584 Set_Node_Length_To_R((node)-RExC_emit_start, len)
585 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
586 #define Set_Node_Cur_Length(node) \
587 Set_Node_Length(node, RExC_parse - parse_start)
589 /* Get offsets and lengths */
590 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
591 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
593 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
594 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
595 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
599 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
600 #define EXPERIMENTAL_INPLACESCAN
601 #endif /*RE_TRACK_PATTERN_OFFSETS*/
603 #define DEBUG_STUDYDATA(str,data,depth) \
604 DEBUG_OPTIMISE_MORE_r(if(data){ \
605 PerlIO_printf(Perl_debug_log, \
606 "%*s" str "Pos:%"IVdf"/%"IVdf \
607 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
608 (int)(depth)*2, "", \
609 (IV)((data)->pos_min), \
610 (IV)((data)->pos_delta), \
611 (UV)((data)->flags), \
612 (IV)((data)->whilem_c), \
613 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
614 is_inf ? "INF " : "" \
616 if ((data)->last_found) \
617 PerlIO_printf(Perl_debug_log, \
618 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
619 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
620 SvPVX_const((data)->last_found), \
621 (IV)((data)->last_end), \
622 (IV)((data)->last_start_min), \
623 (IV)((data)->last_start_max), \
624 ((data)->longest && \
625 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
626 SvPVX_const((data)->longest_fixed), \
627 (IV)((data)->offset_fixed), \
628 ((data)->longest && \
629 (data)->longest==&((data)->longest_float)) ? "*" : "", \
630 SvPVX_const((data)->longest_float), \
631 (IV)((data)->offset_float_min), \
632 (IV)((data)->offset_float_max) \
634 PerlIO_printf(Perl_debug_log,"\n"); \
637 static void clear_re(pTHX_ void *r);
639 /* Mark that we cannot extend a found fixed substring at this point.
640 Update the longest found anchored substring and the longest found
641 floating substrings if needed. */
644 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
646 const STRLEN l = CHR_SVLEN(data->last_found);
647 const STRLEN old_l = CHR_SVLEN(*data->longest);
648 GET_RE_DEBUG_FLAGS_DECL;
650 PERL_ARGS_ASSERT_SCAN_COMMIT;
652 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
653 SvSetMagicSV(*data->longest, data->last_found);
654 if (*data->longest == data->longest_fixed) {
655 data->offset_fixed = l ? data->last_start_min : data->pos_min;
656 if (data->flags & SF_BEFORE_EOL)
658 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
660 data->flags &= ~SF_FIX_BEFORE_EOL;
661 data->minlen_fixed=minlenp;
662 data->lookbehind_fixed=0;
664 else { /* *data->longest == data->longest_float */
665 data->offset_float_min = l ? data->last_start_min : data->pos_min;
666 data->offset_float_max = (l
667 ? data->last_start_max
668 : data->pos_min + data->pos_delta);
669 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
670 data->offset_float_max = I32_MAX;
671 if (data->flags & SF_BEFORE_EOL)
673 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
675 data->flags &= ~SF_FL_BEFORE_EOL;
676 data->minlen_float=minlenp;
677 data->lookbehind_float=0;
680 SvCUR_set(data->last_found, 0);
682 SV * const sv = data->last_found;
683 if (SvUTF8(sv) && SvMAGICAL(sv)) {
684 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
690 data->flags &= ~SF_BEFORE_EOL;
691 DEBUG_STUDYDATA("commit: ",data,0);
694 /* Can match anything (initialization) */
696 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
698 PERL_ARGS_ASSERT_CL_ANYTHING;
700 ANYOF_CLASS_ZERO(cl);
701 ANYOF_BITMAP_SETALL(cl);
702 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
704 cl->flags |= ANYOF_LOCALE;
707 /* Can match anything (initialization) */
709 S_cl_is_anything(const struct regnode_charclass_class *cl)
713 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
715 for (value = 0; value <= ANYOF_MAX; value += 2)
716 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
718 if (!(cl->flags & ANYOF_UNICODE_ALL))
720 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
725 /* Can match anything (initialization) */
727 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
729 PERL_ARGS_ASSERT_CL_INIT;
731 Zero(cl, 1, struct regnode_charclass_class);
733 cl_anything(pRExC_state, cl);
737 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
739 PERL_ARGS_ASSERT_CL_INIT_ZERO;
741 Zero(cl, 1, struct regnode_charclass_class);
743 cl_anything(pRExC_state, cl);
745 cl->flags |= ANYOF_LOCALE;
748 /* 'And' a given class with another one. Can create false positives */
749 /* We assume that cl is not inverted */
751 S_cl_and(struct regnode_charclass_class *cl,
752 const struct regnode_charclass_class *and_with)
754 PERL_ARGS_ASSERT_CL_AND;
756 assert(and_with->type == ANYOF);
757 if (!(and_with->flags & ANYOF_CLASS)
758 && !(cl->flags & ANYOF_CLASS)
759 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
760 && !(and_with->flags & ANYOF_FOLD)
761 && !(cl->flags & ANYOF_FOLD)) {
764 if (and_with->flags & ANYOF_INVERT)
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] &= ~and_with->bitmap[i];
768 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
769 cl->bitmap[i] &= and_with->bitmap[i];
770 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
771 if (!(and_with->flags & ANYOF_EOS))
772 cl->flags &= ~ANYOF_EOS;
774 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
775 !(and_with->flags & ANYOF_INVERT)) {
776 cl->flags &= ~ANYOF_UNICODE_ALL;
777 cl->flags |= ANYOF_UNICODE;
778 ARG_SET(cl, ARG(and_with));
780 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
781 !(and_with->flags & ANYOF_INVERT))
782 cl->flags &= ~ANYOF_UNICODE_ALL;
783 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
784 !(and_with->flags & ANYOF_INVERT))
785 cl->flags &= ~ANYOF_UNICODE;
788 /* 'OR' a given class with another one. Can create false positives */
789 /* We assume that cl is not inverted */
791 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
793 PERL_ARGS_ASSERT_CL_OR;
795 if (or_with->flags & ANYOF_INVERT) {
797 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
798 * <= (B1 | !B2) | (CL1 | !CL2)
799 * which is wasteful if CL2 is small, but we ignore CL2:
800 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
801 * XXXX Can we handle case-fold? Unclear:
802 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
803 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
805 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
806 && !(or_with->flags & ANYOF_FOLD)
807 && !(cl->flags & ANYOF_FOLD) ) {
810 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
811 cl->bitmap[i] |= ~or_with->bitmap[i];
812 } /* XXXX: logic is complicated otherwise */
814 cl_anything(pRExC_state, cl);
817 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
818 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
819 && (!(or_with->flags & ANYOF_FOLD)
820 || (cl->flags & ANYOF_FOLD)) ) {
823 /* OR char bitmap and class bitmap separately */
824 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
825 cl->bitmap[i] |= or_with->bitmap[i];
826 if (or_with->flags & ANYOF_CLASS) {
827 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
828 cl->classflags[i] |= or_with->classflags[i];
829 cl->flags |= ANYOF_CLASS;
832 else { /* XXXX: logic is complicated, leave it along for a moment. */
833 cl_anything(pRExC_state, cl);
836 if (or_with->flags & ANYOF_EOS)
837 cl->flags |= ANYOF_EOS;
839 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
840 ARG(cl) != ARG(or_with)) {
841 cl->flags |= ANYOF_UNICODE_ALL;
842 cl->flags &= ~ANYOF_UNICODE;
844 if (or_with->flags & ANYOF_UNICODE_ALL) {
845 cl->flags |= ANYOF_UNICODE_ALL;
846 cl->flags &= ~ANYOF_UNICODE;
850 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
851 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
852 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
853 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
858 dump_trie(trie,widecharmap,revcharmap)
859 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
860 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
862 These routines dump out a trie in a somewhat readable format.
863 The _interim_ variants are used for debugging the interim
864 tables that are used to generate the final compressed
865 representation which is what dump_trie expects.
867 Part of the reason for their existance is to provide a form
868 of documentation as to how the different representations function.
873 Dumps the final compressed table form of the trie to Perl_debug_log.
874 Used for debugging make_trie().
878 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
879 AV *revcharmap, U32 depth)
882 SV *sv=sv_newmortal();
883 int colwidth= widecharmap ? 6 : 4;
884 GET_RE_DEBUG_FLAGS_DECL;
886 PERL_ARGS_ASSERT_DUMP_TRIE;
888 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
889 (int)depth * 2 + 2,"",
890 "Match","Base","Ofs" );
892 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
893 SV ** const tmp = av_fetch( revcharmap, state, 0);
895 PerlIO_printf( Perl_debug_log, "%*s",
897 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
898 PL_colors[0], PL_colors[1],
899 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
900 PERL_PV_ESCAPE_FIRSTCHAR
905 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
906 (int)depth * 2 + 2,"");
908 for( state = 0 ; state < trie->uniquecharcount ; state++ )
909 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
910 PerlIO_printf( Perl_debug_log, "\n");
912 for( state = 1 ; state < trie->statecount ; state++ ) {
913 const U32 base = trie->states[ state ].trans.base;
915 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
917 if ( trie->states[ state ].wordnum ) {
918 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
920 PerlIO_printf( Perl_debug_log, "%6s", "" );
923 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
928 while( ( base + ofs < trie->uniquecharcount ) ||
929 ( base + ofs - trie->uniquecharcount < trie->lasttrans
930 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
933 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
935 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
936 if ( ( base + ofs >= trie->uniquecharcount ) &&
937 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
938 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
940 PerlIO_printf( Perl_debug_log, "%*"UVXf,
942 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
944 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
948 PerlIO_printf( Perl_debug_log, "]");
951 PerlIO_printf( Perl_debug_log, "\n" );
955 Dumps a fully constructed but uncompressed trie in list form.
956 List tries normally only are used for construction when the number of
957 possible chars (trie->uniquecharcount) is very high.
958 Used for debugging make_trie().
961 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
962 HV *widecharmap, AV *revcharmap, U32 next_alloc,
966 SV *sv=sv_newmortal();
967 int colwidth= widecharmap ? 6 : 4;
968 GET_RE_DEBUG_FLAGS_DECL;
970 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
972 /* print out the table precompression. */
973 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
974 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
975 "------:-----+-----------------\n" );
977 for( state=1 ; state < next_alloc ; state ++ ) {
980 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
981 (int)depth * 2 + 2,"", (UV)state );
982 if ( ! trie->states[ state ].wordnum ) {
983 PerlIO_printf( Perl_debug_log, "%5s| ","");
985 PerlIO_printf( Perl_debug_log, "W%4x| ",
986 trie->states[ state ].wordnum
989 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
990 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
992 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
994 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
995 PL_colors[0], PL_colors[1],
996 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
997 PERL_PV_ESCAPE_FIRSTCHAR
999 TRIE_LIST_ITEM(state,charid).forid,
1000 (UV)TRIE_LIST_ITEM(state,charid).newstate
1003 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1004 (int)((depth * 2) + 14), "");
1007 PerlIO_printf( Perl_debug_log, "\n");
1012 Dumps a fully constructed but uncompressed trie in table form.
1013 This is the normal DFA style state transition table, with a few
1014 twists to facilitate compression later.
1015 Used for debugging make_trie().
1018 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1019 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1024 SV *sv=sv_newmortal();
1025 int colwidth= widecharmap ? 6 : 4;
1026 GET_RE_DEBUG_FLAGS_DECL;
1028 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1031 print out the table precompression so that we can do a visual check
1032 that they are identical.
1035 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1037 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1038 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1040 PerlIO_printf( Perl_debug_log, "%*s",
1042 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1043 PL_colors[0], PL_colors[1],
1044 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1045 PERL_PV_ESCAPE_FIRSTCHAR
1051 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1053 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1054 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1057 PerlIO_printf( Perl_debug_log, "\n" );
1059 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1061 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1062 (int)depth * 2 + 2,"",
1063 (UV)TRIE_NODENUM( state ) );
1065 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1066 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1068 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1070 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1072 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1073 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1075 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1076 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1083 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1084 startbranch: the first branch in the whole branch sequence
1085 first : start branch of sequence of branch-exact nodes.
1086 May be the same as startbranch
1087 last : Thing following the last branch.
1088 May be the same as tail.
1089 tail : item following the branch sequence
1090 count : words in the sequence
1091 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1092 depth : indent depth
1094 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1096 A trie is an N'ary tree where the branches are determined by digital
1097 decomposition of the key. IE, at the root node you look up the 1st character and
1098 follow that branch repeat until you find the end of the branches. Nodes can be
1099 marked as "accepting" meaning they represent a complete word. Eg:
1103 would convert into the following structure. Numbers represent states, letters
1104 following numbers represent valid transitions on the letter from that state, if
1105 the number is in square brackets it represents an accepting state, otherwise it
1106 will be in parenthesis.
1108 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1112 (1) +-i->(6)-+-s->[7]
1114 +-s->(3)-+-h->(4)-+-e->[5]
1116 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1118 This shows that when matching against the string 'hers' we will begin at state 1
1119 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1120 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1121 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1122 single traverse. We store a mapping from accepting to state to which word was
1123 matched, and then when we have multiple possibilities we try to complete the
1124 rest of the regex in the order in which they occured in the alternation.
1126 The only prior NFA like behaviour that would be changed by the TRIE support is
1127 the silent ignoring of duplicate alternations which are of the form:
1129 / (DUPE|DUPE) X? (?{ ... }) Y /x
1131 Thus EVAL blocks follwing a trie may be called a different number of times with
1132 and without the optimisation. With the optimisations dupes will be silently
1133 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1134 the following demonstrates:
1136 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1138 which prints out 'word' three times, but
1140 'words'=~/(word|word|word)(?{ print $1 })S/
1142 which doesnt print it out at all. This is due to other optimisations kicking in.
1144 Example of what happens on a structural level:
1146 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1148 1: CURLYM[1] {1,32767}(18)
1159 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1160 and should turn into:
1162 1: CURLYM[1] {1,32767}(18)
1164 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1172 Cases where tail != last would be like /(?foo|bar)baz/:
1182 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1183 and would end up looking like:
1186 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1193 d = uvuni_to_utf8_flags(d, uv, 0);
1195 is the recommended Unicode-aware way of saying
1200 #define TRIE_STORE_REVCHAR \
1203 SV *zlopp = newSV(2); \
1204 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1205 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1206 SvCUR_set(zlopp, kapow - flrbbbbb); \
1209 av_push(revcharmap, zlopp); \
1211 char ooooff = (char)uvc; \
1212 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1216 #define TRIE_READ_CHAR STMT_START { \
1220 if ( foldlen > 0 ) { \
1221 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1226 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1227 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1228 foldlen -= UNISKIP( uvc ); \
1229 scan = foldbuf + UNISKIP( uvc ); \
1232 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1242 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1243 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1244 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1245 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1247 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1248 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1249 TRIE_LIST_CUR( state )++; \
1252 #define TRIE_LIST_NEW(state) STMT_START { \
1253 Newxz( trie->states[ state ].trans.list, \
1254 4, reg_trie_trans_le ); \
1255 TRIE_LIST_CUR( state ) = 1; \
1256 TRIE_LIST_LEN( state ) = 4; \
1259 #define TRIE_HANDLE_WORD(state) STMT_START { \
1260 U16 dupe= trie->states[ state ].wordnum; \
1261 regnode * const noper_next = regnext( noper ); \
1263 if (trie->wordlen) \
1264 trie->wordlen[ curword ] = wordlen; \
1266 /* store the word for dumping */ \
1268 if (OP(noper) != NOTHING) \
1269 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1271 tmp = newSVpvn_utf8( "", 0, UTF ); \
1272 av_push( trie_words, tmp ); \
1277 if ( noper_next < tail ) { \
1279 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1280 trie->jump[curword] = (U16)(noper_next - convert); \
1282 jumper = noper_next; \
1284 nextbranch= regnext(cur); \
1288 /* So it's a dupe. This means we need to maintain a */\
1289 /* linked-list from the first to the next. */\
1290 /* we only allocate the nextword buffer when there */\
1291 /* a dupe, so first time we have to do the allocation */\
1292 if (!trie->nextword) \
1293 trie->nextword = (U16 *) \
1294 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1295 while ( trie->nextword[dupe] ) \
1296 dupe= trie->nextword[dupe]; \
1297 trie->nextword[dupe]= curword; \
1299 /* we haven't inserted this word yet. */ \
1300 trie->states[ state ].wordnum = curword; \
1305 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1306 ( ( base + charid >= ucharcount \
1307 && base + charid < ubound \
1308 && state == trie->trans[ base - ucharcount + charid ].check \
1309 && trie->trans[ base - ucharcount + charid ].next ) \
1310 ? trie->trans[ base - ucharcount + charid ].next \
1311 : ( state==1 ? special : 0 ) \
1315 #define MADE_JUMP_TRIE 2
1316 #define MADE_EXACT_TRIE 4
1319 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1322 /* first pass, loop through and scan words */
1323 reg_trie_data *trie;
1324 HV *widecharmap = NULL;
1325 AV *revcharmap = newAV();
1327 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1332 regnode *jumper = NULL;
1333 regnode *nextbranch = NULL;
1334 regnode *convert = NULL;
1335 /* we just use folder as a flag in utf8 */
1336 const U8 * const folder = ( flags == EXACTF
1338 : ( flags == EXACTFL
1345 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1346 AV *trie_words = NULL;
1347 /* along with revcharmap, this only used during construction but both are
1348 * useful during debugging so we store them in the struct when debugging.
1351 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1352 STRLEN trie_charcount=0;
1354 SV *re_trie_maxbuff;
1355 GET_RE_DEBUG_FLAGS_DECL;
1357 PERL_ARGS_ASSERT_MAKE_TRIE;
1359 PERL_UNUSED_ARG(depth);
1362 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1364 trie->startstate = 1;
1365 trie->wordcount = word_count;
1366 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1367 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1368 if (!(UTF && folder))
1369 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1371 trie_words = newAV();
1374 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1375 if (!SvIOK(re_trie_maxbuff)) {
1376 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1379 PerlIO_printf( Perl_debug_log,
1380 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1381 (int)depth * 2 + 2, "",
1382 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1383 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1387 /* Find the node we are going to overwrite */
1388 if ( first == startbranch && OP( last ) != BRANCH ) {
1389 /* whole branch chain */
1392 /* branch sub-chain */
1393 convert = NEXTOPER( first );
1396 /* -- First loop and Setup --
1398 We first traverse the branches and scan each word to determine if it
1399 contains widechars, and how many unique chars there are, this is
1400 important as we have to build a table with at least as many columns as we
1403 We use an array of integers to represent the character codes 0..255
1404 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1405 native representation of the character value as the key and IV's for the
1408 *TODO* If we keep track of how many times each character is used we can
1409 remap the columns so that the table compression later on is more
1410 efficient in terms of memory by ensuring most common value is in the
1411 middle and the least common are on the outside. IMO this would be better
1412 than a most to least common mapping as theres a decent chance the most
1413 common letter will share a node with the least common, meaning the node
1414 will not be compressable. With a middle is most common approach the worst
1415 case is when we have the least common nodes twice.
1419 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1420 regnode * const noper = NEXTOPER( cur );
1421 const U8 *uc = (U8*)STRING( noper );
1422 const U8 * const e = uc + STR_LEN( noper );
1424 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1425 const U8 *scan = (U8*)NULL;
1426 U32 wordlen = 0; /* required init */
1428 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1430 if (OP(noper) == NOTHING) {
1434 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1435 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1436 regardless of encoding */
1438 for ( ; uc < e ; uc += len ) {
1439 TRIE_CHARCOUNT(trie)++;
1443 if ( !trie->charmap[ uvc ] ) {
1444 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1446 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1450 /* store the codepoint in the bitmap, and if its ascii
1451 also store its folded equivelent. */
1452 TRIE_BITMAP_SET(trie,uvc);
1454 /* store the folded codepoint */
1455 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1458 /* store first byte of utf8 representation of
1459 codepoints in the 127 < uvc < 256 range */
1460 if (127 < uvc && uvc < 192) {
1461 TRIE_BITMAP_SET(trie,194);
1462 } else if (191 < uvc ) {
1463 TRIE_BITMAP_SET(trie,195);
1464 /* && uvc < 256 -- we know uvc is < 256 already */
1467 set_bit = 0; /* We've done our bit :-) */
1472 widecharmap = newHV();
1474 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1477 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1479 if ( !SvTRUE( *svpp ) ) {
1480 sv_setiv( *svpp, ++trie->uniquecharcount );
1485 if( cur == first ) {
1488 } else if (chars < trie->minlen) {
1490 } else if (chars > trie->maxlen) {
1494 } /* end first pass */
1495 DEBUG_TRIE_COMPILE_r(
1496 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1497 (int)depth * 2 + 2,"",
1498 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1499 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1500 (int)trie->minlen, (int)trie->maxlen )
1502 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1505 We now know what we are dealing with in terms of unique chars and
1506 string sizes so we can calculate how much memory a naive
1507 representation using a flat table will take. If it's over a reasonable
1508 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1509 conservative but potentially much slower representation using an array
1512 At the end we convert both representations into the same compressed
1513 form that will be used in regexec.c for matching with. The latter
1514 is a form that cannot be used to construct with but has memory
1515 properties similar to the list form and access properties similar
1516 to the table form making it both suitable for fast searches and
1517 small enough that its feasable to store for the duration of a program.
1519 See the comment in the code where the compressed table is produced
1520 inplace from the flat tabe representation for an explanation of how
1521 the compression works.
1526 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1528 Second Pass -- Array Of Lists Representation
1530 Each state will be represented by a list of charid:state records
1531 (reg_trie_trans_le) the first such element holds the CUR and LEN
1532 points of the allocated array. (See defines above).
1534 We build the initial structure using the lists, and then convert
1535 it into the compressed table form which allows faster lookups
1536 (but cant be modified once converted).
1539 STRLEN transcount = 1;
1541 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1542 "%*sCompiling trie using list compiler\n",
1543 (int)depth * 2 + 2, ""));
1545 trie->states = (reg_trie_state *)
1546 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1547 sizeof(reg_trie_state) );
1551 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1553 regnode * const noper = NEXTOPER( cur );
1554 U8 *uc = (U8*)STRING( noper );
1555 const U8 * const e = uc + STR_LEN( noper );
1556 U32 state = 1; /* required init */
1557 U16 charid = 0; /* sanity init */
1558 U8 *scan = (U8*)NULL; /* sanity init */
1559 STRLEN foldlen = 0; /* required init */
1560 U32 wordlen = 0; /* required init */
1561 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1563 if (OP(noper) != NOTHING) {
1564 for ( ; uc < e ; uc += len ) {
1569 charid = trie->charmap[ uvc ];
1571 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1575 charid=(U16)SvIV( *svpp );
1578 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1585 if ( !trie->states[ state ].trans.list ) {
1586 TRIE_LIST_NEW( state );
1588 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1589 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1590 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1595 newstate = next_alloc++;
1596 TRIE_LIST_PUSH( state, charid, newstate );
1601 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1605 TRIE_HANDLE_WORD(state);
1607 } /* end second pass */
1609 /* next alloc is the NEXT state to be allocated */
1610 trie->statecount = next_alloc;
1611 trie->states = (reg_trie_state *)
1612 PerlMemShared_realloc( trie->states,
1614 * sizeof(reg_trie_state) );
1616 /* and now dump it out before we compress it */
1617 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1618 revcharmap, next_alloc,
1622 trie->trans = (reg_trie_trans *)
1623 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1630 for( state=1 ; state < next_alloc ; state ++ ) {
1634 DEBUG_TRIE_COMPILE_MORE_r(
1635 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1639 if (trie->states[state].trans.list) {
1640 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1644 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1645 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1646 if ( forid < minid ) {
1648 } else if ( forid > maxid ) {
1652 if ( transcount < tp + maxid - minid + 1) {
1654 trie->trans = (reg_trie_trans *)
1655 PerlMemShared_realloc( trie->trans,
1657 * sizeof(reg_trie_trans) );
1658 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1660 base = trie->uniquecharcount + tp - minid;
1661 if ( maxid == minid ) {
1663 for ( ; zp < tp ; zp++ ) {
1664 if ( ! trie->trans[ zp ].next ) {
1665 base = trie->uniquecharcount + zp - minid;
1666 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1667 trie->trans[ zp ].check = state;
1673 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1674 trie->trans[ tp ].check = state;
1679 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1680 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1681 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1682 trie->trans[ tid ].check = state;
1684 tp += ( maxid - minid + 1 );
1686 Safefree(trie->states[ state ].trans.list);
1689 DEBUG_TRIE_COMPILE_MORE_r(
1690 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1693 trie->states[ state ].trans.base=base;
1695 trie->lasttrans = tp + 1;
1699 Second Pass -- Flat Table Representation.
1701 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1702 We know that we will need Charcount+1 trans at most to store the data
1703 (one row per char at worst case) So we preallocate both structures
1704 assuming worst case.
1706 We then construct the trie using only the .next slots of the entry
1709 We use the .check field of the first entry of the node temporarily to
1710 make compression both faster and easier by keeping track of how many non
1711 zero fields are in the node.
1713 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1716 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1717 number representing the first entry of the node, and state as a
1718 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1719 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1720 are 2 entrys per node. eg:
1728 The table is internally in the right hand, idx form. However as we also
1729 have to deal with the states array which is indexed by nodenum we have to
1730 use TRIE_NODENUM() to convert.
1733 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1734 "%*sCompiling trie using table compiler\n",
1735 (int)depth * 2 + 2, ""));
1737 trie->trans = (reg_trie_trans *)
1738 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1739 * trie->uniquecharcount + 1,
1740 sizeof(reg_trie_trans) );
1741 trie->states = (reg_trie_state *)
1742 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1743 sizeof(reg_trie_state) );
1744 next_alloc = trie->uniquecharcount + 1;
1747 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1749 regnode * const noper = NEXTOPER( cur );
1750 const U8 *uc = (U8*)STRING( noper );
1751 const U8 * const e = uc + STR_LEN( noper );
1753 U32 state = 1; /* required init */
1755 U16 charid = 0; /* sanity init */
1756 U32 accept_state = 0; /* sanity init */
1757 U8 *scan = (U8*)NULL; /* sanity init */
1759 STRLEN foldlen = 0; /* required init */
1760 U32 wordlen = 0; /* required init */
1761 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1763 if ( OP(noper) != NOTHING ) {
1764 for ( ; uc < e ; uc += len ) {
1769 charid = trie->charmap[ uvc ];
1771 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1772 charid = svpp ? (U16)SvIV(*svpp) : 0;
1776 if ( !trie->trans[ state + charid ].next ) {
1777 trie->trans[ state + charid ].next = next_alloc;
1778 trie->trans[ state ].check++;
1779 next_alloc += trie->uniquecharcount;
1781 state = trie->trans[ state + charid ].next;
1783 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1785 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1788 accept_state = TRIE_NODENUM( state );
1789 TRIE_HANDLE_WORD(accept_state);
1791 } /* end second pass */
1793 /* and now dump it out before we compress it */
1794 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1796 next_alloc, depth+1));
1800 * Inplace compress the table.*
1802 For sparse data sets the table constructed by the trie algorithm will
1803 be mostly 0/FAIL transitions or to put it another way mostly empty.
1804 (Note that leaf nodes will not contain any transitions.)
1806 This algorithm compresses the tables by eliminating most such
1807 transitions, at the cost of a modest bit of extra work during lookup:
1809 - Each states[] entry contains a .base field which indicates the
1810 index in the state[] array wheres its transition data is stored.
1812 - If .base is 0 there are no valid transitions from that node.
1814 - If .base is nonzero then charid is added to it to find an entry in
1817 -If trans[states[state].base+charid].check!=state then the
1818 transition is taken to be a 0/Fail transition. Thus if there are fail
1819 transitions at the front of the node then the .base offset will point
1820 somewhere inside the previous nodes data (or maybe even into a node
1821 even earlier), but the .check field determines if the transition is
1825 The following process inplace converts the table to the compressed
1826 table: We first do not compress the root node 1,and mark its all its
1827 .check pointers as 1 and set its .base pointer as 1 as well. This
1828 allows to do a DFA construction from the compressed table later, and
1829 ensures that any .base pointers we calculate later are greater than
1832 - We set 'pos' to indicate the first entry of the second node.
1834 - We then iterate over the columns of the node, finding the first and
1835 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1836 and set the .check pointers accordingly, and advance pos
1837 appropriately and repreat for the next node. Note that when we copy
1838 the next pointers we have to convert them from the original
1839 NODEIDX form to NODENUM form as the former is not valid post
1842 - If a node has no transitions used we mark its base as 0 and do not
1843 advance the pos pointer.
1845 - If a node only has one transition we use a second pointer into the
1846 structure to fill in allocated fail transitions from other states.
1847 This pointer is independent of the main pointer and scans forward
1848 looking for null transitions that are allocated to a state. When it
1849 finds one it writes the single transition into the "hole". If the
1850 pointer doesnt find one the single transition is appended as normal.
1852 - Once compressed we can Renew/realloc the structures to release the
1855 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1856 specifically Fig 3.47 and the associated pseudocode.
1860 const U32 laststate = TRIE_NODENUM( next_alloc );
1863 trie->statecount = laststate;
1865 for ( state = 1 ; state < laststate ; state++ ) {
1867 const U32 stateidx = TRIE_NODEIDX( state );
1868 const U32 o_used = trie->trans[ stateidx ].check;
1869 U32 used = trie->trans[ stateidx ].check;
1870 trie->trans[ stateidx ].check = 0;
1872 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1873 if ( flag || trie->trans[ stateidx + charid ].next ) {
1874 if ( trie->trans[ stateidx + charid ].next ) {
1876 for ( ; zp < pos ; zp++ ) {
1877 if ( ! trie->trans[ zp ].next ) {
1881 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1882 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1883 trie->trans[ zp ].check = state;
1884 if ( ++zp > pos ) pos = zp;
1891 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1893 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1894 trie->trans[ pos ].check = state;
1899 trie->lasttrans = pos + 1;
1900 trie->states = (reg_trie_state *)
1901 PerlMemShared_realloc( trie->states, laststate
1902 * sizeof(reg_trie_state) );
1903 DEBUG_TRIE_COMPILE_MORE_r(
1904 PerlIO_printf( Perl_debug_log,
1905 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1906 (int)depth * 2 + 2,"",
1907 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1910 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1913 } /* end table compress */
1915 DEBUG_TRIE_COMPILE_MORE_r(
1916 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1917 (int)depth * 2 + 2, "",
1918 (UV)trie->statecount,
1919 (UV)trie->lasttrans)
1921 /* resize the trans array to remove unused space */
1922 trie->trans = (reg_trie_trans *)
1923 PerlMemShared_realloc( trie->trans, trie->lasttrans
1924 * sizeof(reg_trie_trans) );
1926 /* and now dump out the compressed format */
1927 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1929 { /* Modify the program and insert the new TRIE node*/
1930 U8 nodetype =(U8)(flags & 0xFF);
1934 regnode *optimize = NULL;
1935 #ifdef RE_TRACK_PATTERN_OFFSETS
1938 U32 mjd_nodelen = 0;
1939 #endif /* RE_TRACK_PATTERN_OFFSETS */
1940 #endif /* DEBUGGING */
1942 This means we convert either the first branch or the first Exact,
1943 depending on whether the thing following (in 'last') is a branch
1944 or not and whther first is the startbranch (ie is it a sub part of
1945 the alternation or is it the whole thing.)
1946 Assuming its a sub part we conver the EXACT otherwise we convert
1947 the whole branch sequence, including the first.
1949 /* Find the node we are going to overwrite */
1950 if ( first != startbranch || OP( last ) == BRANCH ) {
1951 /* branch sub-chain */
1952 NEXT_OFF( first ) = (U16)(last - first);
1953 #ifdef RE_TRACK_PATTERN_OFFSETS
1955 mjd_offset= Node_Offset((convert));
1956 mjd_nodelen= Node_Length((convert));
1959 /* whole branch chain */
1961 #ifdef RE_TRACK_PATTERN_OFFSETS
1964 const regnode *nop = NEXTOPER( convert );
1965 mjd_offset= Node_Offset((nop));
1966 mjd_nodelen= Node_Length((nop));
1970 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1971 (int)depth * 2 + 2, "",
1972 (UV)mjd_offset, (UV)mjd_nodelen)
1975 /* But first we check to see if there is a common prefix we can
1976 split out as an EXACT and put in front of the TRIE node. */
1977 trie->startstate= 1;
1978 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1980 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1984 const U32 base = trie->states[ state ].trans.base;
1986 if ( trie->states[state].wordnum )
1989 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1990 if ( ( base + ofs >= trie->uniquecharcount ) &&
1991 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1992 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1994 if ( ++count > 1 ) {
1995 SV **tmp = av_fetch( revcharmap, ofs, 0);
1996 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1997 if ( state == 1 ) break;
1999 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2001 PerlIO_printf(Perl_debug_log,
2002 "%*sNew Start State=%"UVuf" Class: [",
2003 (int)depth * 2 + 2, "",
2006 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2007 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2009 TRIE_BITMAP_SET(trie,*ch);
2011 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2013 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2017 TRIE_BITMAP_SET(trie,*ch);
2019 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2020 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2026 SV **tmp = av_fetch( revcharmap, idx, 0);
2028 char *ch = SvPV( *tmp, len );
2030 SV *sv=sv_newmortal();
2031 PerlIO_printf( Perl_debug_log,
2032 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2033 (int)depth * 2 + 2, "",
2035 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2036 PL_colors[0], PL_colors[1],
2037 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2038 PERL_PV_ESCAPE_FIRSTCHAR
2043 OP( convert ) = nodetype;
2044 str=STRING(convert);
2047 STR_LEN(convert) += len;
2053 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2059 regnode *n = convert+NODE_SZ_STR(convert);
2060 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2061 trie->startstate = state;
2062 trie->minlen -= (state - 1);
2063 trie->maxlen -= (state - 1);
2065 /* At least the UNICOS C compiler choked on this
2066 * being argument to DEBUG_r(), so let's just have
2069 #ifdef PERL_EXT_RE_BUILD
2075 regnode *fix = convert;
2076 U32 word = trie->wordcount;
2078 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2079 while( ++fix < n ) {
2080 Set_Node_Offset_Length(fix, 0, 0);
2083 SV ** const tmp = av_fetch( trie_words, word, 0 );
2085 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2086 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2088 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2096 NEXT_OFF(convert) = (U16)(tail - convert);
2097 DEBUG_r(optimize= n);
2103 if ( trie->maxlen ) {
2104 NEXT_OFF( convert ) = (U16)(tail - convert);
2105 ARG_SET( convert, data_slot );
2106 /* Store the offset to the first unabsorbed branch in
2107 jump[0], which is otherwise unused by the jump logic.
2108 We use this when dumping a trie and during optimisation. */
2110 trie->jump[0] = (U16)(nextbranch - convert);
2113 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2114 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2116 OP( convert ) = TRIEC;
2117 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2118 PerlMemShared_free(trie->bitmap);
2121 OP( convert ) = TRIE;
2123 /* store the type in the flags */
2124 convert->flags = nodetype;
2128 + regarglen[ OP( convert ) ];
2130 /* XXX We really should free up the resource in trie now,
2131 as we won't use them - (which resources?) dmq */
2133 /* needed for dumping*/
2134 DEBUG_r(if (optimize) {
2135 regnode *opt = convert;
2137 while ( ++opt < optimize) {
2138 Set_Node_Offset_Length(opt,0,0);
2141 Try to clean up some of the debris left after the
2144 while( optimize < jumper ) {
2145 mjd_nodelen += Node_Length((optimize));
2146 OP( optimize ) = OPTIMIZED;
2147 Set_Node_Offset_Length(optimize,0,0);
2150 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2152 } /* end node insert */
2153 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2154 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2156 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2157 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2159 SvREFCNT_dec(revcharmap);
2163 : trie->startstate>1
2169 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2171 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2173 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2174 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2177 We find the fail state for each state in the trie, this state is the longest proper
2178 suffix of the current states 'word' that is also a proper prefix of another word in our
2179 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2180 the DFA not to have to restart after its tried and failed a word at a given point, it
2181 simply continues as though it had been matching the other word in the first place.
2183 'abcdgu'=~/abcdefg|cdgu/
2184 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2185 fail, which would bring use to the state representing 'd' in the second word where we would
2186 try 'g' and succeed, prodceding to match 'cdgu'.
2188 /* add a fail transition */
2189 const U32 trie_offset = ARG(source);
2190 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2192 const U32 ucharcount = trie->uniquecharcount;
2193 const U32 numstates = trie->statecount;
2194 const U32 ubound = trie->lasttrans + ucharcount;
2198 U32 base = trie->states[ 1 ].trans.base;
2201 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2202 GET_RE_DEBUG_FLAGS_DECL;
2204 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2206 PERL_UNUSED_ARG(depth);
2210 ARG_SET( stclass, data_slot );
2211 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2212 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2213 aho->trie=trie_offset;
2214 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2215 Copy( trie->states, aho->states, numstates, reg_trie_state );
2216 Newxz( q, numstates, U32);
2217 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2220 /* initialize fail[0..1] to be 1 so that we always have
2221 a valid final fail state */
2222 fail[ 0 ] = fail[ 1 ] = 1;
2224 for ( charid = 0; charid < ucharcount ; charid++ ) {
2225 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2227 q[ q_write ] = newstate;
2228 /* set to point at the root */
2229 fail[ q[ q_write++ ] ]=1;
2232 while ( q_read < q_write) {
2233 const U32 cur = q[ q_read++ % numstates ];
2234 base = trie->states[ cur ].trans.base;
2236 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2237 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2239 U32 fail_state = cur;
2242 fail_state = fail[ fail_state ];
2243 fail_base = aho->states[ fail_state ].trans.base;
2244 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2246 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2247 fail[ ch_state ] = fail_state;
2248 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2250 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2252 q[ q_write++ % numstates] = ch_state;
2256 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2257 when we fail in state 1, this allows us to use the
2258 charclass scan to find a valid start char. This is based on the principle
2259 that theres a good chance the string being searched contains lots of stuff
2260 that cant be a start char.
2262 fail[ 0 ] = fail[ 1 ] = 0;
2263 DEBUG_TRIE_COMPILE_r({
2264 PerlIO_printf(Perl_debug_log,
2265 "%*sStclass Failtable (%"UVuf" states): 0",
2266 (int)(depth * 2), "", (UV)numstates
2268 for( q_read=1; q_read<numstates; q_read++ ) {
2269 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2271 PerlIO_printf(Perl_debug_log, "\n");
2274 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2279 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2280 * These need to be revisited when a newer toolchain becomes available.
2282 #if defined(__sparc64__) && defined(__GNUC__)
2283 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2284 # undef SPARC64_GCC_WORKAROUND
2285 # define SPARC64_GCC_WORKAROUND 1
2289 #define DEBUG_PEEP(str,scan,depth) \
2290 DEBUG_OPTIMISE_r({if (scan){ \
2291 SV * const mysv=sv_newmortal(); \
2292 regnode *Next = regnext(scan); \
2293 regprop(RExC_rx, mysv, scan); \
2294 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2295 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2296 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2303 #define JOIN_EXACT(scan,min,flags) \
2304 if (PL_regkind[OP(scan)] == EXACT) \
2305 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2308 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2309 /* Merge several consecutive EXACTish nodes into one. */
2310 regnode *n = regnext(scan);
2312 regnode *next = scan + NODE_SZ_STR(scan);
2316 regnode *stop = scan;
2317 GET_RE_DEBUG_FLAGS_DECL;
2319 PERL_UNUSED_ARG(depth);
2322 PERL_ARGS_ASSERT_JOIN_EXACT;
2323 #ifndef EXPERIMENTAL_INPLACESCAN
2324 PERL_UNUSED_ARG(flags);
2325 PERL_UNUSED_ARG(val);
2327 DEBUG_PEEP("join",scan,depth);
2329 /* Skip NOTHING, merge EXACT*. */
2331 ( PL_regkind[OP(n)] == NOTHING ||
2332 (stringok && (OP(n) == OP(scan))))
2334 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2336 if (OP(n) == TAIL || n > next)
2338 if (PL_regkind[OP(n)] == NOTHING) {
2339 DEBUG_PEEP("skip:",n,depth);
2340 NEXT_OFF(scan) += NEXT_OFF(n);
2341 next = n + NODE_STEP_REGNODE;
2348 else if (stringok) {
2349 const unsigned int oldl = STR_LEN(scan);
2350 regnode * const nnext = regnext(n);
2352 DEBUG_PEEP("merg",n,depth);
2355 if (oldl + STR_LEN(n) > U8_MAX)
2357 NEXT_OFF(scan) += NEXT_OFF(n);
2358 STR_LEN(scan) += STR_LEN(n);
2359 next = n + NODE_SZ_STR(n);
2360 /* Now we can overwrite *n : */
2361 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2369 #ifdef EXPERIMENTAL_INPLACESCAN
2370 if (flags && !NEXT_OFF(n)) {
2371 DEBUG_PEEP("atch", val, depth);
2372 if (reg_off_by_arg[OP(n)]) {
2373 ARG_SET(n, val - n);
2376 NEXT_OFF(n) = val - n;
2383 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2385 Two problematic code points in Unicode casefolding of EXACT nodes:
2387 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2388 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2394 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2395 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2397 This means that in case-insensitive matching (or "loose matching",
2398 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2399 length of the above casefolded versions) can match a target string
2400 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2401 This would rather mess up the minimum length computation.
2403 What we'll do is to look for the tail four bytes, and then peek
2404 at the preceding two bytes to see whether we need to decrease
2405 the minimum length by four (six minus two).
2407 Thanks to the design of UTF-8, there cannot be false matches:
2408 A sequence of valid UTF-8 bytes cannot be a subsequence of
2409 another valid sequence of UTF-8 bytes.
2412 char * const s0 = STRING(scan), *s, *t;
2413 char * const s1 = s0 + STR_LEN(scan) - 1;
2414 char * const s2 = s1 - 4;
2415 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2416 const char t0[] = "\xaf\x49\xaf\x42";
2418 const char t0[] = "\xcc\x88\xcc\x81";
2420 const char * const t1 = t0 + 3;
2423 s < s2 && (t = ninstr(s, s1, t0, t1));
2426 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2427 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2429 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2430 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2438 n = scan + NODE_SZ_STR(scan);
2440 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2447 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2451 /* REx optimizer. Converts nodes into quickier variants "in place".
2452 Finds fixed substrings. */
2454 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2455 to the position after last scanned or to NULL. */
2457 #define INIT_AND_WITHP \
2458 assert(!and_withp); \
2459 Newx(and_withp,1,struct regnode_charclass_class); \
2460 SAVEFREEPV(and_withp)
2462 /* this is a chain of data about sub patterns we are processing that
2463 need to be handled seperately/specially in study_chunk. Its so
2464 we can simulate recursion without losing state. */
2466 typedef struct scan_frame {
2467 regnode *last; /* last node to process in this frame */
2468 regnode *next; /* next node to process when last is reached */
2469 struct scan_frame *prev; /*previous frame*/
2470 I32 stop; /* what stopparen do we use */
2474 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2476 #define CASE_SYNST_FNC(nAmE) \
2478 if (flags & SCF_DO_STCLASS_AND) { \
2479 for (value = 0; value < 256; value++) \
2480 if (!is_ ## nAmE ## _cp(value)) \
2481 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2484 for (value = 0; value < 256; value++) \
2485 if (is_ ## nAmE ## _cp(value)) \
2486 ANYOF_BITMAP_SET(data->start_class, value); \
2490 if (flags & SCF_DO_STCLASS_AND) { \
2491 for (value = 0; value < 256; value++) \
2492 if (is_ ## nAmE ## _cp(value)) \
2493 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2496 for (value = 0; value < 256; value++) \
2497 if (!is_ ## nAmE ## _cp(value)) \
2498 ANYOF_BITMAP_SET(data->start_class, value); \
2505 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2506 I32 *minlenp, I32 *deltap,
2511 struct regnode_charclass_class *and_withp,
2512 U32 flags, U32 depth)
2513 /* scanp: Start here (read-write). */
2514 /* deltap: Write maxlen-minlen here. */
2515 /* last: Stop before this one. */
2516 /* data: string data about the pattern */
2517 /* stopparen: treat close N as END */
2518 /* recursed: which subroutines have we recursed into */
2519 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2522 I32 min = 0, pars = 0, code;
2523 regnode *scan = *scanp, *next;
2525 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2526 int is_inf_internal = 0; /* The studied chunk is infinite */
2527 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2528 scan_data_t data_fake;
2529 SV *re_trie_maxbuff = NULL;
2530 regnode *first_non_open = scan;
2531 I32 stopmin = I32_MAX;
2532 scan_frame *frame = NULL;
2533 GET_RE_DEBUG_FLAGS_DECL;
2535 PERL_ARGS_ASSERT_STUDY_CHUNK;
2538 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2542 while (first_non_open && OP(first_non_open) == OPEN)
2543 first_non_open=regnext(first_non_open);
2548 while ( scan && OP(scan) != END && scan < last ){
2549 /* Peephole optimizer: */
2550 DEBUG_STUDYDATA("Peep:", data,depth);
2551 DEBUG_PEEP("Peep",scan,depth);
2552 JOIN_EXACT(scan,&min,0);
2554 /* Follow the next-chain of the current node and optimize
2555 away all the NOTHINGs from it. */
2556 if (OP(scan) != CURLYX) {
2557 const int max = (reg_off_by_arg[OP(scan)]
2559 /* I32 may be smaller than U16 on CRAYs! */
2560 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2561 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2565 /* Skip NOTHING and LONGJMP. */
2566 while ((n = regnext(n))
2567 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2568 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2569 && off + noff < max)
2571 if (reg_off_by_arg[OP(scan)])
2574 NEXT_OFF(scan) = off;
2579 /* The principal pseudo-switch. Cannot be a switch, since we
2580 look into several different things. */
2581 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2582 || OP(scan) == IFTHEN) {
2583 next = regnext(scan);
2585 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2587 if (OP(next) == code || code == IFTHEN) {
2588 /* NOTE - There is similar code to this block below for handling
2589 TRIE nodes on a re-study. If you change stuff here check there
2591 I32 max1 = 0, min1 = I32_MAX, num = 0;
2592 struct regnode_charclass_class accum;
2593 regnode * const startbranch=scan;
2595 if (flags & SCF_DO_SUBSTR)
2596 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2597 if (flags & SCF_DO_STCLASS)
2598 cl_init_zero(pRExC_state, &accum);
2600 while (OP(scan) == code) {
2601 I32 deltanext, minnext, f = 0, fake;
2602 struct regnode_charclass_class this_class;
2605 data_fake.flags = 0;
2607 data_fake.whilem_c = data->whilem_c;
2608 data_fake.last_closep = data->last_closep;
2611 data_fake.last_closep = &fake;
2613 data_fake.pos_delta = delta;
2614 next = regnext(scan);
2615 scan = NEXTOPER(scan);
2617 scan = NEXTOPER(scan);
2618 if (flags & SCF_DO_STCLASS) {
2619 cl_init(pRExC_state, &this_class);
2620 data_fake.start_class = &this_class;
2621 f = SCF_DO_STCLASS_AND;
2623 if (flags & SCF_WHILEM_VISITED_POS)
2624 f |= SCF_WHILEM_VISITED_POS;
2626 /* we suppose the run is continuous, last=next...*/
2627 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2629 stopparen, recursed, NULL, f,depth+1);
2632 if (max1 < minnext + deltanext)
2633 max1 = minnext + deltanext;
2634 if (deltanext == I32_MAX)
2635 is_inf = is_inf_internal = 1;
2637 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2639 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2640 if ( stopmin > minnext)
2641 stopmin = min + min1;
2642 flags &= ~SCF_DO_SUBSTR;
2644 data->flags |= SCF_SEEN_ACCEPT;
2647 if (data_fake.flags & SF_HAS_EVAL)
2648 data->flags |= SF_HAS_EVAL;
2649 data->whilem_c = data_fake.whilem_c;
2651 if (flags & SCF_DO_STCLASS)
2652 cl_or(pRExC_state, &accum, &this_class);
2654 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2656 if (flags & SCF_DO_SUBSTR) {
2657 data->pos_min += min1;
2658 data->pos_delta += max1 - min1;
2659 if (max1 != min1 || is_inf)
2660 data->longest = &(data->longest_float);
2663 delta += max1 - min1;
2664 if (flags & SCF_DO_STCLASS_OR) {
2665 cl_or(pRExC_state, data->start_class, &accum);
2667 cl_and(data->start_class, and_withp);
2668 flags &= ~SCF_DO_STCLASS;
2671 else if (flags & SCF_DO_STCLASS_AND) {
2673 cl_and(data->start_class, &accum);
2674 flags &= ~SCF_DO_STCLASS;
2677 /* Switch to OR mode: cache the old value of
2678 * data->start_class */
2680 StructCopy(data->start_class, and_withp,
2681 struct regnode_charclass_class);
2682 flags &= ~SCF_DO_STCLASS_AND;
2683 StructCopy(&accum, data->start_class,
2684 struct regnode_charclass_class);
2685 flags |= SCF_DO_STCLASS_OR;
2686 data->start_class->flags |= ANYOF_EOS;
2690 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2693 Assuming this was/is a branch we are dealing with: 'scan' now
2694 points at the item that follows the branch sequence, whatever
2695 it is. We now start at the beginning of the sequence and look
2702 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2704 If we can find such a subseqence we need to turn the first
2705 element into a trie and then add the subsequent branch exact
2706 strings to the trie.
2710 1. patterns where the whole set of branch can be converted.
2712 2. patterns where only a subset can be converted.
2714 In case 1 we can replace the whole set with a single regop
2715 for the trie. In case 2 we need to keep the start and end
2718 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2719 becomes BRANCH TRIE; BRANCH X;
2721 There is an additional case, that being where there is a
2722 common prefix, which gets split out into an EXACT like node
2723 preceding the TRIE node.
2725 If x(1..n)==tail then we can do a simple trie, if not we make
2726 a "jump" trie, such that when we match the appropriate word
2727 we "jump" to the appopriate tail node. Essentailly we turn
2728 a nested if into a case structure of sorts.
2733 if (!re_trie_maxbuff) {
2734 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2735 if (!SvIOK(re_trie_maxbuff))
2736 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2738 if ( SvIV(re_trie_maxbuff)>=0 ) {
2740 regnode *first = (regnode *)NULL;
2741 regnode *last = (regnode *)NULL;
2742 regnode *tail = scan;
2747 SV * const mysv = sv_newmortal(); /* for dumping */
2749 /* var tail is used because there may be a TAIL
2750 regop in the way. Ie, the exacts will point to the
2751 thing following the TAIL, but the last branch will
2752 point at the TAIL. So we advance tail. If we
2753 have nested (?:) we may have to move through several
2757 while ( OP( tail ) == TAIL ) {
2758 /* this is the TAIL generated by (?:) */
2759 tail = regnext( tail );
2764 regprop(RExC_rx, mysv, tail );
2765 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2766 (int)depth * 2 + 2, "",
2767 "Looking for TRIE'able sequences. Tail node is: ",
2768 SvPV_nolen_const( mysv )
2774 step through the branches, cur represents each
2775 branch, noper is the first thing to be matched
2776 as part of that branch and noper_next is the
2777 regnext() of that node. if noper is an EXACT
2778 and noper_next is the same as scan (our current
2779 position in the regex) then the EXACT branch is
2780 a possible optimization target. Once we have
2781 two or more consequetive such branches we can
2782 create a trie of the EXACT's contents and stich
2783 it in place. If the sequence represents all of
2784 the branches we eliminate the whole thing and
2785 replace it with a single TRIE. If it is a
2786 subsequence then we need to stitch it in. This
2787 means the first branch has to remain, and needs
2788 to be repointed at the item on the branch chain
2789 following the last branch optimized. This could
2790 be either a BRANCH, in which case the
2791 subsequence is internal, or it could be the
2792 item following the branch sequence in which
2793 case the subsequence is at the end.
2797 /* dont use tail as the end marker for this traverse */
2798 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2799 regnode * const noper = NEXTOPER( cur );
2800 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2801 regnode * const noper_next = regnext( noper );
2805 regprop(RExC_rx, mysv, cur);
2806 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2807 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2809 regprop(RExC_rx, mysv, noper);
2810 PerlIO_printf( Perl_debug_log, " -> %s",
2811 SvPV_nolen_const(mysv));
2814 regprop(RExC_rx, mysv, noper_next );
2815 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2816 SvPV_nolen_const(mysv));
2818 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2819 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2821 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2822 : PL_regkind[ OP( noper ) ] == EXACT )
2823 || OP(noper) == NOTHING )
2825 && noper_next == tail
2830 if ( !first || optype == NOTHING ) {
2831 if (!first) first = cur;
2832 optype = OP( noper );
2838 Currently we do not believe that the trie logic can
2839 handle case insensitive matching properly when the
2840 pattern is not unicode (thus forcing unicode semantics).
2842 If/when this is fixed the following define can be swapped
2843 in below to fully enable trie logic.
2845 #define TRIE_TYPE_IS_SAFE 1
2848 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2850 if ( last && TRIE_TYPE_IS_SAFE ) {
2851 make_trie( pRExC_state,
2852 startbranch, first, cur, tail, count,
2855 if ( PL_regkind[ OP( noper ) ] == EXACT
2857 && noper_next == tail
2862 optype = OP( noper );
2872 regprop(RExC_rx, mysv, cur);
2873 PerlIO_printf( Perl_debug_log,
2874 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2875 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2879 if ( last && TRIE_TYPE_IS_SAFE ) {
2880 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2881 #ifdef TRIE_STUDY_OPT
2882 if ( ((made == MADE_EXACT_TRIE &&
2883 startbranch == first)
2884 || ( first_non_open == first )) &&
2886 flags |= SCF_TRIE_RESTUDY;
2887 if ( startbranch == first
2890 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2900 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2901 scan = NEXTOPER(NEXTOPER(scan));
2902 } else /* single branch is optimized. */
2903 scan = NEXTOPER(scan);
2905 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2906 scan_frame *newframe = NULL;
2911 if (OP(scan) != SUSPEND) {
2912 /* set the pointer */
2913 if (OP(scan) == GOSUB) {
2915 RExC_recurse[ARG2L(scan)] = scan;
2916 start = RExC_open_parens[paren-1];
2917 end = RExC_close_parens[paren-1];
2920 start = RExC_rxi->program + 1;
2924 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2925 SAVEFREEPV(recursed);
2927 if (!PAREN_TEST(recursed,paren+1)) {
2928 PAREN_SET(recursed,paren+1);
2929 Newx(newframe,1,scan_frame);
2931 if (flags & SCF_DO_SUBSTR) {
2932 SCAN_COMMIT(pRExC_state,data,minlenp);
2933 data->longest = &(data->longest_float);
2935 is_inf = is_inf_internal = 1;
2936 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2937 cl_anything(pRExC_state, data->start_class);
2938 flags &= ~SCF_DO_STCLASS;
2941 Newx(newframe,1,scan_frame);
2944 end = regnext(scan);
2949 SAVEFREEPV(newframe);
2950 newframe->next = regnext(scan);
2951 newframe->last = last;
2952 newframe->stop = stopparen;
2953 newframe->prev = frame;
2963 else if (OP(scan) == EXACT) {
2964 I32 l = STR_LEN(scan);
2967 const U8 * const s = (U8*)STRING(scan);
2968 l = utf8_length(s, s + l);
2969 uc = utf8_to_uvchr(s, NULL);
2971 uc = *((U8*)STRING(scan));
2974 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2975 /* The code below prefers earlier match for fixed
2976 offset, later match for variable offset. */
2977 if (data->last_end == -1) { /* Update the start info. */
2978 data->last_start_min = data->pos_min;
2979 data->last_start_max = is_inf
2980 ? I32_MAX : data->pos_min + data->pos_delta;
2982 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2984 SvUTF8_on(data->last_found);
2986 SV * const sv = data->last_found;
2987 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2988 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2989 if (mg && mg->mg_len >= 0)
2990 mg->mg_len += utf8_length((U8*)STRING(scan),
2991 (U8*)STRING(scan)+STR_LEN(scan));
2993 data->last_end = data->pos_min + l;
2994 data->pos_min += l; /* As in the first entry. */
2995 data->flags &= ~SF_BEFORE_EOL;
2997 if (flags & SCF_DO_STCLASS_AND) {
2998 /* Check whether it is compatible with what we know already! */
3002 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3003 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3004 && (!(data->start_class->flags & ANYOF_FOLD)
3005 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3008 ANYOF_CLASS_ZERO(data->start_class);
3009 ANYOF_BITMAP_ZERO(data->start_class);
3011 ANYOF_BITMAP_SET(data->start_class, uc);
3012 data->start_class->flags &= ~ANYOF_EOS;
3014 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3016 else if (flags & SCF_DO_STCLASS_OR) {
3017 /* false positive possible if the class is case-folded */
3019 ANYOF_BITMAP_SET(data->start_class, uc);
3021 data->start_class->flags |= ANYOF_UNICODE_ALL;
3022 data->start_class->flags &= ~ANYOF_EOS;
3023 cl_and(data->start_class, and_withp);
3025 flags &= ~SCF_DO_STCLASS;
3027 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3028 I32 l = STR_LEN(scan);
3029 UV uc = *((U8*)STRING(scan));
3031 /* Search for fixed substrings supports EXACT only. */
3032 if (flags & SCF_DO_SUBSTR) {
3034 SCAN_COMMIT(pRExC_state, data, minlenp);
3037 const U8 * const s = (U8 *)STRING(scan);
3038 l = utf8_length(s, s + l);
3039 uc = utf8_to_uvchr(s, NULL);
3042 if (flags & SCF_DO_SUBSTR)
3044 if (flags & SCF_DO_STCLASS_AND) {
3045 /* Check whether it is compatible with what we know already! */
3049 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3050 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3051 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3053 ANYOF_CLASS_ZERO(data->start_class);
3054 ANYOF_BITMAP_ZERO(data->start_class);
3056 ANYOF_BITMAP_SET(data->start_class, uc);
3057 data->start_class->flags &= ~ANYOF_EOS;
3058 data->start_class->flags |= ANYOF_FOLD;
3059 if (OP(scan) == EXACTFL)
3060 data->start_class->flags |= ANYOF_LOCALE;
3063 else if (flags & SCF_DO_STCLASS_OR) {
3064 if (data->start_class->flags & ANYOF_FOLD) {
3065 /* false positive possible if the class is case-folded.
3066 Assume that the locale settings are the same... */
3068 ANYOF_BITMAP_SET(data->start_class, uc);
3069 data->start_class->flags &= ~ANYOF_EOS;
3071 cl_and(data->start_class, and_withp);
3073 flags &= ~SCF_DO_STCLASS;
3075 else if (strchr((const char*)PL_varies,OP(scan))) {
3076 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3077 I32 f = flags, pos_before = 0;
3078 regnode * const oscan = scan;
3079 struct regnode_charclass_class this_class;
3080 struct regnode_charclass_class *oclass = NULL;
3081 I32 next_is_eval = 0;
3083 switch (PL_regkind[OP(scan)]) {
3084 case WHILEM: /* End of (?:...)* . */
3085 scan = NEXTOPER(scan);
3088 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3089 next = NEXTOPER(scan);
3090 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3092 maxcount = REG_INFTY;
3093 next = regnext(scan);
3094 scan = NEXTOPER(scan);
3098 if (flags & SCF_DO_SUBSTR)
3103 if (flags & SCF_DO_STCLASS) {
3105 maxcount = REG_INFTY;
3106 next = regnext(scan);
3107 scan = NEXTOPER(scan);
3110 is_inf = is_inf_internal = 1;
3111 scan = regnext(scan);
3112 if (flags & SCF_DO_SUBSTR) {
3113 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3114 data->longest = &(data->longest_float);
3116 goto optimize_curly_tail;
3118 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3119 && (scan->flags == stopparen))
3124 mincount = ARG1(scan);
3125 maxcount = ARG2(scan);
3127 next = regnext(scan);
3128 if (OP(scan) == CURLYX) {
3129 I32 lp = (data ? *(data->last_closep) : 0);
3130 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3132 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3133 next_is_eval = (OP(scan) == EVAL);
3135 if (flags & SCF_DO_SUBSTR) {
3136 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3137 pos_before = data->pos_min;
3141 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3143 data->flags |= SF_IS_INF;
3145 if (flags & SCF_DO_STCLASS) {
3146 cl_init(pRExC_state, &this_class);
3147 oclass = data->start_class;
3148 data->start_class = &this_class;
3149 f |= SCF_DO_STCLASS_AND;
3150 f &= ~SCF_DO_STCLASS_OR;
3152 /* These are the cases when once a subexpression
3153 fails at a particular position, it cannot succeed
3154 even after backtracking at the enclosing scope.
3156 XXXX what if minimal match and we are at the
3157 initial run of {n,m}? */
3158 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3159 f &= ~SCF_WHILEM_VISITED_POS;
3161 /* This will finish on WHILEM, setting scan, or on NULL: */
3162 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3163 last, data, stopparen, recursed, NULL,
3165 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3167 if (flags & SCF_DO_STCLASS)
3168 data->start_class = oclass;
3169 if (mincount == 0 || minnext == 0) {
3170 if (flags & SCF_DO_STCLASS_OR) {
3171 cl_or(pRExC_state, data->start_class, &this_class);
3173 else if (flags & SCF_DO_STCLASS_AND) {
3174 /* Switch to OR mode: cache the old value of
3175 * data->start_class */
3177 StructCopy(data->start_class, and_withp,
3178 struct regnode_charclass_class);
3179 flags &= ~SCF_DO_STCLASS_AND;
3180 StructCopy(&this_class, data->start_class,
3181 struct regnode_charclass_class);
3182 flags |= SCF_DO_STCLASS_OR;
3183 data->start_class->flags |= ANYOF_EOS;
3185 } else { /* Non-zero len */
3186 if (flags & SCF_DO_STCLASS_OR) {
3187 cl_or(pRExC_state, data->start_class, &this_class);
3188 cl_and(data->start_class, and_withp);
3190 else if (flags & SCF_DO_STCLASS_AND)
3191 cl_and(data->start_class, &this_class);
3192 flags &= ~SCF_DO_STCLASS;
3194 if (!scan) /* It was not CURLYX, but CURLY. */
3196 if ( /* ? quantifier ok, except for (?{ ... }) */
3197 (next_is_eval || !(mincount == 0 && maxcount == 1))
3198 && (minnext == 0) && (deltanext == 0)
3199 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3200 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3202 ckWARNreg(RExC_parse,
3203 "Quantifier unexpected on zero-length expression");
3206 min += minnext * mincount;
3207 is_inf_internal |= ((maxcount == REG_INFTY
3208 && (minnext + deltanext) > 0)
3209 || deltanext == I32_MAX);
3210 is_inf |= is_inf_internal;
3211 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3213 /* Try powerful optimization CURLYX => CURLYN. */
3214 if ( OP(oscan) == CURLYX && data
3215 && data->flags & SF_IN_PAR
3216 && !(data->flags & SF_HAS_EVAL)
3217 && !deltanext && minnext == 1 ) {
3218 /* Try to optimize to CURLYN. */
3219 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3220 regnode * const nxt1 = nxt;
3227 if (!strchr((const char*)PL_simple,OP(nxt))
3228 && !(PL_regkind[OP(nxt)] == EXACT
3229 && STR_LEN(nxt) == 1))
3235 if (OP(nxt) != CLOSE)
3237 if (RExC_open_parens) {
3238 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3239 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3241 /* Now we know that nxt2 is the only contents: */
3242 oscan->flags = (U8)ARG(nxt);
3244 OP(nxt1) = NOTHING; /* was OPEN. */
3247 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3248 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3249 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3250 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3251 OP(nxt + 1) = OPTIMIZED; /* was count. */
3252 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3257 /* Try optimization CURLYX => CURLYM. */
3258 if ( OP(oscan) == CURLYX && data
3259 && !(data->flags & SF_HAS_PAR)
3260 && !(data->flags & SF_HAS_EVAL)
3261 && !deltanext /* atom is fixed width */
3262 && minnext != 0 /* CURLYM can't handle zero width */
3264 /* XXXX How to optimize if data == 0? */
3265 /* Optimize to a simpler form. */
3266 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3270 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3271 && (OP(nxt2) != WHILEM))
3273 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3274 /* Need to optimize away parenths. */
3275 if (data->flags & SF_IN_PAR) {
3276 /* Set the parenth number. */
3277 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3279 if (OP(nxt) != CLOSE)
3280 FAIL("Panic opt close");
3281 oscan->flags = (U8)ARG(nxt);
3282 if (RExC_open_parens) {
3283 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3284 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3286 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3287 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3290 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3291 OP(nxt + 1) = OPTIMIZED; /* was count. */
3292 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3293 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3296 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3297 regnode *nnxt = regnext(nxt1);
3300 if (reg_off_by_arg[OP(nxt1)])
3301 ARG_SET(nxt1, nxt2 - nxt1);
3302 else if (nxt2 - nxt1 < U16_MAX)
3303 NEXT_OFF(nxt1) = nxt2 - nxt1;
3305 OP(nxt) = NOTHING; /* Cannot beautify */
3310 /* Optimize again: */
3311 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3312 NULL, stopparen, recursed, NULL, 0,depth+1);
3317 else if ((OP(oscan) == CURLYX)
3318 && (flags & SCF_WHILEM_VISITED_POS)
3319 /* See the comment on a similar expression above.
3320 However, this time it not a subexpression
3321 we care about, but the expression itself. */
3322 && (maxcount == REG_INFTY)
3323 && data && ++data->whilem_c < 16) {
3324 /* This stays as CURLYX, we can put the count/of pair. */
3325 /* Find WHILEM (as in regexec.c) */
3326 regnode *nxt = oscan + NEXT_OFF(oscan);
3328 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3330 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3331 | (RExC_whilem_seen << 4)); /* On WHILEM */
3333 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3335 if (flags & SCF_DO_SUBSTR) {
3336 SV *last_str = NULL;
3337 int counted = mincount != 0;
3339 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3340 #if defined(SPARC64_GCC_WORKAROUND)
3343 const char *s = NULL;
3346 if (pos_before >= data->last_start_min)
3349 b = data->last_start_min;
3352 s = SvPV_const(data->last_found, l);
3353 old = b - data->last_start_min;
3356 I32 b = pos_before >= data->last_start_min
3357 ? pos_before : data->last_start_min;
3359 const char * const s = SvPV_const(data->last_found, l);
3360 I32 old = b - data->last_start_min;
3364 old = utf8_hop((U8*)s, old) - (U8*)s;
3367 /* Get the added string: */
3368 last_str = newSVpvn_utf8(s + old, l, UTF);
3369 if (deltanext == 0 && pos_before == b) {
3370 /* What was added is a constant string */
3372 SvGROW(last_str, (mincount * l) + 1);
3373 repeatcpy(SvPVX(last_str) + l,
3374 SvPVX_const(last_str), l, mincount - 1);
3375 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3376 /* Add additional parts. */
3377 SvCUR_set(data->last_found,
3378 SvCUR(data->last_found) - l);
3379 sv_catsv(data->last_found, last_str);
3381 SV * sv = data->last_found;
3383 SvUTF8(sv) && SvMAGICAL(sv) ?
3384 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3385 if (mg && mg->mg_len >= 0)
3386 mg->mg_len += CHR_SVLEN(last_str) - l;
3388 data->last_end += l * (mincount - 1);
3391 /* start offset must point into the last copy */
3392 data->last_start_min += minnext * (mincount - 1);
3393 data->last_start_max += is_inf ? I32_MAX
3394 : (maxcount - 1) * (minnext + data->pos_delta);
3397 /* It is counted once already... */
3398 data->pos_min += minnext * (mincount - counted);
3399 data->pos_delta += - counted * deltanext +
3400 (minnext + deltanext) * maxcount - minnext * mincount;
3401 if (mincount != maxcount) {
3402 /* Cannot extend fixed substrings found inside
3404 SCAN_COMMIT(pRExC_state,data,minlenp);
3405 if (mincount && last_str) {
3406 SV * const sv = data->last_found;
3407 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3408 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3412 sv_setsv(sv, last_str);
3413 data->last_end = data->pos_min;
3414 data->last_start_min =
3415 data->pos_min - CHR_SVLEN(last_str);
3416 data->last_start_max = is_inf
3418 : data->pos_min + data->pos_delta
3419 - CHR_SVLEN(last_str);
3421 data->longest = &(data->longest_float);
3423 SvREFCNT_dec(last_str);
3425 if (data && (fl & SF_HAS_EVAL))
3426 data->flags |= SF_HAS_EVAL;
3427 optimize_curly_tail:
3428 if (OP(oscan) != CURLYX) {
3429 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3431 NEXT_OFF(oscan) += NEXT_OFF(next);
3434 default: /* REF and CLUMP only? */
3435 if (flags & SCF_DO_SUBSTR) {
3436 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3437 data->longest = &(data->longest_float);
3439 is_inf = is_inf_internal = 1;
3440 if (flags & SCF_DO_STCLASS_OR)
3441 cl_anything(pRExC_state, data->start_class);
3442 flags &= ~SCF_DO_STCLASS;
3446 else if (OP(scan) == LNBREAK) {
3447 if (flags & SCF_DO_STCLASS) {
3449 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3450 if (flags & SCF_DO_STCLASS_AND) {
3451 for (value = 0; value < 256; value++)
3452 if (!is_VERTWS_cp(value))
3453 ANYOF_BITMAP_CLEAR(data->start_class, value);
3456 for (value = 0; value < 256; value++)
3457 if (is_VERTWS_cp(value))
3458 ANYOF_BITMAP_SET(data->start_class, value);
3460 if (flags & SCF_DO_STCLASS_OR)
3461 cl_and(data->start_class, and_withp);
3462 flags &= ~SCF_DO_STCLASS;
3466 if (flags & SCF_DO_SUBSTR) {
3467 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3469 data->pos_delta += 1;
3470 data->longest = &(data->longest_float);
3474 else if (OP(scan) == FOLDCHAR) {
3475 int d = ARG(scan)==0xDF ? 1 : 2;
3476 flags &= ~SCF_DO_STCLASS;
3479 if (flags & SCF_DO_SUBSTR) {
3480 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3482 data->pos_delta += d;
3483 data->longest = &(data->longest_float);
3486 else if (strchr((const char*)PL_simple,OP(scan))) {
3489 if (flags & SCF_DO_SUBSTR) {
3490 SCAN_COMMIT(pRExC_state,data,minlenp);
3494 if (flags & SCF_DO_STCLASS) {
3495 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3497 /* Some of the logic below assumes that switching
3498 locale on will only add false positives. */
3499 switch (PL_regkind[OP(scan)]) {
3503 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3504 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3505 cl_anything(pRExC_state, data->start_class);
3508 if (OP(scan) == SANY)
3510 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3511 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3512 || (data->start_class->flags & ANYOF_CLASS));
3513 cl_anything(pRExC_state, data->start_class);
3515 if (flags & SCF_DO_STCLASS_AND || !value)
3516 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3519 if (flags & SCF_DO_STCLASS_AND)
3520 cl_and(data->start_class,
3521 (struct regnode_charclass_class*)scan);
3523 cl_or(pRExC_state, data->start_class,
3524 (struct regnode_charclass_class*)scan);
3527 if (flags & SCF_DO_STCLASS_AND) {
3528 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3529 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3530 for (value = 0; value < 256; value++)
3531 if (!isALNUM(value))
3532 ANYOF_BITMAP_CLEAR(data->start_class, value);
3536 if (data->start_class->flags & ANYOF_LOCALE)
3537 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3539 for (value = 0; value < 256; value++)
3541 ANYOF_BITMAP_SET(data->start_class, value);
3546 if (flags & SCF_DO_STCLASS_AND) {
3547 if (data->start_class->flags & ANYOF_LOCALE)
3548 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3551 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3552 data->start_class->flags |= ANYOF_LOCALE;
3556 if (flags & SCF_DO_STCLASS_AND) {
3557 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3558 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3559 for (value = 0; value < 256; value++)
3561 ANYOF_BITMAP_CLEAR(data->start_class, value);
3565 if (data->start_class->flags & ANYOF_LOCALE)
3566 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3568 for (value = 0; value < 256; value++)
3569 if (!isALNUM(value))
3570 ANYOF_BITMAP_SET(data->start_class, value);
3575 if (flags & SCF_DO_STCLASS_AND) {
3576 if (data->start_class->flags & ANYOF_LOCALE)
3577 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3580 data->start_class->flags |= ANYOF_LOCALE;
3581 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3585 if (flags & SCF_DO_STCLASS_AND) {
3586 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3587 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3588 for (value = 0; value < 256; value++)
3589 if (!isSPACE(value))
3590 ANYOF_BITMAP_CLEAR(data->start_class, value);
3594 if (data->start_class->flags & ANYOF_LOCALE)
3595 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3597 for (value = 0; value < 256; value++)
3599 ANYOF_BITMAP_SET(data->start_class, value);
3604 if (flags & SCF_DO_STCLASS_AND) {
3605 if (data->start_class->flags & ANYOF_LOCALE)
3606 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3609 data->start_class->flags |= ANYOF_LOCALE;
3610 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3614 if (flags & SCF_DO_STCLASS_AND) {
3615 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3616 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3617 for (value = 0; value < 256; value++)
3619 ANYOF_BITMAP_CLEAR(data->start_class, value);
3623 if (data->start_class->flags & ANYOF_LOCALE)
3624 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3626 for (value = 0; value < 256; value++)
3627 if (!isSPACE(value))
3628 ANYOF_BITMAP_SET(data->start_class, value);
3633 if (flags & SCF_DO_STCLASS_AND) {
3634 if (data->start_class->flags & ANYOF_LOCALE) {
3635 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3636 for (value = 0; value < 256; value++)
3637 if (!isSPACE(value))
3638 ANYOF_BITMAP_CLEAR(data->start_class, value);
3642 data->start_class->flags |= ANYOF_LOCALE;
3643 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3647 if (flags & SCF_DO_STCLASS_AND) {
3648 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3649 for (value = 0; value < 256; value++)
3650 if (!isDIGIT(value))
3651 ANYOF_BITMAP_CLEAR(data->start_class, value);
3654 if (data->start_class->flags & ANYOF_LOCALE)
3655 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3657 for (value = 0; value < 256; value++)
3659 ANYOF_BITMAP_SET(data->start_class, value);
3664 if (flags & SCF_DO_STCLASS_AND) {
3665 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3666 for (value = 0; value < 256; value++)
3668 ANYOF_BITMAP_CLEAR(data->start_class, value);
3671 if (data->start_class->flags & ANYOF_LOCALE)
3672 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3674 for (value = 0; value < 256; value++)
3675 if (!isDIGIT(value))
3676 ANYOF_BITMAP_SET(data->start_class, value);
3680 CASE_SYNST_FNC(VERTWS);
3681 CASE_SYNST_FNC(HORIZWS);
3684 if (flags & SCF_DO_STCLASS_OR)
3685 cl_and(data->start_class, and_withp);
3686 flags &= ~SCF_DO_STCLASS;
3689 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3690 data->flags |= (OP(scan) == MEOL
3694 else if ( PL_regkind[OP(scan)] == BRANCHJ
3695 /* Lookbehind, or need to calculate parens/evals/stclass: */
3696 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3697 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3698 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3699 || OP(scan) == UNLESSM )
3701 /* Negative Lookahead/lookbehind
3702 In this case we can't do fixed string optimisation.
3705 I32 deltanext, minnext, fake = 0;
3707 struct regnode_charclass_class intrnl;
3710 data_fake.flags = 0;
3712 data_fake.whilem_c = data->whilem_c;
3713 data_fake.last_closep = data->last_closep;
3716 data_fake.last_closep = &fake;
3717 data_fake.pos_delta = delta;
3718 if ( flags & SCF_DO_STCLASS && !scan->flags
3719 && OP(scan) == IFMATCH ) { /* Lookahead */
3720 cl_init(pRExC_state, &intrnl);
3721 data_fake.start_class = &intrnl;
3722 f |= SCF_DO_STCLASS_AND;
3724 if (flags & SCF_WHILEM_VISITED_POS)
3725 f |= SCF_WHILEM_VISITED_POS;
3726 next = regnext(scan);
3727 nscan = NEXTOPER(NEXTOPER(scan));
3728 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3729 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3732 FAIL("Variable length lookbehind not implemented");
3734 else if (minnext > (I32)U8_MAX) {
3735 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3737 scan->flags = (U8)minnext;
3740 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3742 if (data_fake.flags & SF_HAS_EVAL)
3743 data->flags |= SF_HAS_EVAL;
3744 data->whilem_c = data_fake.whilem_c;
3746 if (f & SCF_DO_STCLASS_AND) {
3747 if (flags & SCF_DO_STCLASS_OR) {
3748 /* OR before, AND after: ideally we would recurse with
3749 * data_fake to get the AND applied by study of the
3750 * remainder of the pattern, and then derecurse;
3751 * *** HACK *** for now just treat as "no information".
3752 * See [perl #56690].
3754 cl_init(pRExC_state, data->start_class);
3756 /* AND before and after: combine and continue */
3757 const int was = (data->start_class->flags & ANYOF_EOS);
3759 cl_and(data->start_class, &intrnl);
3761 data->start_class->flags |= ANYOF_EOS;
3765 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3767 /* Positive Lookahead/lookbehind
3768 In this case we can do fixed string optimisation,
3769 but we must be careful about it. Note in the case of
3770 lookbehind the positions will be offset by the minimum
3771 length of the pattern, something we won't know about
3772 until after the recurse.
3774 I32 deltanext, fake = 0;
3776 struct regnode_charclass_class intrnl;
3778 /* We use SAVEFREEPV so that when the full compile
3779 is finished perl will clean up the allocated
3780 minlens when its all done. This was we don't
3781 have to worry about freeing them when we know
3782 they wont be used, which would be a pain.
3785 Newx( minnextp, 1, I32 );
3786 SAVEFREEPV(minnextp);
3789 StructCopy(data, &data_fake, scan_data_t);
3790 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3793 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3794 data_fake.last_found=newSVsv(data->last_found);
3798 data_fake.last_closep = &fake;
3799 data_fake.flags = 0;
3800 data_fake.pos_delta = delta;
3802 data_fake.flags |= SF_IS_INF;
3803 if ( flags & SCF_DO_STCLASS && !scan->flags
3804 && OP(scan) == IFMATCH ) { /* Lookahead */
3805 cl_init(pRExC_state, &intrnl);
3806 data_fake.start_class = &intrnl;
3807 f |= SCF_DO_STCLASS_AND;
3809 if (flags & SCF_WHILEM_VISITED_POS)
3810 f |= SCF_WHILEM_VISITED_POS;
3811 next = regnext(scan);
3812 nscan = NEXTOPER(NEXTOPER(scan));
3814 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3815 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3818 FAIL("Variable length lookbehind not implemented");
3820 else if (*minnextp > (I32)U8_MAX) {
3821 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3823 scan->flags = (U8)*minnextp;
3828 if (f & SCF_DO_STCLASS_AND) {
3829 const int was = (data->start_class->flags & ANYOF_EOS);
3831 cl_and(data->start_class, &intrnl);
3833 data->start_class->flags |= ANYOF_EOS;
3836 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3838 if (data_fake.flags & SF_HAS_EVAL)
3839 data->flags |= SF_HAS_EVAL;
3840 data->whilem_c = data_fake.whilem_c;
3841 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3842 if (RExC_rx->minlen<*minnextp)
3843 RExC_rx->minlen=*minnextp;
3844 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3845 SvREFCNT_dec(data_fake.last_found);
3847 if ( data_fake.minlen_fixed != minlenp )
3849 data->offset_fixed= data_fake.offset_fixed;
3850 data->minlen_fixed= data_fake.minlen_fixed;
3851 data->lookbehind_fixed+= scan->flags;
3853 if ( data_fake.minlen_float != minlenp )
3855 data->minlen_float= data_fake.minlen_float;
3856 data->offset_float_min=data_fake.offset_float_min;
3857 data->offset_float_max=data_fake.offset_float_max;
3858 data->lookbehind_float+= scan->flags;
3867 else if (OP(scan) == OPEN) {
3868 if (stopparen != (I32)ARG(scan))
3871 else if (OP(scan) == CLOSE) {
3872 if (stopparen == (I32)ARG(scan)) {
3875 if ((I32)ARG(scan) == is_par) {
3876 next = regnext(scan);
3878 if ( next && (OP(next) != WHILEM) && next < last)
3879 is_par = 0; /* Disable optimization */
3882 *(data->last_closep) = ARG(scan);
3884 else if (OP(scan) == EVAL) {
3886 data->flags |= SF_HAS_EVAL;
3888 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3889 if (flags & SCF_DO_SUBSTR) {
3890 SCAN_COMMIT(pRExC_state,data,minlenp);
3891 flags &= ~SCF_DO_SUBSTR;
3893 if (data && OP(scan)==ACCEPT) {
3894 data->flags |= SCF_SEEN_ACCEPT;
3899 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3901 if (flags & SCF_DO_SUBSTR) {
3902 SCAN_COMMIT(pRExC_state,data,minlenp);
3903 data->longest = &(data->longest_float);
3905 is_inf = is_inf_internal = 1;
3906 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3907 cl_anything(pRExC_state, data->start_class);
3908 flags &= ~SCF_DO_STCLASS;
3910 else if (OP(scan) == GPOS) {
3911 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3912 !(delta || is_inf || (data && data->pos_delta)))
3914 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3915 RExC_rx->extflags |= RXf_ANCH_GPOS;
3916 if (RExC_rx->gofs < (U32)min)
3917 RExC_rx->gofs = min;
3919 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3923 #ifdef TRIE_STUDY_OPT
3924 #ifdef FULL_TRIE_STUDY
3925 else if (PL_regkind[OP(scan)] == TRIE) {
3926 /* NOTE - There is similar code to this block above for handling
3927 BRANCH nodes on the initial study. If you change stuff here
3929 regnode *trie_node= scan;
3930 regnode *tail= regnext(scan);
3931 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3932 I32 max1 = 0, min1 = I32_MAX;
3933 struct regnode_charclass_class accum;
3935 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3936 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3937 if (flags & SCF_DO_STCLASS)
3938 cl_init_zero(pRExC_state, &accum);
3944 const regnode *nextbranch= NULL;
3947 for ( word=1 ; word <= trie->wordcount ; word++)
3949 I32 deltanext=0, minnext=0, f = 0, fake;
3950 struct regnode_charclass_class this_class;
3952 data_fake.flags = 0;
3954 data_fake.whilem_c = data->whilem_c;
3955 data_fake.last_closep = data->last_closep;
3958 data_fake.last_closep = &fake;
3959 data_fake.pos_delta = delta;
3960 if (flags & SCF_DO_STCLASS) {
3961 cl_init(pRExC_state, &this_class);
3962 data_fake.start_class = &this_class;
3963 f = SCF_DO_STCLASS_AND;
3965 if (flags & SCF_WHILEM_VISITED_POS)
3966 f |= SCF_WHILEM_VISITED_POS;
3968 if (trie->jump[word]) {
3970 nextbranch = trie_node + trie->jump[0];
3971 scan= trie_node + trie->jump[word];
3972 /* We go from the jump point to the branch that follows
3973 it. Note this means we need the vestigal unused branches
3974 even though they arent otherwise used.
3976 minnext = study_chunk(pRExC_state, &scan, minlenp,
3977 &deltanext, (regnode *)nextbranch, &data_fake,
3978 stopparen, recursed, NULL, f,depth+1);
3980 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3981 nextbranch= regnext((regnode*)nextbranch);
3983 if (min1 > (I32)(minnext + trie->minlen))
3984 min1 = minnext + trie->minlen;
3985 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3986 max1 = minnext + deltanext + trie->maxlen;
3987 if (deltanext == I32_MAX)
3988 is_inf = is_inf_internal = 1;
3990 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3992 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3993 if ( stopmin > min + min1)
3994 stopmin = min + min1;
3995 flags &= ~SCF_DO_SUBSTR;
3997 data->flags |= SCF_SEEN_ACCEPT;
4000 if (data_fake.flags & SF_HAS_EVAL)
4001 data->flags |= SF_HAS_EVAL;
4002 data->whilem_c = data_fake.whilem_c;
4004 if (flags & SCF_DO_STCLASS)
4005 cl_or(pRExC_state, &accum, &this_class);
4008 if (flags & SCF_DO_SUBSTR) {
4009 data->pos_min += min1;
4010 data->pos_delta += max1 - min1;
4011 if (max1 != min1 || is_inf)
4012 data->longest = &(data->longest_float);
4015 delta += max1 - min1;
4016 if (flags & SCF_DO_STCLASS_OR) {
4017 cl_or(pRExC_state, data->start_class, &accum);
4019 cl_and(data->start_class, and_withp);
4020 flags &= ~SCF_DO_STCLASS;
4023 else if (flags & SCF_DO_STCLASS_AND) {
4025 cl_and(data->start_class, &accum);
4026 flags &= ~SCF_DO_STCLASS;
4029 /* Switch to OR mode: cache the old value of
4030 * data->start_class */
4032 StructCopy(data->start_class, and_withp,
4033 struct regnode_charclass_class);
4034 flags &= ~SCF_DO_STCLASS_AND;
4035 StructCopy(&accum, data->start_class,
4036 struct regnode_charclass_class);
4037 flags |= SCF_DO_STCLASS_OR;
4038 data->start_class->flags |= ANYOF_EOS;
4045 else if (PL_regkind[OP(scan)] == TRIE) {
4046 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4049 min += trie->minlen;
4050 delta += (trie->maxlen - trie->minlen);
4051 flags &= ~SCF_DO_STCLASS; /* xxx */
4052 if (flags & SCF_DO_SUBSTR) {
4053 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4054 data->pos_min += trie->minlen;
4055 data->pos_delta += (trie->maxlen - trie->minlen);
4056 if (trie->maxlen != trie->minlen)
4057 data->longest = &(data->longest_float);
4059 if (trie->jump) /* no more substrings -- for now /grr*/
4060 flags &= ~SCF_DO_SUBSTR;
4062 #endif /* old or new */
4063 #endif /* TRIE_STUDY_OPT */
4065 /* Else: zero-length, ignore. */
4066 scan = regnext(scan);
4071 stopparen = frame->stop;
4072 frame = frame->prev;
4073 goto fake_study_recurse;
4078 DEBUG_STUDYDATA("pre-fin:",data,depth);
4081 *deltap = is_inf_internal ? I32_MAX : delta;
4082 if (flags & SCF_DO_SUBSTR && is_inf)
4083 data->pos_delta = I32_MAX - data->pos_min;
4084 if (is_par > (I32)U8_MAX)
4086 if (is_par && pars==1 && data) {
4087 data->flags |= SF_IN_PAR;
4088 data->flags &= ~SF_HAS_PAR;
4090 else if (pars && data) {
4091 data->flags |= SF_HAS_PAR;
4092 data->flags &= ~SF_IN_PAR;
4094 if (flags & SCF_DO_STCLASS_OR)
4095 cl_and(data->start_class, and_withp);
4096 if (flags & SCF_TRIE_RESTUDY)
4097 data->flags |= SCF_TRIE_RESTUDY;
4099 DEBUG_STUDYDATA("post-fin:",data,depth);
4101 return min < stopmin ? min : stopmin;
4105 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4107 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4109 PERL_ARGS_ASSERT_ADD_DATA;
4111 Renewc(RExC_rxi->data,
4112 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4113 char, struct reg_data);
4115 Renew(RExC_rxi->data->what, count + n, U8);
4117 Newx(RExC_rxi->data->what, n, U8);
4118 RExC_rxi->data->count = count + n;
4119 Copy(s, RExC_rxi->data->what + count, n, U8);
4123 /*XXX: todo make this not included in a non debugging perl */
4124 #ifndef PERL_IN_XSUB_RE
4126 Perl_reginitcolors(pTHX)
4129 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4131 char *t = savepv(s);
4135 t = strchr(t, '\t');
4141 PL_colors[i] = t = (char *)"";
4146 PL_colors[i++] = (char *)"";
4153 #ifdef TRIE_STUDY_OPT
4154 #define CHECK_RESTUDY_GOTO \
4156 (data.flags & SCF_TRIE_RESTUDY) \
4160 #define CHECK_RESTUDY_GOTO
4164 - pregcomp - compile a regular expression into internal code
4166 * We can't allocate space until we know how big the compiled form will be,
4167 * but we can't compile it (and thus know how big it is) until we've got a
4168 * place to put the code. So we cheat: we compile it twice, once with code
4169 * generation turned off and size counting turned on, and once "for real".
4170 * This also means that we don't allocate space until we are sure that the
4171 * thing really will compile successfully, and we never have to move the
4172 * code and thus invalidate pointers into it. (Note that it has to be in
4173 * one piece because free() must be able to free it all.) [NB: not true in perl]
4175 * Beware that the optimization-preparation code in here knows about some
4176 * of the structure of the compiled regexp. [I'll say.]
4181 #ifndef PERL_IN_XSUB_RE
4182 #define RE_ENGINE_PTR &reh_regexp_engine
4184 extern const struct regexp_engine my_reg_engine;
4185 #define RE_ENGINE_PTR &my_reg_engine
4188 #ifndef PERL_IN_XSUB_RE
4190 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4193 HV * const table = GvHV(PL_hintgv);
4195 PERL_ARGS_ASSERT_PREGCOMP;
4197 /* Dispatch a request to compile a regexp to correct
4200 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4201 GET_RE_DEBUG_FLAGS_DECL;
4202 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4203 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4205 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4208 return CALLREGCOMP_ENG(eng, pattern, flags);
4211 return Perl_re_compile(aTHX_ pattern, flags);
4216 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4221 register regexp_internal *ri;
4223 char *exp = SvPV(pattern, plen);
4224 char* xend = exp + plen;
4231 RExC_state_t RExC_state;
4232 RExC_state_t * const pRExC_state = &RExC_state;
4233 #ifdef TRIE_STUDY_OPT
4235 RExC_state_t copyRExC_state;
4237 GET_RE_DEBUG_FLAGS_DECL;
4239 PERL_ARGS_ASSERT_RE_COMPILE;
4241 DEBUG_r(if (!PL_colorset) reginitcolors());
4243 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4246 SV *dsv= sv_newmortal();
4247 RE_PV_QUOTED_DECL(s, RExC_utf8,
4248 dsv, exp, plen, 60);
4249 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4250 PL_colors[4],PL_colors[5],s);
4255 RExC_flags = pm_flags;
4259 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4260 RExC_seen_evals = 0;
4263 /* First pass: determine size, legality. */
4271 RExC_emit = &PL_regdummy;
4272 RExC_whilem_seen = 0;
4273 RExC_open_parens = NULL;
4274 RExC_close_parens = NULL;
4276 RExC_paren_names = NULL;
4278 RExC_paren_name_list = NULL;
4280 RExC_recurse = NULL;
4281 RExC_recurse_count = 0;
4283 #if 0 /* REGC() is (currently) a NOP at the first pass.
4284 * Clever compilers notice this and complain. --jhi */
4285 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4287 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4288 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4289 RExC_precomp = NULL;
4292 if (RExC_utf8 && !RExC_orig_utf8) {
4293 /* It's possible to write a regexp in ascii that represents Unicode
4294 codepoints outside of the byte range, such as via \x{100}. If we
4295 detect such a sequence we have to convert the entire pattern to utf8
4296 and then recompile, as our sizing calculation will have been based
4297 on 1 byte == 1 character, but we will need to use utf8 to encode
4298 at least some part of the pattern, and therefore must convert the whole
4300 XXX: somehow figure out how to make this less expensive...
4303 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4304 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4305 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4307 RExC_orig_utf8 = RExC_utf8;
4309 goto redo_first_pass;
4312 PerlIO_printf(Perl_debug_log,
4313 "Required size %"IVdf" nodes\n"
4314 "Starting second pass (creation)\n",
4317 RExC_lastparse=NULL;
4319 /* Small enough for pointer-storage convention?
4320 If extralen==0, this means that we will not need long jumps. */
4321 if (RExC_size >= 0x10000L && RExC_extralen)
4322 RExC_size += RExC_extralen;
4325 if (RExC_whilem_seen > 15)
4326 RExC_whilem_seen = 15;
4328 /* Allocate space and zero-initialize. Note, the two step process
4329 of zeroing when in debug mode, thus anything assigned has to
4330 happen after that */
4331 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4332 r = (struct regexp*)SvANY(rx);
4333 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4334 char, regexp_internal);
4335 if ( r == NULL || ri == NULL )
4336 FAIL("Regexp out of space");
4338 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4339 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4341 /* bulk initialize base fields with 0. */
4342 Zero(ri, sizeof(regexp_internal), char);
4345 /* non-zero initialization begins here */
4347 r->engine= RE_ENGINE_PTR;
4348 r->extflags = pm_flags;
4350 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4351 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4352 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4353 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4354 >> RXf_PMf_STD_PMMOD_SHIFT);
4355 const char *fptr = STD_PAT_MODS; /*"msix"*/
4357 const STRLEN wraplen = plen + has_minus + has_p + has_runon
4358 + (sizeof(STD_PAT_MODS) - 1)
4359 + (sizeof("(?:)") - 1);
4361 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4362 SvCUR_set(rx, wraplen);
4364 SvFLAGS(rx) |= SvUTF8(pattern);
4367 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4369 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4370 char *colon = r + 1;
4373 while((ch = *fptr++)) {
4387 Copy(RExC_precomp, p, plen, char);
4388 assert ((RX_WRAPPED(rx) - p) < 16);
4389 r->pre_prefix = p - RX_WRAPPED(rx);
4398 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4400 if (RExC_seen & REG_SEEN_RECURSE) {
4401 Newxz(RExC_open_parens, RExC_npar,regnode *);
4402 SAVEFREEPV(RExC_open_parens);
4403 Newxz(RExC_close_parens,RExC_npar,regnode *);
4404 SAVEFREEPV(RExC_close_parens);
4407 /* Useful during FAIL. */
4408 #ifdef RE_TRACK_PATTERN_OFFSETS
4409 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4410 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4411 "%s %"UVuf" bytes for offset annotations.\n",
4412 ri->u.offsets ? "Got" : "Couldn't get",
4413 (UV)((2*RExC_size+1) * sizeof(U32))));
4415 SetProgLen(ri,RExC_size);
4419 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4421 /* Second pass: emit code. */
4422 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4427 RExC_emit_start = ri->program;
4428 RExC_emit = ri->program;
4429 RExC_emit_bound = ri->program + RExC_size + 1;
4431 /* Store the count of eval-groups for security checks: */
4432 RExC_rx->seen_evals = RExC_seen_evals;
4433 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4434 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4438 /* XXXX To minimize changes to RE engine we always allocate
4439 3-units-long substrs field. */
4440 Newx(r->substrs, 1, struct reg_substr_data);
4441 if (RExC_recurse_count) {
4442 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4443 SAVEFREEPV(RExC_recurse);
4447 r->minlen = minlen = sawplus = sawopen = 0;
4448 Zero(r->substrs, 1, struct reg_substr_data);
4450 #ifdef TRIE_STUDY_OPT
4452 StructCopy(&zero_scan_data, &data, scan_data_t);
4453 copyRExC_state = RExC_state;
4456 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4458 RExC_state = copyRExC_state;
4459 if (seen & REG_TOP_LEVEL_BRANCHES)
4460 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4462 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4463 if (data.last_found) {
4464 SvREFCNT_dec(data.longest_fixed);
4465 SvREFCNT_dec(data.longest_float);
4466 SvREFCNT_dec(data.last_found);
4468 StructCopy(&zero_scan_data, &data, scan_data_t);
4471 StructCopy(&zero_scan_data, &data, scan_data_t);
4474 /* Dig out information for optimizations. */
4475 r->extflags = RExC_flags; /* was pm_op */
4476 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4479 SvUTF8_on(rx); /* Unicode in it? */
4480 ri->regstclass = NULL;
4481 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4482 r->intflags |= PREGf_NAUGHTY;
4483 scan = ri->program + 1; /* First BRANCH. */
4485 /* testing for BRANCH here tells us whether there is "must appear"
4486 data in the pattern. If there is then we can use it for optimisations */
4487 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4489 STRLEN longest_float_length, longest_fixed_length;
4490 struct regnode_charclass_class ch_class; /* pointed to by data */
4492 I32 last_close = 0; /* pointed to by data */
4493 regnode *first= scan;
4494 regnode *first_next= regnext(first);
4497 * Skip introductions and multiplicators >= 1
4498 * so that we can extract the 'meat' of the pattern that must
4499 * match in the large if() sequence following.
4500 * NOTE that EXACT is NOT covered here, as it is normally
4501 * picked up by the optimiser separately.
4503 * This is unfortunate as the optimiser isnt handling lookahead
4504 * properly currently.
4507 while ((OP(first) == OPEN && (sawopen = 1)) ||
4508 /* An OR of *one* alternative - should not happen now. */
4509 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4510 /* for now we can't handle lookbehind IFMATCH*/
4511 (OP(first) == IFMATCH && !first->flags) ||
4512 (OP(first) == PLUS) ||
4513 (OP(first) == MINMOD) ||
4514 /* An {n,m} with n>0 */
4515 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4516 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4519 * the only op that could be a regnode is PLUS, all the rest
4520 * will be regnode_1 or regnode_2.
4523 if (OP(first) == PLUS)
4526 first += regarglen[OP(first)];
4528 first = NEXTOPER(first);
4529 first_next= regnext(first);
4532 /* Starting-point info. */
4534 DEBUG_PEEP("first:",first,0);
4535 /* Ignore EXACT as we deal with it later. */
4536 if (PL_regkind[OP(first)] == EXACT) {
4537 if (OP(first) == EXACT)
4538 NOOP; /* Empty, get anchored substr later. */
4539 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4540 ri->regstclass = first;
4543 else if (PL_regkind[OP(first)] == TRIE &&
4544 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4547 /* this can happen only on restudy */
4548 if ( OP(first) == TRIE ) {
4549 struct regnode_1 *trieop = (struct regnode_1 *)
4550 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4551 StructCopy(first,trieop,struct regnode_1);
4552 trie_op=(regnode *)trieop;
4554 struct regnode_charclass *trieop = (struct regnode_charclass *)
4555 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4556 StructCopy(first,trieop,struct regnode_charclass);
4557 trie_op=(regnode *)trieop;
4560 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4561 ri->regstclass = trie_op;
4564 else if (strchr((const char*)PL_simple,OP(first)))
4565 ri->regstclass = first;
4566 else if (PL_regkind[OP(first)] == BOUND ||
4567 PL_regkind[OP(first)] == NBOUND)
4568 ri->regstclass = first;
4569 else if (PL_regkind[OP(first)] == BOL) {
4570 r->extflags |= (OP(first) == MBOL
4572 : (OP(first) == SBOL
4575 first = NEXTOPER(first);
4578 else if (OP(first) == GPOS) {
4579 r->extflags |= RXf_ANCH_GPOS;
4580 first = NEXTOPER(first);
4583 else if ((!sawopen || !RExC_sawback) &&
4584 (OP(first) == STAR &&
4585 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4586 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4588 /* turn .* into ^.* with an implied $*=1 */
4590 (OP(NEXTOPER(first)) == REG_ANY)
4593 r->extflags |= type;
4594 r->intflags |= PREGf_IMPLICIT;
4595 first = NEXTOPER(first);
4598 if (sawplus && (!sawopen || !RExC_sawback)
4599 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4600 /* x+ must match at the 1st pos of run of x's */
4601 r->intflags |= PREGf_SKIP;
4603 /* Scan is after the zeroth branch, first is atomic matcher. */
4604 #ifdef TRIE_STUDY_OPT
4607 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4608 (IV)(first - scan + 1))
4612 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4613 (IV)(first - scan + 1))
4619 * If there's something expensive in the r.e., find the
4620 * longest literal string that must appear and make it the
4621 * regmust. Resolve ties in favor of later strings, since
4622 * the regstart check works with the beginning of the r.e.
4623 * and avoiding duplication strengthens checking. Not a
4624 * strong reason, but sufficient in the absence of others.
4625 * [Now we resolve ties in favor of the earlier string if
4626 * it happens that c_offset_min has been invalidated, since the
4627 * earlier string may buy us something the later one won't.]
4630 data.longest_fixed = newSVpvs("");
4631 data.longest_float = newSVpvs("");
4632 data.last_found = newSVpvs("");
4633 data.longest = &(data.longest_fixed);
4635 if (!ri->regstclass) {
4636 cl_init(pRExC_state, &ch_class);
4637 data.start_class = &ch_class;
4638 stclass_flag = SCF_DO_STCLASS_AND;
4639 } else /* XXXX Check for BOUND? */
4641 data.last_closep = &last_close;
4643 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4644 &data, -1, NULL, NULL,
4645 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4651 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4652 && data.last_start_min == 0 && data.last_end > 0
4653 && !RExC_seen_zerolen
4654 && !(RExC_seen & REG_SEEN_VERBARG)
4655 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4656 r->extflags |= RXf_CHECK_ALL;
4657 scan_commit(pRExC_state, &data,&minlen,0);
4658 SvREFCNT_dec(data.last_found);
4660 /* Note that code very similar to this but for anchored string
4661 follows immediately below, changes may need to be made to both.
4664 longest_float_length = CHR_SVLEN(data.longest_float);
4665 if (longest_float_length
4666 || (data.flags & SF_FL_BEFORE_EOL
4667 && (!(data.flags & SF_FL_BEFORE_MEOL)
4668 || (RExC_flags & RXf_PMf_MULTILINE))))
4672 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4673 && data.offset_fixed == data.offset_float_min
4674 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4675 goto remove_float; /* As in (a)+. */
4677 /* copy the information about the longest float from the reg_scan_data
4678 over to the program. */
4679 if (SvUTF8(data.longest_float)) {
4680 r->float_utf8 = data.longest_float;
4681 r->float_substr = NULL;
4683 r->float_substr = data.longest_float;
4684 r->float_utf8 = NULL;
4686 /* float_end_shift is how many chars that must be matched that
4687 follow this item. We calculate it ahead of time as once the
4688 lookbehind offset is added in we lose the ability to correctly
4690 ml = data.minlen_float ? *(data.minlen_float)
4691 : (I32)longest_float_length;
4692 r->float_end_shift = ml - data.offset_float_min
4693 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4694 + data.lookbehind_float;
4695 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4696 r->float_max_offset = data.offset_float_max;
4697 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4698 r->float_max_offset -= data.lookbehind_float;
4700 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4701 && (!(data.flags & SF_FL_BEFORE_MEOL)
4702 || (RExC_flags & RXf_PMf_MULTILINE)));
4703 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4707 r->float_substr = r->float_utf8 = NULL;
4708 SvREFCNT_dec(data.longest_float);
4709 longest_float_length = 0;
4712 /* Note that code very similar to this but for floating string
4713 is immediately above, changes may need to be made to both.
4716 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4717 if (longest_fixed_length
4718 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4719 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4720 || (RExC_flags & RXf_PMf_MULTILINE))))
4724 /* copy the information about the longest fixed
4725 from the reg_scan_data over to the program. */
4726 if (SvUTF8(data.longest_fixed)) {
4727 r->anchored_utf8 = data.longest_fixed;
4728 r->anchored_substr = NULL;
4730 r->anchored_substr = data.longest_fixed;
4731 r->anchored_utf8 = NULL;
4733 /* fixed_end_shift is how many chars that must be matched that
4734 follow this item. We calculate it ahead of time as once the
4735 lookbehind offset is added in we lose the ability to correctly
4737 ml = data.minlen_fixed ? *(data.minlen_fixed)
4738 : (I32)longest_fixed_length;
4739 r->anchored_end_shift = ml - data.offset_fixed
4740 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4741 + data.lookbehind_fixed;
4742 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4744 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4745 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4746 || (RExC_flags & RXf_PMf_MULTILINE)));
4747 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4750 r->anchored_substr = r->anchored_utf8 = NULL;
4751 SvREFCNT_dec(data.longest_fixed);
4752 longest_fixed_length = 0;
4755 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4756 ri->regstclass = NULL;
4757 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4759 && !(data.start_class->flags & ANYOF_EOS)
4760 && !cl_is_anything(data.start_class))
4762 const U32 n = add_data(pRExC_state, 1, "f");
4764 Newx(RExC_rxi->data->data[n], 1,
4765 struct regnode_charclass_class);
4766 StructCopy(data.start_class,
4767 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4768 struct regnode_charclass_class);
4769 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4770 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4771 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4772 regprop(r, sv, (regnode*)data.start_class);
4773 PerlIO_printf(Perl_debug_log,
4774 "synthetic stclass \"%s\".\n",
4775 SvPVX_const(sv));});
4778 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4779 if (longest_fixed_length > longest_float_length) {
4780 r->check_end_shift = r->anchored_end_shift;
4781 r->check_substr = r->anchored_substr;
4782 r->check_utf8 = r->anchored_utf8;
4783 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4784 if (r->extflags & RXf_ANCH_SINGLE)
4785 r->extflags |= RXf_NOSCAN;
4788 r->check_end_shift = r->float_end_shift;
4789 r->check_substr = r->float_substr;
4790 r->check_utf8 = r->float_utf8;
4791 r->check_offset_min = r->float_min_offset;
4792 r->check_offset_max = r->float_max_offset;
4794 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4795 This should be changed ASAP! */
4796 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4797 r->extflags |= RXf_USE_INTUIT;
4798 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4799 r->extflags |= RXf_INTUIT_TAIL;
4801 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4802 if ( (STRLEN)minlen < longest_float_length )
4803 minlen= longest_float_length;
4804 if ( (STRLEN)minlen < longest_fixed_length )
4805 minlen= longest_fixed_length;
4809 /* Several toplevels. Best we can is to set minlen. */
4811 struct regnode_charclass_class ch_class;
4814 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4816 scan = ri->program + 1;
4817 cl_init(pRExC_state, &ch_class);
4818 data.start_class = &ch_class;
4819 data.last_closep = &last_close;
4822 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4823 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4827 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4828 = r->float_substr = r->float_utf8 = NULL;
4829 if (!(data.start_class->flags & ANYOF_EOS)
4830 && !cl_is_anything(data.start_class))
4832 const U32 n = add_data(pRExC_state, 1, "f");
4834 Newx(RExC_rxi->data->data[n], 1,
4835 struct regnode_charclass_class);
4836 StructCopy(data.start_class,
4837 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4838 struct regnode_charclass_class);
4839 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4840 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4841 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4842 regprop(r, sv, (regnode*)data.start_class);
4843 PerlIO_printf(Perl_debug_log,
4844 "synthetic stclass \"%s\".\n",
4845 SvPVX_const(sv));});
4849 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4850 the "real" pattern. */
4852 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4853 (IV)minlen, (IV)r->minlen);
4855 r->minlenret = minlen;
4856 if (r->minlen < minlen)
4859 if (RExC_seen & REG_SEEN_GPOS)
4860 r->extflags |= RXf_GPOS_SEEN;
4861 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4862 r->extflags |= RXf_LOOKBEHIND_SEEN;
4863 if (RExC_seen & REG_SEEN_EVAL)
4864 r->extflags |= RXf_EVAL_SEEN;
4865 if (RExC_seen & REG_SEEN_CANY)
4866 r->extflags |= RXf_CANY_SEEN;
4867 if (RExC_seen & REG_SEEN_VERBARG)
4868 r->intflags |= PREGf_VERBARG_SEEN;
4869 if (RExC_seen & REG_SEEN_CUTGROUP)
4870 r->intflags |= PREGf_CUTGROUP_SEEN;
4871 if (RExC_paren_names)
4872 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4874 RXp_PAREN_NAMES(r) = NULL;
4876 #ifdef STUPID_PATTERN_CHECKS
4877 if (RX_PRELEN(rx) == 0)
4878 r->extflags |= RXf_NULL;
4879 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4880 /* XXX: this should happen BEFORE we compile */
4881 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4882 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4883 r->extflags |= RXf_WHITE;
4884 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4885 r->extflags |= RXf_START_ONLY;
4887 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4888 /* XXX: this should happen BEFORE we compile */
4889 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4891 regnode *first = ri->program + 1;
4893 U8 nop = OP(NEXTOPER(first));
4895 if (PL_regkind[fop] == NOTHING && nop == END)
4896 r->extflags |= RXf_NULL;
4897 else if (PL_regkind[fop] == BOL && nop == END)
4898 r->extflags |= RXf_START_ONLY;
4899 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4900 r->extflags |= RXf_WHITE;
4904 if (RExC_paren_names) {
4905 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4906 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4909 ri->name_list_idx = 0;
4911 if (RExC_recurse_count) {
4912 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4913 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4914 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4917 Newxz(r->offs, RExC_npar, regexp_paren_pair);
4918 /* assume we don't need to swap parens around before we match */
4921 PerlIO_printf(Perl_debug_log,"Final program:\n");
4924 #ifdef RE_TRACK_PATTERN_OFFSETS
4925 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4926 const U32 len = ri->u.offsets[0];
4928 GET_RE_DEBUG_FLAGS_DECL;
4929 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4930 for (i = 1; i <= len; i++) {
4931 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4932 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4933 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4935 PerlIO_printf(Perl_debug_log, "\n");
4941 #undef RE_ENGINE_PTR
4945 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4948 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4950 PERL_UNUSED_ARG(value);
4952 if (flags & RXapif_FETCH) {
4953 return reg_named_buff_fetch(rx, key, flags);
4954 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4955 Perl_croak(aTHX_ "%s", PL_no_modify);
4957 } else if (flags & RXapif_EXISTS) {
4958 return reg_named_buff_exists(rx, key, flags)
4961 } else if (flags & RXapif_REGNAMES) {
4962 return reg_named_buff_all(rx, flags);
4963 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4964 return reg_named_buff_scalar(rx, flags);
4966 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4972 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4975 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4976 PERL_UNUSED_ARG(lastkey);
4978 if (flags & RXapif_FIRSTKEY)
4979 return reg_named_buff_firstkey(rx, flags);
4980 else if (flags & RXapif_NEXTKEY)
4981 return reg_named_buff_nextkey(rx, flags);
4983 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4989 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4992 AV *retarray = NULL;
4994 struct regexp *const rx = (struct regexp *)SvANY(r);
4996 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4998 if (flags & RXapif_ALL)
5001 if (rx && RXp_PAREN_NAMES(rx)) {
5002 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5005 SV* sv_dat=HeVAL(he_str);
5006 I32 *nums=(I32*)SvPVX(sv_dat);
5007 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5008 if ((I32)(rx->nparens) >= nums[i]
5009 && rx->offs[nums[i]].start != -1
5010 && rx->offs[nums[i]].end != -1)
5013 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5017 ret = newSVsv(&PL_sv_undef);
5020 av_push(retarray, ret);
5023 return newRV_noinc(MUTABLE_SV(retarray));
5030 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5033 struct regexp *const rx = (struct regexp *)SvANY(r);
5035 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5037 if (rx && RXp_PAREN_NAMES(rx)) {
5038 if (flags & RXapif_ALL) {
5039 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5041 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5055 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5057 struct regexp *const rx = (struct regexp *)SvANY(r);
5059 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5061 if ( rx && RXp_PAREN_NAMES(rx) ) {
5062 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5064 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5071 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5073 struct regexp *const rx = (struct regexp *)SvANY(r);
5074 GET_RE_DEBUG_FLAGS_DECL;
5076 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5078 if (rx && RXp_PAREN_NAMES(rx)) {
5079 HV *hv = RXp_PAREN_NAMES(rx);
5081 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5084 SV* sv_dat = HeVAL(temphe);
5085 I32 *nums = (I32*)SvPVX(sv_dat);
5086 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5087 if ((I32)(rx->lastparen) >= nums[i] &&
5088 rx->offs[nums[i]].start != -1 &&
5089 rx->offs[nums[i]].end != -1)
5095 if (parno || flags & RXapif_ALL) {
5096 return newSVhek(HeKEY_hek(temphe));
5104 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5109 struct regexp *const rx = (struct regexp *)SvANY(r);
5111 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5113 if (rx && RXp_PAREN_NAMES(rx)) {
5114 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5115 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5116 } else if (flags & RXapif_ONE) {
5117 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5118 av = MUTABLE_AV(SvRV(ret));
5119 length = av_len(av);
5121 return newSViv(length + 1);
5123 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5127 return &PL_sv_undef;
5131 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5133 struct regexp *const rx = (struct regexp *)SvANY(r);
5136 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5138 if (rx && RXp_PAREN_NAMES(rx)) {
5139 HV *hv= RXp_PAREN_NAMES(rx);
5141 (void)hv_iterinit(hv);
5142 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5145 SV* sv_dat = HeVAL(temphe);
5146 I32 *nums = (I32*)SvPVX(sv_dat);
5147 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5148 if ((I32)(rx->lastparen) >= nums[i] &&
5149 rx->offs[nums[i]].start != -1 &&
5150 rx->offs[nums[i]].end != -1)
5156 if (parno || flags & RXapif_ALL) {
5157 av_push(av, newSVhek(HeKEY_hek(temphe)));
5162 return newRV_noinc(MUTABLE_SV(av));
5166 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5169 struct regexp *const rx = (struct regexp *)SvANY(r);
5174 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5177 sv_setsv(sv,&PL_sv_undef);
5181 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5183 i = rx->offs[0].start;
5187 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5189 s = rx->subbeg + rx->offs[0].end;
5190 i = rx->sublen - rx->offs[0].end;
5193 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5194 (s1 = rx->offs[paren].start) != -1 &&
5195 (t1 = rx->offs[paren].end) != -1)
5199 s = rx->subbeg + s1;
5201 sv_setsv(sv,&PL_sv_undef);
5204 assert(rx->sublen >= (s - rx->subbeg) + i );
5206 const int oldtainted = PL_tainted;
5208 sv_setpvn(sv, s, i);
5209 PL_tainted = oldtainted;
5210 if ( (rx->extflags & RXf_CANY_SEEN)
5211 ? (RXp_MATCH_UTF8(rx)
5212 && (!i || is_utf8_string((U8*)s, i)))
5213 : (RXp_MATCH_UTF8(rx)) )
5220 if (RXp_MATCH_TAINTED(rx)) {
5221 if (SvTYPE(sv) >= SVt_PVMG) {
5222 MAGIC* const mg = SvMAGIC(sv);
5225 SvMAGIC_set(sv, mg->mg_moremagic);
5227 if ((mgt = SvMAGIC(sv))) {
5228 mg->mg_moremagic = mgt;
5229 SvMAGIC_set(sv, mg);
5239 sv_setsv(sv,&PL_sv_undef);
5245 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5246 SV const * const value)
5248 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5250 PERL_UNUSED_ARG(rx);
5251 PERL_UNUSED_ARG(paren);
5252 PERL_UNUSED_ARG(value);
5255 Perl_croak(aTHX_ "%s", PL_no_modify);
5259 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5262 struct regexp *const rx = (struct regexp *)SvANY(r);
5266 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5268 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5270 /* $` / ${^PREMATCH} */
5271 case RX_BUFF_IDX_PREMATCH:
5272 if (rx->offs[0].start != -1) {
5273 i = rx->offs[0].start;
5281 /* $' / ${^POSTMATCH} */
5282 case RX_BUFF_IDX_POSTMATCH:
5283 if (rx->offs[0].end != -1) {
5284 i = rx->sublen - rx->offs[0].end;
5286 s1 = rx->offs[0].end;
5292 /* $& / ${^MATCH}, $1, $2, ... */
5294 if (paren <= (I32)rx->nparens &&
5295 (s1 = rx->offs[paren].start) != -1 &&
5296 (t1 = rx->offs[paren].end) != -1)
5301 if (ckWARN(WARN_UNINITIALIZED))
5302 report_uninit((const SV *)sv);
5307 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5308 const char * const s = rx->subbeg + s1;
5313 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5320 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5322 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5323 PERL_UNUSED_ARG(rx);
5327 return newSVpvs("Regexp");
5330 /* Scans the name of a named buffer from the pattern.
5331 * If flags is REG_RSN_RETURN_NULL returns null.
5332 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5333 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5334 * to the parsed name as looked up in the RExC_paren_names hash.
5335 * If there is an error throws a vFAIL().. type exception.
5338 #define REG_RSN_RETURN_NULL 0
5339 #define REG_RSN_RETURN_NAME 1
5340 #define REG_RSN_RETURN_DATA 2
5343 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5345 char *name_start = RExC_parse;
5347 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5349 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5350 /* skip IDFIRST by using do...while */
5353 RExC_parse += UTF8SKIP(RExC_parse);
5354 } while (isALNUM_utf8((U8*)RExC_parse));
5358 } while (isALNUM(*RExC_parse));
5363 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5364 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5365 if ( flags == REG_RSN_RETURN_NAME)
5367 else if (flags==REG_RSN_RETURN_DATA) {
5370 if ( ! sv_name ) /* should not happen*/
5371 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5372 if (RExC_paren_names)
5373 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5375 sv_dat = HeVAL(he_str);
5377 vFAIL("Reference to nonexistent named group");
5381 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5388 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5389 int rem=(int)(RExC_end - RExC_parse); \
5398 if (RExC_lastparse!=RExC_parse) \
5399 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5402 iscut ? "..." : "<" \
5405 PerlIO_printf(Perl_debug_log,"%16s",""); \
5408 num = RExC_size + 1; \
5410 num=REG_NODE_NUM(RExC_emit); \
5411 if (RExC_lastnum!=num) \
5412 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5414 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5415 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5416 (int)((depth*2)), "", \
5420 RExC_lastparse=RExC_parse; \
5425 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5426 DEBUG_PARSE_MSG((funcname)); \
5427 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5429 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5430 DEBUG_PARSE_MSG((funcname)); \
5431 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5434 - reg - regular expression, i.e. main body or parenthesized thing
5436 * Caller must absorb opening parenthesis.
5438 * Combining parenthesis handling with the base level of regular expression
5439 * is a trifle forced, but the need to tie the tails of the branches to what
5440 * follows makes it hard to avoid.
5442 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5444 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5446 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5450 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5451 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5454 register regnode *ret; /* Will be the head of the group. */
5455 register regnode *br;
5456 register regnode *lastbr;
5457 register regnode *ender = NULL;
5458 register I32 parno = 0;
5460 U32 oregflags = RExC_flags;
5461 bool have_branch = 0;
5463 I32 freeze_paren = 0;
5464 I32 after_freeze = 0;
5466 /* for (?g), (?gc), and (?o) warnings; warning
5467 about (?c) will warn about (?g) -- japhy */
5469 #define WASTED_O 0x01
5470 #define WASTED_G 0x02
5471 #define WASTED_C 0x04
5472 #define WASTED_GC (0x02|0x04)
5473 I32 wastedflags = 0x00;
5475 char * parse_start = RExC_parse; /* MJD */
5476 char * const oregcomp_parse = RExC_parse;
5478 GET_RE_DEBUG_FLAGS_DECL;
5480 PERL_ARGS_ASSERT_REG;
5481 DEBUG_PARSE("reg ");
5483 *flagp = 0; /* Tentatively. */
5486 /* Make an OPEN node, if parenthesized. */
5488 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5489 char *start_verb = RExC_parse;
5490 STRLEN verb_len = 0;
5491 char *start_arg = NULL;
5492 unsigned char op = 0;
5494 int internal_argval = 0; /* internal_argval is only useful if !argok */
5495 while ( *RExC_parse && *RExC_parse != ')' ) {
5496 if ( *RExC_parse == ':' ) {
5497 start_arg = RExC_parse + 1;
5503 verb_len = RExC_parse - start_verb;
5506 while ( *RExC_parse && *RExC_parse != ')' )
5508 if ( *RExC_parse != ')' )
5509 vFAIL("Unterminated verb pattern argument");
5510 if ( RExC_parse == start_arg )
5513 if ( *RExC_parse != ')' )
5514 vFAIL("Unterminated verb pattern");
5517 switch ( *start_verb ) {
5518 case 'A': /* (*ACCEPT) */
5519 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5521 internal_argval = RExC_nestroot;
5524 case 'C': /* (*COMMIT) */
5525 if ( memEQs(start_verb,verb_len,"COMMIT") )
5528 case 'F': /* (*FAIL) */
5529 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5534 case ':': /* (*:NAME) */
5535 case 'M': /* (*MARK:NAME) */
5536 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5541 case 'P': /* (*PRUNE) */
5542 if ( memEQs(start_verb,verb_len,"PRUNE") )
5545 case 'S': /* (*SKIP) */
5546 if ( memEQs(start_verb,verb_len,"SKIP") )
5549 case 'T': /* (*THEN) */
5550 /* [19:06] <TimToady> :: is then */
5551 if ( memEQs(start_verb,verb_len,"THEN") ) {
5553 RExC_seen |= REG_SEEN_CUTGROUP;
5559 vFAIL3("Unknown verb pattern '%.*s'",
5560 verb_len, start_verb);
5563 if ( start_arg && internal_argval ) {
5564 vFAIL3("Verb pattern '%.*s' may not have an argument",
5565 verb_len, start_verb);
5566 } else if ( argok < 0 && !start_arg ) {
5567 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5568 verb_len, start_verb);
5570 ret = reganode(pRExC_state, op, internal_argval);
5571 if ( ! internal_argval && ! SIZE_ONLY ) {
5573 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5574 ARG(ret) = add_data( pRExC_state, 1, "S" );
5575 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5582 if (!internal_argval)
5583 RExC_seen |= REG_SEEN_VERBARG;
5584 } else if ( start_arg ) {
5585 vFAIL3("Verb pattern '%.*s' may not have an argument",
5586 verb_len, start_verb);
5588 ret = reg_node(pRExC_state, op);
5590 nextchar(pRExC_state);
5593 if (*RExC_parse == '?') { /* (?...) */
5594 bool is_logical = 0;
5595 const char * const seqstart = RExC_parse;
5598 paren = *RExC_parse++;
5599 ret = NULL; /* For look-ahead/behind. */
5602 case 'P': /* (?P...) variants for those used to PCRE/Python */
5603 paren = *RExC_parse++;
5604 if ( paren == '<') /* (?P<...>) named capture */
5606 else if (paren == '>') { /* (?P>name) named recursion */
5607 goto named_recursion;
5609 else if (paren == '=') { /* (?P=...) named backref */
5610 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5611 you change this make sure you change that */
5612 char* name_start = RExC_parse;
5614 SV *sv_dat = reg_scan_name(pRExC_state,
5615 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5616 if (RExC_parse == name_start || *RExC_parse != ')')
5617 vFAIL2("Sequence %.3s... not terminated",parse_start);
5620 num = add_data( pRExC_state, 1, "S" );
5621 RExC_rxi->data->data[num]=(void*)sv_dat;
5622 SvREFCNT_inc_simple_void(sv_dat);
5625 ret = reganode(pRExC_state,
5626 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5630 Set_Node_Offset(ret, parse_start+1);
5631 Set_Node_Cur_Length(ret); /* MJD */
5633 nextchar(pRExC_state);
5637 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5639 case '<': /* (?<...) */
5640 if (*RExC_parse == '!')
5642 else if (*RExC_parse != '=')
5648 case '\'': /* (?'...') */
5649 name_start= RExC_parse;
5650 svname = reg_scan_name(pRExC_state,
5651 SIZE_ONLY ? /* reverse test from the others */
5652 REG_RSN_RETURN_NAME :
5653 REG_RSN_RETURN_NULL);
5654 if (RExC_parse == name_start) {
5656 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5659 if (*RExC_parse != paren)
5660 vFAIL2("Sequence (?%c... not terminated",
5661 paren=='>' ? '<' : paren);
5665 if (!svname) /* shouldnt happen */
5667 "panic: reg_scan_name returned NULL");
5668 if (!RExC_paren_names) {
5669 RExC_paren_names= newHV();
5670 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5672 RExC_paren_name_list= newAV();
5673 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5676 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5678 sv_dat = HeVAL(he_str);
5680 /* croak baby croak */
5682 "panic: paren_name hash element allocation failed");
5683 } else if ( SvPOK(sv_dat) ) {
5684 /* (?|...) can mean we have dupes so scan to check
5685 its already been stored. Maybe a flag indicating
5686 we are inside such a construct would be useful,
5687 but the arrays are likely to be quite small, so
5688 for now we punt -- dmq */
5689 IV count = SvIV(sv_dat);
5690 I32 *pv = (I32*)SvPVX(sv_dat);
5692 for ( i = 0 ; i < count ; i++ ) {
5693 if ( pv[i] == RExC_npar ) {
5699 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5700 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5701 pv[count] = RExC_npar;
5702 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5705 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5706 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5708 SvIV_set(sv_dat, 1);
5711 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5712 SvREFCNT_dec(svname);
5715 /*sv_dump(sv_dat);*/
5717 nextchar(pRExC_state);
5719 goto capturing_parens;
5721 RExC_seen |= REG_SEEN_LOOKBEHIND;
5723 case '=': /* (?=...) */
5724 RExC_seen_zerolen++;
5726 case '!': /* (?!...) */
5727 RExC_seen_zerolen++;
5728 if (*RExC_parse == ')') {
5729 ret=reg_node(pRExC_state, OPFAIL);
5730 nextchar(pRExC_state);
5734 case '|': /* (?|...) */
5735 /* branch reset, behave like a (?:...) except that
5736 buffers in alternations share the same numbers */
5738 after_freeze = freeze_paren = RExC_npar;
5740 case ':': /* (?:...) */
5741 case '>': /* (?>...) */
5743 case '$': /* (?$...) */
5744 case '@': /* (?@...) */
5745 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5747 case '#': /* (?#...) */
5748 while (*RExC_parse && *RExC_parse != ')')
5750 if (*RExC_parse != ')')
5751 FAIL("Sequence (?#... not terminated");
5752 nextchar(pRExC_state);
5755 case '0' : /* (?0) */
5756 case 'R' : /* (?R) */
5757 if (*RExC_parse != ')')
5758 FAIL("Sequence (?R) not terminated");
5759 ret = reg_node(pRExC_state, GOSTART);
5760 *flagp |= POSTPONED;
5761 nextchar(pRExC_state);
5764 { /* named and numeric backreferences */
5766 case '&': /* (?&NAME) */
5767 parse_start = RExC_parse - 1;
5770 SV *sv_dat = reg_scan_name(pRExC_state,
5771 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5772 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5774 goto gen_recurse_regop;
5777 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5779 vFAIL("Illegal pattern");
5781 goto parse_recursion;
5783 case '-': /* (?-1) */
5784 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5785 RExC_parse--; /* rewind to let it be handled later */
5789 case '1': case '2': case '3': case '4': /* (?1) */
5790 case '5': case '6': case '7': case '8': case '9':
5793 num = atoi(RExC_parse);
5794 parse_start = RExC_parse - 1; /* MJD */
5795 if (*RExC_parse == '-')
5797 while (isDIGIT(*RExC_parse))
5799 if (*RExC_parse!=')')
5800 vFAIL("Expecting close bracket");
5803 if ( paren == '-' ) {
5805 Diagram of capture buffer numbering.
5806 Top line is the normal capture buffer numbers
5807 Botton line is the negative indexing as from
5811 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5815 num = RExC_npar + num;
5818 vFAIL("Reference to nonexistent group");
5820 } else if ( paren == '+' ) {
5821 num = RExC_npar + num - 1;
5824 ret = reganode(pRExC_state, GOSUB, num);
5826 if (num > (I32)RExC_rx->nparens) {
5828 vFAIL("Reference to nonexistent group");
5830 ARG2L_SET( ret, RExC_recurse_count++);
5832 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5833 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5837 RExC_seen |= REG_SEEN_RECURSE;
5838 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5839 Set_Node_Offset(ret, parse_start); /* MJD */
5841 *flagp |= POSTPONED;
5842 nextchar(pRExC_state);
5844 } /* named and numeric backreferences */
5847 case '?': /* (??...) */
5849 if (*RExC_parse != '{') {
5851 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5854 *flagp |= POSTPONED;
5855 paren = *RExC_parse++;
5857 case '{': /* (?{...}) */
5862 char *s = RExC_parse;
5864 RExC_seen_zerolen++;
5865 RExC_seen |= REG_SEEN_EVAL;
5866 while (count && (c = *RExC_parse)) {
5877 if (*RExC_parse != ')') {
5879 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5883 OP_4tree *sop, *rop;
5884 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5887 Perl_save_re_context(aTHX);
5888 rop = sv_compile_2op(sv, &sop, "re", &pad);
5889 sop->op_private |= OPpREFCOUNTED;
5890 /* re_dup will OpREFCNT_inc */
5891 OpREFCNT_set(sop, 1);
5894 n = add_data(pRExC_state, 3, "nop");
5895 RExC_rxi->data->data[n] = (void*)rop;
5896 RExC_rxi->data->data[n+1] = (void*)sop;
5897 RExC_rxi->data->data[n+2] = (void*)pad;
5900 else { /* First pass */
5901 if (PL_reginterp_cnt < ++RExC_seen_evals
5903 /* No compiled RE interpolated, has runtime
5904 components ===> unsafe. */
5905 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5906 if (PL_tainting && PL_tainted)
5907 FAIL("Eval-group in insecure regular expression");
5908 #if PERL_VERSION > 8
5909 if (IN_PERL_COMPILETIME)
5914 nextchar(pRExC_state);
5916 ret = reg_node(pRExC_state, LOGICAL);
5919 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5920 /* deal with the length of this later - MJD */
5923 ret = reganode(pRExC_state, EVAL, n);
5924 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5925 Set_Node_Offset(ret, parse_start);
5928 case '(': /* (?(?{...})...) and (?(?=...)...) */
5931 if (RExC_parse[0] == '?') { /* (?(?...)) */
5932 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5933 || RExC_parse[1] == '<'
5934 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5937 ret = reg_node(pRExC_state, LOGICAL);
5940 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5944 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5945 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5947 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5948 char *name_start= RExC_parse++;
5950 SV *sv_dat=reg_scan_name(pRExC_state,
5951 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5952 if (RExC_parse == name_start || *RExC_parse != ch)
5953 vFAIL2("Sequence (?(%c... not terminated",
5954 (ch == '>' ? '<' : ch));
5957 num = add_data( pRExC_state, 1, "S" );
5958 RExC_rxi->data->data[num]=(void*)sv_dat;
5959 SvREFCNT_inc_simple_void(sv_dat);
5961 ret = reganode(pRExC_state,NGROUPP,num);
5962 goto insert_if_check_paren;
5964 else if (RExC_parse[0] == 'D' &&
5965 RExC_parse[1] == 'E' &&
5966 RExC_parse[2] == 'F' &&
5967 RExC_parse[3] == 'I' &&
5968 RExC_parse[4] == 'N' &&
5969 RExC_parse[5] == 'E')
5971 ret = reganode(pRExC_state,DEFINEP,0);
5974 goto insert_if_check_paren;
5976 else if (RExC_parse[0] == 'R') {
5979 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5980 parno = atoi(RExC_parse++);
5981 while (isDIGIT(*RExC_parse))
5983 } else if (RExC_parse[0] == '&') {
5986 sv_dat = reg_scan_name(pRExC_state,
5987 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5988 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5990 ret = reganode(pRExC_state,INSUBP,parno);
5991 goto insert_if_check_paren;
5993 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5996 parno = atoi(RExC_parse++);
5998 while (isDIGIT(*RExC_parse))
6000 ret = reganode(pRExC_state, GROUPP, parno);
6002 insert_if_check_paren:
6003 if ((c = *nextchar(pRExC_state)) != ')')
6004 vFAIL("Switch condition not recognized");
6006 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6007 br = regbranch(pRExC_state, &flags, 1,depth+1);
6009 br = reganode(pRExC_state, LONGJMP, 0);
6011 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6012 c = *nextchar(pRExC_state);
6017 vFAIL("(?(DEFINE)....) does not allow branches");
6018 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6019 regbranch(pRExC_state, &flags, 1,depth+1);
6020 REGTAIL(pRExC_state, ret, lastbr);
6023 c = *nextchar(pRExC_state);
6028 vFAIL("Switch (?(condition)... contains too many branches");
6029 ender = reg_node(pRExC_state, TAIL);
6030 REGTAIL(pRExC_state, br, ender);
6032 REGTAIL(pRExC_state, lastbr, ender);
6033 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6036 REGTAIL(pRExC_state, ret, ender);
6037 RExC_size++; /* XXX WHY do we need this?!!
6038 For large programs it seems to be required
6039 but I can't figure out why. -- dmq*/
6043 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6047 RExC_parse--; /* for vFAIL to print correctly */
6048 vFAIL("Sequence (? incomplete");
6052 parse_flags: /* (?i) */
6054 U32 posflags = 0, negflags = 0;
6055 U32 *flagsp = &posflags;
6057 while (*RExC_parse) {
6058 /* && strchr("iogcmsx", *RExC_parse) */
6059 /* (?g), (?gc) and (?o) are useless here
6060 and must be globally applied -- japhy */
6061 switch (*RExC_parse) {
6062 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6063 case ONCE_PAT_MOD: /* 'o' */
6064 case GLOBAL_PAT_MOD: /* 'g' */
6065 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6066 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6067 if (! (wastedflags & wflagbit) ) {
6068 wastedflags |= wflagbit;
6071 "Useless (%s%c) - %suse /%c modifier",
6072 flagsp == &negflags ? "?-" : "?",
6074 flagsp == &negflags ? "don't " : "",
6081 case CONTINUE_PAT_MOD: /* 'c' */
6082 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6083 if (! (wastedflags & WASTED_C) ) {
6084 wastedflags |= WASTED_GC;
6087 "Useless (%sc) - %suse /gc modifier",
6088 flagsp == &negflags ? "?-" : "?",
6089 flagsp == &negflags ? "don't " : ""
6094 case KEEPCOPY_PAT_MOD: /* 'p' */
6095 if (flagsp == &negflags) {
6097 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6099 *flagsp |= RXf_PMf_KEEPCOPY;
6103 if (flagsp == &negflags) {
6105 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6109 wastedflags = 0; /* reset so (?g-c) warns twice */
6115 RExC_flags |= posflags;
6116 RExC_flags &= ~negflags;
6118 oregflags |= posflags;
6119 oregflags &= ~negflags;
6121 nextchar(pRExC_state);
6132 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6137 }} /* one for the default block, one for the switch */
6144 ret = reganode(pRExC_state, OPEN, parno);
6147 RExC_nestroot = parno;
6148 if (RExC_seen & REG_SEEN_RECURSE
6149 && !RExC_open_parens[parno-1])
6151 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6152 "Setting open paren #%"IVdf" to %d\n",
6153 (IV)parno, REG_NODE_NUM(ret)));
6154 RExC_open_parens[parno-1]= ret;
6157 Set_Node_Length(ret, 1); /* MJD */
6158 Set_Node_Offset(ret, RExC_parse); /* MJD */
6166 /* Pick up the branches, linking them together. */
6167 parse_start = RExC_parse; /* MJD */
6168 br = regbranch(pRExC_state, &flags, 1,depth+1);
6171 if (RExC_npar > after_freeze)
6172 after_freeze = RExC_npar;
6173 RExC_npar = freeze_paren;
6176 /* branch_len = (paren != 0); */
6180 if (*RExC_parse == '|') {
6181 if (!SIZE_ONLY && RExC_extralen) {
6182 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6185 reginsert(pRExC_state, BRANCH, br, depth+1);
6186 Set_Node_Length(br, paren != 0);
6187 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6191 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6193 else if (paren == ':') {
6194 *flagp |= flags&SIMPLE;
6196 if (is_open) { /* Starts with OPEN. */
6197 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6199 else if (paren != '?') /* Not Conditional */
6201 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6203 while (*RExC_parse == '|') {
6204 if (!SIZE_ONLY && RExC_extralen) {
6205 ender = reganode(pRExC_state, LONGJMP,0);
6206 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6209 RExC_extralen += 2; /* Account for LONGJMP. */
6210 nextchar(pRExC_state);
6212 if (RExC_npar > after_freeze)
6213 after_freeze = RExC_npar;
6214 RExC_npar = freeze_paren;
6216 br = regbranch(pRExC_state, &flags, 0, depth+1);
6220 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6222 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6225 if (have_branch || paren != ':') {
6226 /* Make a closing node, and hook it on the end. */
6229 ender = reg_node(pRExC_state, TAIL);
6232 ender = reganode(pRExC_state, CLOSE, parno);
6233 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6234 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6235 "Setting close paren #%"IVdf" to %d\n",
6236 (IV)parno, REG_NODE_NUM(ender)));
6237 RExC_close_parens[parno-1]= ender;
6238 if (RExC_nestroot == parno)
6241 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6242 Set_Node_Length(ender,1); /* MJD */
6248 *flagp &= ~HASWIDTH;
6251 ender = reg_node(pRExC_state, SUCCEED);
6254 ender = reg_node(pRExC_state, END);
6256 assert(!RExC_opend); /* there can only be one! */
6261 REGTAIL(pRExC_state, lastbr, ender);
6263 if (have_branch && !SIZE_ONLY) {
6265 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6267 /* Hook the tails of the branches to the closing node. */
6268 for (br = ret; br; br = regnext(br)) {
6269 const U8 op = PL_regkind[OP(br)];
6271 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6273 else if (op == BRANCHJ) {
6274 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6282 static const char parens[] = "=!<,>";
6284 if (paren && (p = strchr(parens, paren))) {
6285 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6286 int flag = (p - parens) > 1;
6289 node = SUSPEND, flag = 0;
6290 reginsert(pRExC_state, node,ret, depth+1);
6291 Set_Node_Cur_Length(ret);
6292 Set_Node_Offset(ret, parse_start + 1);
6294 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6298 /* Check for proper termination. */
6300 RExC_flags = oregflags;
6301 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6302 RExC_parse = oregcomp_parse;
6303 vFAIL("Unmatched (");
6306 else if (!paren && RExC_parse < RExC_end) {
6307 if (*RExC_parse == ')') {
6309 vFAIL("Unmatched )");
6312 FAIL("Junk on end of regexp"); /* "Can't happen". */
6316 RExC_npar = after_freeze;
6321 - regbranch - one alternative of an | operator
6323 * Implements the concatenation operator.
6326 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6329 register regnode *ret;
6330 register regnode *chain = NULL;
6331 register regnode *latest;
6332 I32 flags = 0, c = 0;
6333 GET_RE_DEBUG_FLAGS_DECL;
6335 PERL_ARGS_ASSERT_REGBRANCH;
6337 DEBUG_PARSE("brnc");
6342 if (!SIZE_ONLY && RExC_extralen)
6343 ret = reganode(pRExC_state, BRANCHJ,0);
6345 ret = reg_node(pRExC_state, BRANCH);
6346 Set_Node_Length(ret, 1);
6350 if (!first && SIZE_ONLY)
6351 RExC_extralen += 1; /* BRANCHJ */
6353 *flagp = WORST; /* Tentatively. */
6356 nextchar(pRExC_state);
6357 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6359 latest = regpiece(pRExC_state, &flags,depth+1);
6360 if (latest == NULL) {
6361 if (flags & TRYAGAIN)
6365 else if (ret == NULL)
6367 *flagp |= flags&(HASWIDTH|POSTPONED);
6368 if (chain == NULL) /* First piece. */
6369 *flagp |= flags&SPSTART;
6372 REGTAIL(pRExC_state, chain, latest);
6377 if (chain == NULL) { /* Loop ran zero times. */
6378 chain = reg_node(pRExC_state, NOTHING);
6383 *flagp |= flags&SIMPLE;
6390 - regpiece - something followed by possible [*+?]
6392 * Note that the branching code sequences used for ? and the general cases
6393 * of * and + are somewhat optimized: they use the same NOTHING node as
6394 * both the endmarker for their branch list and the body of the last branch.
6395 * It might seem that this node could be dispensed with entirely, but the
6396 * endmarker role is not redundant.
6399 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6402 register regnode *ret;
6404 register char *next;
6406 const char * const origparse = RExC_parse;
6408 I32 max = REG_INFTY;
6410 const char *maxpos = NULL;
6411 GET_RE_DEBUG_FLAGS_DECL;
6413 PERL_ARGS_ASSERT_REGPIECE;
6415 DEBUG_PARSE("piec");
6417 ret = regatom(pRExC_state, &flags,depth+1);
6419 if (flags & TRYAGAIN)
6426 if (op == '{' && regcurly(RExC_parse)) {
6428 parse_start = RExC_parse; /* MJD */
6429 next = RExC_parse + 1;
6430 while (isDIGIT(*next) || *next == ',') {
6439 if (*next == '}') { /* got one */
6443 min = atoi(RExC_parse);
6447 maxpos = RExC_parse;
6449 if (!max && *maxpos != '0')
6450 max = REG_INFTY; /* meaning "infinity" */
6451 else if (max >= REG_INFTY)
6452 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6454 nextchar(pRExC_state);
6457 if ((flags&SIMPLE)) {
6458 RExC_naughty += 2 + RExC_naughty / 2;
6459 reginsert(pRExC_state, CURLY, ret, depth+1);
6460 Set_Node_Offset(ret, parse_start+1); /* MJD */
6461 Set_Node_Cur_Length(ret);
6464 regnode * const w = reg_node(pRExC_state, WHILEM);
6467 REGTAIL(pRExC_state, ret, w);
6468 if (!SIZE_ONLY && RExC_extralen) {
6469 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6470 reginsert(pRExC_state, NOTHING,ret, depth+1);
6471 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6473 reginsert(pRExC_state, CURLYX,ret, depth+1);
6475 Set_Node_Offset(ret, parse_start+1);
6476 Set_Node_Length(ret,
6477 op == '{' ? (RExC_parse - parse_start) : 1);
6479 if (!SIZE_ONLY && RExC_extralen)
6480 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6481 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6483 RExC_whilem_seen++, RExC_extralen += 3;
6484 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6493 vFAIL("Can't do {n,m} with n > m");
6495 ARG1_SET(ret, (U16)min);
6496 ARG2_SET(ret, (U16)max);
6508 #if 0 /* Now runtime fix should be reliable. */
6510 /* if this is reinstated, don't forget to put this back into perldiag:
6512 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6514 (F) The part of the regexp subject to either the * or + quantifier
6515 could match an empty string. The {#} shows in the regular
6516 expression about where the problem was discovered.
6520 if (!(flags&HASWIDTH) && op != '?')
6521 vFAIL("Regexp *+ operand could be empty");
6524 parse_start = RExC_parse;
6525 nextchar(pRExC_state);
6527 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6529 if (op == '*' && (flags&SIMPLE)) {
6530 reginsert(pRExC_state, STAR, ret, depth+1);
6534 else if (op == '*') {
6538 else if (op == '+' && (flags&SIMPLE)) {
6539 reginsert(pRExC_state, PLUS, ret, depth+1);
6543 else if (op == '+') {
6547 else if (op == '?') {
6552 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6553 ckWARN3reg(RExC_parse,
6554 "%.*s matches null string many times",
6555 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6559 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6560 nextchar(pRExC_state);
6561 reginsert(pRExC_state, MINMOD, ret, depth+1);
6562 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6564 #ifndef REG_ALLOW_MINMOD_SUSPEND
6567 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6569 nextchar(pRExC_state);
6570 ender = reg_node(pRExC_state, SUCCEED);
6571 REGTAIL(pRExC_state, ret, ender);
6572 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6574 ender = reg_node(pRExC_state, TAIL);
6575 REGTAIL(pRExC_state, ret, ender);
6579 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6581 vFAIL("Nested quantifiers");
6588 /* reg_namedseq(pRExC_state,UVp)
6590 This is expected to be called by a parser routine that has
6591 recognized '\N' and needs to handle the rest. RExC_parse is
6592 expected to point at the first char following the N at the time
6595 The \N may be inside (indicated by valuep not being NULL) or outside a
6598 \N may begin either a named sequence, or if outside a character class, mean
6599 to match a non-newline. For non single-quoted regexes, the tokenizer has
6600 attempted to decide which, and in the case of a named sequence converted it
6601 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6602 where c1... are the characters in the sequence. For single-quoted regexes,
6603 the tokenizer passes the \N sequence through unchanged; this code will not
6604 attempt to determine this nor expand those. The net effect is that if the
6605 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6606 signals that this \N occurrence means to match a non-newline.
6608 Only the \N{U+...} form should occur in a character class, for the same
6609 reason that '.' inside a character class means to just match a period: it
6610 just doesn't make sense.
6612 If valuep is non-null then it is assumed that we are parsing inside
6613 of a charclass definition and the first codepoint in the resolved
6614 string is returned via *valuep and the routine will return NULL.
6615 In this mode if a multichar string is returned from the charnames
6616 handler, a warning will be issued, and only the first char in the
6617 sequence will be examined. If the string returned is zero length
6618 then the value of *valuep is undefined and NON-NULL will
6619 be returned to indicate failure. (This will NOT be a valid pointer
6622 If valuep is null then it is assumed that we are parsing normal text and a
6623 new EXACT node is inserted into the program containing the resolved string,
6624 and a pointer to the new node is returned. But if the string is zero length
6625 a NOTHING node is emitted instead.
6627 On success RExC_parse is set to the char following the endbrace.
6628 Parsing failures will generate a fatal error via vFAIL(...)
6631 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6633 char * endbrace; /* '}' following the name */
6634 regnode *ret = NULL;
6636 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6640 GET_RE_DEBUG_FLAGS_DECL;
6642 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6646 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6647 * modifier. The other meaning does not */
6648 p = (RExC_flags & RXf_PMf_EXTENDED)
6649 ? regwhite( pRExC_state, RExC_parse )
6652 /* Disambiguate between \N meaning a named character versus \N meaning
6653 * [^\n]. The former is assumed when it can't be the latter. */
6654 if (*p != '{' || regcurly(p)) {
6657 /* no bare \N in a charclass */
6658 vFAIL("\\N in a character class must be a named character: \\N{...}");
6660 nextchar(pRExC_state);
6661 ret = reg_node(pRExC_state, REG_ANY);
6662 *flagp |= HASWIDTH|SIMPLE;
6665 Set_Node_Length(ret, 1); /* MJD */
6669 /* Here, we have decided it should be a named sequence */
6671 /* The test above made sure that the next real character is a '{', but
6672 * under the /x modifier, it could be separated by space (or a comment and
6673 * \n) and this is not allowed (for consistency with \x{...} and the
6674 * tokenizer handling of \N{NAME}). */
6675 if (*RExC_parse != '{') {
6676 vFAIL("Missing braces on \\N{}");
6679 RExC_parse++; /* Skip past the '{' */
6681 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6682 || ! (endbrace == RExC_parse /* nothing between the {} */
6683 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6684 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6686 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6687 vFAIL("\\N{NAME} must be resolved by the lexer");
6690 if (endbrace == RExC_parse) { /* empty: \N{} */
6692 RExC_parse = endbrace + 1;
6693 return reg_node(pRExC_state,NOTHING);
6697 ckWARNreg(RExC_parse,
6698 "Ignoring zero length \\N{} in character class"
6700 RExC_parse = endbrace + 1;
6703 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6706 RExC_utf8 = 1; /* named sequences imply Unicode semantics */
6707 RExC_parse += 2; /* Skip past the 'U+' */
6709 if (valuep) { /* In a bracketed char class */
6710 /* We only pay attention to the first char of
6711 multichar strings being returned. I kinda wonder
6712 if this makes sense as it does change the behaviour
6713 from earlier versions, OTOH that behaviour was broken
6714 as well. XXX Solution is to recharacterize as
6715 [rest-of-class]|multi1|multi2... */
6717 STRLEN length_of_hex;
6718 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6719 | PERL_SCAN_DISALLOW_PREFIX
6720 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6722 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6723 if (endchar < endbrace) {
6724 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6727 length_of_hex = (STRLEN)(endchar - RExC_parse);
6728 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6730 /* The tokenizer should have guaranteed validity, but it's possible to
6731 * bypass it by using single quoting, so check */
6732 if (length_of_hex == 0
6733 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6735 RExC_parse += length_of_hex; /* Includes all the valid */
6736 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6737 ? UTF8SKIP(RExC_parse)
6739 /* Guard against malformed utf8 */
6740 if (RExC_parse >= endchar) RExC_parse = endchar;
6741 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6744 RExC_parse = endbrace + 1;
6745 if (endchar == endbrace) return NULL;
6747 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
6749 else { /* Not a char class */
6750 char *s; /* String to put in generated EXACT node */
6751 STRLEN len = 0; /* Its current length */
6752 char *endchar; /* Points to '.' or '}' ending cur char in the input
6755 ret = reg_node(pRExC_state,
6756 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6759 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6760 * the input which is of the form now 'c1.c2.c3...}' until find the
6761 * ending brace or exeed length 255. The characters that exceed this
6762 * limit are dropped. The limit could be relaxed should it become
6763 * desirable by reparsing this as (?:\N{NAME}), so could generate
6764 * multiple EXACT nodes, as is done for just regular input. But this
6765 * is primarily a named character, and not intended to be a huge long
6766 * string, so 255 bytes should be good enough */
6768 STRLEN length_of_hex;
6769 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6770 | PERL_SCAN_DISALLOW_PREFIX
6771 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6772 UV cp; /* Ord of current character */
6774 /* Code points are separated by dots. If none, there is only one
6775 * code point, and is terminated by the brace */
6776 endchar = RExC_parse + strcspn(RExC_parse, ".}");
6778 /* The values are Unicode even on EBCDIC machines */
6779 length_of_hex = (STRLEN)(endchar - RExC_parse);
6780 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6781 if ( length_of_hex == 0
6782 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6784 RExC_parse += length_of_hex; /* Includes all the valid */
6785 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6786 ? UTF8SKIP(RExC_parse)
6788 /* Guard against malformed utf8 */
6789 if (RExC_parse >= endchar) RExC_parse = endchar;
6790 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6793 if (! FOLD) { /* Not folding, just append to the string */
6796 /* Quit before adding this character if would exceed limit */
6797 if (len + UNISKIP(cp) > U8_MAX) break;
6799 unilen = reguni(pRExC_state, cp, s);
6804 } else { /* Folding, output the folded equivalent */
6805 STRLEN foldlen,numlen;
6806 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6807 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6809 /* Quit before exceeding size limit */
6810 if (len + foldlen > U8_MAX) break;
6812 for (foldbuf = tmpbuf;
6816 cp = utf8_to_uvchr(foldbuf, &numlen);
6818 const STRLEN unilen = reguni(pRExC_state, cp, s);
6821 /* In EBCDIC the numlen and unilen can differ. */
6823 if (numlen >= foldlen)
6827 break; /* "Can't happen." */
6831 /* Point to the beginning of the next character in the sequence. */
6832 RExC_parse = endchar + 1;
6834 /* Quit if no more characters */
6835 if (RExC_parse >= endbrace) break;
6840 if (RExC_parse < endbrace) {
6841 ckWARNreg(RExC_parse - 1,
6842 "Using just the first characters returned by \\N{}");
6845 RExC_size += STR_SZ(len);
6848 RExC_emit += STR_SZ(len);
6851 RExC_parse = endbrace + 1;
6853 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6854 with malformed in t/re/pat_advanced.t */
6856 Set_Node_Cur_Length(ret); /* MJD */
6857 nextchar(pRExC_state);
6867 * It returns the code point in utf8 for the value in *encp.
6868 * value: a code value in the source encoding
6869 * encp: a pointer to an Encode object
6871 * If the result from Encode is not a single character,
6872 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6875 S_reg_recode(pTHX_ const char value, SV **encp)
6878 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6879 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6880 const STRLEN newlen = SvCUR(sv);
6881 UV uv = UNICODE_REPLACEMENT;
6883 PERL_ARGS_ASSERT_REG_RECODE;
6887 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6890 if (!newlen || numlen != newlen) {
6891 uv = UNICODE_REPLACEMENT;
6899 - regatom - the lowest level
6901 Try to identify anything special at the start of the pattern. If there
6902 is, then handle it as required. This may involve generating a single regop,
6903 such as for an assertion; or it may involve recursing, such as to
6904 handle a () structure.
6906 If the string doesn't start with something special then we gobble up
6907 as much literal text as we can.
6909 Once we have been able to handle whatever type of thing started the
6910 sequence, we return.
6912 Note: we have to be careful with escapes, as they can be both literal
6913 and special, and in the case of \10 and friends can either, depending
6914 on context. Specifically there are two seperate switches for handling
6915 escape sequences, with the one for handling literal escapes requiring
6916 a dummy entry for all of the special escapes that are actually handled
6921 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6924 register regnode *ret = NULL;
6926 char *parse_start = RExC_parse;
6927 GET_RE_DEBUG_FLAGS_DECL;
6928 DEBUG_PARSE("atom");
6929 *flagp = WORST; /* Tentatively. */
6931 PERL_ARGS_ASSERT_REGATOM;
6934 switch ((U8)*RExC_parse) {
6936 RExC_seen_zerolen++;
6937 nextchar(pRExC_state);
6938 if (RExC_flags & RXf_PMf_MULTILINE)
6939 ret = reg_node(pRExC_state, MBOL);
6940 else if (RExC_flags & RXf_PMf_SINGLELINE)
6941 ret = reg_node(pRExC_state, SBOL);
6943 ret = reg_node(pRExC_state, BOL);
6944 Set_Node_Length(ret, 1); /* MJD */
6947 nextchar(pRExC_state);
6949 RExC_seen_zerolen++;
6950 if (RExC_flags & RXf_PMf_MULTILINE)
6951 ret = reg_node(pRExC_state, MEOL);
6952 else if (RExC_flags & RXf_PMf_SINGLELINE)
6953 ret = reg_node(pRExC_state, SEOL);
6955 ret = reg_node(pRExC_state, EOL);
6956 Set_Node_Length(ret, 1); /* MJD */
6959 nextchar(pRExC_state);
6960 if (RExC_flags & RXf_PMf_SINGLELINE)
6961 ret = reg_node(pRExC_state, SANY);
6963 ret = reg_node(pRExC_state, REG_ANY);
6964 *flagp |= HASWIDTH|SIMPLE;
6966 Set_Node_Length(ret, 1); /* MJD */
6970 char * const oregcomp_parse = ++RExC_parse;
6971 ret = regclass(pRExC_state,depth+1);
6972 if (*RExC_parse != ']') {
6973 RExC_parse = oregcomp_parse;
6974 vFAIL("Unmatched [");
6976 nextchar(pRExC_state);
6977 *flagp |= HASWIDTH|SIMPLE;
6978 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6982 nextchar(pRExC_state);
6983 ret = reg(pRExC_state, 1, &flags,depth+1);
6985 if (flags & TRYAGAIN) {
6986 if (RExC_parse == RExC_end) {
6987 /* Make parent create an empty node if needed. */
6995 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6999 if (flags & TRYAGAIN) {
7003 vFAIL("Internal urp");
7004 /* Supposed to be caught earlier. */
7007 if (!regcurly(RExC_parse)) {
7016 vFAIL("Quantifier follows nothing");
7024 len=0; /* silence a spurious compiler warning */
7025 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7026 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7027 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7028 ret = reganode(pRExC_state, FOLDCHAR, cp);
7029 Set_Node_Length(ret, 1); /* MJD */
7030 nextchar(pRExC_state); /* kill whitespace under /x */
7038 This switch handles escape sequences that resolve to some kind
7039 of special regop and not to literal text. Escape sequnces that
7040 resolve to literal text are handled below in the switch marked
7043 Every entry in this switch *must* have a corresponding entry
7044 in the literal escape switch. However, the opposite is not
7045 required, as the default for this switch is to jump to the
7046 literal text handling code.
7048 switch ((U8)*++RExC_parse) {
7053 /* Special Escapes */
7055 RExC_seen_zerolen++;
7056 ret = reg_node(pRExC_state, SBOL);
7058 goto finish_meta_pat;
7060 ret = reg_node(pRExC_state, GPOS);
7061 RExC_seen |= REG_SEEN_GPOS;
7063 goto finish_meta_pat;
7065 RExC_seen_zerolen++;
7066 ret = reg_node(pRExC_state, KEEPS);
7068 /* XXX:dmq : disabling in-place substitution seems to
7069 * be necessary here to avoid cases of memory corruption, as
7070 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7072 RExC_seen |= REG_SEEN_LOOKBEHIND;
7073 goto finish_meta_pat;
7075 ret = reg_node(pRExC_state, SEOL);
7077 RExC_seen_zerolen++; /* Do not optimize RE away */
7078 goto finish_meta_pat;
7080 ret = reg_node(pRExC_state, EOS);
7082 RExC_seen_zerolen++; /* Do not optimize RE away */
7083 goto finish_meta_pat;
7085 ret = reg_node(pRExC_state, CANY);
7086 RExC_seen |= REG_SEEN_CANY;
7087 *flagp |= HASWIDTH|SIMPLE;
7088 goto finish_meta_pat;
7090 ret = reg_node(pRExC_state, CLUMP);
7092 goto finish_meta_pat;
7094 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
7095 *flagp |= HASWIDTH|SIMPLE;
7096 goto finish_meta_pat;
7098 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
7099 *flagp |= HASWIDTH|SIMPLE;
7100 goto finish_meta_pat;
7102 RExC_seen_zerolen++;
7103 RExC_seen |= REG_SEEN_LOOKBEHIND;
7104 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
7106 goto finish_meta_pat;
7108 RExC_seen_zerolen++;
7109 RExC_seen |= REG_SEEN_LOOKBEHIND;
7110 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
7112 goto finish_meta_pat;
7114 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
7115 *flagp |= HASWIDTH|SIMPLE;
7116 goto finish_meta_pat;
7118 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
7119 *flagp |= HASWIDTH|SIMPLE;
7120 goto finish_meta_pat;
7122 ret = reg_node(pRExC_state, DIGIT);
7123 *flagp |= HASWIDTH|SIMPLE;
7124 goto finish_meta_pat;
7126 ret = reg_node(pRExC_state, NDIGIT);
7127 *flagp |= HASWIDTH|SIMPLE;
7128 goto finish_meta_pat;
7130 ret = reg_node(pRExC_state, LNBREAK);
7131 *flagp |= HASWIDTH|SIMPLE;
7132 goto finish_meta_pat;
7134 ret = reg_node(pRExC_state, HORIZWS);
7135 *flagp |= HASWIDTH|SIMPLE;
7136 goto finish_meta_pat;
7138 ret = reg_node(pRExC_state, NHORIZWS);
7139 *flagp |= HASWIDTH|SIMPLE;
7140 goto finish_meta_pat;
7142 ret = reg_node(pRExC_state, VERTWS);
7143 *flagp |= HASWIDTH|SIMPLE;
7144 goto finish_meta_pat;
7146 ret = reg_node(pRExC_state, NVERTWS);
7147 *flagp |= HASWIDTH|SIMPLE;
7149 nextchar(pRExC_state);
7150 Set_Node_Length(ret, 2); /* MJD */
7155 char* const oldregxend = RExC_end;
7157 char* parse_start = RExC_parse - 2;
7160 if (RExC_parse[1] == '{') {
7161 /* a lovely hack--pretend we saw [\pX] instead */
7162 RExC_end = strchr(RExC_parse, '}');
7164 const U8 c = (U8)*RExC_parse;
7166 RExC_end = oldregxend;
7167 vFAIL2("Missing right brace on \\%c{}", c);
7172 RExC_end = RExC_parse + 2;
7173 if (RExC_end > oldregxend)
7174 RExC_end = oldregxend;
7178 ret = regclass(pRExC_state,depth+1);
7180 RExC_end = oldregxend;
7183 Set_Node_Offset(ret, parse_start + 2);
7184 Set_Node_Cur_Length(ret);
7185 nextchar(pRExC_state);
7186 *flagp |= HASWIDTH|SIMPLE;
7190 /* Handle \N and \N{NAME} here and not below because it can be
7191 multicharacter. join_exact() will join them up later on.
7192 Also this makes sure that things like /\N{BLAH}+/ and
7193 \N{BLAH} being multi char Just Happen. dmq*/
7195 ret= reg_namedseq(pRExC_state, NULL, flagp);
7197 case 'k': /* Handle \k<NAME> and \k'NAME' */
7200 char ch= RExC_parse[1];
7201 if (ch != '<' && ch != '\'' && ch != '{') {
7203 vFAIL2("Sequence %.2s... not terminated",parse_start);
7205 /* this pretty much dupes the code for (?P=...) in reg(), if
7206 you change this make sure you change that */
7207 char* name_start = (RExC_parse += 2);
7209 SV *sv_dat = reg_scan_name(pRExC_state,
7210 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7211 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7212 if (RExC_parse == name_start || *RExC_parse != ch)
7213 vFAIL2("Sequence %.3s... not terminated",parse_start);
7216 num = add_data( pRExC_state, 1, "S" );
7217 RExC_rxi->data->data[num]=(void*)sv_dat;
7218 SvREFCNT_inc_simple_void(sv_dat);
7222 ret = reganode(pRExC_state,
7223 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7227 /* override incorrect value set in reganode MJD */
7228 Set_Node_Offset(ret, parse_start+1);
7229 Set_Node_Cur_Length(ret); /* MJD */
7230 nextchar(pRExC_state);
7236 case '1': case '2': case '3': case '4':
7237 case '5': case '6': case '7': case '8': case '9':
7240 bool isg = *RExC_parse == 'g';
7245 if (*RExC_parse == '{') {
7249 if (*RExC_parse == '-') {
7253 if (hasbrace && !isDIGIT(*RExC_parse)) {
7254 if (isrel) RExC_parse--;
7256 goto parse_named_seq;
7258 num = atoi(RExC_parse);
7259 if (isg && num == 0)
7260 vFAIL("Reference to invalid group 0");
7262 num = RExC_npar - num;
7264 vFAIL("Reference to nonexistent or unclosed group");
7266 if (!isg && num > 9 && num >= RExC_npar)
7269 char * const parse_start = RExC_parse - 1; /* MJD */
7270 while (isDIGIT(*RExC_parse))
7272 if (parse_start == RExC_parse - 1)
7273 vFAIL("Unterminated \\g... pattern");
7275 if (*RExC_parse != '}')
7276 vFAIL("Unterminated \\g{...} pattern");
7280 if (num > (I32)RExC_rx->nparens)
7281 vFAIL("Reference to nonexistent group");
7284 ret = reganode(pRExC_state,
7285 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7289 /* override incorrect value set in reganode MJD */
7290 Set_Node_Offset(ret, parse_start+1);
7291 Set_Node_Cur_Length(ret); /* MJD */
7293 nextchar(pRExC_state);
7298 if (RExC_parse >= RExC_end)
7299 FAIL("Trailing \\");
7302 /* Do not generate "unrecognized" warnings here, we fall
7303 back into the quick-grab loop below */
7310 if (RExC_flags & RXf_PMf_EXTENDED) {
7311 if ( reg_skipcomment( pRExC_state ) )
7318 register STRLEN len;
7323 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7325 parse_start = RExC_parse - 1;
7331 ret = reg_node(pRExC_state,
7332 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7334 for (len = 0, p = RExC_parse - 1;
7335 len < 127 && p < RExC_end;
7338 char * const oldp = p;
7340 if (RExC_flags & RXf_PMf_EXTENDED)
7341 p = regwhite( pRExC_state, p );
7346 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7347 goto normal_default;
7357 /* Literal Escapes Switch
7359 This switch is meant to handle escape sequences that
7360 resolve to a literal character.
7362 Every escape sequence that represents something
7363 else, like an assertion or a char class, is handled
7364 in the switch marked 'Special Escapes' above in this
7365 routine, but also has an entry here as anything that
7366 isn't explicitly mentioned here will be treated as
7367 an unescaped equivalent literal.
7371 /* These are all the special escapes. */
7375 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7376 goto normal_default;
7377 case 'A': /* Start assertion */
7378 case 'b': case 'B': /* Word-boundary assertion*/
7379 case 'C': /* Single char !DANGEROUS! */
7380 case 'd': case 'D': /* digit class */
7381 case 'g': case 'G': /* generic-backref, pos assertion */
7382 case 'h': case 'H': /* HORIZWS */
7383 case 'k': case 'K': /* named backref, keep marker */
7384 case 'N': /* named char sequence */
7385 case 'p': case 'P': /* Unicode property */
7386 case 'R': /* LNBREAK */
7387 case 's': case 'S': /* space class */
7388 case 'v': case 'V': /* VERTWS */
7389 case 'w': case 'W': /* word class */
7390 case 'X': /* eXtended Unicode "combining character sequence" */
7391 case 'z': case 'Z': /* End of line/string assertion */
7395 /* Anything after here is an escape that resolves to a
7396 literal. (Except digits, which may or may not)
7415 ender = ASCII_TO_NATIVE('\033');
7419 ender = ASCII_TO_NATIVE('\007');
7424 char* const e = strchr(p, '}');
7428 vFAIL("Missing right brace on \\x{}");
7431 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7432 | PERL_SCAN_DISALLOW_PREFIX;
7433 STRLEN numlen = e - p - 1;
7434 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7441 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7443 ender = grok_hex(p, &numlen, &flags, NULL);
7446 if (PL_encoding && ender < 0x100)
7447 goto recode_encoding;
7451 ender = UCHARAT(p++);
7452 ender = toCTRL(ender);
7454 case '0': case '1': case '2': case '3':case '4':
7455 case '5': case '6': case '7': case '8':case '9':
7457 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7460 ender = grok_oct(p, &numlen, &flags, NULL);
7462 /* An octal above 0xff is interpreted differently
7463 * depending on if the re is in utf8 or not. If it
7464 * is in utf8, the value will be itself, otherwise
7465 * it is interpreted as modulo 0x100. It has been
7466 * decided to discourage the use of octal above the
7467 * single-byte range. For now, warn only when
7468 * it ends up modulo */
7469 if (SIZE_ONLY && ender >= 0x100
7470 && ! UTF && ! PL_encoding) {
7471 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7479 if (PL_encoding && ender < 0x100)
7480 goto recode_encoding;
7484 SV* enc = PL_encoding;
7485 ender = reg_recode((const char)(U8)ender, &enc);
7486 if (!enc && SIZE_ONLY)
7487 ckWARNreg(p, "Invalid escape in the specified encoding");
7493 FAIL("Trailing \\");
7496 if (!SIZE_ONLY&& isALPHA(*p))
7497 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7498 goto normal_default;
7503 if (UTF8_IS_START(*p) && UTF) {
7505 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7506 &numlen, UTF8_ALLOW_DEFAULT);
7513 if ( RExC_flags & RXf_PMf_EXTENDED)
7514 p = regwhite( pRExC_state, p );
7516 /* Prime the casefolded buffer. */
7517 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7519 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7524 /* Emit all the Unicode characters. */
7526 for (foldbuf = tmpbuf;
7528 foldlen -= numlen) {
7529 ender = utf8_to_uvchr(foldbuf, &numlen);
7531 const STRLEN unilen = reguni(pRExC_state, ender, s);
7534 /* In EBCDIC the numlen
7535 * and unilen can differ. */
7537 if (numlen >= foldlen)
7541 break; /* "Can't happen." */
7545 const STRLEN unilen = reguni(pRExC_state, ender, s);
7554 REGC((char)ender, s++);
7560 /* Emit all the Unicode characters. */
7562 for (foldbuf = tmpbuf;
7564 foldlen -= numlen) {
7565 ender = utf8_to_uvchr(foldbuf, &numlen);
7567 const STRLEN unilen = reguni(pRExC_state, ender, s);
7570 /* In EBCDIC the numlen
7571 * and unilen can differ. */
7573 if (numlen >= foldlen)
7581 const STRLEN unilen = reguni(pRExC_state, ender, s);
7590 REGC((char)ender, s++);
7594 Set_Node_Cur_Length(ret); /* MJD */
7595 nextchar(pRExC_state);
7597 /* len is STRLEN which is unsigned, need to copy to signed */
7600 vFAIL("Internal disaster");
7604 if (len == 1 && UNI_IS_INVARIANT(ender))
7608 RExC_size += STR_SZ(len);
7611 RExC_emit += STR_SZ(len);
7621 S_regwhite( RExC_state_t *pRExC_state, char *p )
7623 const char *e = RExC_end;
7625 PERL_ARGS_ASSERT_REGWHITE;
7630 else if (*p == '#') {
7639 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7647 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7648 Character classes ([:foo:]) can also be negated ([:^foo:]).
7649 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7650 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7651 but trigger failures because they are currently unimplemented. */
7653 #define POSIXCC_DONE(c) ((c) == ':')
7654 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7655 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7658 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7661 I32 namedclass = OOB_NAMEDCLASS;
7663 PERL_ARGS_ASSERT_REGPPOSIXCC;
7665 if (value == '[' && RExC_parse + 1 < RExC_end &&
7666 /* I smell either [: or [= or [. -- POSIX has been here, right? */
7667 POSIXCC(UCHARAT(RExC_parse))) {
7668 const char c = UCHARAT(RExC_parse);
7669 char* const s = RExC_parse++;
7671 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7673 if (RExC_parse == RExC_end)
7674 /* Grandfather lone [:, [=, [. */
7677 const char* const t = RExC_parse++; /* skip over the c */
7680 if (UCHARAT(RExC_parse) == ']') {
7681 const char *posixcc = s + 1;
7682 RExC_parse++; /* skip over the ending ] */
7685 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7686 const I32 skip = t - posixcc;
7688 /* Initially switch on the length of the name. */
7691 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7692 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7695 /* Names all of length 5. */
7696 /* alnum alpha ascii blank cntrl digit graph lower
7697 print punct space upper */
7698 /* Offset 4 gives the best switch position. */
7699 switch (posixcc[4]) {
7701 if (memEQ(posixcc, "alph", 4)) /* alpha */
7702 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7705 if (memEQ(posixcc, "spac", 4)) /* space */
7706 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7709 if (memEQ(posixcc, "grap", 4)) /* graph */
7710 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7713 if (memEQ(posixcc, "asci", 4)) /* ascii */
7714 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7717 if (memEQ(posixcc, "blan", 4)) /* blank */
7718 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7721 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7722 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7725 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7726 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7729 if (memEQ(posixcc, "lowe", 4)) /* lower */
7730 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7731 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7732 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7735 if (memEQ(posixcc, "digi", 4)) /* digit */
7736 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7737 else if (memEQ(posixcc, "prin", 4)) /* print */
7738 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7739 else if (memEQ(posixcc, "punc", 4)) /* punct */
7740 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7745 if (memEQ(posixcc, "xdigit", 6))
7746 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7750 if (namedclass == OOB_NAMEDCLASS)
7751 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7753 assert (posixcc[skip] == ':');
7754 assert (posixcc[skip+1] == ']');
7755 } else if (!SIZE_ONLY) {
7756 /* [[=foo=]] and [[.foo.]] are still future. */
7758 /* adjust RExC_parse so the warning shows after
7760 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7762 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7765 /* Maternal grandfather:
7766 * "[:" ending in ":" but not in ":]" */
7776 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7780 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7782 if (POSIXCC(UCHARAT(RExC_parse))) {
7783 const char *s = RExC_parse;
7784 const char c = *s++;
7788 if (*s && c == *s && s[1] == ']') {
7790 "POSIX syntax [%c %c] belongs inside character classes",
7793 /* [[=foo=]] and [[.foo.]] are still future. */
7794 if (POSIXCC_NOTYET(c)) {
7795 /* adjust RExC_parse so the error shows after
7797 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7799 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7806 #define _C_C_T_(NAME,TEST,WORD) \
7809 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7811 for (value = 0; value < 256; value++) \
7813 ANYOF_BITMAP_SET(ret, value); \
7818 case ANYOF_N##NAME: \
7820 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7822 for (value = 0; value < 256; value++) \
7824 ANYOF_BITMAP_SET(ret, value); \
7830 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7832 for (value = 0; value < 256; value++) \
7834 ANYOF_BITMAP_SET(ret, value); \
7838 case ANYOF_N##NAME: \
7839 for (value = 0; value < 256; value++) \
7841 ANYOF_BITMAP_SET(ret, value); \
7847 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7848 so that it is possible to override the option here without having to
7849 rebuild the entire core. as we are required to do if we change regcomp.h
7850 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7852 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7853 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7856 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7857 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7859 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7863 parse a class specification and produce either an ANYOF node that
7864 matches the pattern or if the pattern matches a single char only and
7865 that char is < 256 and we are case insensitive then we produce an
7870 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7873 register UV nextvalue;
7874 register IV prevvalue = OOB_UNICODE;
7875 register IV range = 0;
7876 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7877 register regnode *ret;
7880 char *rangebegin = NULL;
7881 bool need_class = 0;
7884 bool optimize_invert = TRUE;
7885 AV* unicode_alternate = NULL;
7887 UV literal_endpoint = 0;
7889 UV stored = 0; /* number of chars stored in the class */
7891 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7892 case we need to change the emitted regop to an EXACT. */
7893 const char * orig_parse = RExC_parse;
7894 GET_RE_DEBUG_FLAGS_DECL;
7896 PERL_ARGS_ASSERT_REGCLASS;
7898 PERL_UNUSED_ARG(depth);
7901 DEBUG_PARSE("clas");
7903 /* Assume we are going to generate an ANYOF node. */
7904 ret = reganode(pRExC_state, ANYOF, 0);
7907 ANYOF_FLAGS(ret) = 0;
7909 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7913 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7917 RExC_size += ANYOF_SKIP;
7918 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7921 RExC_emit += ANYOF_SKIP;
7923 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7925 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7926 ANYOF_BITMAP_ZERO(ret);
7927 listsv = newSVpvs("# comment\n");
7930 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7932 if (!SIZE_ONLY && POSIXCC(nextvalue))
7933 checkposixcc(pRExC_state);
7935 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7936 if (UCHARAT(RExC_parse) == ']')
7940 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7944 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7947 rangebegin = RExC_parse;
7949 value = utf8n_to_uvchr((U8*)RExC_parse,
7950 RExC_end - RExC_parse,
7951 &numlen, UTF8_ALLOW_DEFAULT);
7952 RExC_parse += numlen;
7955 value = UCHARAT(RExC_parse++);
7957 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7958 if (value == '[' && POSIXCC(nextvalue))
7959 namedclass = regpposixcc(pRExC_state, value);
7960 else if (value == '\\') {
7962 value = utf8n_to_uvchr((U8*)RExC_parse,
7963 RExC_end - RExC_parse,
7964 &numlen, UTF8_ALLOW_DEFAULT);
7965 RExC_parse += numlen;
7968 value = UCHARAT(RExC_parse++);
7969 /* Some compilers cannot handle switching on 64-bit integer
7970 * values, therefore value cannot be an UV. Yes, this will
7971 * be a problem later if we want switch on Unicode.
7972 * A similar issue a little bit later when switching on
7973 * namedclass. --jhi */
7974 switch ((I32)value) {
7975 case 'w': namedclass = ANYOF_ALNUM; break;
7976 case 'W': namedclass = ANYOF_NALNUM; break;
7977 case 's': namedclass = ANYOF_SPACE; break;
7978 case 'S': namedclass = ANYOF_NSPACE; break;
7979 case 'd': namedclass = ANYOF_DIGIT; break;
7980 case 'D': namedclass = ANYOF_NDIGIT; break;
7981 case 'v': namedclass = ANYOF_VERTWS; break;
7982 case 'V': namedclass = ANYOF_NVERTWS; break;
7983 case 'h': namedclass = ANYOF_HORIZWS; break;
7984 case 'H': namedclass = ANYOF_NHORIZWS; break;
7985 case 'N': /* Handle \N{NAME} in class */
7987 /* We only pay attention to the first char of
7988 multichar strings being returned. I kinda wonder
7989 if this makes sense as it does change the behaviour
7990 from earlier versions, OTOH that behaviour was broken
7992 UV v; /* value is register so we cant & it /grrr */
7993 if (reg_namedseq(pRExC_state, &v, NULL)) {
8003 if (RExC_parse >= RExC_end)
8004 vFAIL2("Empty \\%c{}", (U8)value);
8005 if (*RExC_parse == '{') {
8006 const U8 c = (U8)value;
8007 e = strchr(RExC_parse++, '}');
8009 vFAIL2("Missing right brace on \\%c{}", c);
8010 while (isSPACE(UCHARAT(RExC_parse)))
8012 if (e == RExC_parse)
8013 vFAIL2("Empty \\%c{}", c);
8015 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8023 if (UCHARAT(RExC_parse) == '^') {
8026 value = value == 'p' ? 'P' : 'p'; /* toggle */
8027 while (isSPACE(UCHARAT(RExC_parse))) {
8032 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8033 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8036 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8037 namedclass = ANYOF_MAX; /* no official name, but it's named */
8040 case 'n': value = '\n'; break;
8041 case 'r': value = '\r'; break;
8042 case 't': value = '\t'; break;
8043 case 'f': value = '\f'; break;
8044 case 'b': value = '\b'; break;
8045 case 'e': value = ASCII_TO_NATIVE('\033');break;
8046 case 'a': value = ASCII_TO_NATIVE('\007');break;
8048 if (*RExC_parse == '{') {
8049 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8050 | PERL_SCAN_DISALLOW_PREFIX;
8051 char * const e = strchr(RExC_parse++, '}');
8053 vFAIL("Missing right brace on \\x{}");
8055 numlen = e - RExC_parse;
8056 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8060 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8062 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8063 RExC_parse += numlen;
8065 if (PL_encoding && value < 0x100)
8066 goto recode_encoding;
8069 value = UCHARAT(RExC_parse++);
8070 value = toCTRL(value);
8072 case '0': case '1': case '2': case '3': case '4':
8073 case '5': case '6': case '7': case '8': case '9':
8077 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8078 RExC_parse += numlen;
8079 if (PL_encoding && value < 0x100)
8080 goto recode_encoding;
8085 SV* enc = PL_encoding;
8086 value = reg_recode((const char)(U8)value, &enc);
8087 if (!enc && SIZE_ONLY)
8088 ckWARNreg(RExC_parse,
8089 "Invalid escape in the specified encoding");
8093 if (!SIZE_ONLY && isALPHA(value))
8094 ckWARN2reg(RExC_parse,
8095 "Unrecognized escape \\%c in character class passed through",
8099 } /* end of \blah */
8105 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8107 if (!SIZE_ONLY && !need_class)
8108 ANYOF_CLASS_ZERO(ret);
8112 /* a bad range like a-\d, a-[:digit:] ? */
8116 RExC_parse >= rangebegin ?
8117 RExC_parse - rangebegin : 0;
8118 ckWARN4reg(RExC_parse,
8119 "False [] range \"%*.*s\"",
8122 if (prevvalue < 256) {
8123 ANYOF_BITMAP_SET(ret, prevvalue);
8124 ANYOF_BITMAP_SET(ret, '-');
8127 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8128 Perl_sv_catpvf(aTHX_ listsv,
8129 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8133 range = 0; /* this was not a true range */
8139 const char *what = NULL;
8142 if (namedclass > OOB_NAMEDCLASS)
8143 optimize_invert = FALSE;
8144 /* Possible truncation here but in some 64-bit environments
8145 * the compiler gets heartburn about switch on 64-bit values.
8146 * A similar issue a little earlier when switching on value.
8148 switch ((I32)namedclass) {
8150 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8151 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8152 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8153 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8154 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8155 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8156 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8157 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8158 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8159 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8160 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8161 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8162 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8164 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8165 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8167 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8168 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8169 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8172 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8175 for (value = 0; value < 128; value++)
8176 ANYOF_BITMAP_SET(ret, value);
8178 for (value = 0; value < 256; value++) {
8180 ANYOF_BITMAP_SET(ret, value);
8189 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8192 for (value = 128; value < 256; value++)
8193 ANYOF_BITMAP_SET(ret, value);
8195 for (value = 0; value < 256; value++) {
8196 if (!isASCII(value))
8197 ANYOF_BITMAP_SET(ret, value);
8206 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8208 /* consecutive digits assumed */
8209 for (value = '0'; value <= '9'; value++)
8210 ANYOF_BITMAP_SET(ret, value);
8213 what = POSIX_CC_UNI_NAME("Digit");
8217 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8219 /* consecutive digits assumed */
8220 for (value = 0; value < '0'; value++)
8221 ANYOF_BITMAP_SET(ret, value);
8222 for (value = '9' + 1; value < 256; value++)
8223 ANYOF_BITMAP_SET(ret, value);
8226 what = POSIX_CC_UNI_NAME("Digit");
8229 /* this is to handle \p and \P */
8232 vFAIL("Invalid [::] class");
8236 /* Strings such as "+utf8::isWord\n" */
8237 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8240 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8243 } /* end of namedclass \blah */
8246 if (prevvalue > (IV)value) /* b-a */ {
8247 const int w = RExC_parse - rangebegin;
8248 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8249 range = 0; /* not a valid range */
8253 prevvalue = value; /* save the beginning of the range */
8254 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8255 RExC_parse[1] != ']') {
8258 /* a bad range like \w-, [:word:]- ? */
8259 if (namedclass > OOB_NAMEDCLASS) {
8260 if (ckWARN(WARN_REGEXP)) {
8262 RExC_parse >= rangebegin ?
8263 RExC_parse - rangebegin : 0;
8265 "False [] range \"%*.*s\"",
8269 ANYOF_BITMAP_SET(ret, '-');
8271 range = 1; /* yeah, it's a range! */
8272 continue; /* but do it the next time */
8276 /* now is the next time */
8277 /*stored += (value - prevvalue + 1);*/
8279 if (prevvalue < 256) {
8280 const IV ceilvalue = value < 256 ? value : 255;
8283 /* In EBCDIC [\x89-\x91] should include
8284 * the \x8e but [i-j] should not. */
8285 if (literal_endpoint == 2 &&
8286 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8287 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8289 if (isLOWER(prevvalue)) {
8290 for (i = prevvalue; i <= ceilvalue; i++)
8291 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8293 ANYOF_BITMAP_SET(ret, i);
8296 for (i = prevvalue; i <= ceilvalue; i++)
8297 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8299 ANYOF_BITMAP_SET(ret, i);
8305 for (i = prevvalue; i <= ceilvalue; i++) {
8306 if (!ANYOF_BITMAP_TEST(ret,i)) {
8308 ANYOF_BITMAP_SET(ret, i);
8312 if (value > 255 || UTF) {
8313 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8314 const UV natvalue = NATIVE_TO_UNI(value);
8315 stored+=2; /* can't optimize this class */
8316 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8317 if (prevnatvalue < natvalue) { /* what about > ? */
8318 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8319 prevnatvalue, natvalue);
8321 else if (prevnatvalue == natvalue) {
8322 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8324 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8326 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8328 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8329 if (RExC_precomp[0] == ':' &&
8330 RExC_precomp[1] == '[' &&
8331 (f == 0xDF || f == 0x92)) {
8332 f = NATIVE_TO_UNI(f);
8335 /* If folding and foldable and a single
8336 * character, insert also the folded version
8337 * to the charclass. */
8339 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8340 if ((RExC_precomp[0] == ':' &&
8341 RExC_precomp[1] == '[' &&
8343 (value == 0xFB05 || value == 0xFB06))) ?
8344 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8345 foldlen == (STRLEN)UNISKIP(f) )
8347 if (foldlen == (STRLEN)UNISKIP(f))
8349 Perl_sv_catpvf(aTHX_ listsv,
8352 /* Any multicharacter foldings
8353 * require the following transform:
8354 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8355 * where E folds into "pq" and F folds
8356 * into "rst", all other characters
8357 * fold to single characters. We save
8358 * away these multicharacter foldings,
8359 * to be later saved as part of the
8360 * additional "s" data. */
8363 if (!unicode_alternate)
8364 unicode_alternate = newAV();
8365 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8367 av_push(unicode_alternate, sv);
8371 /* If folding and the value is one of the Greek
8372 * sigmas insert a few more sigmas to make the
8373 * folding rules of the sigmas to work right.
8374 * Note that not all the possible combinations
8375 * are handled here: some of them are handled
8376 * by the standard folding rules, and some of
8377 * them (literal or EXACTF cases) are handled
8378 * during runtime in regexec.c:S_find_byclass(). */
8379 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8380 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8381 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8382 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8383 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8385 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8386 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8387 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8392 literal_endpoint = 0;
8396 range = 0; /* this range (if it was one) is done now */
8400 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8402 RExC_size += ANYOF_CLASS_ADD_SKIP;
8404 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8410 /****** !SIZE_ONLY AFTER HERE *********/
8412 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8413 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8415 /* optimize single char class to an EXACT node
8416 but *only* when its not a UTF/high char */
8417 const char * cur_parse= RExC_parse;
8418 RExC_emit = (regnode *)orig_emit;
8419 RExC_parse = (char *)orig_parse;
8420 ret = reg_node(pRExC_state,
8421 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8422 RExC_parse = (char *)cur_parse;
8423 *STRING(ret)= (char)value;
8425 RExC_emit += STR_SZ(1);
8426 SvREFCNT_dec(listsv);
8429 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8430 if ( /* If the only flag is folding (plus possibly inversion). */
8431 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8433 for (value = 0; value < 256; ++value) {
8434 if (ANYOF_BITMAP_TEST(ret, value)) {
8435 UV fold = PL_fold[value];
8438 ANYOF_BITMAP_SET(ret, fold);
8441 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8444 /* optimize inverted simple patterns (e.g. [^a-z]) */
8445 if (optimize_invert &&
8446 /* If the only flag is inversion. */
8447 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8448 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8449 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8450 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8453 AV * const av = newAV();
8455 /* The 0th element stores the character class description
8456 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8457 * to initialize the appropriate swash (which gets stored in
8458 * the 1st element), and also useful for dumping the regnode.
8459 * The 2nd element stores the multicharacter foldings,
8460 * used later (regexec.c:S_reginclass()). */
8461 av_store(av, 0, listsv);
8462 av_store(av, 1, NULL);
8463 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8464 rv = newRV_noinc(MUTABLE_SV(av));
8465 n = add_data(pRExC_state, 1, "s");
8466 RExC_rxi->data->data[n] = (void*)rv;
8474 /* reg_skipcomment()
8476 Absorbs an /x style # comments from the input stream.
8477 Returns true if there is more text remaining in the stream.
8478 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8479 terminates the pattern without including a newline.
8481 Note its the callers responsibility to ensure that we are
8487 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8491 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8493 while (RExC_parse < RExC_end)
8494 if (*RExC_parse++ == '\n') {
8499 /* we ran off the end of the pattern without ending
8500 the comment, so we have to add an \n when wrapping */
8501 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8509 Advance that parse position, and optionall absorbs
8510 "whitespace" from the inputstream.
8512 Without /x "whitespace" means (?#...) style comments only,
8513 with /x this means (?#...) and # comments and whitespace proper.
8515 Returns the RExC_parse point from BEFORE the scan occurs.
8517 This is the /x friendly way of saying RExC_parse++.
8521 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8523 char* const retval = RExC_parse++;
8525 PERL_ARGS_ASSERT_NEXTCHAR;
8528 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8529 RExC_parse[2] == '#') {
8530 while (*RExC_parse != ')') {
8531 if (RExC_parse == RExC_end)
8532 FAIL("Sequence (?#... not terminated");
8538 if (RExC_flags & RXf_PMf_EXTENDED) {
8539 if (isSPACE(*RExC_parse)) {
8543 else if (*RExC_parse == '#') {
8544 if ( reg_skipcomment( pRExC_state ) )
8553 - reg_node - emit a node
8555 STATIC regnode * /* Location. */
8556 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8559 register regnode *ptr;
8560 regnode * const ret = RExC_emit;
8561 GET_RE_DEBUG_FLAGS_DECL;
8563 PERL_ARGS_ASSERT_REG_NODE;
8566 SIZE_ALIGN(RExC_size);
8570 if (RExC_emit >= RExC_emit_bound)
8571 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8573 NODE_ALIGN_FILL(ret);
8575 FILL_ADVANCE_NODE(ptr, op);
8576 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8577 #ifdef RE_TRACK_PATTERN_OFFSETS
8578 if (RExC_offsets) { /* MJD */
8579 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8580 "reg_node", __LINE__,
8582 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8583 ? "Overwriting end of array!\n" : "OK",
8584 (UV)(RExC_emit - RExC_emit_start),
8585 (UV)(RExC_parse - RExC_start),
8586 (UV)RExC_offsets[0]));
8587 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8595 - reganode - emit a node with an argument
8597 STATIC regnode * /* Location. */
8598 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8601 register regnode *ptr;
8602 regnode * const ret = RExC_emit;
8603 GET_RE_DEBUG_FLAGS_DECL;
8605 PERL_ARGS_ASSERT_REGANODE;
8608 SIZE_ALIGN(RExC_size);
8613 assert(2==regarglen[op]+1);
8615 Anything larger than this has to allocate the extra amount.
8616 If we changed this to be:
8618 RExC_size += (1 + regarglen[op]);
8620 then it wouldn't matter. Its not clear what side effect
8621 might come from that so its not done so far.
8626 if (RExC_emit >= RExC_emit_bound)
8627 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8629 NODE_ALIGN_FILL(ret);
8631 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8632 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8633 #ifdef RE_TRACK_PATTERN_OFFSETS
8634 if (RExC_offsets) { /* MJD */
8635 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8639 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8640 "Overwriting end of array!\n" : "OK",
8641 (UV)(RExC_emit - RExC_emit_start),
8642 (UV)(RExC_parse - RExC_start),
8643 (UV)RExC_offsets[0]));
8644 Set_Cur_Node_Offset;
8652 - reguni - emit (if appropriate) a Unicode character
8655 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8659 PERL_ARGS_ASSERT_REGUNI;
8661 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8665 - reginsert - insert an operator in front of already-emitted operand
8667 * Means relocating the operand.
8670 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8673 register regnode *src;
8674 register regnode *dst;
8675 register regnode *place;
8676 const int offset = regarglen[(U8)op];
8677 const int size = NODE_STEP_REGNODE + offset;
8678 GET_RE_DEBUG_FLAGS_DECL;
8680 PERL_ARGS_ASSERT_REGINSERT;
8681 PERL_UNUSED_ARG(depth);
8682 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8683 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8692 if (RExC_open_parens) {
8694 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8695 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8696 if ( RExC_open_parens[paren] >= opnd ) {
8697 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8698 RExC_open_parens[paren] += size;
8700 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8702 if ( RExC_close_parens[paren] >= opnd ) {
8703 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8704 RExC_close_parens[paren] += size;
8706 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8711 while (src > opnd) {
8712 StructCopy(--src, --dst, regnode);
8713 #ifdef RE_TRACK_PATTERN_OFFSETS
8714 if (RExC_offsets) { /* MJD 20010112 */
8715 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8719 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8720 ? "Overwriting end of array!\n" : "OK",
8721 (UV)(src - RExC_emit_start),
8722 (UV)(dst - RExC_emit_start),
8723 (UV)RExC_offsets[0]));
8724 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8725 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8731 place = opnd; /* Op node, where operand used to be. */
8732 #ifdef RE_TRACK_PATTERN_OFFSETS
8733 if (RExC_offsets) { /* MJD */
8734 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8738 (UV)(place - RExC_emit_start) > RExC_offsets[0]
8739 ? "Overwriting end of array!\n" : "OK",
8740 (UV)(place - RExC_emit_start),
8741 (UV)(RExC_parse - RExC_start),
8742 (UV)RExC_offsets[0]));
8743 Set_Node_Offset(place, RExC_parse);
8744 Set_Node_Length(place, 1);
8747 src = NEXTOPER(place);
8748 FILL_ADVANCE_NODE(place, op);
8749 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8750 Zero(src, offset, regnode);
8754 - regtail - set the next-pointer at the end of a node chain of p to val.
8755 - SEE ALSO: regtail_study
8757 /* TODO: All three parms should be const */
8759 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8762 register regnode *scan;
8763 GET_RE_DEBUG_FLAGS_DECL;
8765 PERL_ARGS_ASSERT_REGTAIL;
8767 PERL_UNUSED_ARG(depth);
8773 /* Find last node. */
8776 regnode * const temp = regnext(scan);
8778 SV * const mysv=sv_newmortal();
8779 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8780 regprop(RExC_rx, mysv, scan);
8781 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8782 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8783 (temp == NULL ? "->" : ""),
8784 (temp == NULL ? PL_reg_name[OP(val)] : "")
8792 if (reg_off_by_arg[OP(scan)]) {
8793 ARG_SET(scan, val - scan);
8796 NEXT_OFF(scan) = val - scan;
8802 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8803 - Look for optimizable sequences at the same time.
8804 - currently only looks for EXACT chains.
8806 This is expermental code. The idea is to use this routine to perform
8807 in place optimizations on branches and groups as they are constructed,
8808 with the long term intention of removing optimization from study_chunk so
8809 that it is purely analytical.
8811 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8812 to control which is which.
8815 /* TODO: All four parms should be const */
8818 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8821 register regnode *scan;
8823 #ifdef EXPERIMENTAL_INPLACESCAN
8826 GET_RE_DEBUG_FLAGS_DECL;
8828 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8834 /* Find last node. */
8838 regnode * const temp = regnext(scan);
8839 #ifdef EXPERIMENTAL_INPLACESCAN
8840 if (PL_regkind[OP(scan)] == EXACT)
8841 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8849 if( exact == PSEUDO )
8851 else if ( exact != OP(scan) )
8860 SV * const mysv=sv_newmortal();
8861 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8862 regprop(RExC_rx, mysv, scan);
8863 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8864 SvPV_nolen_const(mysv),
8866 PL_reg_name[exact]);
8873 SV * const mysv_val=sv_newmortal();
8874 DEBUG_PARSE_MSG("");
8875 regprop(RExC_rx, mysv_val, val);
8876 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8877 SvPV_nolen_const(mysv_val),
8878 (IV)REG_NODE_NUM(val),
8882 if (reg_off_by_arg[OP(scan)]) {
8883 ARG_SET(scan, val - scan);
8886 NEXT_OFF(scan) = val - scan;
8894 - regcurly - a little FSA that accepts {\d+,?\d*}
8896 #ifndef PERL_IN_XSUB_RE
8898 Perl_regcurly(register const char *s)
8900 PERL_ARGS_ASSERT_REGCURLY;
8919 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8923 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8928 for (bit=0; bit<32; bit++) {
8929 if (flags & (1<<bit)) {
8931 PerlIO_printf(Perl_debug_log, "%s",lead);
8932 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8937 PerlIO_printf(Perl_debug_log, "\n");
8939 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8945 Perl_regdump(pTHX_ const regexp *r)
8949 SV * const sv = sv_newmortal();
8950 SV *dsv= sv_newmortal();
8952 GET_RE_DEBUG_FLAGS_DECL;
8954 PERL_ARGS_ASSERT_REGDUMP;
8956 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8958 /* Header fields of interest. */
8959 if (r->anchored_substr) {
8960 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8961 RE_SV_DUMPLEN(r->anchored_substr), 30);
8962 PerlIO_printf(Perl_debug_log,
8963 "anchored %s%s at %"IVdf" ",
8964 s, RE_SV_TAIL(r->anchored_substr),
8965 (IV)r->anchored_offset);
8966 } else if (r->anchored_utf8) {
8967 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8968 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8969 PerlIO_printf(Perl_debug_log,
8970 "anchored utf8 %s%s at %"IVdf" ",
8971 s, RE_SV_TAIL(r->anchored_utf8),
8972 (IV)r->anchored_offset);
8974 if (r->float_substr) {
8975 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8976 RE_SV_DUMPLEN(r->float_substr), 30);
8977 PerlIO_printf(Perl_debug_log,
8978 "floating %s%s at %"IVdf"..%"UVuf" ",
8979 s, RE_SV_TAIL(r->float_substr),
8980 (IV)r->float_min_offset, (UV)r->float_max_offset);
8981 } else if (r->float_utf8) {
8982 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8983 RE_SV_DUMPLEN(r->float_utf8), 30);
8984 PerlIO_printf(Perl_debug_log,
8985 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8986 s, RE_SV_TAIL(r->float_utf8),
8987 (IV)r->float_min_offset, (UV)r->float_max_offset);
8989 if (r->check_substr || r->check_utf8)
8990 PerlIO_printf(Perl_debug_log,
8992 (r->check_substr == r->float_substr
8993 && r->check_utf8 == r->float_utf8
8994 ? "(checking floating" : "(checking anchored"));
8995 if (r->extflags & RXf_NOSCAN)
8996 PerlIO_printf(Perl_debug_log, " noscan");
8997 if (r->extflags & RXf_CHECK_ALL)
8998 PerlIO_printf(Perl_debug_log, " isall");
8999 if (r->check_substr || r->check_utf8)
9000 PerlIO_printf(Perl_debug_log, ") ");
9002 if (ri->regstclass) {
9003 regprop(r, sv, ri->regstclass);
9004 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9006 if (r->extflags & RXf_ANCH) {
9007 PerlIO_printf(Perl_debug_log, "anchored");
9008 if (r->extflags & RXf_ANCH_BOL)
9009 PerlIO_printf(Perl_debug_log, "(BOL)");
9010 if (r->extflags & RXf_ANCH_MBOL)
9011 PerlIO_printf(Perl_debug_log, "(MBOL)");
9012 if (r->extflags & RXf_ANCH_SBOL)
9013 PerlIO_printf(Perl_debug_log, "(SBOL)");
9014 if (r->extflags & RXf_ANCH_GPOS)
9015 PerlIO_printf(Perl_debug_log, "(GPOS)");
9016 PerlIO_putc(Perl_debug_log, ' ');
9018 if (r->extflags & RXf_GPOS_SEEN)
9019 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9020 if (r->intflags & PREGf_SKIP)
9021 PerlIO_printf(Perl_debug_log, "plus ");
9022 if (r->intflags & PREGf_IMPLICIT)
9023 PerlIO_printf(Perl_debug_log, "implicit ");
9024 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9025 if (r->extflags & RXf_EVAL_SEEN)
9026 PerlIO_printf(Perl_debug_log, "with eval ");
9027 PerlIO_printf(Perl_debug_log, "\n");
9028 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9030 PERL_ARGS_ASSERT_REGDUMP;
9031 PERL_UNUSED_CONTEXT;
9033 #endif /* DEBUGGING */
9037 - regprop - printable representation of opcode
9039 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9042 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9043 if (flags & ANYOF_INVERT) \
9044 /*make sure the invert info is in each */ \
9045 sv_catpvs(sv, "^"); \
9051 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9056 RXi_GET_DECL(prog,progi);
9057 GET_RE_DEBUG_FLAGS_DECL;
9059 PERL_ARGS_ASSERT_REGPROP;
9063 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9064 /* It would be nice to FAIL() here, but this may be called from
9065 regexec.c, and it would be hard to supply pRExC_state. */
9066 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9067 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9069 k = PL_regkind[OP(o)];
9073 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9074 * is a crude hack but it may be the best for now since
9075 * we have no flag "this EXACTish node was UTF-8"
9077 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9078 PERL_PV_ESCAPE_UNI_DETECT |
9079 PERL_PV_PRETTY_ELLIPSES |
9080 PERL_PV_PRETTY_LTGT |
9081 PERL_PV_PRETTY_NOCLEAR
9083 } else if (k == TRIE) {
9084 /* print the details of the trie in dumpuntil instead, as
9085 * progi->data isn't available here */
9086 const char op = OP(o);
9087 const U32 n = ARG(o);
9088 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9089 (reg_ac_data *)progi->data->data[n] :
9091 const reg_trie_data * const trie
9092 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9094 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9095 DEBUG_TRIE_COMPILE_r(
9096 Perl_sv_catpvf(aTHX_ sv,
9097 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9098 (UV)trie->startstate,
9099 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9100 (UV)trie->wordcount,
9103 (UV)TRIE_CHARCOUNT(trie),
9104 (UV)trie->uniquecharcount
9107 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9109 int rangestart = -1;
9110 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9112 for (i = 0; i <= 256; i++) {
9113 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9114 if (rangestart == -1)
9116 } else if (rangestart != -1) {
9117 if (i <= rangestart + 3)
9118 for (; rangestart < i; rangestart++)
9119 put_byte(sv, rangestart);
9121 put_byte(sv, rangestart);
9123 put_byte(sv, i - 1);
9131 } else if (k == CURLY) {
9132 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9133 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9134 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9136 else if (k == WHILEM && o->flags) /* Ordinal/of */
9137 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9138 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9139 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9140 if ( RXp_PAREN_NAMES(prog) ) {
9141 if ( k != REF || OP(o) < NREF) {
9142 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9143 SV **name= av_fetch(list, ARG(o), 0 );
9145 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9148 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9149 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9150 I32 *nums=(I32*)SvPVX(sv_dat);
9151 SV **name= av_fetch(list, nums[0], 0 );
9154 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9155 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9156 (n ? "," : ""), (IV)nums[n]);
9158 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9162 } else if (k == GOSUB)
9163 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9164 else if (k == VERB) {
9166 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9167 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9168 } else if (k == LOGICAL)
9169 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9170 else if (k == FOLDCHAR)
9171 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9172 else if (k == ANYOF) {
9173 int i, rangestart = -1;
9174 const U8 flags = ANYOF_FLAGS(o);
9177 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9178 static const char * const anyofs[] = {
9211 if (flags & ANYOF_LOCALE)
9212 sv_catpvs(sv, "{loc}");
9213 if (flags & ANYOF_FOLD)
9214 sv_catpvs(sv, "{i}");
9215 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9216 if (flags & ANYOF_INVERT)
9219 /* output what the standard cp 0-255 bitmap matches */
9220 for (i = 0; i <= 256; i++) {
9221 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9222 if (rangestart == -1)
9224 } else if (rangestart != -1) {
9225 if (i <= rangestart + 3)
9226 for (; rangestart < i; rangestart++)
9227 put_byte(sv, rangestart);
9229 put_byte(sv, rangestart);
9231 put_byte(sv, i - 1);
9238 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9239 /* output any special charclass tests (used mostly under use locale) */
9240 if (o->flags & ANYOF_CLASS)
9241 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9242 if (ANYOF_CLASS_TEST(o,i)) {
9243 sv_catpv(sv, anyofs[i]);
9247 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9249 /* output information about the unicode matching */
9250 if (flags & ANYOF_UNICODE)
9251 sv_catpvs(sv, "{unicode}");
9252 else if (flags & ANYOF_UNICODE_ALL)
9253 sv_catpvs(sv, "{unicode_all}");
9257 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9261 U8 s[UTF8_MAXBYTES_CASE+1];
9263 for (i = 0; i <= 256; i++) { /* just the first 256 */
9264 uvchr_to_utf8(s, i);
9266 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9267 if (rangestart == -1)
9269 } else if (rangestart != -1) {
9270 if (i <= rangestart + 3)
9271 for (; rangestart < i; rangestart++) {
9272 const U8 * const e = uvchr_to_utf8(s,rangestart);
9274 for(p = s; p < e; p++)
9278 const U8 *e = uvchr_to_utf8(s,rangestart);
9280 for (p = s; p < e; p++)
9283 e = uvchr_to_utf8(s, i-1);
9284 for (p = s; p < e; p++)
9291 sv_catpvs(sv, "..."); /* et cetera */
9295 char *s = savesvpv(lv);
9296 char * const origs = s;
9298 while (*s && *s != '\n')
9302 const char * const t = ++s;
9320 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9322 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9323 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9325 PERL_UNUSED_CONTEXT;
9326 PERL_UNUSED_ARG(sv);
9328 PERL_UNUSED_ARG(prog);
9329 #endif /* DEBUGGING */
9333 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9334 { /* Assume that RE_INTUIT is set */
9336 struct regexp *const prog = (struct regexp *)SvANY(r);
9337 GET_RE_DEBUG_FLAGS_DECL;
9339 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9340 PERL_UNUSED_CONTEXT;
9344 const char * const s = SvPV_nolen_const(prog->check_substr
9345 ? prog->check_substr : prog->check_utf8);
9347 if (!PL_colorset) reginitcolors();
9348 PerlIO_printf(Perl_debug_log,
9349 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9351 prog->check_substr ? "" : "utf8 ",
9352 PL_colors[5],PL_colors[0],
9355 (strlen(s) > 60 ? "..." : ""));
9358 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9364 handles refcounting and freeing the perl core regexp structure. When
9365 it is necessary to actually free the structure the first thing it
9366 does is call the 'free' method of the regexp_engine associated to to
9367 the regexp, allowing the handling of the void *pprivate; member
9368 first. (This routine is not overridable by extensions, which is why
9369 the extensions free is called first.)
9371 See regdupe and regdupe_internal if you change anything here.
9373 #ifndef PERL_IN_XSUB_RE
9375 Perl_pregfree(pTHX_ REGEXP *r)
9381 Perl_pregfree2(pTHX_ REGEXP *rx)
9384 struct regexp *const r = (struct regexp *)SvANY(rx);
9385 GET_RE_DEBUG_FLAGS_DECL;
9387 PERL_ARGS_ASSERT_PREGFREE2;
9390 ReREFCNT_dec(r->mother_re);
9392 CALLREGFREE_PVT(rx); /* free the private data */
9393 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9396 SvREFCNT_dec(r->anchored_substr);
9397 SvREFCNT_dec(r->anchored_utf8);
9398 SvREFCNT_dec(r->float_substr);
9399 SvREFCNT_dec(r->float_utf8);
9400 Safefree(r->substrs);
9402 RX_MATCH_COPY_FREE(rx);
9403 #ifdef PERL_OLD_COPY_ON_WRITE
9404 SvREFCNT_dec(r->saved_copy);
9411 This is a hacky workaround to the structural issue of match results
9412 being stored in the regexp structure which is in turn stored in
9413 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9414 could be PL_curpm in multiple contexts, and could require multiple
9415 result sets being associated with the pattern simultaneously, such
9416 as when doing a recursive match with (??{$qr})
9418 The solution is to make a lightweight copy of the regexp structure
9419 when a qr// is returned from the code executed by (??{$qr}) this
9420 lightweight copy doesnt actually own any of its data except for
9421 the starp/end and the actual regexp structure itself.
9427 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9430 struct regexp *const r = (struct regexp *)SvANY(rx);
9431 register const I32 npar = r->nparens+1;
9433 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9436 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9437 ret = (struct regexp *)SvANY(ret_x);
9439 (void)ReREFCNT_inc(rx);
9440 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9441 by pointing directly at the buffer, but flagging that the allocated
9442 space in the copy is zero. As we've just done a struct copy, it's now
9443 a case of zero-ing that, rather than copying the current length. */
9444 SvPV_set(ret_x, RX_WRAPPED(rx));
9445 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9446 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9447 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9448 SvLEN_set(ret_x, 0);
9449 SvSTASH_set(ret_x, NULL);
9450 SvMAGIC_set(ret_x, NULL);
9451 Newx(ret->offs, npar, regexp_paren_pair);
9452 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9454 Newx(ret->substrs, 1, struct reg_substr_data);
9455 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9457 SvREFCNT_inc_void(ret->anchored_substr);
9458 SvREFCNT_inc_void(ret->anchored_utf8);
9459 SvREFCNT_inc_void(ret->float_substr);
9460 SvREFCNT_inc_void(ret->float_utf8);
9462 /* check_substr and check_utf8, if non-NULL, point to either their
9463 anchored or float namesakes, and don't hold a second reference. */
9465 RX_MATCH_COPIED_off(ret_x);
9466 #ifdef PERL_OLD_COPY_ON_WRITE
9467 ret->saved_copy = NULL;
9469 ret->mother_re = rx;
9475 /* regfree_internal()
9477 Free the private data in a regexp. This is overloadable by
9478 extensions. Perl takes care of the regexp structure in pregfree(),
9479 this covers the *pprivate pointer which technically perldoesnt
9480 know about, however of course we have to handle the
9481 regexp_internal structure when no extension is in use.
9483 Note this is called before freeing anything in the regexp
9488 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9491 struct regexp *const r = (struct regexp *)SvANY(rx);
9493 GET_RE_DEBUG_FLAGS_DECL;
9495 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9501 SV *dsv= sv_newmortal();
9502 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9503 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9504 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9505 PL_colors[4],PL_colors[5],s);
9508 #ifdef RE_TRACK_PATTERN_OFFSETS
9510 Safefree(ri->u.offsets); /* 20010421 MJD */
9513 int n = ri->data->count;
9514 PAD* new_comppad = NULL;
9519 /* If you add a ->what type here, update the comment in regcomp.h */
9520 switch (ri->data->what[n]) {
9524 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9527 Safefree(ri->data->data[n]);
9530 new_comppad = MUTABLE_AV(ri->data->data[n]);
9533 if (new_comppad == NULL)
9534 Perl_croak(aTHX_ "panic: pregfree comppad");
9535 PAD_SAVE_LOCAL(old_comppad,
9536 /* Watch out for global destruction's random ordering. */
9537 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9540 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9543 op_free((OP_4tree*)ri->data->data[n]);
9545 PAD_RESTORE_LOCAL(old_comppad);
9546 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9552 { /* Aho Corasick add-on structure for a trie node.
9553 Used in stclass optimization only */
9555 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9557 refcount = --aho->refcount;
9560 PerlMemShared_free(aho->states);
9561 PerlMemShared_free(aho->fail);
9562 /* do this last!!!! */
9563 PerlMemShared_free(ri->data->data[n]);
9564 PerlMemShared_free(ri->regstclass);
9570 /* trie structure. */
9572 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9574 refcount = --trie->refcount;
9577 PerlMemShared_free(trie->charmap);
9578 PerlMemShared_free(trie->states);
9579 PerlMemShared_free(trie->trans);
9581 PerlMemShared_free(trie->bitmap);
9583 PerlMemShared_free(trie->wordlen);
9585 PerlMemShared_free(trie->jump);
9587 PerlMemShared_free(trie->nextword);
9588 /* do this last!!!! */
9589 PerlMemShared_free(ri->data->data[n]);
9594 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9597 Safefree(ri->data->what);
9604 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9605 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9606 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9607 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9610 re_dup - duplicate a regexp.
9612 This routine is expected to clone a given regexp structure. It is only
9613 compiled under USE_ITHREADS.
9615 After all of the core data stored in struct regexp is duplicated
9616 the regexp_engine.dupe method is used to copy any private data
9617 stored in the *pprivate pointer. This allows extensions to handle
9618 any duplication it needs to do.
9620 See pregfree() and regfree_internal() if you change anything here.
9622 #if defined(USE_ITHREADS)
9623 #ifndef PERL_IN_XSUB_RE
9625 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9629 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9630 struct regexp *ret = (struct regexp *)SvANY(dstr);
9632 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9634 npar = r->nparens+1;
9635 Newx(ret->offs, npar, regexp_paren_pair);
9636 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9638 /* no need to copy these */
9639 Newx(ret->swap, npar, regexp_paren_pair);
9643 /* Do it this way to avoid reading from *r after the StructCopy().
9644 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9645 cache, it doesn't matter. */
9646 const bool anchored = r->check_substr
9647 ? r->check_substr == r->anchored_substr
9648 : r->check_utf8 == r->anchored_utf8;
9649 Newx(ret->substrs, 1, struct reg_substr_data);
9650 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9652 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9653 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9654 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9655 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9657 /* check_substr and check_utf8, if non-NULL, point to either their
9658 anchored or float namesakes, and don't hold a second reference. */
9660 if (ret->check_substr) {
9662 assert(r->check_utf8 == r->anchored_utf8);
9663 ret->check_substr = ret->anchored_substr;
9664 ret->check_utf8 = ret->anchored_utf8;
9666 assert(r->check_substr == r->float_substr);
9667 assert(r->check_utf8 == r->float_utf8);
9668 ret->check_substr = ret->float_substr;
9669 ret->check_utf8 = ret->float_utf8;
9671 } else if (ret->check_utf8) {
9673 ret->check_utf8 = ret->anchored_utf8;
9675 ret->check_utf8 = ret->float_utf8;
9680 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9683 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9685 if (RX_MATCH_COPIED(dstr))
9686 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
9689 #ifdef PERL_OLD_COPY_ON_WRITE
9690 ret->saved_copy = NULL;
9693 if (ret->mother_re) {
9694 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9695 /* Our storage points directly to our mother regexp, but that's
9696 1: a buffer in a different thread
9697 2: something we no longer hold a reference on
9698 so we need to copy it locally. */
9699 /* Note we need to sue SvCUR() on our mother_re, because it, in
9700 turn, may well be pointing to its own mother_re. */
9701 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9702 SvCUR(ret->mother_re)+1));
9703 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9705 ret->mother_re = NULL;
9709 #endif /* PERL_IN_XSUB_RE */
9714 This is the internal complement to regdupe() which is used to copy
9715 the structure pointed to by the *pprivate pointer in the regexp.
9716 This is the core version of the extension overridable cloning hook.
9717 The regexp structure being duplicated will be copied by perl prior
9718 to this and will be provided as the regexp *r argument, however
9719 with the /old/ structures pprivate pointer value. Thus this routine
9720 may override any copying normally done by perl.
9722 It returns a pointer to the new regexp_internal structure.
9726 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9729 struct regexp *const r = (struct regexp *)SvANY(rx);
9730 regexp_internal *reti;
9734 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9736 npar = r->nparens+1;
9739 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9740 Copy(ri->program, reti->program, len+1, regnode);
9743 reti->regstclass = NULL;
9747 const int count = ri->data->count;
9750 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9751 char, struct reg_data);
9752 Newx(d->what, count, U8);
9755 for (i = 0; i < count; i++) {
9756 d->what[i] = ri->data->what[i];
9757 switch (d->what[i]) {
9758 /* legal options are one of: sSfpontTu
9759 see also regcomp.h and pregfree() */
9762 case 'p': /* actually an AV, but the dup function is identical. */
9763 case 'u': /* actually an HV, but the dup function is identical. */
9764 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9767 /* This is cheating. */
9768 Newx(d->data[i], 1, struct regnode_charclass_class);
9769 StructCopy(ri->data->data[i], d->data[i],
9770 struct regnode_charclass_class);
9771 reti->regstclass = (regnode*)d->data[i];
9774 /* Compiled op trees are readonly and in shared memory,
9775 and can thus be shared without duplication. */
9777 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9781 /* Trie stclasses are readonly and can thus be shared
9782 * without duplication. We free the stclass in pregfree
9783 * when the corresponding reg_ac_data struct is freed.
9785 reti->regstclass= ri->regstclass;
9789 ((reg_trie_data*)ri->data->data[i])->refcount++;
9793 d->data[i] = ri->data->data[i];
9796 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9805 reti->name_list_idx = ri->name_list_idx;
9807 #ifdef RE_TRACK_PATTERN_OFFSETS
9808 if (ri->u.offsets) {
9809 Newx(reti->u.offsets, 2*len+1, U32);
9810 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9813 SetProgLen(reti,len);
9819 #endif /* USE_ITHREADS */
9821 #ifndef PERL_IN_XSUB_RE
9824 - regnext - dig the "next" pointer out of a node
9827 Perl_regnext(pTHX_ register regnode *p)
9830 register I32 offset;
9835 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9844 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9847 STRLEN l1 = strlen(pat1);
9848 STRLEN l2 = strlen(pat2);
9851 const char *message;
9853 PERL_ARGS_ASSERT_RE_CROAK2;
9859 Copy(pat1, buf, l1 , char);
9860 Copy(pat2, buf + l1, l2 , char);
9861 buf[l1 + l2] = '\n';
9862 buf[l1 + l2 + 1] = '\0';
9864 /* ANSI variant takes additional second argument */
9865 va_start(args, pat2);
9869 msv = vmess(buf, &args);
9871 message = SvPV_const(msv,l1);
9874 Copy(message, buf, l1 , char);
9875 buf[l1-1] = '\0'; /* Overwrite \n */
9876 Perl_croak(aTHX_ "%s", buf);
9879 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9881 #ifndef PERL_IN_XSUB_RE
9883 Perl_save_re_context(pTHX)
9887 struct re_save_state *state;
9889 SAVEVPTR(PL_curcop);
9890 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9892 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9893 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9894 SSPUSHINT(SAVEt_RE_STATE);
9896 Copy(&PL_reg_state, state, 1, struct re_save_state);
9898 PL_reg_start_tmp = 0;
9899 PL_reg_start_tmpl = 0;
9900 PL_reg_oldsaved = NULL;
9901 PL_reg_oldsavedlen = 0;
9903 PL_reg_leftiter = 0;
9904 PL_reg_poscache = NULL;
9905 PL_reg_poscache_size = 0;
9906 #ifdef PERL_OLD_COPY_ON_WRITE
9910 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9912 const REGEXP * const rx = PM_GETRE(PL_curpm);
9915 for (i = 1; i <= RX_NPARENS(rx); i++) {
9916 char digits[TYPE_CHARS(long)];
9917 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9918 GV *const *const gvp
9919 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9922 GV * const gv = *gvp;
9923 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9933 clear_re(pTHX_ void *r)
9936 ReREFCNT_dec((REGEXP *)r);
9942 S_put_byte(pTHX_ SV *sv, int c)
9944 PERL_ARGS_ASSERT_PUT_BYTE;
9946 /* Our definition of isPRINT() ignores locales, so only bytes that are
9947 not part of UTF-8 are considered printable. I assume that the same
9948 holds for UTF-EBCDIC.
9949 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9950 which Wikipedia says:
9952 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9953 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9954 identical, to the ASCII delete (DEL) or rubout control character.
9955 ) So the old condition can be simplified to !isPRINT(c) */
9957 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9959 const char string = c;
9960 if (c == '-' || c == ']' || c == '\\' || c == '^')
9961 sv_catpvs(sv, "\\");
9962 sv_catpvn(sv, &string, 1);
9967 #define CLEAR_OPTSTART \
9968 if (optstart) STMT_START { \
9969 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9973 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9975 STATIC const regnode *
9976 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9977 const regnode *last, const regnode *plast,
9978 SV* sv, I32 indent, U32 depth)
9981 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9982 register const regnode *next;
9983 const regnode *optstart= NULL;
9986 GET_RE_DEBUG_FLAGS_DECL;
9988 PERL_ARGS_ASSERT_DUMPUNTIL;
9990 #ifdef DEBUG_DUMPUNTIL
9991 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9992 last ? last-start : 0,plast ? plast-start : 0);
9995 if (plast && plast < last)
9998 while (PL_regkind[op] != END && (!last || node < last)) {
9999 /* While that wasn't END last time... */
10002 if (op == CLOSE || op == WHILEM)
10004 next = regnext((regnode *)node);
10007 if (OP(node) == OPTIMIZED) {
10008 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10015 regprop(r, sv, node);
10016 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10017 (int)(2*indent + 1), "", SvPVX_const(sv));
10019 if (OP(node) != OPTIMIZED) {
10020 if (next == NULL) /* Next ptr. */
10021 PerlIO_printf(Perl_debug_log, " (0)");
10022 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10023 PerlIO_printf(Perl_debug_log, " (FAIL)");
10025 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10026 (void)PerlIO_putc(Perl_debug_log, '\n');
10030 if (PL_regkind[(U8)op] == BRANCHJ) {
10033 register const regnode *nnode = (OP(next) == LONGJMP
10034 ? regnext((regnode *)next)
10036 if (last && nnode > last)
10038 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10041 else if (PL_regkind[(U8)op] == BRANCH) {
10043 DUMPUNTIL(NEXTOPER(node), next);
10045 else if ( PL_regkind[(U8)op] == TRIE ) {
10046 const regnode *this_trie = node;
10047 const char op = OP(node);
10048 const U32 n = ARG(node);
10049 const reg_ac_data * const ac = op>=AHOCORASICK ?
10050 (reg_ac_data *)ri->data->data[n] :
10052 const reg_trie_data * const trie =
10053 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10055 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10057 const regnode *nextbranch= NULL;
10060 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10061 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10063 PerlIO_printf(Perl_debug_log, "%*s%s ",
10064 (int)(2*(indent+3)), "",
10065 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10066 PL_colors[0], PL_colors[1],
10067 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10068 PERL_PV_PRETTY_ELLIPSES |
10069 PERL_PV_PRETTY_LTGT
10074 U16 dist= trie->jump[word_idx+1];
10075 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10076 (UV)((dist ? this_trie + dist : next) - start));
10079 nextbranch= this_trie + trie->jump[0];
10080 DUMPUNTIL(this_trie + dist, nextbranch);
10082 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10083 nextbranch= regnext((regnode *)nextbranch);
10085 PerlIO_printf(Perl_debug_log, "\n");
10088 if (last && next > last)
10093 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10094 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10095 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10097 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10099 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10101 else if ( op == PLUS || op == STAR) {
10102 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10104 else if (op == ANYOF) {
10105 /* arglen 1 + class block */
10106 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10107 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10108 node = NEXTOPER(node);
10110 else if (PL_regkind[(U8)op] == EXACT) {
10111 /* Literal string, where present. */
10112 node += NODE_SZ_STR(node) - 1;
10113 node = NEXTOPER(node);
10116 node = NEXTOPER(node);
10117 node += regarglen[(U8)op];
10119 if (op == CURLYX || op == OPEN)
10123 #ifdef DEBUG_DUMPUNTIL
10124 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10129 #endif /* DEBUGGING */
10133 * c-indentation-style: bsd
10134 * c-basic-offset: 4
10135 * indent-tabs-mode: t
10138 * ex: set ts=8 sts=4 sw=4 noet: